Mercurial > hg > sfx-subgrouping
diff other/evalResults.m @ 1:995546d09284
add gensim notebook and matlab scripts
author | DaveM |
---|---|
date | Tue, 24 Jan 2017 17:44:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/other/evalResults.m Tue Jan 24 17:44:45 2017 +0000 @@ -0,0 +1,174 @@ + +load('Adobe.mat') +load('Results1Percent.mat') +%% +datamap = featuredata(end).IdxVar; +reduceData = Data(:,datamap); +reduceLabels = Labels(datamap); +%% +reduceFeatures = FeatureNames(datamap); + +%% +load('Results1Percent.mat') + +%% +reduceFeatures = featuredata(1).FeatureNamesRanked; + +dataToUseSize = 500; +dataToUse = ceil(rand(dataToUseSize,1)*size(reduceData,1))'; + +dMap = pdist(reduceData(dataToUse,:)); +clusterMethod = 'ward'; +% 'average' Unweighted average distance (UPGMA) +% 'centroid' Centroid distance (UPGMC), appropriate for Euclidean distances only +% 'complete' Furthest distance +% 'median' Weighted center of mass distance (WPGMC), appropriate for Euclidean distances only +% 'single' Shortest distance +% 'ward' Inner squared distance (minimum variance algorithm), appropriate for Euclidean distances only +% 'weighted' Weighted average distance (WPGMA) + +dl = linkage(dMap, clusterMethod); +dendrogram(dl) +% figure; imagesc(squareform(dMap_sp)) +% title('euclidian self similarity'); + +%% +incon_sp = inconsistent(dl) + + +%% +% Use all data + +dMapAll = pdist(reduceData); +clusterMethod = 'ward'; +% 'average' Unweighted average distance (UPGMA) +% 'centroid' Centroid distance (UPGMC), +% appropriate for Euclidean distances only +% 'complete' Furthest distance +% 'median' Weighted center of mass distance (WPGMC), +% appropriate for Euclidean distances only +% 'single' Shortest distance +% 'ward' Inner squared distance (minimum variance algorithm), +% appropriate for Euclidean distances only +% 'weighted' Weighted average distance (WPGMA) + +dl_all = linkage(dMapAll, clusterMethod); +% [~,T] = dendrogram(dl_all,0) + +%% +% print filelist for each cluster + +numClusters = 25; +fnames = cell(1,numClusters); +[~,T] = dendrogram(dl_all,numClusters); +for i = 1:numClusters + numFiles = sum(T==i); + fnames{i} = Filenames(find(T==i)); +end + +%% +% makeCSV for Weka +% format + +feats = reduceData; + +% csvOut = mat2cell(feats,ones(size(feats,1),1), ones(size(feats,2),1)) +csvOut = num2cell(feats); +csvOut = [csvOut, num2cell(T)]; +% size(csvOut) +% size([FeatureNames(datamap)', {'Class'}]) +csvOut = [[FeatureNames(datamap)', {'Class'}]; csvOut]; + +%% +% fnames to CSV + +maxLen = size(fnames,2); + +for i = 1:maxLen + depth = size(fnames{i},1); + for ii = 1:depth + csvOut(i,ii) = fnames{i}(ii); + end +end + +printString = ''; +for i = 1:maxLen + printString = [printString ' %s, ']; +end + +fid = fopen('junk.csv','w'); +fprintf(fid,[printString '\n'],csvOut{1:end,:}); +% fprintf(fid,'%f, %f, %f\n',c{2:end,:}) +fclose(fid) ; +% dlmwrite('test.csv', csvOut, '-append') ; + +%% +T = cluster(dl_sp,'cutoff',1.3); +figure; plot(T); + + + +%% + + +T = cluster(dl_sp,'maxclust',2); +plot(T) +%% +T = cluster(dl_sp,'maxclust',3); +plot(T) +%% +T = cluster(dl_sp,'maxclust',4); +plot(T) +T = cluster(dl_sp,'maxclust',5); +plot(T) +T = cluster(dl_sp,'maxclust',6); +plot(T) +T = cluster(dl_sp,'maxclust',7); +plot(T) +T = cluster(dl_sp,'maxclust',8); +plot(T) +T = cluster(dl_sp,'maxclust',9); +plot(T) +%% +T = cluster(dl_sp,'maxclust',10); +plot(T) +%% +T = cluster(dl_sp,'maxclust',100); +plot(T) +%% +median(T) + + +T = cluster(dl_sp,'maxclust',1000); +median(T) + + +plot(T) +csvwrite('dataOutput',reduceData); + + + + + + + + + + + + +% dMap_euc = pdist(reduceData); +% dMap_cos = pdist(reduceData,'cos'); +% dMap_cos = pdist(reduceData,'cosine'); +% dl_euc = linkage(dMap_euc); +% dl_cos = linkage(dMap_cos); +% % dl_sp +% dl_sp(10,:) +% dl_sp(1:10,:) +% sprintf('%f', dl_sp(1:10,:)) +% dl_sp(1:10,:) +% format short g +% dl_sp(1:10,:) +% plot(dl_sp(:)) +% plot(dl_sp(:,3)) +% incon_sp = inconsistent(dl_sp)