camir-aes2014: toolboxes/MIRtoolbox1.3.2/somtoolbox/som

annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_demo4.m @ 0:e9a9cd732c1e tip

first hg version after svn

author	wolffd
date	Tue, 10 Feb 2015 15:05:51 +0000
parents
children

rev	line source
wolffd@0	1
wolffd@0	2 %SOM_DEMO4 Data analysis using the SOM.
wolffd@0	3
wolffd@0	4 % Contributed to SOM Toolbox 2.0, February 11th, 2000 by Juha Vesanto
wolffd@0	5 % http://www.cis.hut.fi/projects/somtoolbox/
wolffd@0	6
wolffd@0	7 % Version 1.0beta juuso 071197
wolffd@0	8 % Version 2.0beta juuso 090200 070600
wolffd@0	9
wolffd@0	10 clf reset;
wolffd@0	11 f0 = gcf;
wolffd@0	12 echo on
wolffd@0	13
wolffd@0	14
wolffd@0	15
wolffd@0	16
wolffd@0	17 clc
wolffd@0	18 % ==========================================================
wolffd@0	19 % SOM_DEMO4 - DATA ANALYSIS USING THE SOM
wolffd@0	20 % ==========================================================
wolffd@0	21
wolffd@0	22 % In this demo, the IRIS data set is analysed using SOM. First, the
wolffd@0	23 % data is read from ascii file (please make sure they can be found
wolffd@0	24 % from the current path), normalized, and a map is
wolffd@0	25 % trained. Since the data also had labels, the map is labelled.
wolffd@0	26
wolffd@0	27 try,
wolffd@0	28 sD = som_read_data('iris.data');
wolffd@0	29 catch
wolffd@0	30 echo off
wolffd@0	31
wolffd@0	32 warning('File ''iris.data'' not found. Using simulated data instead.')
wolffd@0	33
wolffd@0	34 D = randn(50,4);
wolffd@0	35 D(:,1) = D(:,1)+5; D(:,2) = D(:,2)+3.5;
wolffd@0	36 D(:,3) = D(:,3)/2+1.5; D(:,4) = D(:,4)/2+0.3;
wolffd@0	37 D2 = randn(100,4); D2(:,2) = sort(D2(:,2));
wolffd@0	38 D2(:,1) = D2(:,1)+6.5; D2(:,2) = D2(:,2)+2.8;
wolffd@0	39 D2(:,3) = D2(:,3)+5; D2(:,4) = D2(:,4)/2+1.5;
wolffd@0	40 sD = som_data_struct([D; D2],'name','iris (simulated)',...
wolffd@0	41 'comp_names',{'SepalL','SepalW','PetalL','PetalW'});
wolffd@0	42 sD = som_label(sD,'add',[1:50]','Setosa');
wolffd@0	43 sD = som_label(sD,'add',[51:100]','Versicolor');
wolffd@0	44 sD = som_label(sD,'add',[101:150]','Virginica');
wolffd@0	45
wolffd@0	46 echo on
wolffd@0	47 end
wolffd@0	48
wolffd@0	49 sD = som_normalize(sD,'var');
wolffd@0	50 sM = som_make(sD);
wolffd@0	51 sM = som_autolabel(sM,sD,'vote');
wolffd@0	52
wolffd@0	53 pause % Strike any key to visualize the map...
wolffd@0	54
wolffd@0	55 clc
wolffd@0	56 % VISUAL INSPECTION OF THE MAP
wolffd@0	57 % ============================
wolffd@0	58
wolffd@0	59 % The first step in the analysis of the map is visual inspection.
wolffd@0	60 % Here is the U-matrix, component planes and labels (you may
wolffd@0	61 % need to enlarge the figure in order to make out the labels).
wolffd@0	62
wolffd@0	63 som_show(sM,'umat','all','comp',[1:4],'empty','Labels','norm','d');
wolffd@0	64 som_show_add('label',sM.labels,'textsize',8,'textcolor','r','subplot',6);
wolffd@0	65
wolffd@0	66 % From this first visualization, one can see that:
wolffd@0	67 % - there are essentially two clusters
wolffd@0	68 % - PetalL and PetalW are highly correlated
wolffd@0	69 % - SepalL is somewhat correlated to PetalL and PetalW
wolffd@0	70 % - one cluster corresponds to the Setosa species and exhibits
wolffd@0	71 % small petals and short but wide sepals
wolffd@0	72 % - the other cluster corresponds to Virginica and Versicolor
wolffd@0	73 % such that Versicolor has smaller leaves (both sepal and
wolffd@0	74 % petal) than Virginica
wolffd@0	75 % - inside both clusters, SepalL and SepalW are highly correlated
wolffd@0	76
wolffd@0	77 pause % Strike any key to continue...
wolffd@0	78
wolffd@0	79 % Next, the projection of the data set is investigated. A
wolffd@0	80 % principle component projection is made for the data, and applied
wolffd@0	81 % to the map. The colormap is done by spreading a colormap on the
wolffd@0	82 % projection. Distance matrix information is extracted from the
wolffd@0	83 % U-matrix, and it is modified by knowledge of zero-hits
wolffd@0	84 % (interpolative) units. Finally, three visualizations are shown:
wolffd@0	85 % the color code, with clustering information and the number of
wolffd@0	86 % hits in each unit, the projection and the labels.
wolffd@0	87
wolffd@0	88 echo off
wolffd@0	89
wolffd@0	90 f1=figure;
wolffd@0	91 [Pd,V,me,l] = pcaproj(sD,2); Pm = pcaproj(sM,V,me); % PC-projection
wolffd@0	92 Code = som_colorcode(Pm); % color coding
wolffd@0	93 hits = som_hits(sM,sD); % hits
wolffd@0	94 U = som_umat(sM); % U-matrix
wolffd@0	95 Dm = U(1:2:size(U,1),1:2:size(U,2)); % distance matrix
wolffd@0	96 Dm = 1-Dm(:)/max(Dm(:)); Dm(find(hits==0)) = 0; % clustering info
wolffd@0	97
wolffd@0	98 subplot(1,3,1)
wolffd@0	99 som_cplane(sM,Code,Dm);
wolffd@0	100 hold on
wolffd@0	101 som_grid(sM,'Label',cellstr(int2str(hits)),...
wolffd@0	102 'Line','none','Marker','none','Labelcolor','k');
wolffd@0	103 hold off
wolffd@0	104 title('Color code')
wolffd@0	105
wolffd@0	106 subplot(1,3,2)
wolffd@0	107 som_grid(sM,'Coord',Pm,'MarkerColor',Code,'Linecolor','k');
wolffd@0	108 hold on, plot(Pd(:,1),Pd(:,2),'k+'), hold off, axis tight, axis equal
wolffd@0	109 title('PC projection')
wolffd@0	110
wolffd@0	111 subplot(1,3,3)
wolffd@0	112 som_cplane(sM,'none')
wolffd@0	113 hold on
wolffd@0	114 som_grid(sM,'Label',sM.labels,'Labelsize',8,...
wolffd@0	115 'Line','none','Marker','none','Labelcolor','r');
wolffd@0	116 hold off
wolffd@0	117 title('Labels')
wolffd@0	118
wolffd@0	119 echo on
wolffd@0	120
wolffd@0	121 % From these figures one can see that:
wolffd@0	122 % - the projection confirms the existence of two different clusters
wolffd@0	123 % - interpolative units seem to divide the Virginica
wolffd@0	124 % flowers into two classes, the difference being in the size of
wolffd@0	125 % sepal leaves
wolffd@0	126
wolffd@0	127 pause % Strike any key to continue...
wolffd@0	128
wolffd@0	129 % Finally, perhaps the most informative figure of all: simple
wolffd@0	130 % scatter plots and histograms of all variables. The species
wolffd@0	131 % information is coded as a fifth variable: 1 for Setosa, 2 for
wolffd@0	132 % Versicolor and 3 for Virginica. Original data points are in the
wolffd@0	133 % upper triangle, map prototype values on the lower triangle, and
wolffd@0	134 % histograms on the diagonal: black for the data set and red for
wolffd@0	135 % the map prototype values. The color coding of the data samples
wolffd@0	136 % has been copied from the map (from the BMU of each sample). Note
wolffd@0	137 % that the variable values have been denormalized.
wolffd@0	138
wolffd@0	139 echo off
wolffd@0	140
wolffd@0	141 % denormalize and add species information
wolffd@0	142 names = sD.comp_names; names{end+1} = 'species';
wolffd@0	143 D = som_denormalize(sD.data,sD); dlen = size(D,1);
wolffd@0	144 s = zeros(dlen,1)+NaN; s(strcmp(sD.labels,'Setosa'))=1;
wolffd@0	145 s(strcmp(sD.labels,'Versicolor'))=2; s(strcmp(sD.labels,'Virginica'))=3;
wolffd@0	146 D = [D, s];
wolffd@0	147 M = som_denormalize(sM.codebook,sM); munits = size(M,1);
wolffd@0	148 s = zeros(munits,1)+NaN; s(strcmp(sM.labels,'Setosa'))=1;
wolffd@0	149 s(strcmp(sM.labels,'Versicolor'))=2; s(strcmp(sM.labels,'Virginica'))=3;
wolffd@0	150 M = [M, s];
wolffd@0	151
wolffd@0	152 f2=figure;
wolffd@0	153
wolffd@0	154 % color coding copied from the map
wolffd@0	155 bmus = som_bmus(sM,sD); Code_data = Code(bmus,:);
wolffd@0	156
wolffd@0	157 k=1;
wolffd@0	158 for i=1:5, for j=1:5,
wolffd@0	159 if i<j, i1=i; i2=j; else i1=j; i2=i; end
wolffd@0	160 subplot(5,5,k); cla
wolffd@0	161 if i<j,
wolffd@0	162 som_grid('rect',[dlen 1],'coord',D(:,[i1 i2]),...
wolffd@0	163 'Line','none','MarkerColor',Code_data,'Markersize',2);
wolffd@0	164 title(sprintf('%s vs. %s',names{i1},names{i2}))
wolffd@0	165 elseif i>j,
wolffd@0	166 som_grid(sM,'coord',M(:,[i1 i2]),...
wolffd@0	167 'markersize',2,'MarkerColor',Code);
wolffd@0	168 title(sprintf('%s vs. %s',names{i1},names{i2}))
wolffd@0	169 else
wolffd@0	170 if i1<5, b = 10; else b = 3; end
wolffd@0	171 [nd,x] = hist(D(:,i1),b); nd=nd/sum(nd);
wolffd@0	172 nm = hist(M(:,i1),x); nm = nm/sum(nm);
wolffd@0	173 h=bar(x,nd,0.8); set(h,'EdgeColor','none','FaceColor','k');
wolffd@0	174 hold on
wolffd@0	175 h=bar(x,nm,0.3); set(h,'EdgeColor','none','FaceColor','r');
wolffd@0	176 hold off
wolffd@0	177 title(names{i1})
wolffd@0	178 end
wolffd@0	179 k=k+1;
wolffd@0	180 end
wolffd@0	181 end
wolffd@0	182
wolffd@0	183 echo on
wolffd@0	184
wolffd@0	185 % This visualization shows quite a lot of information:
wolffd@0	186 % distributions of single and pairs of variables both in the data
wolffd@0	187 % and in the map. If the number of variables was even slightly
wolffd@0	188 % more, it would require a really big display to be convenient to
wolffd@0	189 % use.
wolffd@0	190
wolffd@0	191 % From this visualization we can conform many of the earlier
wolffd@0	192 % conclusions, for example:
wolffd@0	193 % - there are two clusters: 'Setosa' (blue, dark green) and
wolffd@0	194 % 'Virginica'/'Versicolor' (light green, yellow, reds)
wolffd@0	195 % (see almost any of the subplots)
wolffd@0	196 % - PetalL and PetalW have a high linear correlation (see
wolffd@0	197 % subplots 4,3 and 3,4)
wolffd@0	198 % - SepalL is correlated (at least in the bigger cluster) with
wolffd@0	199 % PetalL and PetalW (in subplots 1,3 1,4 3,1 and 4,1)
wolffd@0	200 % - SepalL and SepalW have a clear linear correlation, but it
wolffd@0	201 % is slightly different for the two main clusters (in
wolffd@0	202 % subplots 2,1 and 1,2)
wolffd@0	203
wolffd@0	204 pause % Strike any key to cluster the map...
wolffd@0	205
wolffd@0	206 close(f1), close(f2), figure(f0), clf
wolffd@0	207
wolffd@0	208 clc
wolffd@0	209 % CLUSTERING OF THE MAP
wolffd@0	210 % =====================
wolffd@0	211
wolffd@0	212 % Visual inspection already hinted that there are at least two
wolffd@0	213 % clusters in the data, and that the properties of the clusters are
wolffd@0	214 % different from each other (esp. relation of SepalL and
wolffd@0	215 % SepalW). For further investigation, the map needs to be
wolffd@0	216 % partitioned.
wolffd@0	217
wolffd@0	218 % Here, the KMEANS_CLUSTERS function is used to find an initial
wolffd@0	219 % partitioning. The plot shows the Davies-Boulding clustering
wolffd@0	220 % index, which is minimized with best clustering.
wolffd@0	221
wolffd@0	222 subplot(1,3,1)
wolffd@0	223 [c,p,err,ind] = kmeans_clusters(sM, 7); % find at most 7 clusters
wolffd@0	224 plot(1:length(ind),ind,'x-')
wolffd@0	225 [dummy,i] = min(ind)
wolffd@0	226 cl = p{i};
wolffd@0	227
wolffd@0	228 % The Davies-Boulding index seems to indicate that there are
wolffd@0	229 % two clusters on the map. Here is the clustering info
wolffd@0	230 % calculated previously and the partitioning result:
wolffd@0	231
wolffd@0	232 subplot(1,3,2)
wolffd@0	233 som_cplane(sM,Code,Dm)
wolffd@0	234 subplot(1,3,3)
wolffd@0	235 som_cplane(sM,cl)
wolffd@0	236
wolffd@0	237 % You could use also function SOM_SELECT to manually make or modify
wolffd@0	238 % the partitioning.
wolffd@0	239
wolffd@0	240 % After this, the analysis would proceed with summarization of the
wolffd@0	241 % results, and analysis of each cluster one at a time.
wolffd@0	242 % Unfortunately, you have to do that yourself. The SOM Toolbox does
wolffd@0	243 % not, yet, have functions for those purposes.
wolffd@0	244
wolffd@0	245 pause % Strike any key to continue...
wolffd@0	246
wolffd@0	247
wolffd@0	248 clf
wolffd@0	249 clc
wolffd@0	250 % MODELING
wolffd@0	251 % ========
wolffd@0	252
wolffd@0	253 % One can also build models on top of the SOM. Typically, these
wolffd@0	254 % models are simple local or nearest-neighbor models.
wolffd@0	255
wolffd@0	256 % Here, SOM is used for probability density estimation. Each map
wolffd@0	257 % prototype is the center of a gaussian kernel, the parameters
wolffd@0	258 % of which are estimated from the data. The gaussian mixture
wolffd@0	259 % model is estimated with function SOM_ESTIMATE_GMM and the
wolffd@0	260 % probabilities can be calculated with SOM_PROBABILITY_GMM.
wolffd@0	261
wolffd@0	262 [K,P] = som_estimate_gmm(sM,sD);
wolffd@0	263 [pd,Pdm,pmd] = som_probability_gmm(sD,sM,K,P);
wolffd@0	264
wolffd@0	265 % Here is the probability density function value for the first data
wolffd@0	266 % sample (x=sD.data(:,1)) in terms of each map unit (m):
wolffd@0	267
wolffd@0	268 som_cplane(sM,Pdm(:,1))
wolffd@0	269 colorbar
wolffd@0	270 title('p(x\|m)')
wolffd@0	271
wolffd@0	272 pause % Strike any key to continue...
wolffd@0	273
wolffd@0	274 % Here, SOM is used for classification. Although the SOM can be
wolffd@0	275 % used for classification as such, one has to remember that it does
wolffd@0	276 % not utilize class information at all, and thus its results are
wolffd@0	277 % inherently suboptimal. However, with small modifications, the
wolffd@0	278 % network can take the class into account. The function
wolffd@0	279 % SOM_SUPERVISED does this.
wolffd@0	280
wolffd@0	281 % Learning vector quantization (LVQ) is an algorithm that is very
wolffd@0	282 % similar to the SOM in many aspects. However, it is specifically
wolffd@0	283 % designed for classification. In the SOM Toolbox, there are
wolffd@0	284 % functions LVQ1 and LVQ3 that implement two versions of this
wolffd@0	285 % algorithm.
wolffd@0	286
wolffd@0	287 % Here, the function SOM_SUPERVISED is used to create a classifier
wolffd@0	288 % for IRIS data set:
wolffd@0	289
wolffd@0	290 sM = som_supervised(sD,'small');
wolffd@0	291
wolffd@0	292 som_show(sM,'umat','all');
wolffd@0	293 som_show_add('label',sM.labels,'TextSize',8,'TextColor','r')
wolffd@0	294
wolffd@0	295 sD2 = som_label(sD,'clear','all');
wolffd@0	296 sD2 = som_autolabel(sD2,sM); % classification
wolffd@0	297 ok = strcmp(sD2.labels,sD.labels); % errors
wolffd@0	298 100*(1-sum(ok)/length(ok)) % error percentage (%)
wolffd@0	299
wolffd@0	300 echo off
wolffd@0	301
wolffd@0	302
wolffd@0	303
wolffd@0	304
wolffd@0	305
wolffd@0	306
wolffd@0	307

Mercurial > hg > camir-aes2014

annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_demo4.m @ 0:e9a9cd732c1e tip