wolffd@0
|
1
|
wolffd@0
|
2 %SOM_DEMO4 Data analysis using the SOM.
|
wolffd@0
|
3
|
wolffd@0
|
4 % Contributed to SOM Toolbox 2.0, February 11th, 2000 by Juha Vesanto
|
wolffd@0
|
5 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
6
|
wolffd@0
|
7 % Version 1.0beta juuso 071197
|
wolffd@0
|
8 % Version 2.0beta juuso 090200 070600
|
wolffd@0
|
9
|
wolffd@0
|
10 clf reset;
|
wolffd@0
|
11 f0 = gcf;
|
wolffd@0
|
12 echo on
|
wolffd@0
|
13
|
wolffd@0
|
14
|
wolffd@0
|
15
|
wolffd@0
|
16
|
wolffd@0
|
17 clc
|
wolffd@0
|
18 % ==========================================================
|
wolffd@0
|
19 % SOM_DEMO4 - DATA ANALYSIS USING THE SOM
|
wolffd@0
|
20 % ==========================================================
|
wolffd@0
|
21
|
wolffd@0
|
22 % In this demo, the IRIS data set is analysed using SOM. First, the
|
wolffd@0
|
23 % data is read from ascii file (please make sure they can be found
|
wolffd@0
|
24 % from the current path), normalized, and a map is
|
wolffd@0
|
25 % trained. Since the data also had labels, the map is labelled.
|
wolffd@0
|
26
|
wolffd@0
|
27 try,
|
wolffd@0
|
28 sD = som_read_data('iris.data');
|
wolffd@0
|
29 catch
|
wolffd@0
|
30 echo off
|
wolffd@0
|
31
|
wolffd@0
|
32 warning('File ''iris.data'' not found. Using simulated data instead.')
|
wolffd@0
|
33
|
wolffd@0
|
34 D = randn(50,4);
|
wolffd@0
|
35 D(:,1) = D(:,1)+5; D(:,2) = D(:,2)+3.5;
|
wolffd@0
|
36 D(:,3) = D(:,3)/2+1.5; D(:,4) = D(:,4)/2+0.3;
|
wolffd@0
|
37 D2 = randn(100,4); D2(:,2) = sort(D2(:,2));
|
wolffd@0
|
38 D2(:,1) = D2(:,1)+6.5; D2(:,2) = D2(:,2)+2.8;
|
wolffd@0
|
39 D2(:,3) = D2(:,3)+5; D2(:,4) = D2(:,4)/2+1.5;
|
wolffd@0
|
40 sD = som_data_struct([D; D2],'name','iris (simulated)',...
|
wolffd@0
|
41 'comp_names',{'SepalL','SepalW','PetalL','PetalW'});
|
wolffd@0
|
42 sD = som_label(sD,'add',[1:50]','Setosa');
|
wolffd@0
|
43 sD = som_label(sD,'add',[51:100]','Versicolor');
|
wolffd@0
|
44 sD = som_label(sD,'add',[101:150]','Virginica');
|
wolffd@0
|
45
|
wolffd@0
|
46 echo on
|
wolffd@0
|
47 end
|
wolffd@0
|
48
|
wolffd@0
|
49 sD = som_normalize(sD,'var');
|
wolffd@0
|
50 sM = som_make(sD);
|
wolffd@0
|
51 sM = som_autolabel(sM,sD,'vote');
|
wolffd@0
|
52
|
wolffd@0
|
53 pause % Strike any key to visualize the map...
|
wolffd@0
|
54
|
wolffd@0
|
55 clc
|
wolffd@0
|
56 % VISUAL INSPECTION OF THE MAP
|
wolffd@0
|
57 % ============================
|
wolffd@0
|
58
|
wolffd@0
|
59 % The first step in the analysis of the map is visual inspection.
|
wolffd@0
|
60 % Here is the U-matrix, component planes and labels (you may
|
wolffd@0
|
61 % need to enlarge the figure in order to make out the labels).
|
wolffd@0
|
62
|
wolffd@0
|
63 som_show(sM,'umat','all','comp',[1:4],'empty','Labels','norm','d');
|
wolffd@0
|
64 som_show_add('label',sM.labels,'textsize',8,'textcolor','r','subplot',6);
|
wolffd@0
|
65
|
wolffd@0
|
66 % From this first visualization, one can see that:
|
wolffd@0
|
67 % - there are essentially two clusters
|
wolffd@0
|
68 % - PetalL and PetalW are highly correlated
|
wolffd@0
|
69 % - SepalL is somewhat correlated to PetalL and PetalW
|
wolffd@0
|
70 % - one cluster corresponds to the Setosa species and exhibits
|
wolffd@0
|
71 % small petals and short but wide sepals
|
wolffd@0
|
72 % - the other cluster corresponds to Virginica and Versicolor
|
wolffd@0
|
73 % such that Versicolor has smaller leaves (both sepal and
|
wolffd@0
|
74 % petal) than Virginica
|
wolffd@0
|
75 % - inside both clusters, SepalL and SepalW are highly correlated
|
wolffd@0
|
76
|
wolffd@0
|
77 pause % Strike any key to continue...
|
wolffd@0
|
78
|
wolffd@0
|
79 % Next, the projection of the data set is investigated. A
|
wolffd@0
|
80 % principle component projection is made for the data, and applied
|
wolffd@0
|
81 % to the map. The colormap is done by spreading a colormap on the
|
wolffd@0
|
82 % projection. Distance matrix information is extracted from the
|
wolffd@0
|
83 % U-matrix, and it is modified by knowledge of zero-hits
|
wolffd@0
|
84 % (interpolative) units. Finally, three visualizations are shown:
|
wolffd@0
|
85 % the color code, with clustering information and the number of
|
wolffd@0
|
86 % hits in each unit, the projection and the labels.
|
wolffd@0
|
87
|
wolffd@0
|
88 echo off
|
wolffd@0
|
89
|
wolffd@0
|
90 f1=figure;
|
wolffd@0
|
91 [Pd,V,me,l] = pcaproj(sD,2); Pm = pcaproj(sM,V,me); % PC-projection
|
wolffd@0
|
92 Code = som_colorcode(Pm); % color coding
|
wolffd@0
|
93 hits = som_hits(sM,sD); % hits
|
wolffd@0
|
94 U = som_umat(sM); % U-matrix
|
wolffd@0
|
95 Dm = U(1:2:size(U,1),1:2:size(U,2)); % distance matrix
|
wolffd@0
|
96 Dm = 1-Dm(:)/max(Dm(:)); Dm(find(hits==0)) = 0; % clustering info
|
wolffd@0
|
97
|
wolffd@0
|
98 subplot(1,3,1)
|
wolffd@0
|
99 som_cplane(sM,Code,Dm);
|
wolffd@0
|
100 hold on
|
wolffd@0
|
101 som_grid(sM,'Label',cellstr(int2str(hits)),...
|
wolffd@0
|
102 'Line','none','Marker','none','Labelcolor','k');
|
wolffd@0
|
103 hold off
|
wolffd@0
|
104 title('Color code')
|
wolffd@0
|
105
|
wolffd@0
|
106 subplot(1,3,2)
|
wolffd@0
|
107 som_grid(sM,'Coord',Pm,'MarkerColor',Code,'Linecolor','k');
|
wolffd@0
|
108 hold on, plot(Pd(:,1),Pd(:,2),'k+'), hold off, axis tight, axis equal
|
wolffd@0
|
109 title('PC projection')
|
wolffd@0
|
110
|
wolffd@0
|
111 subplot(1,3,3)
|
wolffd@0
|
112 som_cplane(sM,'none')
|
wolffd@0
|
113 hold on
|
wolffd@0
|
114 som_grid(sM,'Label',sM.labels,'Labelsize',8,...
|
wolffd@0
|
115 'Line','none','Marker','none','Labelcolor','r');
|
wolffd@0
|
116 hold off
|
wolffd@0
|
117 title('Labels')
|
wolffd@0
|
118
|
wolffd@0
|
119 echo on
|
wolffd@0
|
120
|
wolffd@0
|
121 % From these figures one can see that:
|
wolffd@0
|
122 % - the projection confirms the existence of two different clusters
|
wolffd@0
|
123 % - interpolative units seem to divide the Virginica
|
wolffd@0
|
124 % flowers into two classes, the difference being in the size of
|
wolffd@0
|
125 % sepal leaves
|
wolffd@0
|
126
|
wolffd@0
|
127 pause % Strike any key to continue...
|
wolffd@0
|
128
|
wolffd@0
|
129 % Finally, perhaps the most informative figure of all: simple
|
wolffd@0
|
130 % scatter plots and histograms of all variables. The species
|
wolffd@0
|
131 % information is coded as a fifth variable: 1 for Setosa, 2 for
|
wolffd@0
|
132 % Versicolor and 3 for Virginica. Original data points are in the
|
wolffd@0
|
133 % upper triangle, map prototype values on the lower triangle, and
|
wolffd@0
|
134 % histograms on the diagonal: black for the data set and red for
|
wolffd@0
|
135 % the map prototype values. The color coding of the data samples
|
wolffd@0
|
136 % has been copied from the map (from the BMU of each sample). Note
|
wolffd@0
|
137 % that the variable values have been denormalized.
|
wolffd@0
|
138
|
wolffd@0
|
139 echo off
|
wolffd@0
|
140
|
wolffd@0
|
141 % denormalize and add species information
|
wolffd@0
|
142 names = sD.comp_names; names{end+1} = 'species';
|
wolffd@0
|
143 D = som_denormalize(sD.data,sD); dlen = size(D,1);
|
wolffd@0
|
144 s = zeros(dlen,1)+NaN; s(strcmp(sD.labels,'Setosa'))=1;
|
wolffd@0
|
145 s(strcmp(sD.labels,'Versicolor'))=2; s(strcmp(sD.labels,'Virginica'))=3;
|
wolffd@0
|
146 D = [D, s];
|
wolffd@0
|
147 M = som_denormalize(sM.codebook,sM); munits = size(M,1);
|
wolffd@0
|
148 s = zeros(munits,1)+NaN; s(strcmp(sM.labels,'Setosa'))=1;
|
wolffd@0
|
149 s(strcmp(sM.labels,'Versicolor'))=2; s(strcmp(sM.labels,'Virginica'))=3;
|
wolffd@0
|
150 M = [M, s];
|
wolffd@0
|
151
|
wolffd@0
|
152 f2=figure;
|
wolffd@0
|
153
|
wolffd@0
|
154 % color coding copied from the map
|
wolffd@0
|
155 bmus = som_bmus(sM,sD); Code_data = Code(bmus,:);
|
wolffd@0
|
156
|
wolffd@0
|
157 k=1;
|
wolffd@0
|
158 for i=1:5, for j=1:5,
|
wolffd@0
|
159 if i<j, i1=i; i2=j; else i1=j; i2=i; end
|
wolffd@0
|
160 subplot(5,5,k); cla
|
wolffd@0
|
161 if i<j,
|
wolffd@0
|
162 som_grid('rect',[dlen 1],'coord',D(:,[i1 i2]),...
|
wolffd@0
|
163 'Line','none','MarkerColor',Code_data,'Markersize',2);
|
wolffd@0
|
164 title(sprintf('%s vs. %s',names{i1},names{i2}))
|
wolffd@0
|
165 elseif i>j,
|
wolffd@0
|
166 som_grid(sM,'coord',M(:,[i1 i2]),...
|
wolffd@0
|
167 'markersize',2,'MarkerColor',Code);
|
wolffd@0
|
168 title(sprintf('%s vs. %s',names{i1},names{i2}))
|
wolffd@0
|
169 else
|
wolffd@0
|
170 if i1<5, b = 10; else b = 3; end
|
wolffd@0
|
171 [nd,x] = hist(D(:,i1),b); nd=nd/sum(nd);
|
wolffd@0
|
172 nm = hist(M(:,i1),x); nm = nm/sum(nm);
|
wolffd@0
|
173 h=bar(x,nd,0.8); set(h,'EdgeColor','none','FaceColor','k');
|
wolffd@0
|
174 hold on
|
wolffd@0
|
175 h=bar(x,nm,0.3); set(h,'EdgeColor','none','FaceColor','r');
|
wolffd@0
|
176 hold off
|
wolffd@0
|
177 title(names{i1})
|
wolffd@0
|
178 end
|
wolffd@0
|
179 k=k+1;
|
wolffd@0
|
180 end
|
wolffd@0
|
181 end
|
wolffd@0
|
182
|
wolffd@0
|
183 echo on
|
wolffd@0
|
184
|
wolffd@0
|
185 % This visualization shows quite a lot of information:
|
wolffd@0
|
186 % distributions of single and pairs of variables both in the data
|
wolffd@0
|
187 % and in the map. If the number of variables was even slightly
|
wolffd@0
|
188 % more, it would require a really big display to be convenient to
|
wolffd@0
|
189 % use.
|
wolffd@0
|
190
|
wolffd@0
|
191 % From this visualization we can conform many of the earlier
|
wolffd@0
|
192 % conclusions, for example:
|
wolffd@0
|
193 % - there are two clusters: 'Setosa' (blue, dark green) and
|
wolffd@0
|
194 % 'Virginica'/'Versicolor' (light green, yellow, reds)
|
wolffd@0
|
195 % (see almost any of the subplots)
|
wolffd@0
|
196 % - PetalL and PetalW have a high linear correlation (see
|
wolffd@0
|
197 % subplots 4,3 and 3,4)
|
wolffd@0
|
198 % - SepalL is correlated (at least in the bigger cluster) with
|
wolffd@0
|
199 % PetalL and PetalW (in subplots 1,3 1,4 3,1 and 4,1)
|
wolffd@0
|
200 % - SepalL and SepalW have a clear linear correlation, but it
|
wolffd@0
|
201 % is slightly different for the two main clusters (in
|
wolffd@0
|
202 % subplots 2,1 and 1,2)
|
wolffd@0
|
203
|
wolffd@0
|
204 pause % Strike any key to cluster the map...
|
wolffd@0
|
205
|
wolffd@0
|
206 close(f1), close(f2), figure(f0), clf
|
wolffd@0
|
207
|
wolffd@0
|
208 clc
|
wolffd@0
|
209 % CLUSTERING OF THE MAP
|
wolffd@0
|
210 % =====================
|
wolffd@0
|
211
|
wolffd@0
|
212 % Visual inspection already hinted that there are at least two
|
wolffd@0
|
213 % clusters in the data, and that the properties of the clusters are
|
wolffd@0
|
214 % different from each other (esp. relation of SepalL and
|
wolffd@0
|
215 % SepalW). For further investigation, the map needs to be
|
wolffd@0
|
216 % partitioned.
|
wolffd@0
|
217
|
wolffd@0
|
218 % Here, the KMEANS_CLUSTERS function is used to find an initial
|
wolffd@0
|
219 % partitioning. The plot shows the Davies-Boulding clustering
|
wolffd@0
|
220 % index, which is minimized with best clustering.
|
wolffd@0
|
221
|
wolffd@0
|
222 subplot(1,3,1)
|
wolffd@0
|
223 [c,p,err,ind] = kmeans_clusters(sM, 7); % find at most 7 clusters
|
wolffd@0
|
224 plot(1:length(ind),ind,'x-')
|
wolffd@0
|
225 [dummy,i] = min(ind)
|
wolffd@0
|
226 cl = p{i};
|
wolffd@0
|
227
|
wolffd@0
|
228 % The Davies-Boulding index seems to indicate that there are
|
wolffd@0
|
229 % two clusters on the map. Here is the clustering info
|
wolffd@0
|
230 % calculated previously and the partitioning result:
|
wolffd@0
|
231
|
wolffd@0
|
232 subplot(1,3,2)
|
wolffd@0
|
233 som_cplane(sM,Code,Dm)
|
wolffd@0
|
234 subplot(1,3,3)
|
wolffd@0
|
235 som_cplane(sM,cl)
|
wolffd@0
|
236
|
wolffd@0
|
237 % You could use also function SOM_SELECT to manually make or modify
|
wolffd@0
|
238 % the partitioning.
|
wolffd@0
|
239
|
wolffd@0
|
240 % After this, the analysis would proceed with summarization of the
|
wolffd@0
|
241 % results, and analysis of each cluster one at a time.
|
wolffd@0
|
242 % Unfortunately, you have to do that yourself. The SOM Toolbox does
|
wolffd@0
|
243 % not, yet, have functions for those purposes.
|
wolffd@0
|
244
|
wolffd@0
|
245 pause % Strike any key to continue...
|
wolffd@0
|
246
|
wolffd@0
|
247
|
wolffd@0
|
248 clf
|
wolffd@0
|
249 clc
|
wolffd@0
|
250 % MODELING
|
wolffd@0
|
251 % ========
|
wolffd@0
|
252
|
wolffd@0
|
253 % One can also build models on top of the SOM. Typically, these
|
wolffd@0
|
254 % models are simple local or nearest-neighbor models.
|
wolffd@0
|
255
|
wolffd@0
|
256 % Here, SOM is used for probability density estimation. Each map
|
wolffd@0
|
257 % prototype is the center of a gaussian kernel, the parameters
|
wolffd@0
|
258 % of which are estimated from the data. The gaussian mixture
|
wolffd@0
|
259 % model is estimated with function SOM_ESTIMATE_GMM and the
|
wolffd@0
|
260 % probabilities can be calculated with SOM_PROBABILITY_GMM.
|
wolffd@0
|
261
|
wolffd@0
|
262 [K,P] = som_estimate_gmm(sM,sD);
|
wolffd@0
|
263 [pd,Pdm,pmd] = som_probability_gmm(sD,sM,K,P);
|
wolffd@0
|
264
|
wolffd@0
|
265 % Here is the probability density function value for the first data
|
wolffd@0
|
266 % sample (x=sD.data(:,1)) in terms of each map unit (m):
|
wolffd@0
|
267
|
wolffd@0
|
268 som_cplane(sM,Pdm(:,1))
|
wolffd@0
|
269 colorbar
|
wolffd@0
|
270 title('p(x|m)')
|
wolffd@0
|
271
|
wolffd@0
|
272 pause % Strike any key to continue...
|
wolffd@0
|
273
|
wolffd@0
|
274 % Here, SOM is used for classification. Although the SOM can be
|
wolffd@0
|
275 % used for classification as such, one has to remember that it does
|
wolffd@0
|
276 % not utilize class information at all, and thus its results are
|
wolffd@0
|
277 % inherently suboptimal. However, with small modifications, the
|
wolffd@0
|
278 % network can take the class into account. The function
|
wolffd@0
|
279 % SOM_SUPERVISED does this.
|
wolffd@0
|
280
|
wolffd@0
|
281 % Learning vector quantization (LVQ) is an algorithm that is very
|
wolffd@0
|
282 % similar to the SOM in many aspects. However, it is specifically
|
wolffd@0
|
283 % designed for classification. In the SOM Toolbox, there are
|
wolffd@0
|
284 % functions LVQ1 and LVQ3 that implement two versions of this
|
wolffd@0
|
285 % algorithm.
|
wolffd@0
|
286
|
wolffd@0
|
287 % Here, the function SOM_SUPERVISED is used to create a classifier
|
wolffd@0
|
288 % for IRIS data set:
|
wolffd@0
|
289
|
wolffd@0
|
290 sM = som_supervised(sD,'small');
|
wolffd@0
|
291
|
wolffd@0
|
292 som_show(sM,'umat','all');
|
wolffd@0
|
293 som_show_add('label',sM.labels,'TextSize',8,'TextColor','r')
|
wolffd@0
|
294
|
wolffd@0
|
295 sD2 = som_label(sD,'clear','all');
|
wolffd@0
|
296 sD2 = som_autolabel(sD2,sM); % classification
|
wolffd@0
|
297 ok = strcmp(sD2.labels,sD.labels); % errors
|
wolffd@0
|
298 100*(1-sum(ok)/length(ok)) % error percentage (%)
|
wolffd@0
|
299
|
wolffd@0
|
300 echo off
|
wolffd@0
|
301
|
wolffd@0
|
302
|
wolffd@0
|
303
|
wolffd@0
|
304
|
wolffd@0
|
305
|
wolffd@0
|
306
|
wolffd@0
|
307
|