Mercurial > hg > camir-aes2014
comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_cllinkage.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function sC = som_cllinkage(sM,varargin) | |
2 | |
3 %SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units. | |
4 % | |
5 % sC = som_cllinkage(sM, [[argID,] value, ...]) | |
6 % | |
7 % sC = som_cllinkage(sM); | |
8 % sC = som_cllinkage(D,'complete'); | |
9 % sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D))); | |
10 % sC = som_cllinkage(sM,pdist(sM.codebook,'mahal')); | |
11 % som_clplot(sC); | |
12 % | |
13 % Input and output arguments ([]'s are optional): | |
14 % sM (struct) map or data struct to be clustered | |
15 % (matrix) size dlen x dim, a data set: the matrix must not | |
16 % contain any NaN's! | |
17 % [argID, (string) See below. The values which are unambiguous can | |
18 % value] (varies) be given without the preceeding argID. | |
19 % | |
20 % sC (struct) a clustering struct with e.g. the following fields | |
21 % (for more information see SOMCL_STRUCT) | |
22 % .base (vector) if base partitioning is given, this is a newly | |
23 % coded version of it so that the cluster indices | |
24 % go from 1 to the number of clusters. | |
25 % .tree (matrix) size clen-1 x 3, the linkage info | |
26 % Z(i,1) and Z(i,2) hold the indeces of clusters | |
27 % combined on level i (starting from bottom). The new | |
28 % cluster has index dlen+i. The initial cluster | |
29 % index of each unit is its linear index in the | |
30 % original data matrix. Z(i,3) is the distance | |
31 % between the combined clusters. See LINKAGE | |
32 % function in the Statistics Toolbox. | |
33 % | |
34 % Here are the valid argument IDs and corresponding values. The values | |
35 % which are unambiguous (marked with '*') can be given without the | |
36 % preceeding argID. | |
37 % 'topol' *(struct) topology struct | |
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the | |
39 % connections should be allowed only between | |
40 % neighbors or between any vectors | |
41 % (matrix) size dlen x dlen indicating the connections | |
42 % between vectors | |
43 % 'linkage' *(string) the linkage criteria to use: 'single' (the | |
44 % default), 'average', 'complete', 'centroid', or 'ward' | |
45 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to | |
46 % be used instead of euclidian distances | |
47 % (vector) as the output of PDIST function | |
48 % (scalar) distance norm to use (default is euclidian = 2) | |
49 % 'mask' (vector) size dim x 1, the search mask used to | |
50 % weight distance calculation. By default | |
51 % sM.mask or a vector of ones is used. | |
52 % 'base' (vector) giving the base partitioning of the data: | |
53 % base(i) = j denotes that vector i belongs to | |
54 % base cluster j, and base(i) = NaN that vector | |
55 % i does not belong to any cluster, but should be | |
56 % ignored. At the beginning of the clustering, the | |
57 % vector of each cluster are averaged, and these | |
58 % averaged vectors are then clustered using | |
59 % hierarchical clustering. | |
60 % 'ignore' (vector) units to be ignored (in addition to those listed | |
61 % in base argument) | |
62 % 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0) | |
63 % | |
64 % Note that if 'connect'='neighbors' and some vector are ignored (as denoted | |
65 % by NaNs in the base vector), there may be areas on the map which will | |
66 % never be connected: connections across the ignored map units simply do not | |
67 % exist. In such a case, the neighborhood is gradually increased until | |
68 % the areas can be connected. | |
69 % | |
70 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. | |
71 | |
72 % Copyright (c) 2000 by Juha Vesanto | |
73 % Contributed to SOM Toolbox on XXX by Juha Vesanto | |
74 % http://www.cis.hut.fi/projects/somtoolbox/ | |
75 | |
76 % Version 2.0beta juuso 160600 250800 | |
77 | |
78 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
79 %% input arguments | |
80 | |
81 % the data | |
82 if isstruct(sM), | |
83 switch sM.type, | |
84 case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end); | |
85 case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = []; | |
86 case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1); | |
87 sTr = som_set('som_train','neigh','gaussian','radius_fin',1); | |
88 otherwise, error('Bad first argument'); | |
89 end | |
90 else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = []; | |
91 end | |
92 [dlen dim] = size(M); | |
93 if isempty(mask), mask = ones(dim,1); end | |
94 if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end | |
95 | |
96 % varargin | |
97 q = 2; | |
98 Md = []; | |
99 linkage = 'single'; | |
100 ignore = []; | |
101 Ne = 'any'; | |
102 base = []; | |
103 tracking = 0; | |
104 i=1; | |
105 while i<=length(varargin), | |
106 argok = 1; | |
107 if ischar(varargin{i}), | |
108 switch varargin{i}, | |
109 % argument IDs | |
110 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; | |
111 case 'connect', i=i+1; Ne = varargin{i}; | |
112 case 'ignore', i=i+1; ignore = varargin{i}; | |
113 case 'dist', i=i+1; Md = varargin{i}; | |
114 case 'linkage', i=i+1; linkage = varargin{i}; | |
115 case 'mask', i=i+1; mask = varargin{i}; | |
116 case 'tracking',i=i+1; tracking = varargin{i}; | |
117 case 'base', i=i+1; base = varargin{i}; | |
118 % unambiguous values | |
119 case 'neighbors', Ne = varargin{i}; | |
120 case 'any', Ne = varargin{i}; | |
121 case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i}; | |
122 otherwise argok=0; | |
123 end | |
124 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), | |
125 switch varargin{i}(1).type, | |
126 case 'som_topol', sT = varargin{i}; | |
127 otherwise argok=0; | |
128 end | |
129 else | |
130 argok = 0; | |
131 end | |
132 if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end | |
133 i = i+1; | |
134 end | |
135 | |
136 % check distance metric | |
137 if prod(size(Md))==1, q = Md; Md = []; end | |
138 if ~isempty(Md) & prod(size(Md))<dlen^2, Md = squareform(Md); end | |
139 if prod(size(Md))>0 & any(strcmp(linkage,{'ward','centroid'})), | |
140 warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']); | |
141 end | |
142 | |
143 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
144 %% distance matrix and connections between units | |
145 | |
146 % base partition | |
147 if isempty(base), base = 1:dlen; end | |
148 if ~isempty(ignore), base(ignore) = NaN; end | |
149 cid = unique(base(isfinite(base))); | |
150 nc = length(cid); | |
151 if max(cid)>nc | min(cid)<1, | |
152 b = base; for i=1:nc, base(find(b==cid(i))) = i; end | |
153 end | |
154 | |
155 % initial clusters | |
156 clinds = cell(nc,1); | |
157 for i=1:nc, clinds{i} = find(base==i); end | |
158 | |
159 % neighborhood constraint (calculate connection matrix Ne) | |
160 if ischar(Ne), | |
161 switch Ne, | |
162 case 'any', Ne = []; | |
163 case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end | |
164 otherwise, error(['Unrecognized connection mode ' Ne]); | |
165 end | |
166 end | |
167 if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1 | |
168 if all(Ne(:)>0), Ne = []; end | |
169 | |
170 % neighborhood function values | |
171 if strcmp(linkage,'neighf') | |
172 if isempty(sTr), error('Cannot use neighf linkage.'); end | |
173 q = som_unit_dists(sT).^2; | |
174 r = sTr.radius_fin^2; | |
175 if isnan(r) | isempty(r), r = 1; end | |
176 switch sTr.neigh, | |
177 case 'bubble', q = (q <= r); | |
178 case 'gaussian', q = exp(-q/(2*r)); | |
179 case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r); | |
180 case 'ep', q = (1-q/r) .* (q <= r); | |
181 end | |
182 end | |
183 | |
184 % mutual distances and initial cluster distances | |
185 Cd = []; | |
186 if any(strcmp(linkage,{'single','average','complete','neighf'})), | |
187 M = som_mdist(M,2,mask,Ne); | |
188 if (nc == dlen & all(base==[1:dlen])), Cd = M; end | |
189 end | |
190 if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end | |
191 Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal | |
192 | |
193 % check out from Ne which of the clusters are not connected | |
194 if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})), | |
195 Clconn = sparse(nc); | |
196 for i=1:nc-1, | |
197 for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end | |
198 Clconn(i+1:nc,i) = Clconn(i,i+1:nc)'; | |
199 end | |
200 Cd(Clconn==0) = Inf; | |
201 else | |
202 Clconn = []; | |
203 end | |
204 | |
205 | |
206 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
207 %% construct dendrogram | |
208 | |
209 clen = nc; | |
210 cid = 1:clen; | |
211 Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step | |
212 if tracking, h = waitbar(0,'Making hierarchical clustering'); end | |
213 | |
214 for i=1:clen-1, | |
215 if tracking, waitbar(i/clen,h); end | |
216 | |
217 % find two closest clusters and combine them | |
218 [d,c1] = min(min(Cd)); % cluster1 | |
219 [d,c2] = min(Cd(:,c1)); % cluster2 | |
220 i1 = clinds{c1}; % vectors belonging to c1 | |
221 i2 = clinds{c2}; % vectors belonging to c2 | |
222 clinds{c1} = [i1; i2]; % insert clusters to c1 | |
223 Z(i,:) = [cid(c1), cid(c2), d]; % update tree info | |
224 | |
225 % remove cluster c2 | |
226 notc2 = [1:c2-1,c2+1:nc]; | |
227 nc = nc-1; if nc<=1, break; end | |
228 if c1>c2, c1=c1-1; end | |
229 clinds = clinds(notc2); | |
230 Cd = Cd(notc2,notc2); | |
231 cid = cid(notc2); | |
232 if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end | |
233 | |
234 % update cluster distances | |
235 notc1 = [1:c1-1,c1+1:nc]; | |
236 Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask); | |
237 Cd(notc1,c1) = Cd(c1,notc1)'; | |
238 if ~isempty(Clconn), | |
239 for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end | |
240 Clconn(notc1,c1) = Clconn(c1,notc1)'; | |
241 Cd(Clconn==0) = Inf; | |
242 end | |
243 | |
244 end | |
245 | |
246 if tracking, close(h); end | |
247 | |
248 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
249 %% return values | |
250 | |
251 % to maintain compatibility with Statistics Toolbox, the values in | |
252 % Z must be yet transformed so that they are similar to the output | |
253 % of LINKAGE function | |
254 | |
255 clen = size(Z,1)+1; | |
256 Zs = Z; | |
257 current_cluster = 1:clen; | |
258 for i=1:size(Z,1), | |
259 Zs(i,1) = current_cluster(Z(i,1)); | |
260 Zs(i,2) = current_cluster(Z(i,2)); | |
261 current_cluster(Z(i,[1 2])) = clen+i; | |
262 end | |
263 Z = Zs; | |
264 | |
265 % make a clustering struct | |
266 name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0)); | |
267 sC = som_clstruct(Z,'base',base,'name',name); | |
268 | |
269 return; | |
270 | |
271 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
272 |