wolffd@0
|
1 function sC = som_cllinkage(sM,varargin)
|
wolffd@0
|
2
|
wolffd@0
|
3 %SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units.
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % sC = som_cllinkage(sM, [[argID,] value, ...])
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % sC = som_cllinkage(sM);
|
wolffd@0
|
8 % sC = som_cllinkage(D,'complete');
|
wolffd@0
|
9 % sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D)));
|
wolffd@0
|
10 % sC = som_cllinkage(sM,pdist(sM.codebook,'mahal'));
|
wolffd@0
|
11 % som_clplot(sC);
|
wolffd@0
|
12 %
|
wolffd@0
|
13 % Input and output arguments ([]'s are optional):
|
wolffd@0
|
14 % sM (struct) map or data struct to be clustered
|
wolffd@0
|
15 % (matrix) size dlen x dim, a data set: the matrix must not
|
wolffd@0
|
16 % contain any NaN's!
|
wolffd@0
|
17 % [argID, (string) See below. The values which are unambiguous can
|
wolffd@0
|
18 % value] (varies) be given without the preceeding argID.
|
wolffd@0
|
19 %
|
wolffd@0
|
20 % sC (struct) a clustering struct with e.g. the following fields
|
wolffd@0
|
21 % (for more information see SOMCL_STRUCT)
|
wolffd@0
|
22 % .base (vector) if base partitioning is given, this is a newly
|
wolffd@0
|
23 % coded version of it so that the cluster indices
|
wolffd@0
|
24 % go from 1 to the number of clusters.
|
wolffd@0
|
25 % .tree (matrix) size clen-1 x 3, the linkage info
|
wolffd@0
|
26 % Z(i,1) and Z(i,2) hold the indeces of clusters
|
wolffd@0
|
27 % combined on level i (starting from bottom). The new
|
wolffd@0
|
28 % cluster has index dlen+i. The initial cluster
|
wolffd@0
|
29 % index of each unit is its linear index in the
|
wolffd@0
|
30 % original data matrix. Z(i,3) is the distance
|
wolffd@0
|
31 % between the combined clusters. See LINKAGE
|
wolffd@0
|
32 % function in the Statistics Toolbox.
|
wolffd@0
|
33 %
|
wolffd@0
|
34 % Here are the valid argument IDs and corresponding values. The values
|
wolffd@0
|
35 % which are unambiguous (marked with '*') can be given without the
|
wolffd@0
|
36 % preceeding argID.
|
wolffd@0
|
37 % 'topol' *(struct) topology struct
|
wolffd@0
|
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
|
wolffd@0
|
39 % connections should be allowed only between
|
wolffd@0
|
40 % neighbors or between any vectors
|
wolffd@0
|
41 % (matrix) size dlen x dlen indicating the connections
|
wolffd@0
|
42 % between vectors
|
wolffd@0
|
43 % 'linkage' *(string) the linkage criteria to use: 'single' (the
|
wolffd@0
|
44 % default), 'average', 'complete', 'centroid', or 'ward'
|
wolffd@0
|
45 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
|
wolffd@0
|
46 % be used instead of euclidian distances
|
wolffd@0
|
47 % (vector) as the output of PDIST function
|
wolffd@0
|
48 % (scalar) distance norm to use (default is euclidian = 2)
|
wolffd@0
|
49 % 'mask' (vector) size dim x 1, the search mask used to
|
wolffd@0
|
50 % weight distance calculation. By default
|
wolffd@0
|
51 % sM.mask or a vector of ones is used.
|
wolffd@0
|
52 % 'base' (vector) giving the base partitioning of the data:
|
wolffd@0
|
53 % base(i) = j denotes that vector i belongs to
|
wolffd@0
|
54 % base cluster j, and base(i) = NaN that vector
|
wolffd@0
|
55 % i does not belong to any cluster, but should be
|
wolffd@0
|
56 % ignored. At the beginning of the clustering, the
|
wolffd@0
|
57 % vector of each cluster are averaged, and these
|
wolffd@0
|
58 % averaged vectors are then clustered using
|
wolffd@0
|
59 % hierarchical clustering.
|
wolffd@0
|
60 % 'ignore' (vector) units to be ignored (in addition to those listed
|
wolffd@0
|
61 % in base argument)
|
wolffd@0
|
62 % 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0)
|
wolffd@0
|
63 %
|
wolffd@0
|
64 % Note that if 'connect'='neighbors' and some vector are ignored (as denoted
|
wolffd@0
|
65 % by NaNs in the base vector), there may be areas on the map which will
|
wolffd@0
|
66 % never be connected: connections across the ignored map units simply do not
|
wolffd@0
|
67 % exist. In such a case, the neighborhood is gradually increased until
|
wolffd@0
|
68 % the areas can be connected.
|
wolffd@0
|
69 %
|
wolffd@0
|
70 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
|
wolffd@0
|
71
|
wolffd@0
|
72 % Copyright (c) 2000 by Juha Vesanto
|
wolffd@0
|
73 % Contributed to SOM Toolbox on XXX by Juha Vesanto
|
wolffd@0
|
74 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
75
|
wolffd@0
|
76 % Version 2.0beta juuso 160600 250800
|
wolffd@0
|
77
|
wolffd@0
|
78 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
79 %% input arguments
|
wolffd@0
|
80
|
wolffd@0
|
81 % the data
|
wolffd@0
|
82 if isstruct(sM),
|
wolffd@0
|
83 switch sM.type,
|
wolffd@0
|
84 case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end);
|
wolffd@0
|
85 case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = [];
|
wolffd@0
|
86 case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1);
|
wolffd@0
|
87 sTr = som_set('som_train','neigh','gaussian','radius_fin',1);
|
wolffd@0
|
88 otherwise, error('Bad first argument');
|
wolffd@0
|
89 end
|
wolffd@0
|
90 else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = [];
|
wolffd@0
|
91 end
|
wolffd@0
|
92 [dlen dim] = size(M);
|
wolffd@0
|
93 if isempty(mask), mask = ones(dim,1); end
|
wolffd@0
|
94 if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end
|
wolffd@0
|
95
|
wolffd@0
|
96 % varargin
|
wolffd@0
|
97 q = 2;
|
wolffd@0
|
98 Md = [];
|
wolffd@0
|
99 linkage = 'single';
|
wolffd@0
|
100 ignore = [];
|
wolffd@0
|
101 Ne = 'any';
|
wolffd@0
|
102 base = [];
|
wolffd@0
|
103 tracking = 0;
|
wolffd@0
|
104 i=1;
|
wolffd@0
|
105 while i<=length(varargin),
|
wolffd@0
|
106 argok = 1;
|
wolffd@0
|
107 if ischar(varargin{i}),
|
wolffd@0
|
108 switch varargin{i},
|
wolffd@0
|
109 % argument IDs
|
wolffd@0
|
110 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
|
wolffd@0
|
111 case 'connect', i=i+1; Ne = varargin{i};
|
wolffd@0
|
112 case 'ignore', i=i+1; ignore = varargin{i};
|
wolffd@0
|
113 case 'dist', i=i+1; Md = varargin{i};
|
wolffd@0
|
114 case 'linkage', i=i+1; linkage = varargin{i};
|
wolffd@0
|
115 case 'mask', i=i+1; mask = varargin{i};
|
wolffd@0
|
116 case 'tracking',i=i+1; tracking = varargin{i};
|
wolffd@0
|
117 case 'base', i=i+1; base = varargin{i};
|
wolffd@0
|
118 % unambiguous values
|
wolffd@0
|
119 case 'neighbors', Ne = varargin{i};
|
wolffd@0
|
120 case 'any', Ne = varargin{i};
|
wolffd@0
|
121 case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i};
|
wolffd@0
|
122 otherwise argok=0;
|
wolffd@0
|
123 end
|
wolffd@0
|
124 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
|
wolffd@0
|
125 switch varargin{i}(1).type,
|
wolffd@0
|
126 case 'som_topol', sT = varargin{i};
|
wolffd@0
|
127 otherwise argok=0;
|
wolffd@0
|
128 end
|
wolffd@0
|
129 else
|
wolffd@0
|
130 argok = 0;
|
wolffd@0
|
131 end
|
wolffd@0
|
132 if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end
|
wolffd@0
|
133 i = i+1;
|
wolffd@0
|
134 end
|
wolffd@0
|
135
|
wolffd@0
|
136 % check distance metric
|
wolffd@0
|
137 if prod(size(Md))==1, q = Md; Md = []; end
|
wolffd@0
|
138 if ~isempty(Md) & prod(size(Md))<dlen^2, Md = squareform(Md); end
|
wolffd@0
|
139 if prod(size(Md))>0 & any(strcmp(linkage,{'ward','centroid'})),
|
wolffd@0
|
140 warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']);
|
wolffd@0
|
141 end
|
wolffd@0
|
142
|
wolffd@0
|
143 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
144 %% distance matrix and connections between units
|
wolffd@0
|
145
|
wolffd@0
|
146 % base partition
|
wolffd@0
|
147 if isempty(base), base = 1:dlen; end
|
wolffd@0
|
148 if ~isempty(ignore), base(ignore) = NaN; end
|
wolffd@0
|
149 cid = unique(base(isfinite(base)));
|
wolffd@0
|
150 nc = length(cid);
|
wolffd@0
|
151 if max(cid)>nc | min(cid)<1,
|
wolffd@0
|
152 b = base; for i=1:nc, base(find(b==cid(i))) = i; end
|
wolffd@0
|
153 end
|
wolffd@0
|
154
|
wolffd@0
|
155 % initial clusters
|
wolffd@0
|
156 clinds = cell(nc,1);
|
wolffd@0
|
157 for i=1:nc, clinds{i} = find(base==i); end
|
wolffd@0
|
158
|
wolffd@0
|
159 % neighborhood constraint (calculate connection matrix Ne)
|
wolffd@0
|
160 if ischar(Ne),
|
wolffd@0
|
161 switch Ne,
|
wolffd@0
|
162 case 'any', Ne = [];
|
wolffd@0
|
163 case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end
|
wolffd@0
|
164 otherwise, error(['Unrecognized connection mode ' Ne]);
|
wolffd@0
|
165 end
|
wolffd@0
|
166 end
|
wolffd@0
|
167 if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1
|
wolffd@0
|
168 if all(Ne(:)>0), Ne = []; end
|
wolffd@0
|
169
|
wolffd@0
|
170 % neighborhood function values
|
wolffd@0
|
171 if strcmp(linkage,'neighf')
|
wolffd@0
|
172 if isempty(sTr), error('Cannot use neighf linkage.'); end
|
wolffd@0
|
173 q = som_unit_dists(sT).^2;
|
wolffd@0
|
174 r = sTr.radius_fin^2;
|
wolffd@0
|
175 if isnan(r) | isempty(r), r = 1; end
|
wolffd@0
|
176 switch sTr.neigh,
|
wolffd@0
|
177 case 'bubble', q = (q <= r);
|
wolffd@0
|
178 case 'gaussian', q = exp(-q/(2*r));
|
wolffd@0
|
179 case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r);
|
wolffd@0
|
180 case 'ep', q = (1-q/r) .* (q <= r);
|
wolffd@0
|
181 end
|
wolffd@0
|
182 end
|
wolffd@0
|
183
|
wolffd@0
|
184 % mutual distances and initial cluster distances
|
wolffd@0
|
185 Cd = [];
|
wolffd@0
|
186 if any(strcmp(linkage,{'single','average','complete','neighf'})),
|
wolffd@0
|
187 M = som_mdist(M,2,mask,Ne);
|
wolffd@0
|
188 if (nc == dlen & all(base==[1:dlen])), Cd = M; end
|
wolffd@0
|
189 end
|
wolffd@0
|
190 if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end
|
wolffd@0
|
191 Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal
|
wolffd@0
|
192
|
wolffd@0
|
193 % check out from Ne which of the clusters are not connected
|
wolffd@0
|
194 if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})),
|
wolffd@0
|
195 Clconn = sparse(nc);
|
wolffd@0
|
196 for i=1:nc-1,
|
wolffd@0
|
197 for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end
|
wolffd@0
|
198 Clconn(i+1:nc,i) = Clconn(i,i+1:nc)';
|
wolffd@0
|
199 end
|
wolffd@0
|
200 Cd(Clconn==0) = Inf;
|
wolffd@0
|
201 else
|
wolffd@0
|
202 Clconn = [];
|
wolffd@0
|
203 end
|
wolffd@0
|
204
|
wolffd@0
|
205
|
wolffd@0
|
206 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
207 %% construct dendrogram
|
wolffd@0
|
208
|
wolffd@0
|
209 clen = nc;
|
wolffd@0
|
210 cid = 1:clen;
|
wolffd@0
|
211 Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step
|
wolffd@0
|
212 if tracking, h = waitbar(0,'Making hierarchical clustering'); end
|
wolffd@0
|
213
|
wolffd@0
|
214 for i=1:clen-1,
|
wolffd@0
|
215 if tracking, waitbar(i/clen,h); end
|
wolffd@0
|
216
|
wolffd@0
|
217 % find two closest clusters and combine them
|
wolffd@0
|
218 [d,c1] = min(min(Cd)); % cluster1
|
wolffd@0
|
219 [d,c2] = min(Cd(:,c1)); % cluster2
|
wolffd@0
|
220 i1 = clinds{c1}; % vectors belonging to c1
|
wolffd@0
|
221 i2 = clinds{c2}; % vectors belonging to c2
|
wolffd@0
|
222 clinds{c1} = [i1; i2]; % insert clusters to c1
|
wolffd@0
|
223 Z(i,:) = [cid(c1), cid(c2), d]; % update tree info
|
wolffd@0
|
224
|
wolffd@0
|
225 % remove cluster c2
|
wolffd@0
|
226 notc2 = [1:c2-1,c2+1:nc];
|
wolffd@0
|
227 nc = nc-1; if nc<=1, break; end
|
wolffd@0
|
228 if c1>c2, c1=c1-1; end
|
wolffd@0
|
229 clinds = clinds(notc2);
|
wolffd@0
|
230 Cd = Cd(notc2,notc2);
|
wolffd@0
|
231 cid = cid(notc2);
|
wolffd@0
|
232 if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end
|
wolffd@0
|
233
|
wolffd@0
|
234 % update cluster distances
|
wolffd@0
|
235 notc1 = [1:c1-1,c1+1:nc];
|
wolffd@0
|
236 Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask);
|
wolffd@0
|
237 Cd(notc1,c1) = Cd(c1,notc1)';
|
wolffd@0
|
238 if ~isempty(Clconn),
|
wolffd@0
|
239 for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end
|
wolffd@0
|
240 Clconn(notc1,c1) = Clconn(c1,notc1)';
|
wolffd@0
|
241 Cd(Clconn==0) = Inf;
|
wolffd@0
|
242 end
|
wolffd@0
|
243
|
wolffd@0
|
244 end
|
wolffd@0
|
245
|
wolffd@0
|
246 if tracking, close(h); end
|
wolffd@0
|
247
|
wolffd@0
|
248 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
249 %% return values
|
wolffd@0
|
250
|
wolffd@0
|
251 % to maintain compatibility with Statistics Toolbox, the values in
|
wolffd@0
|
252 % Z must be yet transformed so that they are similar to the output
|
wolffd@0
|
253 % of LINKAGE function
|
wolffd@0
|
254
|
wolffd@0
|
255 clen = size(Z,1)+1;
|
wolffd@0
|
256 Zs = Z;
|
wolffd@0
|
257 current_cluster = 1:clen;
|
wolffd@0
|
258 for i=1:size(Z,1),
|
wolffd@0
|
259 Zs(i,1) = current_cluster(Z(i,1));
|
wolffd@0
|
260 Zs(i,2) = current_cluster(Z(i,2));
|
wolffd@0
|
261 current_cluster(Z(i,[1 2])) = clen+i;
|
wolffd@0
|
262 end
|
wolffd@0
|
263 Z = Zs;
|
wolffd@0
|
264
|
wolffd@0
|
265 % make a clustering struct
|
wolffd@0
|
266 name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0));
|
wolffd@0
|
267 sC = som_clstruct(Z,'base',base,'name',name);
|
wolffd@0
|
268
|
wolffd@0
|
269 return;
|
wolffd@0
|
270
|
wolffd@0
|
271 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
272
|