comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_cllinkage.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 function sC = som_cllinkage(sM,varargin)
2
3 %SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units.
4 %
5 % sC = som_cllinkage(sM, [[argID,] value, ...])
6 %
7 % sC = som_cllinkage(sM);
8 % sC = som_cllinkage(D,'complete');
9 % sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D)));
10 % sC = som_cllinkage(sM,pdist(sM.codebook,'mahal'));
11 % som_clplot(sC);
12 %
13 % Input and output arguments ([]'s are optional):
14 % sM (struct) map or data struct to be clustered
15 % (matrix) size dlen x dim, a data set: the matrix must not
16 % contain any NaN's!
17 % [argID, (string) See below. The values which are unambiguous can
18 % value] (varies) be given without the preceeding argID.
19 %
20 % sC (struct) a clustering struct with e.g. the following fields
21 % (for more information see SOMCL_STRUCT)
22 % .base (vector) if base partitioning is given, this is a newly
23 % coded version of it so that the cluster indices
24 % go from 1 to the number of clusters.
25 % .tree (matrix) size clen-1 x 3, the linkage info
26 % Z(i,1) and Z(i,2) hold the indeces of clusters
27 % combined on level i (starting from bottom). The new
28 % cluster has index dlen+i. The initial cluster
29 % index of each unit is its linear index in the
30 % original data matrix. Z(i,3) is the distance
31 % between the combined clusters. See LINKAGE
32 % function in the Statistics Toolbox.
33 %
34 % Here are the valid argument IDs and corresponding values. The values
35 % which are unambiguous (marked with '*') can be given without the
36 % preceeding argID.
37 % 'topol' *(struct) topology struct
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
39 % connections should be allowed only between
40 % neighbors or between any vectors
41 % (matrix) size dlen x dlen indicating the connections
42 % between vectors
43 % 'linkage' *(string) the linkage criteria to use: 'single' (the
44 % default), 'average', 'complete', 'centroid', or 'ward'
45 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
46 % be used instead of euclidian distances
47 % (vector) as the output of PDIST function
48 % (scalar) distance norm to use (default is euclidian = 2)
49 % 'mask' (vector) size dim x 1, the search mask used to
50 % weight distance calculation. By default
51 % sM.mask or a vector of ones is used.
52 % 'base' (vector) giving the base partitioning of the data:
53 % base(i) = j denotes that vector i belongs to
54 % base cluster j, and base(i) = NaN that vector
55 % i does not belong to any cluster, but should be
56 % ignored. At the beginning of the clustering, the
57 % vector of each cluster are averaged, and these
58 % averaged vectors are then clustered using
59 % hierarchical clustering.
60 % 'ignore' (vector) units to be ignored (in addition to those listed
61 % in base argument)
62 % 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0)
63 %
64 % Note that if 'connect'='neighbors' and some vector are ignored (as denoted
65 % by NaNs in the base vector), there may be areas on the map which will
66 % never be connected: connections across the ignored map units simply do not
67 % exist. In such a case, the neighborhood is gradually increased until
68 % the areas can be connected.
69 %
70 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
71
72 % Copyright (c) 2000 by Juha Vesanto
73 % Contributed to SOM Toolbox on XXX by Juha Vesanto
74 % http://www.cis.hut.fi/projects/somtoolbox/
75
76 % Version 2.0beta juuso 160600 250800
77
78 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
79 %% input arguments
80
81 % the data
82 if isstruct(sM),
83 switch sM.type,
84 case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end);
85 case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = [];
86 case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1);
87 sTr = som_set('som_train','neigh','gaussian','radius_fin',1);
88 otherwise, error('Bad first argument');
89 end
90 else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = [];
91 end
92 [dlen dim] = size(M);
93 if isempty(mask), mask = ones(dim,1); end
94 if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end
95
96 % varargin
97 q = 2;
98 Md = [];
99 linkage = 'single';
100 ignore = [];
101 Ne = 'any';
102 base = [];
103 tracking = 0;
104 i=1;
105 while i<=length(varargin),
106 argok = 1;
107 if ischar(varargin{i}),
108 switch varargin{i},
109 % argument IDs
110 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
111 case 'connect', i=i+1; Ne = varargin{i};
112 case 'ignore', i=i+1; ignore = varargin{i};
113 case 'dist', i=i+1; Md = varargin{i};
114 case 'linkage', i=i+1; linkage = varargin{i};
115 case 'mask', i=i+1; mask = varargin{i};
116 case 'tracking',i=i+1; tracking = varargin{i};
117 case 'base', i=i+1; base = varargin{i};
118 % unambiguous values
119 case 'neighbors', Ne = varargin{i};
120 case 'any', Ne = varargin{i};
121 case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i};
122 otherwise argok=0;
123 end
124 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
125 switch varargin{i}(1).type,
126 case 'som_topol', sT = varargin{i};
127 otherwise argok=0;
128 end
129 else
130 argok = 0;
131 end
132 if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end
133 i = i+1;
134 end
135
136 % check distance metric
137 if prod(size(Md))==1, q = Md; Md = []; end
138 if ~isempty(Md) & prod(size(Md))<dlen^2, Md = squareform(Md); end
139 if prod(size(Md))>0 & any(strcmp(linkage,{'ward','centroid'})),
140 warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']);
141 end
142
143 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
144 %% distance matrix and connections between units
145
146 % base partition
147 if isempty(base), base = 1:dlen; end
148 if ~isempty(ignore), base(ignore) = NaN; end
149 cid = unique(base(isfinite(base)));
150 nc = length(cid);
151 if max(cid)>nc | min(cid)<1,
152 b = base; for i=1:nc, base(find(b==cid(i))) = i; end
153 end
154
155 % initial clusters
156 clinds = cell(nc,1);
157 for i=1:nc, clinds{i} = find(base==i); end
158
159 % neighborhood constraint (calculate connection matrix Ne)
160 if ischar(Ne),
161 switch Ne,
162 case 'any', Ne = [];
163 case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end
164 otherwise, error(['Unrecognized connection mode ' Ne]);
165 end
166 end
167 if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1
168 if all(Ne(:)>0), Ne = []; end
169
170 % neighborhood function values
171 if strcmp(linkage,'neighf')
172 if isempty(sTr), error('Cannot use neighf linkage.'); end
173 q = som_unit_dists(sT).^2;
174 r = sTr.radius_fin^2;
175 if isnan(r) | isempty(r), r = 1; end
176 switch sTr.neigh,
177 case 'bubble', q = (q <= r);
178 case 'gaussian', q = exp(-q/(2*r));
179 case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r);
180 case 'ep', q = (1-q/r) .* (q <= r);
181 end
182 end
183
184 % mutual distances and initial cluster distances
185 Cd = [];
186 if any(strcmp(linkage,{'single','average','complete','neighf'})),
187 M = som_mdist(M,2,mask,Ne);
188 if (nc == dlen & all(base==[1:dlen])), Cd = M; end
189 end
190 if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end
191 Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal
192
193 % check out from Ne which of the clusters are not connected
194 if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})),
195 Clconn = sparse(nc);
196 for i=1:nc-1,
197 for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end
198 Clconn(i+1:nc,i) = Clconn(i,i+1:nc)';
199 end
200 Cd(Clconn==0) = Inf;
201 else
202 Clconn = [];
203 end
204
205
206 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
207 %% construct dendrogram
208
209 clen = nc;
210 cid = 1:clen;
211 Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step
212 if tracking, h = waitbar(0,'Making hierarchical clustering'); end
213
214 for i=1:clen-1,
215 if tracking, waitbar(i/clen,h); end
216
217 % find two closest clusters and combine them
218 [d,c1] = min(min(Cd)); % cluster1
219 [d,c2] = min(Cd(:,c1)); % cluster2
220 i1 = clinds{c1}; % vectors belonging to c1
221 i2 = clinds{c2}; % vectors belonging to c2
222 clinds{c1} = [i1; i2]; % insert clusters to c1
223 Z(i,:) = [cid(c1), cid(c2), d]; % update tree info
224
225 % remove cluster c2
226 notc2 = [1:c2-1,c2+1:nc];
227 nc = nc-1; if nc<=1, break; end
228 if c1>c2, c1=c1-1; end
229 clinds = clinds(notc2);
230 Cd = Cd(notc2,notc2);
231 cid = cid(notc2);
232 if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end
233
234 % update cluster distances
235 notc1 = [1:c1-1,c1+1:nc];
236 Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask);
237 Cd(notc1,c1) = Cd(c1,notc1)';
238 if ~isempty(Clconn),
239 for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end
240 Clconn(notc1,c1) = Clconn(c1,notc1)';
241 Cd(Clconn==0) = Inf;
242 end
243
244 end
245
246 if tracking, close(h); end
247
248 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
249 %% return values
250
251 % to maintain compatibility with Statistics Toolbox, the values in
252 % Z must be yet transformed so that they are similar to the output
253 % of LINKAGE function
254
255 clen = size(Z,1)+1;
256 Zs = Z;
257 current_cluster = 1:clen;
258 for i=1:size(Z,1),
259 Zs(i,1) = current_cluster(Z(i,1));
260 Zs(i,2) = current_cluster(Z(i,2));
261 current_cluster(Z(i,[1 2])) = clen+i;
262 end
263 Z = Zs;
264
265 % make a clustering struct
266 name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0));
267 sC = som_clstruct(Z,'base',base,'name',name);
268
269 return;
270
271 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
272