comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 function [Z,order,Md] = som_linkage(sM,varargin)
2
3 %SOM_LINKAGE Make a hierarchical linkage of the SOM map units.
4 %
5 % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...])
6 %
7 % Z = som_linkage(sM);
8 % Z = som_linkage(D,'complete');
9 % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D)));
10 % Z = som_linkage(sM,pdist(sM.codebook,'mahal'));
11 % som_dendrogram(Z);
12 %
13 % Input and output arguments ([]'s are optional):
14 % sM (struct) map or data struct to be clustered
15 % (matrix) size dlen x dim, a data set: the matrix must not
16 % contain any NaN's!
17 % [argID, (string) See below. The values which are unambiguous can
18 % value] (varies) be given without the preceeding argID.
19 %
20 % Z (matrix) size dlen-1 x 3, the linkage info
21 % Z(i,1) and Z(i,2) hold the indeces of clusters
22 % combined on level i (starting from bottom). The new
23 % cluster has index dlen+i. The initial cluster
24 % index of each unit is its linear index in the
25 % original data matrix. Z(i,3) is the distance
26 % between the combined clusters. See LINKAGE
27 % function in the Statistics Toolbox.
28 % The ignored samples are listed at the
29 % end of the Z-matrix and have Z(*,3) == Inf
30 % Dist (matrix) size dlen x dlen, pairwise distance matrix
31 %
32 % Here are the valid argument IDs and corresponding values. The values
33 % which are unambiguous (marked with '*') can be given without the
34 % preceeding argID.
35 % 'linkage' *(string) the linkage criteria to use: 'single' (the
36 % default), 'average' or 'complete'
37 % 'topol' *(struct) topology struct
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
39 % connections should be allowed only between
40 % neighbors or between any vectors
41 % (matrix) size dlen x dlen indicating the connections
42 % between vectors
43 % (scalar) the N-neighborhood upto which the connections
44 % should be formed (implies 'neighbors')
45 % 'ignore' (vector) the units/vectors which should be ignored
46 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
47 % be used instead of euclidian distances
48 % (vector) as the output of PDIST function
49 % (scalar) distance norm to use (euclidian = 2)
50 % 'mask' (vector) size dim x 1, the search mask used to
51 % weight distance calculation. By default
52 % sM.mask or a vector of ones is used.
53 %
54 % Note that together 'connect'='neighbors' and 'ignore' may form
55 % areas on the map which will never be connected: connections
56 % across the ignored map units simply do not exist.
57 %
58 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
59
60 % Copyright (c) 2000 by Juha Vesanto
61 % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto
62 % http://www.cis.hut.fi/projects/somtoolbox/
63
64 % Version 2.0beta juuso 160600
65
66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
67 %% input arguments
68
69 % the data
70 if isstruct(sM),
71 if isfield(sM,'data'), D = sM.data; sT = []; mask = [];
72 else D = sM.codebook; sT = sM.topol; mask = sM.mask;
73 end
74 else
75 D = sM; sT = []; mask = [];
76 end
77 [dlen dim] = size(D);
78 if isempty(mask), mask = ones(dim,1); end
79 if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end
80
81 % varargin
82 Md = 2;
83 linkage = 'single';
84 ignore_units = [];
85 constrained = 0;
86 i=1;
87 while i<=length(varargin),
88 argok = 1;
89 if ischar(varargin{i}),
90 switch varargin{i},
91 % argument IDs
92 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
93 case 'connect', i=i+1;
94 if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any');
95 else constrained = varargin{i}; end
96 case 'ignore', i=i+1; ignore_units = varargin{i};
97 case 'dist', i=i+1; Md = varargin{i};
98 case 'linkage', i=i+1; linkage = varargin{i};
99 case 'mask', i=i+1; mask = varargin{i};
100 case 'tracking',i=i+1; tracking = varargin{i};
101 % unambiguous values
102 case 'neighbors', constrained = 1;
103 case 'any', constrained = 0;
104 case {'single','average','complete'}, linkage = varargin{i};
105 otherwise argok=0;
106 end
107 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
108 switch varargin{i}(1).type,
109 case 'som_topol', sTopol = varargin{i};
110 otherwise argok=0;
111 end
112 else
113 argok = 0;
114 end
115 if ~argok,
116 disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]);
117 end
118 i = i+1;
119 end
120
121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
122 %% distance matrix
123
124 % given distance matrix % jh corrected this place totally 27.3. 03
125 if (prod(size(Md))==1), % no explicit distance matrix, set flag
126 q=2; % 17.2.03 kr added some brackets
127 else
128 if (prod(size(Md))<dlen^2), % check pdist form
129 Md = squareform(Md); % transform to ordinary square diastance matrix
130 end
131 % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below
132 % sets self-distance to Inf! This must be set here also,
133 % otherwise clustering fails for user defined dist. matrix!
134 Md(eye(dlen)==1)=Inf;
135 end
136
137 % neighborhood constraint
138 if length(constrained)==1 & constrained>0,
139 Ne1 = som_unit_neighs(sT);
140 Conn = som_neighborhood(Ne1,constrained);
141 Conn(~isfinite(Conn(:))) = 0;
142 else Conn = constrained; end
143 if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end
144
145 % pairwise distance matrix across connected units
146 n = size(D,1);
147 if prod(size(Md))>1,
148 % remove distances between non-neighbors
149 if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end
150 else
151 % calculate pairwise distance matrix
152 q = Md;
153 Md = zeros(n,n)+Inf;
154 if ~constrained & q==2, % fast for the default case
155 for i = 1:n-1,
156 x = D(i,:);
157 inds = [(i+1):n];
158 Diff = D(inds,:) - x(ones(n-i,1),:);
159 Md(inds,i) = sqrt((Diff.^2)*mask);
160 Md(i,inds) = Md(inds,i)';
161 end
162 else
163 for i = 1:n-1,
164 inds = find(Conn(i,:)==1);
165 inds = inds(find(inds>i));
166 Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:));
167 switch q,
168 case 1, dist = Diff*mask;
169 case 2, dist = sqrt((Diff.^2)*mask);
170 case Inf, dist = max(Diff,[],2);
171 otherwise, dist = ((Diff.^q)*mask).^(1/q);
172 end
173 Md(inds,i) = dist;
174 Md(i,inds) = dist';
175 end
176 end
177 end
178
179 % set distances to ignored units to Inf
180 if ~isempty(ignore_units),
181 Md(ignore_units,:) = Inf;
182 Md(:,ignore_units) = Inf;
183 end
184
185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
186 %% construct dendrogram
187
188 Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step
189 clusters = 1:dlen; % each vector is at first in its own cluster
190 Cd = Md; % distances between clusters
191
192 h = waitbar(0,'Constructing hierarchical clustering');
193
194 for i=1:n-1,
195
196 % tracking
197 waitbar(i/(n-1),h);
198
199 %% combine two closest clusters
200 % find the clusters which are closest to each other (c1 and c2)
201 [d,ind] = min(min(Cd));
202 if ~isfinite(d), break; end % no more connected clusters
203 [d,c1] = min(Cd(:,ind)); % cluster1
204 c2 = clusters(ind); % cluster2
205 % combine the two clusters
206 c1_inds = find(clusters==c1); % vectors belonging to c1
207 c2_inds = find(clusters==c2); % vectors belonging to c2
208 c_inds = [c1_inds, c2_inds]; % members of the new cluster
209 % new cluster index = bigger cluster
210 if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end
211 clusters(c_inds) = c; % update cluster info
212 Z(i,:) = [c, k, d]; % save info into Z
213
214 %% update cluster distances
215 % remove the subclusters from the Cd table
216 Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf
217 k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster
218 Cd(k_inds,:) = Inf; % distance of the subclusters to
219 Cd(:,k_inds) = Inf; % other clusters = Inf
220 % update the distance of this cluster to the other clusters
221 cl = unique(clusters(clusters ~= c)); % indeces of all other clusters
222 if ~isempty(cl), % added since v6.5 works differently than 6.1
223 for l=cl,
224 o_inds = find(clusters==l); % indeces belonging to cluster k
225 vd = Md(c_inds,o_inds); % distances between vectors in c and k
226 vd = vd(isfinite(vd(:))); % remove infinite distances (no connection)
227 len = length(vd);
228 if ~len, % if the two clusters are not connected, their distance in Inf
229 sd = Inf;
230 else % otherwise, calculate the distance between them
231 switch linkage,
232 case 'single', sd = min(vd);
233 case 'average', sd = sum(vd)/len;
234 case 'complete', sd = max(vd);
235 otherwise, error(['Unknown set distance: ' linkage]);
236 end
237 end
238 Cd(c,l) = sd; Cd(l,c) = sd;
239 end
240 end
241 end
242 close(h);
243
244 last = Z(i,1);
245 if isnan(last),
246 last = Z(i-1,1);
247 rest = setdiff(unique(clusters),last);
248 Z(i:n-1,1) = rest';
249 Z(i:n-1,2) = last;
250 Z(i:n-1,3) = Inf;
251 i = i - 1;
252 else
253 rest = [];
254 end
255
256 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
257 %% return values
258
259 % calculate the order of samples
260 order = last;
261 % go through the combination matrix from top to down
262 for k=i:-1:1,
263 c = Z(k,1); k = Z(k,2); % what (k) change to what (c)
264 j = find(order==c); % the occurance of c in order
265 if j == length(order), order = [order k]; % put k behind c
266 else order = [order(1:j) k order(j+1:end)];
267 end
268 end
269 order = [rest, order];
270
271 % to maintain compatibility with Statistics Toolbox, the values in
272 % Z must be yet transformed so that they are similar to the output
273 % of LINKAGE function
274
275 Zs = Z;
276 current_cluster = 1:dlen;
277 for i=1:size(Z,1),
278 Zs(i,1) = current_cluster(Z(i,1));
279 Zs(i,2) = current_cluster(Z(i,2));
280 current_cluster(Z(i,[1 2])) = dlen+i;
281 end
282
283 Z = Zs;
284
285 return;
286
287 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
288
289
290
291