wolffd@0
|
1 function [Z,order,Md] = som_linkage(sM,varargin)
|
wolffd@0
|
2
|
wolffd@0
|
3 %SOM_LINKAGE Make a hierarchical linkage of the SOM map units.
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...])
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % Z = som_linkage(sM);
|
wolffd@0
|
8 % Z = som_linkage(D,'complete');
|
wolffd@0
|
9 % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D)));
|
wolffd@0
|
10 % Z = som_linkage(sM,pdist(sM.codebook,'mahal'));
|
wolffd@0
|
11 % som_dendrogram(Z);
|
wolffd@0
|
12 %
|
wolffd@0
|
13 % Input and output arguments ([]'s are optional):
|
wolffd@0
|
14 % sM (struct) map or data struct to be clustered
|
wolffd@0
|
15 % (matrix) size dlen x dim, a data set: the matrix must not
|
wolffd@0
|
16 % contain any NaN's!
|
wolffd@0
|
17 % [argID, (string) See below. The values which are unambiguous can
|
wolffd@0
|
18 % value] (varies) be given without the preceeding argID.
|
wolffd@0
|
19 %
|
wolffd@0
|
20 % Z (matrix) size dlen-1 x 3, the linkage info
|
wolffd@0
|
21 % Z(i,1) and Z(i,2) hold the indeces of clusters
|
wolffd@0
|
22 % combined on level i (starting from bottom). The new
|
wolffd@0
|
23 % cluster has index dlen+i. The initial cluster
|
wolffd@0
|
24 % index of each unit is its linear index in the
|
wolffd@0
|
25 % original data matrix. Z(i,3) is the distance
|
wolffd@0
|
26 % between the combined clusters. See LINKAGE
|
wolffd@0
|
27 % function in the Statistics Toolbox.
|
wolffd@0
|
28 % The ignored samples are listed at the
|
wolffd@0
|
29 % end of the Z-matrix and have Z(*,3) == Inf
|
wolffd@0
|
30 % Dist (matrix) size dlen x dlen, pairwise distance matrix
|
wolffd@0
|
31 %
|
wolffd@0
|
32 % Here are the valid argument IDs and corresponding values. The values
|
wolffd@0
|
33 % which are unambiguous (marked with '*') can be given without the
|
wolffd@0
|
34 % preceeding argID.
|
wolffd@0
|
35 % 'linkage' *(string) the linkage criteria to use: 'single' (the
|
wolffd@0
|
36 % default), 'average' or 'complete'
|
wolffd@0
|
37 % 'topol' *(struct) topology struct
|
wolffd@0
|
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
|
wolffd@0
|
39 % connections should be allowed only between
|
wolffd@0
|
40 % neighbors or between any vectors
|
wolffd@0
|
41 % (matrix) size dlen x dlen indicating the connections
|
wolffd@0
|
42 % between vectors
|
wolffd@0
|
43 % (scalar) the N-neighborhood upto which the connections
|
wolffd@0
|
44 % should be formed (implies 'neighbors')
|
wolffd@0
|
45 % 'ignore' (vector) the units/vectors which should be ignored
|
wolffd@0
|
46 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
|
wolffd@0
|
47 % be used instead of euclidian distances
|
wolffd@0
|
48 % (vector) as the output of PDIST function
|
wolffd@0
|
49 % (scalar) distance norm to use (euclidian = 2)
|
wolffd@0
|
50 % 'mask' (vector) size dim x 1, the search mask used to
|
wolffd@0
|
51 % weight distance calculation. By default
|
wolffd@0
|
52 % sM.mask or a vector of ones is used.
|
wolffd@0
|
53 %
|
wolffd@0
|
54 % Note that together 'connect'='neighbors' and 'ignore' may form
|
wolffd@0
|
55 % areas on the map which will never be connected: connections
|
wolffd@0
|
56 % across the ignored map units simply do not exist.
|
wolffd@0
|
57 %
|
wolffd@0
|
58 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
|
wolffd@0
|
59
|
wolffd@0
|
60 % Copyright (c) 2000 by Juha Vesanto
|
wolffd@0
|
61 % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto
|
wolffd@0
|
62 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
63
|
wolffd@0
|
64 % Version 2.0beta juuso 160600
|
wolffd@0
|
65
|
wolffd@0
|
66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
67 %% input arguments
|
wolffd@0
|
68
|
wolffd@0
|
69 % the data
|
wolffd@0
|
70 if isstruct(sM),
|
wolffd@0
|
71 if isfield(sM,'data'), D = sM.data; sT = []; mask = [];
|
wolffd@0
|
72 else D = sM.codebook; sT = sM.topol; mask = sM.mask;
|
wolffd@0
|
73 end
|
wolffd@0
|
74 else
|
wolffd@0
|
75 D = sM; sT = []; mask = [];
|
wolffd@0
|
76 end
|
wolffd@0
|
77 [dlen dim] = size(D);
|
wolffd@0
|
78 if isempty(mask), mask = ones(dim,1); end
|
wolffd@0
|
79 if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end
|
wolffd@0
|
80
|
wolffd@0
|
81 % varargin
|
wolffd@0
|
82 Md = 2;
|
wolffd@0
|
83 linkage = 'single';
|
wolffd@0
|
84 ignore_units = [];
|
wolffd@0
|
85 constrained = 0;
|
wolffd@0
|
86 i=1;
|
wolffd@0
|
87 while i<=length(varargin),
|
wolffd@0
|
88 argok = 1;
|
wolffd@0
|
89 if ischar(varargin{i}),
|
wolffd@0
|
90 switch varargin{i},
|
wolffd@0
|
91 % argument IDs
|
wolffd@0
|
92 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
|
wolffd@0
|
93 case 'connect', i=i+1;
|
wolffd@0
|
94 if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any');
|
wolffd@0
|
95 else constrained = varargin{i}; end
|
wolffd@0
|
96 case 'ignore', i=i+1; ignore_units = varargin{i};
|
wolffd@0
|
97 case 'dist', i=i+1; Md = varargin{i};
|
wolffd@0
|
98 case 'linkage', i=i+1; linkage = varargin{i};
|
wolffd@0
|
99 case 'mask', i=i+1; mask = varargin{i};
|
wolffd@0
|
100 case 'tracking',i=i+1; tracking = varargin{i};
|
wolffd@0
|
101 % unambiguous values
|
wolffd@0
|
102 case 'neighbors', constrained = 1;
|
wolffd@0
|
103 case 'any', constrained = 0;
|
wolffd@0
|
104 case {'single','average','complete'}, linkage = varargin{i};
|
wolffd@0
|
105 otherwise argok=0;
|
wolffd@0
|
106 end
|
wolffd@0
|
107 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
|
wolffd@0
|
108 switch varargin{i}(1).type,
|
wolffd@0
|
109 case 'som_topol', sTopol = varargin{i};
|
wolffd@0
|
110 otherwise argok=0;
|
wolffd@0
|
111 end
|
wolffd@0
|
112 else
|
wolffd@0
|
113 argok = 0;
|
wolffd@0
|
114 end
|
wolffd@0
|
115 if ~argok,
|
wolffd@0
|
116 disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]);
|
wolffd@0
|
117 end
|
wolffd@0
|
118 i = i+1;
|
wolffd@0
|
119 end
|
wolffd@0
|
120
|
wolffd@0
|
121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
122 %% distance matrix
|
wolffd@0
|
123
|
wolffd@0
|
124 % given distance matrix % jh corrected this place totally 27.3. 03
|
wolffd@0
|
125 if (prod(size(Md))==1), % no explicit distance matrix, set flag
|
wolffd@0
|
126 q=2; % 17.2.03 kr added some brackets
|
wolffd@0
|
127 else
|
wolffd@0
|
128 if (prod(size(Md))<dlen^2), % check pdist form
|
wolffd@0
|
129 Md = squareform(Md); % transform to ordinary square diastance matrix
|
wolffd@0
|
130 end
|
wolffd@0
|
131 % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below
|
wolffd@0
|
132 % sets self-distance to Inf! This must be set here also,
|
wolffd@0
|
133 % otherwise clustering fails for user defined dist. matrix!
|
wolffd@0
|
134 Md(eye(dlen)==1)=Inf;
|
wolffd@0
|
135 end
|
wolffd@0
|
136
|
wolffd@0
|
137 % neighborhood constraint
|
wolffd@0
|
138 if length(constrained)==1 & constrained>0,
|
wolffd@0
|
139 Ne1 = som_unit_neighs(sT);
|
wolffd@0
|
140 Conn = som_neighborhood(Ne1,constrained);
|
wolffd@0
|
141 Conn(~isfinite(Conn(:))) = 0;
|
wolffd@0
|
142 else Conn = constrained; end
|
wolffd@0
|
143 if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end
|
wolffd@0
|
144
|
wolffd@0
|
145 % pairwise distance matrix across connected units
|
wolffd@0
|
146 n = size(D,1);
|
wolffd@0
|
147 if prod(size(Md))>1,
|
wolffd@0
|
148 % remove distances between non-neighbors
|
wolffd@0
|
149 if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end
|
wolffd@0
|
150 else
|
wolffd@0
|
151 % calculate pairwise distance matrix
|
wolffd@0
|
152 q = Md;
|
wolffd@0
|
153 Md = zeros(n,n)+Inf;
|
wolffd@0
|
154 if ~constrained & q==2, % fast for the default case
|
wolffd@0
|
155 for i = 1:n-1,
|
wolffd@0
|
156 x = D(i,:);
|
wolffd@0
|
157 inds = [(i+1):n];
|
wolffd@0
|
158 Diff = D(inds,:) - x(ones(n-i,1),:);
|
wolffd@0
|
159 Md(inds,i) = sqrt((Diff.^2)*mask);
|
wolffd@0
|
160 Md(i,inds) = Md(inds,i)';
|
wolffd@0
|
161 end
|
wolffd@0
|
162 else
|
wolffd@0
|
163 for i = 1:n-1,
|
wolffd@0
|
164 inds = find(Conn(i,:)==1);
|
wolffd@0
|
165 inds = inds(find(inds>i));
|
wolffd@0
|
166 Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:));
|
wolffd@0
|
167 switch q,
|
wolffd@0
|
168 case 1, dist = Diff*mask;
|
wolffd@0
|
169 case 2, dist = sqrt((Diff.^2)*mask);
|
wolffd@0
|
170 case Inf, dist = max(Diff,[],2);
|
wolffd@0
|
171 otherwise, dist = ((Diff.^q)*mask).^(1/q);
|
wolffd@0
|
172 end
|
wolffd@0
|
173 Md(inds,i) = dist;
|
wolffd@0
|
174 Md(i,inds) = dist';
|
wolffd@0
|
175 end
|
wolffd@0
|
176 end
|
wolffd@0
|
177 end
|
wolffd@0
|
178
|
wolffd@0
|
179 % set distances to ignored units to Inf
|
wolffd@0
|
180 if ~isempty(ignore_units),
|
wolffd@0
|
181 Md(ignore_units,:) = Inf;
|
wolffd@0
|
182 Md(:,ignore_units) = Inf;
|
wolffd@0
|
183 end
|
wolffd@0
|
184
|
wolffd@0
|
185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
186 %% construct dendrogram
|
wolffd@0
|
187
|
wolffd@0
|
188 Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step
|
wolffd@0
|
189 clusters = 1:dlen; % each vector is at first in its own cluster
|
wolffd@0
|
190 Cd = Md; % distances between clusters
|
wolffd@0
|
191
|
wolffd@0
|
192 h = waitbar(0,'Constructing hierarchical clustering');
|
wolffd@0
|
193
|
wolffd@0
|
194 for i=1:n-1,
|
wolffd@0
|
195
|
wolffd@0
|
196 % tracking
|
wolffd@0
|
197 waitbar(i/(n-1),h);
|
wolffd@0
|
198
|
wolffd@0
|
199 %% combine two closest clusters
|
wolffd@0
|
200 % find the clusters which are closest to each other (c1 and c2)
|
wolffd@0
|
201 [d,ind] = min(min(Cd));
|
wolffd@0
|
202 if ~isfinite(d), break; end % no more connected clusters
|
wolffd@0
|
203 [d,c1] = min(Cd(:,ind)); % cluster1
|
wolffd@0
|
204 c2 = clusters(ind); % cluster2
|
wolffd@0
|
205 % combine the two clusters
|
wolffd@0
|
206 c1_inds = find(clusters==c1); % vectors belonging to c1
|
wolffd@0
|
207 c2_inds = find(clusters==c2); % vectors belonging to c2
|
wolffd@0
|
208 c_inds = [c1_inds, c2_inds]; % members of the new cluster
|
wolffd@0
|
209 % new cluster index = bigger cluster
|
wolffd@0
|
210 if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end
|
wolffd@0
|
211 clusters(c_inds) = c; % update cluster info
|
wolffd@0
|
212 Z(i,:) = [c, k, d]; % save info into Z
|
wolffd@0
|
213
|
wolffd@0
|
214 %% update cluster distances
|
wolffd@0
|
215 % remove the subclusters from the Cd table
|
wolffd@0
|
216 Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf
|
wolffd@0
|
217 k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster
|
wolffd@0
|
218 Cd(k_inds,:) = Inf; % distance of the subclusters to
|
wolffd@0
|
219 Cd(:,k_inds) = Inf; % other clusters = Inf
|
wolffd@0
|
220 % update the distance of this cluster to the other clusters
|
wolffd@0
|
221 cl = unique(clusters(clusters ~= c)); % indeces of all other clusters
|
wolffd@0
|
222 if ~isempty(cl), % added since v6.5 works differently than 6.1
|
wolffd@0
|
223 for l=cl,
|
wolffd@0
|
224 o_inds = find(clusters==l); % indeces belonging to cluster k
|
wolffd@0
|
225 vd = Md(c_inds,o_inds); % distances between vectors in c and k
|
wolffd@0
|
226 vd = vd(isfinite(vd(:))); % remove infinite distances (no connection)
|
wolffd@0
|
227 len = length(vd);
|
wolffd@0
|
228 if ~len, % if the two clusters are not connected, their distance in Inf
|
wolffd@0
|
229 sd = Inf;
|
wolffd@0
|
230 else % otherwise, calculate the distance between them
|
wolffd@0
|
231 switch linkage,
|
wolffd@0
|
232 case 'single', sd = min(vd);
|
wolffd@0
|
233 case 'average', sd = sum(vd)/len;
|
wolffd@0
|
234 case 'complete', sd = max(vd);
|
wolffd@0
|
235 otherwise, error(['Unknown set distance: ' linkage]);
|
wolffd@0
|
236 end
|
wolffd@0
|
237 end
|
wolffd@0
|
238 Cd(c,l) = sd; Cd(l,c) = sd;
|
wolffd@0
|
239 end
|
wolffd@0
|
240 end
|
wolffd@0
|
241 end
|
wolffd@0
|
242 close(h);
|
wolffd@0
|
243
|
wolffd@0
|
244 last = Z(i,1);
|
wolffd@0
|
245 if isnan(last),
|
wolffd@0
|
246 last = Z(i-1,1);
|
wolffd@0
|
247 rest = setdiff(unique(clusters),last);
|
wolffd@0
|
248 Z(i:n-1,1) = rest';
|
wolffd@0
|
249 Z(i:n-1,2) = last;
|
wolffd@0
|
250 Z(i:n-1,3) = Inf;
|
wolffd@0
|
251 i = i - 1;
|
wolffd@0
|
252 else
|
wolffd@0
|
253 rest = [];
|
wolffd@0
|
254 end
|
wolffd@0
|
255
|
wolffd@0
|
256 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
257 %% return values
|
wolffd@0
|
258
|
wolffd@0
|
259 % calculate the order of samples
|
wolffd@0
|
260 order = last;
|
wolffd@0
|
261 % go through the combination matrix from top to down
|
wolffd@0
|
262 for k=i:-1:1,
|
wolffd@0
|
263 c = Z(k,1); k = Z(k,2); % what (k) change to what (c)
|
wolffd@0
|
264 j = find(order==c); % the occurance of c in order
|
wolffd@0
|
265 if j == length(order), order = [order k]; % put k behind c
|
wolffd@0
|
266 else order = [order(1:j) k order(j+1:end)];
|
wolffd@0
|
267 end
|
wolffd@0
|
268 end
|
wolffd@0
|
269 order = [rest, order];
|
wolffd@0
|
270
|
wolffd@0
|
271 % to maintain compatibility with Statistics Toolbox, the values in
|
wolffd@0
|
272 % Z must be yet transformed so that they are similar to the output
|
wolffd@0
|
273 % of LINKAGE function
|
wolffd@0
|
274
|
wolffd@0
|
275 Zs = Z;
|
wolffd@0
|
276 current_cluster = 1:dlen;
|
wolffd@0
|
277 for i=1:size(Z,1),
|
wolffd@0
|
278 Zs(i,1) = current_cluster(Z(i,1));
|
wolffd@0
|
279 Zs(i,2) = current_cluster(Z(i,2));
|
wolffd@0
|
280 current_cluster(Z(i,[1 2])) = dlen+i;
|
wolffd@0
|
281 end
|
wolffd@0
|
282
|
wolffd@0
|
283 Z = Zs;
|
wolffd@0
|
284
|
wolffd@0
|
285 return;
|
wolffd@0
|
286
|
wolffd@0
|
287 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
288
|
wolffd@0
|
289
|
wolffd@0
|
290
|
wolffd@0
|
291
|