Mercurial > hg > camir-aes2014
comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function [Z,order,Md] = som_linkage(sM,varargin) | |
2 | |
3 %SOM_LINKAGE Make a hierarchical linkage of the SOM map units. | |
4 % | |
5 % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...]) | |
6 % | |
7 % Z = som_linkage(sM); | |
8 % Z = som_linkage(D,'complete'); | |
9 % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D))); | |
10 % Z = som_linkage(sM,pdist(sM.codebook,'mahal')); | |
11 % som_dendrogram(Z); | |
12 % | |
13 % Input and output arguments ([]'s are optional): | |
14 % sM (struct) map or data struct to be clustered | |
15 % (matrix) size dlen x dim, a data set: the matrix must not | |
16 % contain any NaN's! | |
17 % [argID, (string) See below. The values which are unambiguous can | |
18 % value] (varies) be given without the preceeding argID. | |
19 % | |
20 % Z (matrix) size dlen-1 x 3, the linkage info | |
21 % Z(i,1) and Z(i,2) hold the indeces of clusters | |
22 % combined on level i (starting from bottom). The new | |
23 % cluster has index dlen+i. The initial cluster | |
24 % index of each unit is its linear index in the | |
25 % original data matrix. Z(i,3) is the distance | |
26 % between the combined clusters. See LINKAGE | |
27 % function in the Statistics Toolbox. | |
28 % The ignored samples are listed at the | |
29 % end of the Z-matrix and have Z(*,3) == Inf | |
30 % Dist (matrix) size dlen x dlen, pairwise distance matrix | |
31 % | |
32 % Here are the valid argument IDs and corresponding values. The values | |
33 % which are unambiguous (marked with '*') can be given without the | |
34 % preceeding argID. | |
35 % 'linkage' *(string) the linkage criteria to use: 'single' (the | |
36 % default), 'average' or 'complete' | |
37 % 'topol' *(struct) topology struct | |
38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the | |
39 % connections should be allowed only between | |
40 % neighbors or between any vectors | |
41 % (matrix) size dlen x dlen indicating the connections | |
42 % between vectors | |
43 % (scalar) the N-neighborhood upto which the connections | |
44 % should be formed (implies 'neighbors') | |
45 % 'ignore' (vector) the units/vectors which should be ignored | |
46 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to | |
47 % be used instead of euclidian distances | |
48 % (vector) as the output of PDIST function | |
49 % (scalar) distance norm to use (euclidian = 2) | |
50 % 'mask' (vector) size dim x 1, the search mask used to | |
51 % weight distance calculation. By default | |
52 % sM.mask or a vector of ones is used. | |
53 % | |
54 % Note that together 'connect'='neighbors' and 'ignore' may form | |
55 % areas on the map which will never be connected: connections | |
56 % across the ignored map units simply do not exist. | |
57 % | |
58 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. | |
59 | |
60 % Copyright (c) 2000 by Juha Vesanto | |
61 % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto | |
62 % http://www.cis.hut.fi/projects/somtoolbox/ | |
63 | |
64 % Version 2.0beta juuso 160600 | |
65 | |
66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
67 %% input arguments | |
68 | |
69 % the data | |
70 if isstruct(sM), | |
71 if isfield(sM,'data'), D = sM.data; sT = []; mask = []; | |
72 else D = sM.codebook; sT = sM.topol; mask = sM.mask; | |
73 end | |
74 else | |
75 D = sM; sT = []; mask = []; | |
76 end | |
77 [dlen dim] = size(D); | |
78 if isempty(mask), mask = ones(dim,1); end | |
79 if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end | |
80 | |
81 % varargin | |
82 Md = 2; | |
83 linkage = 'single'; | |
84 ignore_units = []; | |
85 constrained = 0; | |
86 i=1; | |
87 while i<=length(varargin), | |
88 argok = 1; | |
89 if ischar(varargin{i}), | |
90 switch varargin{i}, | |
91 % argument IDs | |
92 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; | |
93 case 'connect', i=i+1; | |
94 if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any'); | |
95 else constrained = varargin{i}; end | |
96 case 'ignore', i=i+1; ignore_units = varargin{i}; | |
97 case 'dist', i=i+1; Md = varargin{i}; | |
98 case 'linkage', i=i+1; linkage = varargin{i}; | |
99 case 'mask', i=i+1; mask = varargin{i}; | |
100 case 'tracking',i=i+1; tracking = varargin{i}; | |
101 % unambiguous values | |
102 case 'neighbors', constrained = 1; | |
103 case 'any', constrained = 0; | |
104 case {'single','average','complete'}, linkage = varargin{i}; | |
105 otherwise argok=0; | |
106 end | |
107 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), | |
108 switch varargin{i}(1).type, | |
109 case 'som_topol', sTopol = varargin{i}; | |
110 otherwise argok=0; | |
111 end | |
112 else | |
113 argok = 0; | |
114 end | |
115 if ~argok, | |
116 disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]); | |
117 end | |
118 i = i+1; | |
119 end | |
120 | |
121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
122 %% distance matrix | |
123 | |
124 % given distance matrix % jh corrected this place totally 27.3. 03 | |
125 if (prod(size(Md))==1), % no explicit distance matrix, set flag | |
126 q=2; % 17.2.03 kr added some brackets | |
127 else | |
128 if (prod(size(Md))<dlen^2), % check pdist form | |
129 Md = squareform(Md); % transform to ordinary square diastance matrix | |
130 end | |
131 % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below | |
132 % sets self-distance to Inf! This must be set here also, | |
133 % otherwise clustering fails for user defined dist. matrix! | |
134 Md(eye(dlen)==1)=Inf; | |
135 end | |
136 | |
137 % neighborhood constraint | |
138 if length(constrained)==1 & constrained>0, | |
139 Ne1 = som_unit_neighs(sT); | |
140 Conn = som_neighborhood(Ne1,constrained); | |
141 Conn(~isfinite(Conn(:))) = 0; | |
142 else Conn = constrained; end | |
143 if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end | |
144 | |
145 % pairwise distance matrix across connected units | |
146 n = size(D,1); | |
147 if prod(size(Md))>1, | |
148 % remove distances between non-neighbors | |
149 if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end | |
150 else | |
151 % calculate pairwise distance matrix | |
152 q = Md; | |
153 Md = zeros(n,n)+Inf; | |
154 if ~constrained & q==2, % fast for the default case | |
155 for i = 1:n-1, | |
156 x = D(i,:); | |
157 inds = [(i+1):n]; | |
158 Diff = D(inds,:) - x(ones(n-i,1),:); | |
159 Md(inds,i) = sqrt((Diff.^2)*mask); | |
160 Md(i,inds) = Md(inds,i)'; | |
161 end | |
162 else | |
163 for i = 1:n-1, | |
164 inds = find(Conn(i,:)==1); | |
165 inds = inds(find(inds>i)); | |
166 Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:)); | |
167 switch q, | |
168 case 1, dist = Diff*mask; | |
169 case 2, dist = sqrt((Diff.^2)*mask); | |
170 case Inf, dist = max(Diff,[],2); | |
171 otherwise, dist = ((Diff.^q)*mask).^(1/q); | |
172 end | |
173 Md(inds,i) = dist; | |
174 Md(i,inds) = dist'; | |
175 end | |
176 end | |
177 end | |
178 | |
179 % set distances to ignored units to Inf | |
180 if ~isempty(ignore_units), | |
181 Md(ignore_units,:) = Inf; | |
182 Md(:,ignore_units) = Inf; | |
183 end | |
184 | |
185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
186 %% construct dendrogram | |
187 | |
188 Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step | |
189 clusters = 1:dlen; % each vector is at first in its own cluster | |
190 Cd = Md; % distances between clusters | |
191 | |
192 h = waitbar(0,'Constructing hierarchical clustering'); | |
193 | |
194 for i=1:n-1, | |
195 | |
196 % tracking | |
197 waitbar(i/(n-1),h); | |
198 | |
199 %% combine two closest clusters | |
200 % find the clusters which are closest to each other (c1 and c2) | |
201 [d,ind] = min(min(Cd)); | |
202 if ~isfinite(d), break; end % no more connected clusters | |
203 [d,c1] = min(Cd(:,ind)); % cluster1 | |
204 c2 = clusters(ind); % cluster2 | |
205 % combine the two clusters | |
206 c1_inds = find(clusters==c1); % vectors belonging to c1 | |
207 c2_inds = find(clusters==c2); % vectors belonging to c2 | |
208 c_inds = [c1_inds, c2_inds]; % members of the new cluster | |
209 % new cluster index = bigger cluster | |
210 if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end | |
211 clusters(c_inds) = c; % update cluster info | |
212 Z(i,:) = [c, k, d]; % save info into Z | |
213 | |
214 %% update cluster distances | |
215 % remove the subclusters from the Cd table | |
216 Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf | |
217 k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster | |
218 Cd(k_inds,:) = Inf; % distance of the subclusters to | |
219 Cd(:,k_inds) = Inf; % other clusters = Inf | |
220 % update the distance of this cluster to the other clusters | |
221 cl = unique(clusters(clusters ~= c)); % indeces of all other clusters | |
222 if ~isempty(cl), % added since v6.5 works differently than 6.1 | |
223 for l=cl, | |
224 o_inds = find(clusters==l); % indeces belonging to cluster k | |
225 vd = Md(c_inds,o_inds); % distances between vectors in c and k | |
226 vd = vd(isfinite(vd(:))); % remove infinite distances (no connection) | |
227 len = length(vd); | |
228 if ~len, % if the two clusters are not connected, their distance in Inf | |
229 sd = Inf; | |
230 else % otherwise, calculate the distance between them | |
231 switch linkage, | |
232 case 'single', sd = min(vd); | |
233 case 'average', sd = sum(vd)/len; | |
234 case 'complete', sd = max(vd); | |
235 otherwise, error(['Unknown set distance: ' linkage]); | |
236 end | |
237 end | |
238 Cd(c,l) = sd; Cd(l,c) = sd; | |
239 end | |
240 end | |
241 end | |
242 close(h); | |
243 | |
244 last = Z(i,1); | |
245 if isnan(last), | |
246 last = Z(i-1,1); | |
247 rest = setdiff(unique(clusters),last); | |
248 Z(i:n-1,1) = rest'; | |
249 Z(i:n-1,2) = last; | |
250 Z(i:n-1,3) = Inf; | |
251 i = i - 1; | |
252 else | |
253 rest = []; | |
254 end | |
255 | |
256 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
257 %% return values | |
258 | |
259 % calculate the order of samples | |
260 order = last; | |
261 % go through the combination matrix from top to down | |
262 for k=i:-1:1, | |
263 c = Z(k,1); k = Z(k,2); % what (k) change to what (c) | |
264 j = find(order==c); % the occurance of c in order | |
265 if j == length(order), order = [order k]; % put k behind c | |
266 else order = [order(1:j) k order(j+1:end)]; | |
267 end | |
268 end | |
269 order = [rest, order]; | |
270 | |
271 % to maintain compatibility with Statistics Toolbox, the values in | |
272 % Z must be yet transformed so that they are similar to the output | |
273 % of LINKAGE function | |
274 | |
275 Zs = Z; | |
276 current_cluster = 1:dlen; | |
277 for i=1:size(Z,1), | |
278 Zs(i,1) = current_cluster(Z(i,1)); | |
279 Zs(i,2) = current_cluster(Z(i,2)); | |
280 current_cluster(Z(i,[1 2])) = dlen+i; | |
281 end | |
282 | |
283 Z = Zs; | |
284 | |
285 return; | |
286 | |
287 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
288 | |
289 | |
290 | |
291 |