annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 function [Z,order,Md] = som_linkage(sM,varargin)
wolffd@0 2
wolffd@0 3 %SOM_LINKAGE Make a hierarchical linkage of the SOM map units.
wolffd@0 4 %
wolffd@0 5 % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...])
wolffd@0 6 %
wolffd@0 7 % Z = som_linkage(sM);
wolffd@0 8 % Z = som_linkage(D,'complete');
wolffd@0 9 % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D)));
wolffd@0 10 % Z = som_linkage(sM,pdist(sM.codebook,'mahal'));
wolffd@0 11 % som_dendrogram(Z);
wolffd@0 12 %
wolffd@0 13 % Input and output arguments ([]'s are optional):
wolffd@0 14 % sM (struct) map or data struct to be clustered
wolffd@0 15 % (matrix) size dlen x dim, a data set: the matrix must not
wolffd@0 16 % contain any NaN's!
wolffd@0 17 % [argID, (string) See below. The values which are unambiguous can
wolffd@0 18 % value] (varies) be given without the preceeding argID.
wolffd@0 19 %
wolffd@0 20 % Z (matrix) size dlen-1 x 3, the linkage info
wolffd@0 21 % Z(i,1) and Z(i,2) hold the indeces of clusters
wolffd@0 22 % combined on level i (starting from bottom). The new
wolffd@0 23 % cluster has index dlen+i. The initial cluster
wolffd@0 24 % index of each unit is its linear index in the
wolffd@0 25 % original data matrix. Z(i,3) is the distance
wolffd@0 26 % between the combined clusters. See LINKAGE
wolffd@0 27 % function in the Statistics Toolbox.
wolffd@0 28 % The ignored samples are listed at the
wolffd@0 29 % end of the Z-matrix and have Z(*,3) == Inf
wolffd@0 30 % Dist (matrix) size dlen x dlen, pairwise distance matrix
wolffd@0 31 %
wolffd@0 32 % Here are the valid argument IDs and corresponding values. The values
wolffd@0 33 % which are unambiguous (marked with '*') can be given without the
wolffd@0 34 % preceeding argID.
wolffd@0 35 % 'linkage' *(string) the linkage criteria to use: 'single' (the
wolffd@0 36 % default), 'average' or 'complete'
wolffd@0 37 % 'topol' *(struct) topology struct
wolffd@0 38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
wolffd@0 39 % connections should be allowed only between
wolffd@0 40 % neighbors or between any vectors
wolffd@0 41 % (matrix) size dlen x dlen indicating the connections
wolffd@0 42 % between vectors
wolffd@0 43 % (scalar) the N-neighborhood upto which the connections
wolffd@0 44 % should be formed (implies 'neighbors')
wolffd@0 45 % 'ignore' (vector) the units/vectors which should be ignored
wolffd@0 46 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
wolffd@0 47 % be used instead of euclidian distances
wolffd@0 48 % (vector) as the output of PDIST function
wolffd@0 49 % (scalar) distance norm to use (euclidian = 2)
wolffd@0 50 % 'mask' (vector) size dim x 1, the search mask used to
wolffd@0 51 % weight distance calculation. By default
wolffd@0 52 % sM.mask or a vector of ones is used.
wolffd@0 53 %
wolffd@0 54 % Note that together 'connect'='neighbors' and 'ignore' may form
wolffd@0 55 % areas on the map which will never be connected: connections
wolffd@0 56 % across the ignored map units simply do not exist.
wolffd@0 57 %
wolffd@0 58 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
wolffd@0 59
wolffd@0 60 % Copyright (c) 2000 by Juha Vesanto
wolffd@0 61 % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto
wolffd@0 62 % http://www.cis.hut.fi/projects/somtoolbox/
wolffd@0 63
wolffd@0 64 % Version 2.0beta juuso 160600
wolffd@0 65
wolffd@0 66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 67 %% input arguments
wolffd@0 68
wolffd@0 69 % the data
wolffd@0 70 if isstruct(sM),
wolffd@0 71 if isfield(sM,'data'), D = sM.data; sT = []; mask = [];
wolffd@0 72 else D = sM.codebook; sT = sM.topol; mask = sM.mask;
wolffd@0 73 end
wolffd@0 74 else
wolffd@0 75 D = sM; sT = []; mask = [];
wolffd@0 76 end
wolffd@0 77 [dlen dim] = size(D);
wolffd@0 78 if isempty(mask), mask = ones(dim,1); end
wolffd@0 79 if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end
wolffd@0 80
wolffd@0 81 % varargin
wolffd@0 82 Md = 2;
wolffd@0 83 linkage = 'single';
wolffd@0 84 ignore_units = [];
wolffd@0 85 constrained = 0;
wolffd@0 86 i=1;
wolffd@0 87 while i<=length(varargin),
wolffd@0 88 argok = 1;
wolffd@0 89 if ischar(varargin{i}),
wolffd@0 90 switch varargin{i},
wolffd@0 91 % argument IDs
wolffd@0 92 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
wolffd@0 93 case 'connect', i=i+1;
wolffd@0 94 if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any');
wolffd@0 95 else constrained = varargin{i}; end
wolffd@0 96 case 'ignore', i=i+1; ignore_units = varargin{i};
wolffd@0 97 case 'dist', i=i+1; Md = varargin{i};
wolffd@0 98 case 'linkage', i=i+1; linkage = varargin{i};
wolffd@0 99 case 'mask', i=i+1; mask = varargin{i};
wolffd@0 100 case 'tracking',i=i+1; tracking = varargin{i};
wolffd@0 101 % unambiguous values
wolffd@0 102 case 'neighbors', constrained = 1;
wolffd@0 103 case 'any', constrained = 0;
wolffd@0 104 case {'single','average','complete'}, linkage = varargin{i};
wolffd@0 105 otherwise argok=0;
wolffd@0 106 end
wolffd@0 107 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
wolffd@0 108 switch varargin{i}(1).type,
wolffd@0 109 case 'som_topol', sTopol = varargin{i};
wolffd@0 110 otherwise argok=0;
wolffd@0 111 end
wolffd@0 112 else
wolffd@0 113 argok = 0;
wolffd@0 114 end
wolffd@0 115 if ~argok,
wolffd@0 116 disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]);
wolffd@0 117 end
wolffd@0 118 i = i+1;
wolffd@0 119 end
wolffd@0 120
wolffd@0 121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 122 %% distance matrix
wolffd@0 123
wolffd@0 124 % given distance matrix % jh corrected this place totally 27.3. 03
wolffd@0 125 if (prod(size(Md))==1), % no explicit distance matrix, set flag
wolffd@0 126 q=2; % 17.2.03 kr added some brackets
wolffd@0 127 else
wolffd@0 128 if (prod(size(Md))<dlen^2), % check pdist form
wolffd@0 129 Md = squareform(Md); % transform to ordinary square diastance matrix
wolffd@0 130 end
wolffd@0 131 % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below
wolffd@0 132 % sets self-distance to Inf! This must be set here also,
wolffd@0 133 % otherwise clustering fails for user defined dist. matrix!
wolffd@0 134 Md(eye(dlen)==1)=Inf;
wolffd@0 135 end
wolffd@0 136
wolffd@0 137 % neighborhood constraint
wolffd@0 138 if length(constrained)==1 & constrained>0,
wolffd@0 139 Ne1 = som_unit_neighs(sT);
wolffd@0 140 Conn = som_neighborhood(Ne1,constrained);
wolffd@0 141 Conn(~isfinite(Conn(:))) = 0;
wolffd@0 142 else Conn = constrained; end
wolffd@0 143 if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end
wolffd@0 144
wolffd@0 145 % pairwise distance matrix across connected units
wolffd@0 146 n = size(D,1);
wolffd@0 147 if prod(size(Md))>1,
wolffd@0 148 % remove distances between non-neighbors
wolffd@0 149 if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end
wolffd@0 150 else
wolffd@0 151 % calculate pairwise distance matrix
wolffd@0 152 q = Md;
wolffd@0 153 Md = zeros(n,n)+Inf;
wolffd@0 154 if ~constrained & q==2, % fast for the default case
wolffd@0 155 for i = 1:n-1,
wolffd@0 156 x = D(i,:);
wolffd@0 157 inds = [(i+1):n];
wolffd@0 158 Diff = D(inds,:) - x(ones(n-i,1),:);
wolffd@0 159 Md(inds,i) = sqrt((Diff.^2)*mask);
wolffd@0 160 Md(i,inds) = Md(inds,i)';
wolffd@0 161 end
wolffd@0 162 else
wolffd@0 163 for i = 1:n-1,
wolffd@0 164 inds = find(Conn(i,:)==1);
wolffd@0 165 inds = inds(find(inds>i));
wolffd@0 166 Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:));
wolffd@0 167 switch q,
wolffd@0 168 case 1, dist = Diff*mask;
wolffd@0 169 case 2, dist = sqrt((Diff.^2)*mask);
wolffd@0 170 case Inf, dist = max(Diff,[],2);
wolffd@0 171 otherwise, dist = ((Diff.^q)*mask).^(1/q);
wolffd@0 172 end
wolffd@0 173 Md(inds,i) = dist;
wolffd@0 174 Md(i,inds) = dist';
wolffd@0 175 end
wolffd@0 176 end
wolffd@0 177 end
wolffd@0 178
wolffd@0 179 % set distances to ignored units to Inf
wolffd@0 180 if ~isempty(ignore_units),
wolffd@0 181 Md(ignore_units,:) = Inf;
wolffd@0 182 Md(:,ignore_units) = Inf;
wolffd@0 183 end
wolffd@0 184
wolffd@0 185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 186 %% construct dendrogram
wolffd@0 187
wolffd@0 188 Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step
wolffd@0 189 clusters = 1:dlen; % each vector is at first in its own cluster
wolffd@0 190 Cd = Md; % distances between clusters
wolffd@0 191
wolffd@0 192 h = waitbar(0,'Constructing hierarchical clustering');
wolffd@0 193
wolffd@0 194 for i=1:n-1,
wolffd@0 195
wolffd@0 196 % tracking
wolffd@0 197 waitbar(i/(n-1),h);
wolffd@0 198
wolffd@0 199 %% combine two closest clusters
wolffd@0 200 % find the clusters which are closest to each other (c1 and c2)
wolffd@0 201 [d,ind] = min(min(Cd));
wolffd@0 202 if ~isfinite(d), break; end % no more connected clusters
wolffd@0 203 [d,c1] = min(Cd(:,ind)); % cluster1
wolffd@0 204 c2 = clusters(ind); % cluster2
wolffd@0 205 % combine the two clusters
wolffd@0 206 c1_inds = find(clusters==c1); % vectors belonging to c1
wolffd@0 207 c2_inds = find(clusters==c2); % vectors belonging to c2
wolffd@0 208 c_inds = [c1_inds, c2_inds]; % members of the new cluster
wolffd@0 209 % new cluster index = bigger cluster
wolffd@0 210 if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end
wolffd@0 211 clusters(c_inds) = c; % update cluster info
wolffd@0 212 Z(i,:) = [c, k, d]; % save info into Z
wolffd@0 213
wolffd@0 214 %% update cluster distances
wolffd@0 215 % remove the subclusters from the Cd table
wolffd@0 216 Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf
wolffd@0 217 k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster
wolffd@0 218 Cd(k_inds,:) = Inf; % distance of the subclusters to
wolffd@0 219 Cd(:,k_inds) = Inf; % other clusters = Inf
wolffd@0 220 % update the distance of this cluster to the other clusters
wolffd@0 221 cl = unique(clusters(clusters ~= c)); % indeces of all other clusters
wolffd@0 222 if ~isempty(cl), % added since v6.5 works differently than 6.1
wolffd@0 223 for l=cl,
wolffd@0 224 o_inds = find(clusters==l); % indeces belonging to cluster k
wolffd@0 225 vd = Md(c_inds,o_inds); % distances between vectors in c and k
wolffd@0 226 vd = vd(isfinite(vd(:))); % remove infinite distances (no connection)
wolffd@0 227 len = length(vd);
wolffd@0 228 if ~len, % if the two clusters are not connected, their distance in Inf
wolffd@0 229 sd = Inf;
wolffd@0 230 else % otherwise, calculate the distance between them
wolffd@0 231 switch linkage,
wolffd@0 232 case 'single', sd = min(vd);
wolffd@0 233 case 'average', sd = sum(vd)/len;
wolffd@0 234 case 'complete', sd = max(vd);
wolffd@0 235 otherwise, error(['Unknown set distance: ' linkage]);
wolffd@0 236 end
wolffd@0 237 end
wolffd@0 238 Cd(c,l) = sd; Cd(l,c) = sd;
wolffd@0 239 end
wolffd@0 240 end
wolffd@0 241 end
wolffd@0 242 close(h);
wolffd@0 243
wolffd@0 244 last = Z(i,1);
wolffd@0 245 if isnan(last),
wolffd@0 246 last = Z(i-1,1);
wolffd@0 247 rest = setdiff(unique(clusters),last);
wolffd@0 248 Z(i:n-1,1) = rest';
wolffd@0 249 Z(i:n-1,2) = last;
wolffd@0 250 Z(i:n-1,3) = Inf;
wolffd@0 251 i = i - 1;
wolffd@0 252 else
wolffd@0 253 rest = [];
wolffd@0 254 end
wolffd@0 255
wolffd@0 256 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 257 %% return values
wolffd@0 258
wolffd@0 259 % calculate the order of samples
wolffd@0 260 order = last;
wolffd@0 261 % go through the combination matrix from top to down
wolffd@0 262 for k=i:-1:1,
wolffd@0 263 c = Z(k,1); k = Z(k,2); % what (k) change to what (c)
wolffd@0 264 j = find(order==c); % the occurance of c in order
wolffd@0 265 if j == length(order), order = [order k]; % put k behind c
wolffd@0 266 else order = [order(1:j) k order(j+1:end)];
wolffd@0 267 end
wolffd@0 268 end
wolffd@0 269 order = [rest, order];
wolffd@0 270
wolffd@0 271 % to maintain compatibility with Statistics Toolbox, the values in
wolffd@0 272 % Z must be yet transformed so that they are similar to the output
wolffd@0 273 % of LINKAGE function
wolffd@0 274
wolffd@0 275 Zs = Z;
wolffd@0 276 current_cluster = 1:dlen;
wolffd@0 277 for i=1:size(Z,1),
wolffd@0 278 Zs(i,1) = current_cluster(Z(i,1));
wolffd@0 279 Zs(i,2) = current_cluster(Z(i,2));
wolffd@0 280 current_cluster(Z(i,[1 2])) = dlen+i;
wolffd@0 281 end
wolffd@0 282
wolffd@0 283 Z = Zs;
wolffd@0 284
wolffd@0 285 return;
wolffd@0 286
wolffd@0 287 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 288
wolffd@0 289
wolffd@0 290
wolffd@0 291