annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m @ 0:cc4b1211e677 tip

initial commit to HG from Changeset: 646 (e263d8a21543) added further path and more save "camirversion.m"
author Daniel Wolff
date Fri, 19 Aug 2016 13:07:06 +0200
parents
children
rev   line source
Daniel@0 1 function [Z,order,Md] = som_linkage(sM,varargin)
Daniel@0 2
Daniel@0 3 %SOM_LINKAGE Make a hierarchical linkage of the SOM map units.
Daniel@0 4 %
Daniel@0 5 % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...])
Daniel@0 6 %
Daniel@0 7 % Z = som_linkage(sM);
Daniel@0 8 % Z = som_linkage(D,'complete');
Daniel@0 9 % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D)));
Daniel@0 10 % Z = som_linkage(sM,pdist(sM.codebook,'mahal'));
Daniel@0 11 % som_dendrogram(Z);
Daniel@0 12 %
Daniel@0 13 % Input and output arguments ([]'s are optional):
Daniel@0 14 % sM (struct) map or data struct to be clustered
Daniel@0 15 % (matrix) size dlen x dim, a data set: the matrix must not
Daniel@0 16 % contain any NaN's!
Daniel@0 17 % [argID, (string) See below. The values which are unambiguous can
Daniel@0 18 % value] (varies) be given without the preceeding argID.
Daniel@0 19 %
Daniel@0 20 % Z (matrix) size dlen-1 x 3, the linkage info
Daniel@0 21 % Z(i,1) and Z(i,2) hold the indeces of clusters
Daniel@0 22 % combined on level i (starting from bottom). The new
Daniel@0 23 % cluster has index dlen+i. The initial cluster
Daniel@0 24 % index of each unit is its linear index in the
Daniel@0 25 % original data matrix. Z(i,3) is the distance
Daniel@0 26 % between the combined clusters. See LINKAGE
Daniel@0 27 % function in the Statistics Toolbox.
Daniel@0 28 % The ignored samples are listed at the
Daniel@0 29 % end of the Z-matrix and have Z(*,3) == Inf
Daniel@0 30 % Dist (matrix) size dlen x dlen, pairwise distance matrix
Daniel@0 31 %
Daniel@0 32 % Here are the valid argument IDs and corresponding values. The values
Daniel@0 33 % which are unambiguous (marked with '*') can be given without the
Daniel@0 34 % preceeding argID.
Daniel@0 35 % 'linkage' *(string) the linkage criteria to use: 'single' (the
Daniel@0 36 % default), 'average' or 'complete'
Daniel@0 37 % 'topol' *(struct) topology struct
Daniel@0 38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
Daniel@0 39 % connections should be allowed only between
Daniel@0 40 % neighbors or between any vectors
Daniel@0 41 % (matrix) size dlen x dlen indicating the connections
Daniel@0 42 % between vectors
Daniel@0 43 % (scalar) the N-neighborhood upto which the connections
Daniel@0 44 % should be formed (implies 'neighbors')
Daniel@0 45 % 'ignore' (vector) the units/vectors which should be ignored
Daniel@0 46 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
Daniel@0 47 % be used instead of euclidian distances
Daniel@0 48 % (vector) as the output of PDIST function
Daniel@0 49 % (scalar) distance norm to use (euclidian = 2)
Daniel@0 50 % 'mask' (vector) size dim x 1, the search mask used to
Daniel@0 51 % weight distance calculation. By default
Daniel@0 52 % sM.mask or a vector of ones is used.
Daniel@0 53 %
Daniel@0 54 % Note that together 'connect'='neighbors' and 'ignore' may form
Daniel@0 55 % areas on the map which will never be connected: connections
Daniel@0 56 % across the ignored map units simply do not exist.
Daniel@0 57 %
Daniel@0 58 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
Daniel@0 59
Daniel@0 60 % Copyright (c) 2000 by Juha Vesanto
Daniel@0 61 % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto
Daniel@0 62 % http://www.cis.hut.fi/projects/somtoolbox/
Daniel@0 63
Daniel@0 64 % Version 2.0beta juuso 160600
Daniel@0 65
Daniel@0 66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 67 %% input arguments
Daniel@0 68
Daniel@0 69 % the data
Daniel@0 70 if isstruct(sM),
Daniel@0 71 if isfield(sM,'data'), D = sM.data; sT = []; mask = [];
Daniel@0 72 else D = sM.codebook; sT = sM.topol; mask = sM.mask;
Daniel@0 73 end
Daniel@0 74 else
Daniel@0 75 D = sM; sT = []; mask = [];
Daniel@0 76 end
Daniel@0 77 [dlen dim] = size(D);
Daniel@0 78 if isempty(mask), mask = ones(dim,1); end
Daniel@0 79 if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end
Daniel@0 80
Daniel@0 81 % varargin
Daniel@0 82 Md = 2;
Daniel@0 83 linkage = 'single';
Daniel@0 84 ignore_units = [];
Daniel@0 85 constrained = 0;
Daniel@0 86 i=1;
Daniel@0 87 while i<=length(varargin),
Daniel@0 88 argok = 1;
Daniel@0 89 if ischar(varargin{i}),
Daniel@0 90 switch varargin{i},
Daniel@0 91 % argument IDs
Daniel@0 92 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
Daniel@0 93 case 'connect', i=i+1;
Daniel@0 94 if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any');
Daniel@0 95 else constrained = varargin{i}; end
Daniel@0 96 case 'ignore', i=i+1; ignore_units = varargin{i};
Daniel@0 97 case 'dist', i=i+1; Md = varargin{i};
Daniel@0 98 case 'linkage', i=i+1; linkage = varargin{i};
Daniel@0 99 case 'mask', i=i+1; mask = varargin{i};
Daniel@0 100 case 'tracking',i=i+1; tracking = varargin{i};
Daniel@0 101 % unambiguous values
Daniel@0 102 case 'neighbors', constrained = 1;
Daniel@0 103 case 'any', constrained = 0;
Daniel@0 104 case {'single','average','complete'}, linkage = varargin{i};
Daniel@0 105 otherwise argok=0;
Daniel@0 106 end
Daniel@0 107 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
Daniel@0 108 switch varargin{i}(1).type,
Daniel@0 109 case 'som_topol', sTopol = varargin{i};
Daniel@0 110 otherwise argok=0;
Daniel@0 111 end
Daniel@0 112 else
Daniel@0 113 argok = 0;
Daniel@0 114 end
Daniel@0 115 if ~argok,
Daniel@0 116 disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]);
Daniel@0 117 end
Daniel@0 118 i = i+1;
Daniel@0 119 end
Daniel@0 120
Daniel@0 121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 122 %% distance matrix
Daniel@0 123
Daniel@0 124 % given distance matrix % jh corrected this place totally 27.3. 03
Daniel@0 125 if (prod(size(Md))==1), % no explicit distance matrix, set flag
Daniel@0 126 q=2; % 17.2.03 kr added some brackets
Daniel@0 127 else
Daniel@0 128 if (prod(size(Md))<dlen^2), % check pdist form
Daniel@0 129 Md = squareform(Md); % transform to ordinary square diastance matrix
Daniel@0 130 end
Daniel@0 131 % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below
Daniel@0 132 % sets self-distance to Inf! This must be set here also,
Daniel@0 133 % otherwise clustering fails for user defined dist. matrix!
Daniel@0 134 Md(eye(dlen)==1)=Inf;
Daniel@0 135 end
Daniel@0 136
Daniel@0 137 % neighborhood constraint
Daniel@0 138 if length(constrained)==1 & constrained>0,
Daniel@0 139 Ne1 = som_unit_neighs(sT);
Daniel@0 140 Conn = som_neighborhood(Ne1,constrained);
Daniel@0 141 Conn(~isfinite(Conn(:))) = 0;
Daniel@0 142 else Conn = constrained; end
Daniel@0 143 if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end
Daniel@0 144
Daniel@0 145 % pairwise distance matrix across connected units
Daniel@0 146 n = size(D,1);
Daniel@0 147 if prod(size(Md))>1,
Daniel@0 148 % remove distances between non-neighbors
Daniel@0 149 if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end
Daniel@0 150 else
Daniel@0 151 % calculate pairwise distance matrix
Daniel@0 152 q = Md;
Daniel@0 153 Md = zeros(n,n)+Inf;
Daniel@0 154 if ~constrained & q==2, % fast for the default case
Daniel@0 155 for i = 1:n-1,
Daniel@0 156 x = D(i,:);
Daniel@0 157 inds = [(i+1):n];
Daniel@0 158 Diff = D(inds,:) - x(ones(n-i,1),:);
Daniel@0 159 Md(inds,i) = sqrt((Diff.^2)*mask);
Daniel@0 160 Md(i,inds) = Md(inds,i)';
Daniel@0 161 end
Daniel@0 162 else
Daniel@0 163 for i = 1:n-1,
Daniel@0 164 inds = find(Conn(i,:)==1);
Daniel@0 165 inds = inds(find(inds>i));
Daniel@0 166 Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:));
Daniel@0 167 switch q,
Daniel@0 168 case 1, dist = Diff*mask;
Daniel@0 169 case 2, dist = sqrt((Diff.^2)*mask);
Daniel@0 170 case Inf, dist = max(Diff,[],2);
Daniel@0 171 otherwise, dist = ((Diff.^q)*mask).^(1/q);
Daniel@0 172 end
Daniel@0 173 Md(inds,i) = dist;
Daniel@0 174 Md(i,inds) = dist';
Daniel@0 175 end
Daniel@0 176 end
Daniel@0 177 end
Daniel@0 178
Daniel@0 179 % set distances to ignored units to Inf
Daniel@0 180 if ~isempty(ignore_units),
Daniel@0 181 Md(ignore_units,:) = Inf;
Daniel@0 182 Md(:,ignore_units) = Inf;
Daniel@0 183 end
Daniel@0 184
Daniel@0 185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 186 %% construct dendrogram
Daniel@0 187
Daniel@0 188 Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step
Daniel@0 189 clusters = 1:dlen; % each vector is at first in its own cluster
Daniel@0 190 Cd = Md; % distances between clusters
Daniel@0 191
Daniel@0 192 h = waitbar(0,'Constructing hierarchical clustering');
Daniel@0 193
Daniel@0 194 for i=1:n-1,
Daniel@0 195
Daniel@0 196 % tracking
Daniel@0 197 waitbar(i/(n-1),h);
Daniel@0 198
Daniel@0 199 %% combine two closest clusters
Daniel@0 200 % find the clusters which are closest to each other (c1 and c2)
Daniel@0 201 [d,ind] = min(min(Cd));
Daniel@0 202 if ~isfinite(d), break; end % no more connected clusters
Daniel@0 203 [d,c1] = min(Cd(:,ind)); % cluster1
Daniel@0 204 c2 = clusters(ind); % cluster2
Daniel@0 205 % combine the two clusters
Daniel@0 206 c1_inds = find(clusters==c1); % vectors belonging to c1
Daniel@0 207 c2_inds = find(clusters==c2); % vectors belonging to c2
Daniel@0 208 c_inds = [c1_inds, c2_inds]; % members of the new cluster
Daniel@0 209 % new cluster index = bigger cluster
Daniel@0 210 if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end
Daniel@0 211 clusters(c_inds) = c; % update cluster info
Daniel@0 212 Z(i,:) = [c, k, d]; % save info into Z
Daniel@0 213
Daniel@0 214 %% update cluster distances
Daniel@0 215 % remove the subclusters from the Cd table
Daniel@0 216 Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf
Daniel@0 217 k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster
Daniel@0 218 Cd(k_inds,:) = Inf; % distance of the subclusters to
Daniel@0 219 Cd(:,k_inds) = Inf; % other clusters = Inf
Daniel@0 220 % update the distance of this cluster to the other clusters
Daniel@0 221 cl = unique(clusters(clusters ~= c)); % indeces of all other clusters
Daniel@0 222 if ~isempty(cl), % added since v6.5 works differently than 6.1
Daniel@0 223 for l=cl,
Daniel@0 224 o_inds = find(clusters==l); % indeces belonging to cluster k
Daniel@0 225 vd = Md(c_inds,o_inds); % distances between vectors in c and k
Daniel@0 226 vd = vd(isfinite(vd(:))); % remove infinite distances (no connection)
Daniel@0 227 len = length(vd);
Daniel@0 228 if ~len, % if the two clusters are not connected, their distance in Inf
Daniel@0 229 sd = Inf;
Daniel@0 230 else % otherwise, calculate the distance between them
Daniel@0 231 switch linkage,
Daniel@0 232 case 'single', sd = min(vd);
Daniel@0 233 case 'average', sd = sum(vd)/len;
Daniel@0 234 case 'complete', sd = max(vd);
Daniel@0 235 otherwise, error(['Unknown set distance: ' linkage]);
Daniel@0 236 end
Daniel@0 237 end
Daniel@0 238 Cd(c,l) = sd; Cd(l,c) = sd;
Daniel@0 239 end
Daniel@0 240 end
Daniel@0 241 end
Daniel@0 242 close(h);
Daniel@0 243
Daniel@0 244 last = Z(i,1);
Daniel@0 245 if isnan(last),
Daniel@0 246 last = Z(i-1,1);
Daniel@0 247 rest = setdiff(unique(clusters),last);
Daniel@0 248 Z(i:n-1,1) = rest';
Daniel@0 249 Z(i:n-1,2) = last;
Daniel@0 250 Z(i:n-1,3) = Inf;
Daniel@0 251 i = i - 1;
Daniel@0 252 else
Daniel@0 253 rest = [];
Daniel@0 254 end
Daniel@0 255
Daniel@0 256 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 257 %% return values
Daniel@0 258
Daniel@0 259 % calculate the order of samples
Daniel@0 260 order = last;
Daniel@0 261 % go through the combination matrix from top to down
Daniel@0 262 for k=i:-1:1,
Daniel@0 263 c = Z(k,1); k = Z(k,2); % what (k) change to what (c)
Daniel@0 264 j = find(order==c); % the occurance of c in order
Daniel@0 265 if j == length(order), order = [order k]; % put k behind c
Daniel@0 266 else order = [order(1:j) k order(j+1:end)];
Daniel@0 267 end
Daniel@0 268 end
Daniel@0 269 order = [rest, order];
Daniel@0 270
Daniel@0 271 % to maintain compatibility with Statistics Toolbox, the values in
Daniel@0 272 % Z must be yet transformed so that they are similar to the output
Daniel@0 273 % of LINKAGE function
Daniel@0 274
Daniel@0 275 Zs = Z;
Daniel@0 276 current_cluster = 1:dlen;
Daniel@0 277 for i=1:size(Z,1),
Daniel@0 278 Zs(i,1) = current_cluster(Z(i,1));
Daniel@0 279 Zs(i,2) = current_cluster(Z(i,2));
Daniel@0 280 current_cluster(Z(i,[1 2])) = dlen+i;
Daniel@0 281 end
Daniel@0 282
Daniel@0 283 Z = Zs;
Daniel@0 284
Daniel@0 285 return;
Daniel@0 286
Daniel@0 287 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 288
Daniel@0 289
Daniel@0 290
Daniel@0 291