annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_cllinkage.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 function sC = som_cllinkage(sM,varargin)
wolffd@0 2
wolffd@0 3 %SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units.
wolffd@0 4 %
wolffd@0 5 % sC = som_cllinkage(sM, [[argID,] value, ...])
wolffd@0 6 %
wolffd@0 7 % sC = som_cllinkage(sM);
wolffd@0 8 % sC = som_cllinkage(D,'complete');
wolffd@0 9 % sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D)));
wolffd@0 10 % sC = som_cllinkage(sM,pdist(sM.codebook,'mahal'));
wolffd@0 11 % som_clplot(sC);
wolffd@0 12 %
wolffd@0 13 % Input and output arguments ([]'s are optional):
wolffd@0 14 % sM (struct) map or data struct to be clustered
wolffd@0 15 % (matrix) size dlen x dim, a data set: the matrix must not
wolffd@0 16 % contain any NaN's!
wolffd@0 17 % [argID, (string) See below. The values which are unambiguous can
wolffd@0 18 % value] (varies) be given without the preceeding argID.
wolffd@0 19 %
wolffd@0 20 % sC (struct) a clustering struct with e.g. the following fields
wolffd@0 21 % (for more information see SOMCL_STRUCT)
wolffd@0 22 % .base (vector) if base partitioning is given, this is a newly
wolffd@0 23 % coded version of it so that the cluster indices
wolffd@0 24 % go from 1 to the number of clusters.
wolffd@0 25 % .tree (matrix) size clen-1 x 3, the linkage info
wolffd@0 26 % Z(i,1) and Z(i,2) hold the indeces of clusters
wolffd@0 27 % combined on level i (starting from bottom). The new
wolffd@0 28 % cluster has index dlen+i. The initial cluster
wolffd@0 29 % index of each unit is its linear index in the
wolffd@0 30 % original data matrix. Z(i,3) is the distance
wolffd@0 31 % between the combined clusters. See LINKAGE
wolffd@0 32 % function in the Statistics Toolbox.
wolffd@0 33 %
wolffd@0 34 % Here are the valid argument IDs and corresponding values. The values
wolffd@0 35 % which are unambiguous (marked with '*') can be given without the
wolffd@0 36 % preceeding argID.
wolffd@0 37 % 'topol' *(struct) topology struct
wolffd@0 38 % 'connect' *(string) 'neighbors' or 'any' (default), whether the
wolffd@0 39 % connections should be allowed only between
wolffd@0 40 % neighbors or between any vectors
wolffd@0 41 % (matrix) size dlen x dlen indicating the connections
wolffd@0 42 % between vectors
wolffd@0 43 % 'linkage' *(string) the linkage criteria to use: 'single' (the
wolffd@0 44 % default), 'average', 'complete', 'centroid', or 'ward'
wolffd@0 45 % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to
wolffd@0 46 % be used instead of euclidian distances
wolffd@0 47 % (vector) as the output of PDIST function
wolffd@0 48 % (scalar) distance norm to use (default is euclidian = 2)
wolffd@0 49 % 'mask' (vector) size dim x 1, the search mask used to
wolffd@0 50 % weight distance calculation. By default
wolffd@0 51 % sM.mask or a vector of ones is used.
wolffd@0 52 % 'base' (vector) giving the base partitioning of the data:
wolffd@0 53 % base(i) = j denotes that vector i belongs to
wolffd@0 54 % base cluster j, and base(i) = NaN that vector
wolffd@0 55 % i does not belong to any cluster, but should be
wolffd@0 56 % ignored. At the beginning of the clustering, the
wolffd@0 57 % vector of each cluster are averaged, and these
wolffd@0 58 % averaged vectors are then clustered using
wolffd@0 59 % hierarchical clustering.
wolffd@0 60 % 'ignore' (vector) units to be ignored (in addition to those listed
wolffd@0 61 % in base argument)
wolffd@0 62 % 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0)
wolffd@0 63 %
wolffd@0 64 % Note that if 'connect'='neighbors' and some vector are ignored (as denoted
wolffd@0 65 % by NaNs in the base vector), there may be areas on the map which will
wolffd@0 66 % never be connected: connections across the ignored map units simply do not
wolffd@0 67 % exist. In such a case, the neighborhood is gradually increased until
wolffd@0 68 % the areas can be connected.
wolffd@0 69 %
wolffd@0 70 % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM.
wolffd@0 71
wolffd@0 72 % Copyright (c) 2000 by Juha Vesanto
wolffd@0 73 % Contributed to SOM Toolbox on XXX by Juha Vesanto
wolffd@0 74 % http://www.cis.hut.fi/projects/somtoolbox/
wolffd@0 75
wolffd@0 76 % Version 2.0beta juuso 160600 250800
wolffd@0 77
wolffd@0 78 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 79 %% input arguments
wolffd@0 80
wolffd@0 81 % the data
wolffd@0 82 if isstruct(sM),
wolffd@0 83 switch sM.type,
wolffd@0 84 case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end);
wolffd@0 85 case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = [];
wolffd@0 86 case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1);
wolffd@0 87 sTr = som_set('som_train','neigh','gaussian','radius_fin',1);
wolffd@0 88 otherwise, error('Bad first argument');
wolffd@0 89 end
wolffd@0 90 else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = [];
wolffd@0 91 end
wolffd@0 92 [dlen dim] = size(M);
wolffd@0 93 if isempty(mask), mask = ones(dim,1); end
wolffd@0 94 if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end
wolffd@0 95
wolffd@0 96 % varargin
wolffd@0 97 q = 2;
wolffd@0 98 Md = [];
wolffd@0 99 linkage = 'single';
wolffd@0 100 ignore = [];
wolffd@0 101 Ne = 'any';
wolffd@0 102 base = [];
wolffd@0 103 tracking = 0;
wolffd@0 104 i=1;
wolffd@0 105 while i<=length(varargin),
wolffd@0 106 argok = 1;
wolffd@0 107 if ischar(varargin{i}),
wolffd@0 108 switch varargin{i},
wolffd@0 109 % argument IDs
wolffd@0 110 case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i};
wolffd@0 111 case 'connect', i=i+1; Ne = varargin{i};
wolffd@0 112 case 'ignore', i=i+1; ignore = varargin{i};
wolffd@0 113 case 'dist', i=i+1; Md = varargin{i};
wolffd@0 114 case 'linkage', i=i+1; linkage = varargin{i};
wolffd@0 115 case 'mask', i=i+1; mask = varargin{i};
wolffd@0 116 case 'tracking',i=i+1; tracking = varargin{i};
wolffd@0 117 case 'base', i=i+1; base = varargin{i};
wolffd@0 118 % unambiguous values
wolffd@0 119 case 'neighbors', Ne = varargin{i};
wolffd@0 120 case 'any', Ne = varargin{i};
wolffd@0 121 case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i};
wolffd@0 122 otherwise argok=0;
wolffd@0 123 end
wolffd@0 124 elseif isstruct(varargin{i}) & isfield(varargin{i},'type'),
wolffd@0 125 switch varargin{i}(1).type,
wolffd@0 126 case 'som_topol', sT = varargin{i};
wolffd@0 127 otherwise argok=0;
wolffd@0 128 end
wolffd@0 129 else
wolffd@0 130 argok = 0;
wolffd@0 131 end
wolffd@0 132 if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end
wolffd@0 133 i = i+1;
wolffd@0 134 end
wolffd@0 135
wolffd@0 136 % check distance metric
wolffd@0 137 if prod(size(Md))==1, q = Md; Md = []; end
wolffd@0 138 if ~isempty(Md) & prod(size(Md))<dlen^2, Md = squareform(Md); end
wolffd@0 139 if prod(size(Md))>0 & any(strcmp(linkage,{'ward','centroid'})),
wolffd@0 140 warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']);
wolffd@0 141 end
wolffd@0 142
wolffd@0 143 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 144 %% distance matrix and connections between units
wolffd@0 145
wolffd@0 146 % base partition
wolffd@0 147 if isempty(base), base = 1:dlen; end
wolffd@0 148 if ~isempty(ignore), base(ignore) = NaN; end
wolffd@0 149 cid = unique(base(isfinite(base)));
wolffd@0 150 nc = length(cid);
wolffd@0 151 if max(cid)>nc | min(cid)<1,
wolffd@0 152 b = base; for i=1:nc, base(find(b==cid(i))) = i; end
wolffd@0 153 end
wolffd@0 154
wolffd@0 155 % initial clusters
wolffd@0 156 clinds = cell(nc,1);
wolffd@0 157 for i=1:nc, clinds{i} = find(base==i); end
wolffd@0 158
wolffd@0 159 % neighborhood constraint (calculate connection matrix Ne)
wolffd@0 160 if ischar(Ne),
wolffd@0 161 switch Ne,
wolffd@0 162 case 'any', Ne = [];
wolffd@0 163 case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end
wolffd@0 164 otherwise, error(['Unrecognized connection mode ' Ne]);
wolffd@0 165 end
wolffd@0 166 end
wolffd@0 167 if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1
wolffd@0 168 if all(Ne(:)>0), Ne = []; end
wolffd@0 169
wolffd@0 170 % neighborhood function values
wolffd@0 171 if strcmp(linkage,'neighf')
wolffd@0 172 if isempty(sTr), error('Cannot use neighf linkage.'); end
wolffd@0 173 q = som_unit_dists(sT).^2;
wolffd@0 174 r = sTr.radius_fin^2;
wolffd@0 175 if isnan(r) | isempty(r), r = 1; end
wolffd@0 176 switch sTr.neigh,
wolffd@0 177 case 'bubble', q = (q <= r);
wolffd@0 178 case 'gaussian', q = exp(-q/(2*r));
wolffd@0 179 case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r);
wolffd@0 180 case 'ep', q = (1-q/r) .* (q <= r);
wolffd@0 181 end
wolffd@0 182 end
wolffd@0 183
wolffd@0 184 % mutual distances and initial cluster distances
wolffd@0 185 Cd = [];
wolffd@0 186 if any(strcmp(linkage,{'single','average','complete','neighf'})),
wolffd@0 187 M = som_mdist(M,2,mask,Ne);
wolffd@0 188 if (nc == dlen & all(base==[1:dlen])), Cd = M; end
wolffd@0 189 end
wolffd@0 190 if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end
wolffd@0 191 Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal
wolffd@0 192
wolffd@0 193 % check out from Ne which of the clusters are not connected
wolffd@0 194 if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})),
wolffd@0 195 Clconn = sparse(nc);
wolffd@0 196 for i=1:nc-1,
wolffd@0 197 for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end
wolffd@0 198 Clconn(i+1:nc,i) = Clconn(i,i+1:nc)';
wolffd@0 199 end
wolffd@0 200 Cd(Clconn==0) = Inf;
wolffd@0 201 else
wolffd@0 202 Clconn = [];
wolffd@0 203 end
wolffd@0 204
wolffd@0 205
wolffd@0 206 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 207 %% construct dendrogram
wolffd@0 208
wolffd@0 209 clen = nc;
wolffd@0 210 cid = 1:clen;
wolffd@0 211 Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step
wolffd@0 212 if tracking, h = waitbar(0,'Making hierarchical clustering'); end
wolffd@0 213
wolffd@0 214 for i=1:clen-1,
wolffd@0 215 if tracking, waitbar(i/clen,h); end
wolffd@0 216
wolffd@0 217 % find two closest clusters and combine them
wolffd@0 218 [d,c1] = min(min(Cd)); % cluster1
wolffd@0 219 [d,c2] = min(Cd(:,c1)); % cluster2
wolffd@0 220 i1 = clinds{c1}; % vectors belonging to c1
wolffd@0 221 i2 = clinds{c2}; % vectors belonging to c2
wolffd@0 222 clinds{c1} = [i1; i2]; % insert clusters to c1
wolffd@0 223 Z(i,:) = [cid(c1), cid(c2), d]; % update tree info
wolffd@0 224
wolffd@0 225 % remove cluster c2
wolffd@0 226 notc2 = [1:c2-1,c2+1:nc];
wolffd@0 227 nc = nc-1; if nc<=1, break; end
wolffd@0 228 if c1>c2, c1=c1-1; end
wolffd@0 229 clinds = clinds(notc2);
wolffd@0 230 Cd = Cd(notc2,notc2);
wolffd@0 231 cid = cid(notc2);
wolffd@0 232 if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end
wolffd@0 233
wolffd@0 234 % update cluster distances
wolffd@0 235 notc1 = [1:c1-1,c1+1:nc];
wolffd@0 236 Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask);
wolffd@0 237 Cd(notc1,c1) = Cd(c1,notc1)';
wolffd@0 238 if ~isempty(Clconn),
wolffd@0 239 for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end
wolffd@0 240 Clconn(notc1,c1) = Clconn(c1,notc1)';
wolffd@0 241 Cd(Clconn==0) = Inf;
wolffd@0 242 end
wolffd@0 243
wolffd@0 244 end
wolffd@0 245
wolffd@0 246 if tracking, close(h); end
wolffd@0 247
wolffd@0 248 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 249 %% return values
wolffd@0 250
wolffd@0 251 % to maintain compatibility with Statistics Toolbox, the values in
wolffd@0 252 % Z must be yet transformed so that they are similar to the output
wolffd@0 253 % of LINKAGE function
wolffd@0 254
wolffd@0 255 clen = size(Z,1)+1;
wolffd@0 256 Zs = Z;
wolffd@0 257 current_cluster = 1:clen;
wolffd@0 258 for i=1:size(Z,1),
wolffd@0 259 Zs(i,1) = current_cluster(Z(i,1));
wolffd@0 260 Zs(i,2) = current_cluster(Z(i,2));
wolffd@0 261 current_cluster(Z(i,[1 2])) = clen+i;
wolffd@0 262 end
wolffd@0 263 Z = Zs;
wolffd@0 264
wolffd@0 265 % make a clustering struct
wolffd@0 266 name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0));
wolffd@0 267 sC = som_clstruct(Z,'base',base,'name',name);
wolffd@0 268
wolffd@0 269 return;
wolffd@0 270
wolffd@0 271 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 272