wolffd@0: function sC = som_cllinkage(sM,varargin) wolffd@0: wolffd@0: %SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units. wolffd@0: % wolffd@0: % sC = som_cllinkage(sM, [[argID,] value, ...]) wolffd@0: % wolffd@0: % sC = som_cllinkage(sM); wolffd@0: % sC = som_cllinkage(D,'complete'); wolffd@0: % sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D))); wolffd@0: % sC = som_cllinkage(sM,pdist(sM.codebook,'mahal')); wolffd@0: % som_clplot(sC); wolffd@0: % wolffd@0: % Input and output arguments ([]'s are optional): wolffd@0: % sM (struct) map or data struct to be clustered wolffd@0: % (matrix) size dlen x dim, a data set: the matrix must not wolffd@0: % contain any NaN's! wolffd@0: % [argID, (string) See below. The values which are unambiguous can wolffd@0: % value] (varies) be given without the preceeding argID. wolffd@0: % wolffd@0: % sC (struct) a clustering struct with e.g. the following fields wolffd@0: % (for more information see SOMCL_STRUCT) wolffd@0: % .base (vector) if base partitioning is given, this is a newly wolffd@0: % coded version of it so that the cluster indices wolffd@0: % go from 1 to the number of clusters. wolffd@0: % .tree (matrix) size clen-1 x 3, the linkage info wolffd@0: % Z(i,1) and Z(i,2) hold the indeces of clusters wolffd@0: % combined on level i (starting from bottom). The new wolffd@0: % cluster has index dlen+i. The initial cluster wolffd@0: % index of each unit is its linear index in the wolffd@0: % original data matrix. Z(i,3) is the distance wolffd@0: % between the combined clusters. See LINKAGE wolffd@0: % function in the Statistics Toolbox. wolffd@0: % wolffd@0: % Here are the valid argument IDs and corresponding values. The values wolffd@0: % which are unambiguous (marked with '*') can be given without the wolffd@0: % preceeding argID. wolffd@0: % 'topol' *(struct) topology struct wolffd@0: % 'connect' *(string) 'neighbors' or 'any' (default), whether the wolffd@0: % connections should be allowed only between wolffd@0: % neighbors or between any vectors wolffd@0: % (matrix) size dlen x dlen indicating the connections wolffd@0: % between vectors wolffd@0: % 'linkage' *(string) the linkage criteria to use: 'single' (the wolffd@0: % default), 'average', 'complete', 'centroid', or 'ward' wolffd@0: % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to wolffd@0: % be used instead of euclidian distances wolffd@0: % (vector) as the output of PDIST function wolffd@0: % (scalar) distance norm to use (default is euclidian = 2) wolffd@0: % 'mask' (vector) size dim x 1, the search mask used to wolffd@0: % weight distance calculation. By default wolffd@0: % sM.mask or a vector of ones is used. wolffd@0: % 'base' (vector) giving the base partitioning of the data: wolffd@0: % base(i) = j denotes that vector i belongs to wolffd@0: % base cluster j, and base(i) = NaN that vector wolffd@0: % i does not belong to any cluster, but should be wolffd@0: % ignored. At the beginning of the clustering, the wolffd@0: % vector of each cluster are averaged, and these wolffd@0: % averaged vectors are then clustered using wolffd@0: % hierarchical clustering. wolffd@0: % 'ignore' (vector) units to be ignored (in addition to those listed wolffd@0: % in base argument) wolffd@0: % 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0) wolffd@0: % wolffd@0: % Note that if 'connect'='neighbors' and some vector are ignored (as denoted wolffd@0: % by NaNs in the base vector), there may be areas on the map which will wolffd@0: % never be connected: connections across the ignored map units simply do not wolffd@0: % exist. In such a case, the neighborhood is gradually increased until wolffd@0: % the areas can be connected. wolffd@0: % wolffd@0: % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. wolffd@0: wolffd@0: % Copyright (c) 2000 by Juha Vesanto wolffd@0: % Contributed to SOM Toolbox on XXX by Juha Vesanto wolffd@0: % http://www.cis.hut.fi/projects/somtoolbox/ wolffd@0: wolffd@0: % Version 2.0beta juuso 160600 250800 wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% input arguments wolffd@0: wolffd@0: % the data wolffd@0: if isstruct(sM), wolffd@0: switch sM.type, wolffd@0: case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end); wolffd@0: case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = []; wolffd@0: case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1); wolffd@0: sTr = som_set('som_train','neigh','gaussian','radius_fin',1); wolffd@0: otherwise, error('Bad first argument'); wolffd@0: end wolffd@0: else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = []; wolffd@0: end wolffd@0: [dlen dim] = size(M); wolffd@0: if isempty(mask), mask = ones(dim,1); end wolffd@0: if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end wolffd@0: wolffd@0: % varargin wolffd@0: q = 2; wolffd@0: Md = []; wolffd@0: linkage = 'single'; wolffd@0: ignore = []; wolffd@0: Ne = 'any'; wolffd@0: base = []; wolffd@0: tracking = 0; wolffd@0: i=1; wolffd@0: while i<=length(varargin), wolffd@0: argok = 1; wolffd@0: if ischar(varargin{i}), wolffd@0: switch varargin{i}, wolffd@0: % argument IDs wolffd@0: case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; wolffd@0: case 'connect', i=i+1; Ne = varargin{i}; wolffd@0: case 'ignore', i=i+1; ignore = varargin{i}; wolffd@0: case 'dist', i=i+1; Md = varargin{i}; wolffd@0: case 'linkage', i=i+1; linkage = varargin{i}; wolffd@0: case 'mask', i=i+1; mask = varargin{i}; wolffd@0: case 'tracking',i=i+1; tracking = varargin{i}; wolffd@0: case 'base', i=i+1; base = varargin{i}; wolffd@0: % unambiguous values wolffd@0: case 'neighbors', Ne = varargin{i}; wolffd@0: case 'any', Ne = varargin{i}; wolffd@0: case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i}; wolffd@0: otherwise argok=0; wolffd@0: end wolffd@0: elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), wolffd@0: switch varargin{i}(1).type, wolffd@0: case 'som_topol', sT = varargin{i}; wolffd@0: otherwise argok=0; wolffd@0: end wolffd@0: else wolffd@0: argok = 0; wolffd@0: end wolffd@0: if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end wolffd@0: i = i+1; wolffd@0: end wolffd@0: wolffd@0: % check distance metric wolffd@0: if prod(size(Md))==1, q = Md; Md = []; end wolffd@0: if ~isempty(Md) & prod(size(Md))0 & any(strcmp(linkage,{'ward','centroid'})), wolffd@0: warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']); wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% distance matrix and connections between units wolffd@0: wolffd@0: % base partition wolffd@0: if isempty(base), base = 1:dlen; end wolffd@0: if ~isempty(ignore), base(ignore) = NaN; end wolffd@0: cid = unique(base(isfinite(base))); wolffd@0: nc = length(cid); wolffd@0: if max(cid)>nc | min(cid)<1, wolffd@0: b = base; for i=1:nc, base(find(b==cid(i))) = i; end wolffd@0: end wolffd@0: wolffd@0: % initial clusters wolffd@0: clinds = cell(nc,1); wolffd@0: for i=1:nc, clinds{i} = find(base==i); end wolffd@0: wolffd@0: % neighborhood constraint (calculate connection matrix Ne) wolffd@0: if ischar(Ne), wolffd@0: switch Ne, wolffd@0: case 'any', Ne = []; wolffd@0: case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end wolffd@0: otherwise, error(['Unrecognized connection mode ' Ne]); wolffd@0: end wolffd@0: end wolffd@0: if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1 wolffd@0: if all(Ne(:)>0), Ne = []; end wolffd@0: wolffd@0: % neighborhood function values wolffd@0: if strcmp(linkage,'neighf') wolffd@0: if isempty(sTr), error('Cannot use neighf linkage.'); end wolffd@0: q = som_unit_dists(sT).^2; wolffd@0: r = sTr.radius_fin^2; wolffd@0: if isnan(r) | isempty(r), r = 1; end wolffd@0: switch sTr.neigh, wolffd@0: case 'bubble', q = (q <= r); wolffd@0: case 'gaussian', q = exp(-q/(2*r)); wolffd@0: case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r); wolffd@0: case 'ep', q = (1-q/r) .* (q <= r); wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: % mutual distances and initial cluster distances wolffd@0: Cd = []; wolffd@0: if any(strcmp(linkage,{'single','average','complete','neighf'})), wolffd@0: M = som_mdist(M,2,mask,Ne); wolffd@0: if (nc == dlen & all(base==[1:dlen])), Cd = M; end wolffd@0: end wolffd@0: if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end wolffd@0: Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal wolffd@0: wolffd@0: % check out from Ne which of the clusters are not connected wolffd@0: if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})), wolffd@0: Clconn = sparse(nc); wolffd@0: for i=1:nc-1, wolffd@0: for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end wolffd@0: Clconn(i+1:nc,i) = Clconn(i,i+1:nc)'; wolffd@0: end wolffd@0: Cd(Clconn==0) = Inf; wolffd@0: else wolffd@0: Clconn = []; wolffd@0: end wolffd@0: wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% construct dendrogram wolffd@0: wolffd@0: clen = nc; wolffd@0: cid = 1:clen; wolffd@0: Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step wolffd@0: if tracking, h = waitbar(0,'Making hierarchical clustering'); end wolffd@0: wolffd@0: for i=1:clen-1, wolffd@0: if tracking, waitbar(i/clen,h); end wolffd@0: wolffd@0: % find two closest clusters and combine them wolffd@0: [d,c1] = min(min(Cd)); % cluster1 wolffd@0: [d,c2] = min(Cd(:,c1)); % cluster2 wolffd@0: i1 = clinds{c1}; % vectors belonging to c1 wolffd@0: i2 = clinds{c2}; % vectors belonging to c2 wolffd@0: clinds{c1} = [i1; i2]; % insert clusters to c1 wolffd@0: Z(i,:) = [cid(c1), cid(c2), d]; % update tree info wolffd@0: wolffd@0: % remove cluster c2 wolffd@0: notc2 = [1:c2-1,c2+1:nc]; wolffd@0: nc = nc-1; if nc<=1, break; end wolffd@0: if c1>c2, c1=c1-1; end wolffd@0: clinds = clinds(notc2); wolffd@0: Cd = Cd(notc2,notc2); wolffd@0: cid = cid(notc2); wolffd@0: if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end wolffd@0: wolffd@0: % update cluster distances wolffd@0: notc1 = [1:c1-1,c1+1:nc]; wolffd@0: Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask); wolffd@0: Cd(notc1,c1) = Cd(c1,notc1)'; wolffd@0: if ~isempty(Clconn), wolffd@0: for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end wolffd@0: Clconn(notc1,c1) = Clconn(c1,notc1)'; wolffd@0: Cd(Clconn==0) = Inf; wolffd@0: end wolffd@0: wolffd@0: end wolffd@0: wolffd@0: if tracking, close(h); end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% return values wolffd@0: wolffd@0: % to maintain compatibility with Statistics Toolbox, the values in wolffd@0: % Z must be yet transformed so that they are similar to the output wolffd@0: % of LINKAGE function wolffd@0: wolffd@0: clen = size(Z,1)+1; wolffd@0: Zs = Z; wolffd@0: current_cluster = 1:clen; wolffd@0: for i=1:size(Z,1), wolffd@0: Zs(i,1) = current_cluster(Z(i,1)); wolffd@0: Zs(i,2) = current_cluster(Z(i,2)); wolffd@0: current_cluster(Z(i,[1 2])) = clen+i; wolffd@0: end wolffd@0: Z = Zs; wolffd@0: wolffd@0: % make a clustering struct wolffd@0: name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0)); wolffd@0: sC = som_clstruct(Z,'base',base,'name',name); wolffd@0: wolffd@0: return; wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: