Mercurial > hg > camir-aes2014
diff toolboxes/MIRtoolbox1.3.2/somtoolbox/som_cllinkage.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolboxes/MIRtoolbox1.3.2/somtoolbox/som_cllinkage.m Tue Feb 10 15:05:51 2015 +0000 @@ -0,0 +1,272 @@ +function sC = som_cllinkage(sM,varargin) + +%SOM_CLLINKAGE Make a hierarchical linkage of the SOM map units. +% +% sC = som_cllinkage(sM, [[argID,] value, ...]) +% +% sC = som_cllinkage(sM); +% sC = som_cllinkage(D,'complete'); +% sC = som_cllinkage(sM,'single','ignore',find(~som_hits(sM,D))); +% sC = som_cllinkage(sM,pdist(sM.codebook,'mahal')); +% som_clplot(sC); +% +% Input and output arguments ([]'s are optional): +% sM (struct) map or data struct to be clustered +% (matrix) size dlen x dim, a data set: the matrix must not +% contain any NaN's! +% [argID, (string) See below. The values which are unambiguous can +% value] (varies) be given without the preceeding argID. +% +% sC (struct) a clustering struct with e.g. the following fields +% (for more information see SOMCL_STRUCT) +% .base (vector) if base partitioning is given, this is a newly +% coded version of it so that the cluster indices +% go from 1 to the number of clusters. +% .tree (matrix) size clen-1 x 3, the linkage info +% Z(i,1) and Z(i,2) hold the indeces of clusters +% combined on level i (starting from bottom). The new +% cluster has index dlen+i. The initial cluster +% index of each unit is its linear index in the +% original data matrix. Z(i,3) is the distance +% between the combined clusters. See LINKAGE +% function in the Statistics Toolbox. +% +% Here are the valid argument IDs and corresponding values. The values +% which are unambiguous (marked with '*') can be given without the +% preceeding argID. +% 'topol' *(struct) topology struct +% 'connect' *(string) 'neighbors' or 'any' (default), whether the +% connections should be allowed only between +% neighbors or between any vectors +% (matrix) size dlen x dlen indicating the connections +% between vectors +% 'linkage' *(string) the linkage criteria to use: 'single' (the +% default), 'average', 'complete', 'centroid', or 'ward' +% 'dist' (matrix) size dlen x dlen, pairwise distance matrix to +% be used instead of euclidian distances +% (vector) as the output of PDIST function +% (scalar) distance norm to use (default is euclidian = 2) +% 'mask' (vector) size dim x 1, the search mask used to +% weight distance calculation. By default +% sM.mask or a vector of ones is used. +% 'base' (vector) giving the base partitioning of the data: +% base(i) = j denotes that vector i belongs to +% base cluster j, and base(i) = NaN that vector +% i does not belong to any cluster, but should be +% ignored. At the beginning of the clustering, the +% vector of each cluster are averaged, and these +% averaged vectors are then clustered using +% hierarchical clustering. +% 'ignore' (vector) units to be ignored (in addition to those listed +% in base argument) +% 'tracking' (scalar) 1 or 0: whether to show tracking bar or not (default = 0) +% +% Note that if 'connect'='neighbors' and some vector are ignored (as denoted +% by NaNs in the base vector), there may be areas on the map which will +% never be connected: connections across the ignored map units simply do not +% exist. In such a case, the neighborhood is gradually increased until +% the areas can be connected. +% +% See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. + +% Copyright (c) 2000 by Juha Vesanto +% Contributed to SOM Toolbox on XXX by Juha Vesanto +% http://www.cis.hut.fi/projects/somtoolbox/ + +% Version 2.0beta juuso 160600 250800 + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% input arguments + +% the data +if isstruct(sM), + switch sM.type, + case 'som_map', M = sM.codebook; sT = sM.topol; mask = sM.mask; data_name = sM.name; sTr = sM.trainhist(end); + case 'som_data', M = sM.data; sT = []; mask = []; data_name = sM.name; sTr = []; + case 'som_topol', M = []; sT = sM; mask = []; data_name = inputname(1); + sTr = som_set('som_train','neigh','gaussian','radius_fin',1); + otherwise, error('Bad first argument'); + end +else M = sM; sT = []; mask = []; data_name = inputname(1); sTr = []; +end +[dlen dim] = size(M); +if isempty(mask), mask = ones(dim,1); end +if any(isnan(M(:))), error('Data matrix must not have any NaNs.'); end + +% varargin +q = 2; +Md = []; +linkage = 'single'; +ignore = []; +Ne = 'any'; +base = []; +tracking = 0; +i=1; +while i<=length(varargin), + argok = 1; + if ischar(varargin{i}), + switch varargin{i}, + % argument IDs + case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; + case 'connect', i=i+1; Ne = varargin{i}; + case 'ignore', i=i+1; ignore = varargin{i}; + case 'dist', i=i+1; Md = varargin{i}; + case 'linkage', i=i+1; linkage = varargin{i}; + case 'mask', i=i+1; mask = varargin{i}; + case 'tracking',i=i+1; tracking = varargin{i}; + case 'base', i=i+1; base = varargin{i}; + % unambiguous values + case 'neighbors', Ne = varargin{i}; + case 'any', Ne = varargin{i}; + case {'single','average','complete','neighf','centroid','ward'}, linkage = varargin{i}; + otherwise argok=0; + end + elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), + switch varargin{i}(1).type, + case 'som_topol', sT = varargin{i}; + otherwise argok=0; + end + else + argok = 0; + end + if ~argok, disp(['(som_cllinkage) Ignoring invalid argument #' num2str(i+1)]); end + i = i+1; +end + +% check distance metric +if prod(size(Md))==1, q = Md; Md = []; end +if ~isempty(Md) & prod(size(Md))<dlen^2, Md = squareform(Md); end +if prod(size(Md))>0 & any(strcmp(linkage,{'ward','centroid'})), + warning(['The linkage method ' linkage ' cannot be performed with precalculated distance matrix.']); +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% distance matrix and connections between units + +% base partition +if isempty(base), base = 1:dlen; end +if ~isempty(ignore), base(ignore) = NaN; end +cid = unique(base(isfinite(base))); +nc = length(cid); +if max(cid)>nc | min(cid)<1, + b = base; for i=1:nc, base(find(b==cid(i))) = i; end +end + +% initial clusters +clinds = cell(nc,1); +for i=1:nc, clinds{i} = find(base==i); end + +% neighborhood constraint (calculate connection matrix Ne) +if ischar(Ne), + switch Ne, + case 'any', Ne = []; + case 'neighbors', if ischar(Ne), Ne = som_unit_neighs(sT); end + otherwise, error(['Unrecognized connection mode ' Ne]); + end +end +if ~isempty(Ne), l = size(Ne,1); Ne([0:l-1]*l+[1:l]) = 1; end % diagonal=1 +if all(Ne(:)>0), Ne = []; end + +% neighborhood function values +if strcmp(linkage,'neighf') + if isempty(sTr), error('Cannot use neighf linkage.'); end + q = som_unit_dists(sT).^2; + r = sTr.radius_fin^2; + if isnan(r) | isempty(r), r = 1; end + switch sTr.neigh, + case 'bubble', q = (q <= r); + case 'gaussian', q = exp(-q/(2*r)); + case 'cutgauss', q = exp(-q/(2*r)) .* (q <= r); + case 'ep', q = (1-q/r) .* (q <= r); + end +end + +% mutual distances and initial cluster distances +Cd = []; +if any(strcmp(linkage,{'single','average','complete','neighf'})), + M = som_mdist(M,2,mask,Ne); + if (nc == dlen & all(base==[1:dlen])), Cd = M; end +end +if isempty(Cd), Cd = som_cldist(M,clinds,[],linkage,q,mask); end +Cd([0:nc-1]*nc+[1:nc]) = NaN; % NaNs on the diagonal + +% check out from Ne which of the clusters are not connected +if ~isempty(Ne) & any(strcmp(linkage,{'centroid','ward'})), + Clconn = sparse(nc); + for i=1:nc-1, + for j=i+1:nc, Clconn(i,j) = any(any(Ne(clinds{i},clinds{j}))); end + Clconn(i+1:nc,i) = Clconn(i,i+1:nc)'; + end + Cd(Clconn==0) = Inf; +else + Clconn = []; +end + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% construct dendrogram + +clen = nc; +cid = 1:clen; +Z = zeros(nc-1,3)+NaN; % merged clusters and distance for each step +if tracking, h = waitbar(0,'Making hierarchical clustering'); end + +for i=1:clen-1, + if tracking, waitbar(i/clen,h); end + + % find two closest clusters and combine them + [d,c1] = min(min(Cd)); % cluster1 + [d,c2] = min(Cd(:,c1)); % cluster2 + i1 = clinds{c1}; % vectors belonging to c1 + i2 = clinds{c2}; % vectors belonging to c2 + clinds{c1} = [i1; i2]; % insert clusters to c1 + Z(i,:) = [cid(c1), cid(c2), d]; % update tree info + + % remove cluster c2 + notc2 = [1:c2-1,c2+1:nc]; + nc = nc-1; if nc<=1, break; end + if c1>c2, c1=c1-1; end + clinds = clinds(notc2); + Cd = Cd(notc2,notc2); + cid = cid(notc2); + if ~isempty(Clconn), Clconn = Clconn(notc2,notc2); end + + % update cluster distances + notc1 = [1:c1-1,c1+1:nc]; + Cd(c1,notc1) = som_cldist(M,clinds(c1),clinds(notc1),linkage,q,mask); + Cd(notc1,c1) = Cd(c1,notc1)'; + if ~isempty(Clconn), + for j=notc1, Clconn(c1,j) = any(any(Ne(clinds{c1},clinds{j}))); end + Clconn(notc1,c1) = Clconn(c1,notc1)'; + Cd(Clconn==0) = Inf; + end + +end + +if tracking, close(h); end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% return values + +% to maintain compatibility with Statistics Toolbox, the values in +% Z must be yet transformed so that they are similar to the output +% of LINKAGE function + +clen = size(Z,1)+1; +Zs = Z; +current_cluster = 1:clen; +for i=1:size(Z,1), + Zs(i,1) = current_cluster(Z(i,1)); + Zs(i,2) = current_cluster(Z(i,2)); + current_cluster(Z(i,[1 2])) = clen+i; +end +Z = Zs; + +% make a clustering struct +name = sprintf('Clustering of %s at %s',data_name,datestr(datenum(now),0)); +sC = som_clstruct(Z,'base',base,'name',name); + +return; + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +