Mercurial > hg > camir-aes2014
diff toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolboxes/MIRtoolbox1.3.2/somtoolbox/som_linkage.m Tue Feb 10 15:05:51 2015 +0000 @@ -0,0 +1,291 @@ +function [Z,order,Md] = som_linkage(sM,varargin) + +%SOM_LINKAGE Make a hierarchical linkage of the SOM map units. +% +% [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...]) +% +% Z = som_linkage(sM); +% Z = som_linkage(D,'complete'); +% Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D))); +% Z = som_linkage(sM,pdist(sM.codebook,'mahal')); +% som_dendrogram(Z); +% +% Input and output arguments ([]'s are optional): +% sM (struct) map or data struct to be clustered +% (matrix) size dlen x dim, a data set: the matrix must not +% contain any NaN's! +% [argID, (string) See below. The values which are unambiguous can +% value] (varies) be given without the preceeding argID. +% +% Z (matrix) size dlen-1 x 3, the linkage info +% Z(i,1) and Z(i,2) hold the indeces of clusters +% combined on level i (starting from bottom). The new +% cluster has index dlen+i. The initial cluster +% index of each unit is its linear index in the +% original data matrix. Z(i,3) is the distance +% between the combined clusters. See LINKAGE +% function in the Statistics Toolbox. +% The ignored samples are listed at the +% end of the Z-matrix and have Z(*,3) == Inf +% Dist (matrix) size dlen x dlen, pairwise distance matrix +% +% Here are the valid argument IDs and corresponding values. The values +% which are unambiguous (marked with '*') can be given without the +% preceeding argID. +% 'linkage' *(string) the linkage criteria to use: 'single' (the +% default), 'average' or 'complete' +% 'topol' *(struct) topology struct +% 'connect' *(string) 'neighbors' or 'any' (default), whether the +% connections should be allowed only between +% neighbors or between any vectors +% (matrix) size dlen x dlen indicating the connections +% between vectors +% (scalar) the N-neighborhood upto which the connections +% should be formed (implies 'neighbors') +% 'ignore' (vector) the units/vectors which should be ignored +% 'dist' (matrix) size dlen x dlen, pairwise distance matrix to +% be used instead of euclidian distances +% (vector) as the output of PDIST function +% (scalar) distance norm to use (euclidian = 2) +% 'mask' (vector) size dim x 1, the search mask used to +% weight distance calculation. By default +% sM.mask or a vector of ones is used. +% +% Note that together 'connect'='neighbors' and 'ignore' may form +% areas on the map which will never be connected: connections +% across the ignored map units simply do not exist. +% +% See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. + +% Copyright (c) 2000 by Juha Vesanto +% Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto +% http://www.cis.hut.fi/projects/somtoolbox/ + +% Version 2.0beta juuso 160600 + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% input arguments + +% the data +if isstruct(sM), + if isfield(sM,'data'), D = sM.data; sT = []; mask = []; + else D = sM.codebook; sT = sM.topol; mask = sM.mask; + end +else + D = sM; sT = []; mask = []; +end +[dlen dim] = size(D); +if isempty(mask), mask = ones(dim,1); end +if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end + +% varargin +Md = 2; +linkage = 'single'; +ignore_units = []; +constrained = 0; +i=1; +while i<=length(varargin), + argok = 1; + if ischar(varargin{i}), + switch varargin{i}, + % argument IDs + case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; + case 'connect', i=i+1; + if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any'); + else constrained = varargin{i}; end + case 'ignore', i=i+1; ignore_units = varargin{i}; + case 'dist', i=i+1; Md = varargin{i}; + case 'linkage', i=i+1; linkage = varargin{i}; + case 'mask', i=i+1; mask = varargin{i}; + case 'tracking',i=i+1; tracking = varargin{i}; + % unambiguous values + case 'neighbors', constrained = 1; + case 'any', constrained = 0; + case {'single','average','complete'}, linkage = varargin{i}; + otherwise argok=0; + end + elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), + switch varargin{i}(1).type, + case 'som_topol', sTopol = varargin{i}; + otherwise argok=0; + end + else + argok = 0; + end + if ~argok, + disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]); + end + i = i+1; +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% distance matrix + +% given distance matrix % jh corrected this place totally 27.3. 03 +if (prod(size(Md))==1), % no explicit distance matrix, set flag + q=2; % 17.2.03 kr added some brackets +else + if (prod(size(Md))<dlen^2), % check pdist form + Md = squareform(Md); % transform to ordinary square diastance matrix + end + % jh: 27.3. 03 "calculate pairwise dist. matrix" see approx. 20 lines below + % sets self-distance to Inf! This must be set here also, + % otherwise clustering fails for user defined dist. matrix! + Md(eye(dlen)==1)=Inf; +end + +% neighborhood constraint +if length(constrained)==1 & constrained>0, + Ne1 = som_unit_neighs(sT); + Conn = som_neighborhood(Ne1,constrained); + Conn(~isfinite(Conn(:))) = 0; +else Conn = constrained; end +if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end + +% pairwise distance matrix across connected units +n = size(D,1); +if prod(size(Md))>1, + % remove distances between non-neighbors + if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end +else + % calculate pairwise distance matrix + q = Md; + Md = zeros(n,n)+Inf; + if ~constrained & q==2, % fast for the default case + for i = 1:n-1, + x = D(i,:); + inds = [(i+1):n]; + Diff = D(inds,:) - x(ones(n-i,1),:); + Md(inds,i) = sqrt((Diff.^2)*mask); + Md(i,inds) = Md(inds,i)'; + end + else + for i = 1:n-1, + inds = find(Conn(i,:)==1); + inds = inds(find(inds>i)); + Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:)); + switch q, + case 1, dist = Diff*mask; + case 2, dist = sqrt((Diff.^2)*mask); + case Inf, dist = max(Diff,[],2); + otherwise, dist = ((Diff.^q)*mask).^(1/q); + end + Md(inds,i) = dist; + Md(i,inds) = dist'; + end + end +end + +% set distances to ignored units to Inf +if ~isempty(ignore_units), + Md(ignore_units,:) = Inf; + Md(:,ignore_units) = Inf; +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% construct dendrogram + +Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step +clusters = 1:dlen; % each vector is at first in its own cluster +Cd = Md; % distances between clusters + +h = waitbar(0,'Constructing hierarchical clustering'); + +for i=1:n-1, + + % tracking + waitbar(i/(n-1),h); + + %% combine two closest clusters + % find the clusters which are closest to each other (c1 and c2) + [d,ind] = min(min(Cd)); + if ~isfinite(d), break; end % no more connected clusters + [d,c1] = min(Cd(:,ind)); % cluster1 + c2 = clusters(ind); % cluster2 + % combine the two clusters + c1_inds = find(clusters==c1); % vectors belonging to c1 + c2_inds = find(clusters==c2); % vectors belonging to c2 + c_inds = [c1_inds, c2_inds]; % members of the new cluster + % new cluster index = bigger cluster + if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end + clusters(c_inds) = c; % update cluster info + Z(i,:) = [c, k, d]; % save info into Z + + %% update cluster distances + % remove the subclusters from the Cd table + Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf + k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster + Cd(k_inds,:) = Inf; % distance of the subclusters to + Cd(:,k_inds) = Inf; % other clusters = Inf + % update the distance of this cluster to the other clusters + cl = unique(clusters(clusters ~= c)); % indeces of all other clusters + if ~isempty(cl), % added since v6.5 works differently than 6.1 + for l=cl, + o_inds = find(clusters==l); % indeces belonging to cluster k + vd = Md(c_inds,o_inds); % distances between vectors in c and k + vd = vd(isfinite(vd(:))); % remove infinite distances (no connection) + len = length(vd); + if ~len, % if the two clusters are not connected, their distance in Inf + sd = Inf; + else % otherwise, calculate the distance between them + switch linkage, + case 'single', sd = min(vd); + case 'average', sd = sum(vd)/len; + case 'complete', sd = max(vd); + otherwise, error(['Unknown set distance: ' linkage]); + end + end + Cd(c,l) = sd; Cd(l,c) = sd; + end + end +end +close(h); + +last = Z(i,1); +if isnan(last), + last = Z(i-1,1); + rest = setdiff(unique(clusters),last); + Z(i:n-1,1) = rest'; + Z(i:n-1,2) = last; + Z(i:n-1,3) = Inf; + i = i - 1; +else + rest = []; +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% return values + +% calculate the order of samples +order = last; +% go through the combination matrix from top to down +for k=i:-1:1, + c = Z(k,1); k = Z(k,2); % what (k) change to what (c) + j = find(order==c); % the occurance of c in order + if j == length(order), order = [order k]; % put k behind c + else order = [order(1:j) k order(j+1:end)]; + end +end +order = [rest, order]; + +% to maintain compatibility with Statistics Toolbox, the values in +% Z must be yet transformed so that they are similar to the output +% of LINKAGE function + +Zs = Z; +current_cluster = 1:dlen; +for i=1:size(Z,1), + Zs(i,1) = current_cluster(Z(i,1)); + Zs(i,2) = current_cluster(Z(i,2)); + current_cluster(Z(i,[1 2])) = dlen+i; +end + +Z = Zs; + +return; + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + +