Daniel@0: function [Z,order,Md] = som_linkage(sM,varargin) Daniel@0: Daniel@0: %SOM_LINKAGE Make a hierarchical linkage of the SOM map units. Daniel@0: % Daniel@0: % [Z,order,Dist] = som_linkage(sM, [[argID,] value, ...]) Daniel@0: % Daniel@0: % Z = som_linkage(sM); Daniel@0: % Z = som_linkage(D,'complete'); Daniel@0: % Z = som_linkage(sM,'single','ignore',find(~som_hits(sM,D))); Daniel@0: % Z = som_linkage(sM,pdist(sM.codebook,'mahal')); Daniel@0: % som_dendrogram(Z); Daniel@0: % Daniel@0: % Input and output arguments ([]'s are optional): Daniel@0: % sM (struct) map or data struct to be clustered Daniel@0: % (matrix) size dlen x dim, a data set: the matrix must not Daniel@0: % contain any NaN's! Daniel@0: % [argID, (string) See below. The values which are unambiguous can Daniel@0: % value] (varies) be given without the preceeding argID. Daniel@0: % Daniel@0: % Z (matrix) size dlen-1 x 3, the linkage info Daniel@0: % Z(i,1) and Z(i,2) hold the indeces of clusters Daniel@0: % combined on level i (starting from bottom). The new Daniel@0: % cluster has index dlen+i. The initial cluster Daniel@0: % index of each unit is its linear index in the Daniel@0: % original data matrix. Z(i,3) is the distance Daniel@0: % between the combined clusters. See LINKAGE Daniel@0: % function in the Statistics Toolbox. Daniel@0: % The ignored samples are listed at the Daniel@0: % end of the Z-matrix and have Z(*,3) == Inf Daniel@0: % Dist (matrix) size dlen x dlen, pairwise distance matrix Daniel@0: % Daniel@0: % Here are the valid argument IDs and corresponding values. The values Daniel@0: % which are unambiguous (marked with '*') can be given without the Daniel@0: % preceeding argID. Daniel@0: % 'linkage' *(string) the linkage criteria to use: 'single' (the Daniel@0: % default), 'average' or 'complete' Daniel@0: % 'topol' *(struct) topology struct Daniel@0: % 'connect' *(string) 'neighbors' or 'any' (default), whether the Daniel@0: % connections should be allowed only between Daniel@0: % neighbors or between any vectors Daniel@0: % (matrix) size dlen x dlen indicating the connections Daniel@0: % between vectors Daniel@0: % (scalar) the N-neighborhood upto which the connections Daniel@0: % should be formed (implies 'neighbors') Daniel@0: % 'ignore' (vector) the units/vectors which should be ignored Daniel@0: % 'dist' (matrix) size dlen x dlen, pairwise distance matrix to Daniel@0: % be used instead of euclidian distances Daniel@0: % (vector) as the output of PDIST function Daniel@0: % (scalar) distance norm to use (euclidian = 2) Daniel@0: % 'mask' (vector) size dim x 1, the search mask used to Daniel@0: % weight distance calculation. By default Daniel@0: % sM.mask or a vector of ones is used. Daniel@0: % Daniel@0: % Note that together 'connect'='neighbors' and 'ignore' may form Daniel@0: % areas on the map which will never be connected: connections Daniel@0: % across the ignored map units simply do not exist. Daniel@0: % Daniel@0: % See also KMEANS_CLUSTERS, LINKAGE, PDIST, DENDROGRAM. Daniel@0: Daniel@0: % Copyright (c) 2000 by Juha Vesanto Daniel@0: % Contributed to SOM Toolbox on June 16th, 2000 by Juha Vesanto Daniel@0: % http://www.cis.hut.fi/projects/somtoolbox/ Daniel@0: Daniel@0: % Version 2.0beta juuso 160600 Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Daniel@0: %% input arguments Daniel@0: Daniel@0: % the data Daniel@0: if isstruct(sM), Daniel@0: if isfield(sM,'data'), D = sM.data; sT = []; mask = []; Daniel@0: else D = sM.codebook; sT = sM.topol; mask = sM.mask; Daniel@0: end Daniel@0: else Daniel@0: D = sM; sT = []; mask = []; Daniel@0: end Daniel@0: [dlen dim] = size(D); Daniel@0: if isempty(mask), mask = ones(dim,1); end Daniel@0: if any(isnan(D(:))), error('Data matrix must not have any NaNs.'); end Daniel@0: Daniel@0: % varargin Daniel@0: Md = 2; Daniel@0: linkage = 'single'; Daniel@0: ignore_units = []; Daniel@0: constrained = 0; Daniel@0: i=1; Daniel@0: while i<=length(varargin), Daniel@0: argok = 1; Daniel@0: if ischar(varargin{i}), Daniel@0: switch varargin{i}, Daniel@0: % argument IDs Daniel@0: case {'topol','som_topol','sTopol'}, i=i+1; sT = varargin{i}; Daniel@0: case 'connect', i=i+1; Daniel@0: if ischar(varargin{i}), constrained = ~strcmp(varargin{i},'any'); Daniel@0: else constrained = varargin{i}; end Daniel@0: case 'ignore', i=i+1; ignore_units = varargin{i}; Daniel@0: case 'dist', i=i+1; Md = varargin{i}; Daniel@0: case 'linkage', i=i+1; linkage = varargin{i}; Daniel@0: case 'mask', i=i+1; mask = varargin{i}; Daniel@0: case 'tracking',i=i+1; tracking = varargin{i}; Daniel@0: % unambiguous values Daniel@0: case 'neighbors', constrained = 1; Daniel@0: case 'any', constrained = 0; Daniel@0: case {'single','average','complete'}, linkage = varargin{i}; Daniel@0: otherwise argok=0; Daniel@0: end Daniel@0: elseif isstruct(varargin{i}) & isfield(varargin{i},'type'), Daniel@0: switch varargin{i}(1).type, Daniel@0: case 'som_topol', sTopol = varargin{i}; Daniel@0: otherwise argok=0; Daniel@0: end Daniel@0: else Daniel@0: argok = 0; Daniel@0: end Daniel@0: if ~argok, Daniel@0: disp(['(som_linkage) Ignoring invalid argument #' num2str(i+1)]); Daniel@0: end Daniel@0: i = i+1; Daniel@0: end Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Daniel@0: %% distance matrix Daniel@0: Daniel@0: % given distance matrix % jh corrected this place totally 27.3. 03 Daniel@0: if (prod(size(Md))==1), % no explicit distance matrix, set flag Daniel@0: q=2; % 17.2.03 kr added some brackets Daniel@0: else Daniel@0: if (prod(size(Md))0, Daniel@0: Ne1 = som_unit_neighs(sT); Daniel@0: Conn = som_neighborhood(Ne1,constrained); Daniel@0: Conn(~isfinite(Conn(:))) = 0; Daniel@0: else Conn = constrained; end Daniel@0: if ~isempty(Conn), for i=1:dlen, C(i,i) = 1; end, end Daniel@0: Daniel@0: % pairwise distance matrix across connected units Daniel@0: n = size(D,1); Daniel@0: if prod(size(Md))>1, Daniel@0: % remove distances between non-neighbors Daniel@0: if constrained, for i = 1:n, Md(i,find(Conn(i,:)==0)) = Inf; end, end Daniel@0: else Daniel@0: % calculate pairwise distance matrix Daniel@0: q = Md; Daniel@0: Md = zeros(n,n)+Inf; Daniel@0: if ~constrained & q==2, % fast for the default case Daniel@0: for i = 1:n-1, Daniel@0: x = D(i,:); Daniel@0: inds = [(i+1):n]; Daniel@0: Diff = D(inds,:) - x(ones(n-i,1),:); Daniel@0: Md(inds,i) = sqrt((Diff.^2)*mask); Daniel@0: Md(i,inds) = Md(inds,i)'; Daniel@0: end Daniel@0: else Daniel@0: for i = 1:n-1, Daniel@0: inds = find(Conn(i,:)==1); Daniel@0: inds = inds(find(inds>i)); Daniel@0: Diff = abs(D(inds,:) - D(i*ones(length(inds),1),:)); Daniel@0: switch q, Daniel@0: case 1, dist = Diff*mask; Daniel@0: case 2, dist = sqrt((Diff.^2)*mask); Daniel@0: case Inf, dist = max(Diff,[],2); Daniel@0: otherwise, dist = ((Diff.^q)*mask).^(1/q); Daniel@0: end Daniel@0: Md(inds,i) = dist; Daniel@0: Md(i,inds) = dist'; Daniel@0: end Daniel@0: end Daniel@0: end Daniel@0: Daniel@0: % set distances to ignored units to Inf Daniel@0: if ~isempty(ignore_units), Daniel@0: Md(ignore_units,:) = Inf; Daniel@0: Md(:,ignore_units) = Inf; Daniel@0: end Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Daniel@0: %% construct dendrogram Daniel@0: Daniel@0: Z = zeros(n-1,3)+NaN; % merged clusters and distance for each step Daniel@0: clusters = 1:dlen; % each vector is at first in its own cluster Daniel@0: Cd = Md; % distances between clusters Daniel@0: Daniel@0: h = waitbar(0,'Constructing hierarchical clustering'); Daniel@0: Daniel@0: for i=1:n-1, Daniel@0: Daniel@0: % tracking Daniel@0: waitbar(i/(n-1),h); Daniel@0: Daniel@0: %% combine two closest clusters Daniel@0: % find the clusters which are closest to each other (c1 and c2) Daniel@0: [d,ind] = min(min(Cd)); Daniel@0: if ~isfinite(d), break; end % no more connected clusters Daniel@0: [d,c1] = min(Cd(:,ind)); % cluster1 Daniel@0: c2 = clusters(ind); % cluster2 Daniel@0: % combine the two clusters Daniel@0: c1_inds = find(clusters==c1); % vectors belonging to c1 Daniel@0: c2_inds = find(clusters==c2); % vectors belonging to c2 Daniel@0: c_inds = [c1_inds, c2_inds]; % members of the new cluster Daniel@0: % new cluster index = bigger cluster Daniel@0: if length(c2_inds)>length(c1_inds), c=c2; k=c1; else c=c1; k=c2; end Daniel@0: clusters(c_inds) = c; % update cluster info Daniel@0: Z(i,:) = [c, k, d]; % save info into Z Daniel@0: Daniel@0: %% update cluster distances Daniel@0: % remove the subclusters from the Cd table Daniel@0: Cd(c_inds,c_inds) = Inf; % distance of the cluster to its members = Inf Daniel@0: k_inds = c_inds(c_inds ~= c); % vectors of the smaller cluster Daniel@0: Cd(k_inds,:) = Inf; % distance of the subclusters to Daniel@0: Cd(:,k_inds) = Inf; % other clusters = Inf Daniel@0: % update the distance of this cluster to the other clusters Daniel@0: cl = unique(clusters(clusters ~= c)); % indeces of all other clusters Daniel@0: if ~isempty(cl), % added since v6.5 works differently than 6.1 Daniel@0: for l=cl, Daniel@0: o_inds = find(clusters==l); % indeces belonging to cluster k Daniel@0: vd = Md(c_inds,o_inds); % distances between vectors in c and k Daniel@0: vd = vd(isfinite(vd(:))); % remove infinite distances (no connection) Daniel@0: len = length(vd); Daniel@0: if ~len, % if the two clusters are not connected, their distance in Inf Daniel@0: sd = Inf; Daniel@0: else % otherwise, calculate the distance between them Daniel@0: switch linkage, Daniel@0: case 'single', sd = min(vd); Daniel@0: case 'average', sd = sum(vd)/len; Daniel@0: case 'complete', sd = max(vd); Daniel@0: otherwise, error(['Unknown set distance: ' linkage]); Daniel@0: end Daniel@0: end Daniel@0: Cd(c,l) = sd; Cd(l,c) = sd; Daniel@0: end Daniel@0: end Daniel@0: end Daniel@0: close(h); Daniel@0: Daniel@0: last = Z(i,1); Daniel@0: if isnan(last), Daniel@0: last = Z(i-1,1); Daniel@0: rest = setdiff(unique(clusters),last); Daniel@0: Z(i:n-1,1) = rest'; Daniel@0: Z(i:n-1,2) = last; Daniel@0: Z(i:n-1,3) = Inf; Daniel@0: i = i - 1; Daniel@0: else Daniel@0: rest = []; Daniel@0: end Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Daniel@0: %% return values Daniel@0: Daniel@0: % calculate the order of samples Daniel@0: order = last; Daniel@0: % go through the combination matrix from top to down Daniel@0: for k=i:-1:1, Daniel@0: c = Z(k,1); k = Z(k,2); % what (k) change to what (c) Daniel@0: j = find(order==c); % the occurance of c in order Daniel@0: if j == length(order), order = [order k]; % put k behind c Daniel@0: else order = [order(1:j) k order(j+1:end)]; Daniel@0: end Daniel@0: end Daniel@0: order = [rest, order]; Daniel@0: Daniel@0: % to maintain compatibility with Statistics Toolbox, the values in Daniel@0: % Z must be yet transformed so that they are similar to the output Daniel@0: % of LINKAGE function Daniel@0: Daniel@0: Zs = Z; Daniel@0: current_cluster = 1:dlen; Daniel@0: for i=1:size(Z,1), Daniel@0: Zs(i,1) = current_cluster(Z(i,1)); Daniel@0: Zs(i,2) = current_cluster(Z(i,2)); Daniel@0: current_cluster(Z(i,[1 2])) = dlen+i; Daniel@0: end Daniel@0: Daniel@0: Z = Zs; Daniel@0: Daniel@0: return; Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Daniel@0: Daniel@0: Daniel@0: Daniel@0: