wolffd@0: function [sR,best,sig,Cm] = som_drmake(D,inds1,inds2,sigmea,nanis) wolffd@0: wolffd@0: % SOM_DRMAKE Make descriptive rules for given group within the given data. wolffd@0: % wolffd@0: % sR = som_drmake(D,[inds1],[inds2],[sigmea],[nanis]) wolffd@0: % wolffd@0: % D (struct) map or data struct wolffd@0: % (matrix) the data, of size [dlen x dim] wolffd@0: % [inds1] (vector) indeces belonging to the group wolffd@0: % (the whole data set by default) wolffd@0: % [inds2] (vector) indeces belonging to the contrast group wolffd@0: % (the rest of the data set by default) wolffd@0: % [sigmea] (string) significance measure: 'accuracy', wolffd@0: % 'mutuconf' (default), or 'accuracyI'. wolffd@0: % (See definitions below). wolffd@0: % [nanis] (scalar) value given for NaNs: 0 (=FALSE, default), wolffd@0: % 1 (=TRUE) or NaN (=ignored) wolffd@0: % wolffd@0: % sR (struct array) best rule for each component. Each wolffd@0: % struct has the following fields: wolffd@0: % .type (string) 'som_rule' wolffd@0: % .name (string) name of the component wolffd@0: % .low (scalar) the low end of the rule range wolffd@0: % .high (scalar) the high end of the rule range wolffd@0: % .nanis (scalar) how NaNs are handled: NaN, 0 or 1 wolffd@0: % wolffd@0: % best (vector) indeces of rules which make the best combined rule wolffd@0: % sig (vector) significance measure values for each rule, and for the combined rule wolffd@0: % Cm (matrix) A matrix of vectorized confusion matrices for each rule, wolffd@0: % and for the combined rule: [a, c, b, d] (see below). wolffd@0: % wolffd@0: % For each rule, such rules sR.low <= x < sR.high are found wolffd@0: % which optimize the given significance measure. The confusion wolffd@0: % matrix below between the given grouping (G: group - not G: contrast group) wolffd@0: % and rule (R: true or false) is used to determine the significance values: wolffd@0: % wolffd@0: % G not G wolffd@0: % --------------- accuracy = (a+d) / (a+b+c+d) wolffd@0: % true | a | b | wolffd@0: % |-------------- mutuconf = a*a / ((a+b)(a+c)) wolffd@0: % false | c | d | wolffd@0: % --------------- accuracyI = a / (a+b+c) wolffd@0: % wolffd@0: % See also SOM_DREVAL, SOM_DRTABLE. wolffd@0: wolffd@0: % Contributed to SOM Toolbox 2.0, January 7th, 2002 by Juha Vesanto wolffd@0: % Copyright (c) by Juha Vesanto wolffd@0: % http://www.cis.hut.fi/projects/somtoolbox/ wolffd@0: wolffd@0: % Version 2.0beta juuso 070102 wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% input arguments wolffd@0: wolffd@0: if isstruct(D), wolffd@0: switch D.type, wolffd@0: case 'som_data', cn = D.comp_names; D = D.data; wolffd@0: case 'som_map', cn = D.comp_names; D = D.codebook; wolffd@0: end wolffd@0: else wolffd@0: cn = cell(size(D,2),1); wolffd@0: for i=1:size(D,2), cn{i} = sprintf('Variable%d',i); end wolffd@0: end wolffd@0: wolffd@0: [dlen,dim] = size(D); wolffd@0: if nargin<2 | isempty(inds1), inds1 = 1:dlen; end wolffd@0: if nargin<3 | isempty(inds2), i = ones(dlen,1); i(inds1) = 0; inds2 = find(i); end wolffd@0: if nargin<4, sigmea = 'mutuconf'; end wolffd@0: if nargin<5, nanis = 0; end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% input arguments wolffd@0: wolffd@0: sig = zeros(dim+1,1); wolffd@0: Cm = zeros(dim+1,4); wolffd@0: wolffd@0: sR1tmp = struct('type','som_rule','name','','low',-Inf,'high',Inf,'nanis',nanis,'lowstr','','highstr',''); wolffd@0: sR = sR1tmp; wolffd@0: wolffd@0: % single variable rules wolffd@0: for i=1:dim, wolffd@0: wolffd@0: % bin edges wolffd@0: mi = min(D(:,i)); wolffd@0: ma = max(D(:,i)); wolffd@0: [histcount,bins] = hist([mi,ma],10); wolffd@0: if size(bins,1)>1, bins = bins'; end wolffd@0: edges = [-Inf, (bins(1:end-1)+bins(2:end))/2, Inf]; wolffd@0: wolffd@0: % find the rule for this variable wolffd@0: [low,high,s,cm] = onevar_descrule(D(inds1,i),D(inds2,i),sigmea,nanis,edges); wolffd@0: sR1 = sR1tmp; wolffd@0: sR1.name = cn{i}; wolffd@0: sR1.low = low; wolffd@0: sR1.high = high; wolffd@0: sR(i) = sR1; wolffd@0: sig(i) = s; wolffd@0: Cm(i,:) = cm; wolffd@0: wolffd@0: end wolffd@0: wolffd@0: % find combined rule wolffd@0: [dummy,order] = sort(-sig); wolffd@0: maxsig = sig(order(1)); bestcm = Cm(order(1),:); wolffd@0: best = order(1); wolffd@0: for i=2:dim, wolffd@0: com = [best, order(i)]; wolffd@0: [s,cm,truex,truey] = som_dreval(sR(com),D(:,com),sigmea,inds1,inds2,'and'); wolffd@0: if s>maxsig, best = com; maxsig = s; bestcm = cm; end wolffd@0: end wolffd@0: sig(end) = maxsig; wolffd@0: Cm(end,:) = cm; wolffd@0: wolffd@0: return; wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55 wolffd@0: %% descriptive rules wolffd@0: wolffd@0: function [low,high,sig,cm] = onevar_descrule(x,y,sigmea,nanis,edges) wolffd@0: wolffd@0: % Given a set of bin edges, find the range of bins with best significance. wolffd@0: % wolffd@0: % x data values in cluster wolffd@0: % y data values not in cluster wolffd@0: % sigmea significance measure wolffd@0: % bins bin centers wolffd@0: % nanis how to handle NaNs wolffd@0: wolffd@0: % histogram counts wolffd@0: if isnan(nanis), x = x(~isnan(x)); y = y(~isnan(y)); end wolffd@0: [xcount,xbin] = histc(x,edges); wolffd@0: [ycount,ybin] = histc(y,edges); wolffd@0: xcount = xcount(1:end-1); wolffd@0: ycount = ycount(1:end-1); wolffd@0: xnan=sum(isnan(x)); wolffd@0: ynan=sum(isnan(y)); wolffd@0: wolffd@0: % find number of true items in both groups in all possible ranges wolffd@0: n = length(xcount); wolffd@0: V = zeros(n*(n+1)/2,4); wolffd@0: s1 = cumsum(xcount); wolffd@0: s2 = cumsum(xcount(end:-1:1)); s2 = s2(end:-1:1); wolffd@0: m = s1(end); wolffd@0: Tx = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(xcount',[n 1])+repmat(xcount,[1 n]),0); wolffd@0: s1 = cumsum(ycount); wolffd@0: s2 = cumsum(ycount(end:-1:1)); s2 = s2(end:-1:1); wolffd@0: Ty = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(ycount',[n 1])+repmat(ycount,[1 n]),0); wolffd@0: [i,j] = find(Tx+Ty); wolffd@0: k = sub2ind(size(Tx),i,j); wolffd@0: V = [i, j, Tx(k), Ty(k)]; wolffd@0: tix = V(:,3) + nanis*xnan; wolffd@0: tiy = V(:,4) + nanis*ynan; wolffd@0: wolffd@0: % select the best range wolffd@0: nix = length(x); wolffd@0: niy = length(y); wolffd@0: Cm = [tix,nix-tix,tiy,niy-tiy]; wolffd@0: [s,k] = max(som_drsignif(sigmea,Cm)); wolffd@0: wolffd@0: % output wolffd@0: low = edges(V(k,1)); wolffd@0: high = edges(V(k,2)+1); wolffd@0: sig = s; wolffd@0: cm = Cm(k,:); wolffd@0: wolffd@0: return; wolffd@0: