Daniel@0: function [sR,best,sig,Cm] = som_drmake(D,inds1,inds2,sigmea,nanis)
Daniel@0: 
Daniel@0: % SOM_DRMAKE Make descriptive rules for given group within the given data. 
Daniel@0: %
Daniel@0: % sR = som_drmake(D,[inds1],[inds2],[sigmea],[nanis]) 
Daniel@0: % 
Daniel@0: %  D        (struct) map or data struct
Daniel@0: %           (matrix) the data, of size [dlen x dim]
Daniel@0: %  [inds1]  (vector) indeces belonging to the group
Daniel@0: %                    (the whole data set by default)
Daniel@0: %  [inds2]  (vector) indeces belonging to the contrast group
Daniel@0: %                    (the rest of the data set by default)
Daniel@0: %  [sigmea] (string) significance measure: 'accuracy', 
Daniel@0: %                    'mutuconf' (default), or 'accuracyI'.
Daniel@0: %                    (See definitions below).
Daniel@0: %  [nanis]  (scalar) value given for NaNs: 0 (=FALSE, default),
Daniel@0: %                    1 (=TRUE) or NaN (=ignored)
Daniel@0: %
Daniel@0: %  sR      (struct array) best rule for each component. Each 
Daniel@0: %                   struct has the following fields:
Daniel@0: %    .type     (string) 'som_rule'
Daniel@0: %    .name     (string) name of the component
Daniel@0: %    .low      (scalar) the low end of the rule range
Daniel@0: %    .high     (scalar) the high end of the rule range
Daniel@0: %    .nanis    (scalar) how NaNs are handled: NaN, 0 or 1
Daniel@0: %
Daniel@0: %  best    (vector) indeces of rules which make the best combined rule
Daniel@0: %  sig     (vector) significance measure values for each rule, and for the combined rule
Daniel@0: %  Cm      (matrix) A matrix of vectorized confusion matrices for each rule, 
Daniel@0: %                   and for the combined rule: [a, c, b, d] (see below). 
Daniel@0: % 
Daniel@0: % For each rule, such rules sR.low <= x < sR.high are found 
Daniel@0: % which optimize the given significance measure. The confusion
Daniel@0: % matrix below between the given grouping (G: group - not G: contrast group) 
Daniel@0: % and rule (R: true or false) is used to determine the significance values:
Daniel@0: %
Daniel@0: %          G    not G    
Daniel@0: %       ---------------    accuracy  = (a+d) / (a+b+c+d)
Daniel@0: % true  |  a  |   b   |    
Daniel@0: %       |--------------    mutuconf  =  a*a  / ((a+b)(a+c)) 
Daniel@0: % false |  c  |   d   | 
Daniel@0: %       ---------------    accuracyI =   a   / (a+b+c)
Daniel@0: %
Daniel@0: % See also  SOM_DREVAL, SOM_DRTABLE.
Daniel@0: 
Daniel@0: % Contributed to SOM Toolbox 2.0, January 7th, 2002 by Juha Vesanto
Daniel@0: % Copyright (c) by Juha Vesanto
Daniel@0: % http://www.cis.hut.fi/projects/somtoolbox/
Daniel@0: 
Daniel@0: % Version 2.0beta juuso 070102
Daniel@0: 
Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0: %% input arguments
Daniel@0: 
Daniel@0: if isstruct(D), 
Daniel@0:   switch D.type, 
Daniel@0:    case 'som_data', cn = D.comp_names; D = D.data; 
Daniel@0:    case 'som_map',  cn = D.comp_names; D = D.codebook; 
Daniel@0:   end  
Daniel@0: else
Daniel@0:   cn = cell(size(D,2),1);
Daniel@0:   for i=1:size(D,2), cn{i} = sprintf('Variable%d',i); end
Daniel@0: end
Daniel@0: 
Daniel@0: [dlen,dim] = size(D);
Daniel@0: if nargin<2 | isempty(inds1), inds1 = 1:dlen; end
Daniel@0: if nargin<3 | isempty(inds2), i = ones(dlen,1); i(inds1) = 0; inds2 = find(i); end
Daniel@0: if nargin<4, sigmea = 'mutuconf'; end
Daniel@0: if nargin<5, nanis = 0; end
Daniel@0: 
Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0: %% input arguments
Daniel@0: 
Daniel@0: sig = zeros(dim+1,1); 
Daniel@0: Cm  = zeros(dim+1,4); 
Daniel@0: 
Daniel@0: sR1tmp = struct('type','som_rule','name','','low',-Inf,'high',Inf,'nanis',nanis,'lowstr','','highstr','');
Daniel@0: sR = sR1tmp;  
Daniel@0: 
Daniel@0: % single variable rules
Daniel@0: for i=1:dim,
Daniel@0:     
Daniel@0:     % bin edges
Daniel@0:     mi = min(D(:,i)); 
Daniel@0:     ma = max(D(:,i)); 
Daniel@0:     [histcount,bins] = hist([mi,ma],10); 
Daniel@0:     if size(bins,1)>1, bins = bins'; end
Daniel@0:     edges = [-Inf, (bins(1:end-1)+bins(2:end))/2, Inf];
Daniel@0:     
Daniel@0:     % find the rule for this variable
Daniel@0:     [low,high,s,cm] = onevar_descrule(D(inds1,i),D(inds2,i),sigmea,nanis,edges);
Daniel@0:     sR1 = sR1tmp;      
Daniel@0:     sR1.name = cn{i}; 
Daniel@0:     sR1.low = low; 
Daniel@0:     sR1.high = high; 
Daniel@0:     sR(i) = sR1; 
Daniel@0:     sig(i) = s; 
Daniel@0:     Cm(i,:) = cm; 
Daniel@0:     
Daniel@0: end  
Daniel@0: 
Daniel@0: % find combined rule
Daniel@0: [dummy,order] = sort(-sig);
Daniel@0: maxsig = sig(order(1)); bestcm = Cm(order(1),:);
Daniel@0: best  = order(1);
Daniel@0: for i=2:dim,    
Daniel@0:     com = [best, order(i)];
Daniel@0:     [s,cm,truex,truey] = som_dreval(sR(com),D(:,com),sigmea,inds1,inds2,'and');
Daniel@0:     if s>maxsig, best = com; maxsig = s; bestcm = cm; end
Daniel@0: end   
Daniel@0: sig(end) = maxsig;
Daniel@0: Cm(end,:) = cm; 
Daniel@0: 
Daniel@0: return;
Daniel@0:     
Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
Daniel@0: %% descriptive rules
Daniel@0: 
Daniel@0: function [low,high,sig,cm] = onevar_descrule(x,y,sigmea,nanis,edges)
Daniel@0: 
Daniel@0:   % Given a set of bin edges, find the range of bins with best significance.
Daniel@0:   %
Daniel@0:   %  x          data values in cluster
Daniel@0:   %  y          data values not in cluster
Daniel@0:   %  sigmea     significance measure
Daniel@0:   %  bins       bin centers
Daniel@0:   %  nanis      how to handle NaNs 
Daniel@0: 
Daniel@0:   % histogram counts
Daniel@0:   if isnan(nanis), x = x(~isnan(x)); y = y(~isnan(y)); end
Daniel@0:   [xcount,xbin] = histc(x,edges); 
Daniel@0:   [ycount,ybin] = histc(y,edges); 
Daniel@0:   xcount = xcount(1:end-1);
Daniel@0:   ycount = ycount(1:end-1); 
Daniel@0:   xnan=sum(isnan(x));
Daniel@0:   ynan=sum(isnan(y));
Daniel@0:     
Daniel@0:   % find number of true items in both groups in all possible ranges
Daniel@0:   n = length(xcount);
Daniel@0:   V = zeros(n*(n+1)/2,4); 
Daniel@0:   s1 = cumsum(xcount);
Daniel@0:   s2 = cumsum(xcount(end:-1:1)); s2 = s2(end:-1:1);       
Daniel@0:   m  = s1(end);      
Daniel@0:   Tx = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(xcount',[n 1])+repmat(xcount,[1 n]),0); 
Daniel@0:   s1 = cumsum(ycount); 
Daniel@0:   s2 = cumsum(ycount(end:-1:1)); s2 = s2(end:-1:1);        
Daniel@0:   Ty = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(ycount',[n 1])+repmat(ycount,[1 n]),0); 
Daniel@0:   [i,j] = find(Tx+Ty);
Daniel@0:   k = sub2ind(size(Tx),i,j);
Daniel@0:   V = [i, j, Tx(k), Ty(k)];
Daniel@0:   tix = V(:,3) + nanis*xnan; 
Daniel@0:   tiy = V(:,4) + nanis*ynan; 
Daniel@0:   
Daniel@0:   % select the best range
Daniel@0:   nix   = length(x);
Daniel@0:   niy   = length(y);
Daniel@0:   Cm    = [tix,nix-tix,tiy,niy-tiy];
Daniel@0:   [s,k] = max(som_drsignif(sigmea,Cm));
Daniel@0: 
Daniel@0:   % output
Daniel@0:   low  = edges(V(k,1));
Daniel@0:   high = edges(V(k,2)+1);
Daniel@0:   sig  = s;   
Daniel@0:   cm   = Cm(k,:);
Daniel@0: 
Daniel@0:   return;
Daniel@0: