annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_drmake.m @ 0:cc4b1211e677 tip

initial commit to HG from Changeset: 646 (e263d8a21543) added further path and more save "camirversion.m"
author Daniel Wolff
date Fri, 19 Aug 2016 13:07:06 +0200
parents
children
rev   line source
Daniel@0 1 function [sR,best,sig,Cm] = som_drmake(D,inds1,inds2,sigmea,nanis)
Daniel@0 2
Daniel@0 3 % SOM_DRMAKE Make descriptive rules for given group within the given data.
Daniel@0 4 %
Daniel@0 5 % sR = som_drmake(D,[inds1],[inds2],[sigmea],[nanis])
Daniel@0 6 %
Daniel@0 7 % D (struct) map or data struct
Daniel@0 8 % (matrix) the data, of size [dlen x dim]
Daniel@0 9 % [inds1] (vector) indeces belonging to the group
Daniel@0 10 % (the whole data set by default)
Daniel@0 11 % [inds2] (vector) indeces belonging to the contrast group
Daniel@0 12 % (the rest of the data set by default)
Daniel@0 13 % [sigmea] (string) significance measure: 'accuracy',
Daniel@0 14 % 'mutuconf' (default), or 'accuracyI'.
Daniel@0 15 % (See definitions below).
Daniel@0 16 % [nanis] (scalar) value given for NaNs: 0 (=FALSE, default),
Daniel@0 17 % 1 (=TRUE) or NaN (=ignored)
Daniel@0 18 %
Daniel@0 19 % sR (struct array) best rule for each component. Each
Daniel@0 20 % struct has the following fields:
Daniel@0 21 % .type (string) 'som_rule'
Daniel@0 22 % .name (string) name of the component
Daniel@0 23 % .low (scalar) the low end of the rule range
Daniel@0 24 % .high (scalar) the high end of the rule range
Daniel@0 25 % .nanis (scalar) how NaNs are handled: NaN, 0 or 1
Daniel@0 26 %
Daniel@0 27 % best (vector) indeces of rules which make the best combined rule
Daniel@0 28 % sig (vector) significance measure values for each rule, and for the combined rule
Daniel@0 29 % Cm (matrix) A matrix of vectorized confusion matrices for each rule,
Daniel@0 30 % and for the combined rule: [a, c, b, d] (see below).
Daniel@0 31 %
Daniel@0 32 % For each rule, such rules sR.low <= x < sR.high are found
Daniel@0 33 % which optimize the given significance measure. The confusion
Daniel@0 34 % matrix below between the given grouping (G: group - not G: contrast group)
Daniel@0 35 % and rule (R: true or false) is used to determine the significance values:
Daniel@0 36 %
Daniel@0 37 % G not G
Daniel@0 38 % --------------- accuracy = (a+d) / (a+b+c+d)
Daniel@0 39 % true | a | b |
Daniel@0 40 % |-------------- mutuconf = a*a / ((a+b)(a+c))
Daniel@0 41 % false | c | d |
Daniel@0 42 % --------------- accuracyI = a / (a+b+c)
Daniel@0 43 %
Daniel@0 44 % See also SOM_DREVAL, SOM_DRTABLE.
Daniel@0 45
Daniel@0 46 % Contributed to SOM Toolbox 2.0, January 7th, 2002 by Juha Vesanto
Daniel@0 47 % Copyright (c) by Juha Vesanto
Daniel@0 48 % http://www.cis.hut.fi/projects/somtoolbox/
Daniel@0 49
Daniel@0 50 % Version 2.0beta juuso 070102
Daniel@0 51
Daniel@0 52 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 53 %% input arguments
Daniel@0 54
Daniel@0 55 if isstruct(D),
Daniel@0 56 switch D.type,
Daniel@0 57 case 'som_data', cn = D.comp_names; D = D.data;
Daniel@0 58 case 'som_map', cn = D.comp_names; D = D.codebook;
Daniel@0 59 end
Daniel@0 60 else
Daniel@0 61 cn = cell(size(D,2),1);
Daniel@0 62 for i=1:size(D,2), cn{i} = sprintf('Variable%d',i); end
Daniel@0 63 end
Daniel@0 64
Daniel@0 65 [dlen,dim] = size(D);
Daniel@0 66 if nargin<2 | isempty(inds1), inds1 = 1:dlen; end
Daniel@0 67 if nargin<3 | isempty(inds2), i = ones(dlen,1); i(inds1) = 0; inds2 = find(i); end
Daniel@0 68 if nargin<4, sigmea = 'mutuconf'; end
Daniel@0 69 if nargin<5, nanis = 0; end
Daniel@0 70
Daniel@0 71 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Daniel@0 72 %% input arguments
Daniel@0 73
Daniel@0 74 sig = zeros(dim+1,1);
Daniel@0 75 Cm = zeros(dim+1,4);
Daniel@0 76
Daniel@0 77 sR1tmp = struct('type','som_rule','name','','low',-Inf,'high',Inf,'nanis',nanis,'lowstr','','highstr','');
Daniel@0 78 sR = sR1tmp;
Daniel@0 79
Daniel@0 80 % single variable rules
Daniel@0 81 for i=1:dim,
Daniel@0 82
Daniel@0 83 % bin edges
Daniel@0 84 mi = min(D(:,i));
Daniel@0 85 ma = max(D(:,i));
Daniel@0 86 [histcount,bins] = hist([mi,ma],10);
Daniel@0 87 if size(bins,1)>1, bins = bins'; end
Daniel@0 88 edges = [-Inf, (bins(1:end-1)+bins(2:end))/2, Inf];
Daniel@0 89
Daniel@0 90 % find the rule for this variable
Daniel@0 91 [low,high,s,cm] = onevar_descrule(D(inds1,i),D(inds2,i),sigmea,nanis,edges);
Daniel@0 92 sR1 = sR1tmp;
Daniel@0 93 sR1.name = cn{i};
Daniel@0 94 sR1.low = low;
Daniel@0 95 sR1.high = high;
Daniel@0 96 sR(i) = sR1;
Daniel@0 97 sig(i) = s;
Daniel@0 98 Cm(i,:) = cm;
Daniel@0 99
Daniel@0 100 end
Daniel@0 101
Daniel@0 102 % find combined rule
Daniel@0 103 [dummy,order] = sort(-sig);
Daniel@0 104 maxsig = sig(order(1)); bestcm = Cm(order(1),:);
Daniel@0 105 best = order(1);
Daniel@0 106 for i=2:dim,
Daniel@0 107 com = [best, order(i)];
Daniel@0 108 [s,cm,truex,truey] = som_dreval(sR(com),D(:,com),sigmea,inds1,inds2,'and');
Daniel@0 109 if s>maxsig, best = com; maxsig = s; bestcm = cm; end
Daniel@0 110 end
Daniel@0 111 sig(end) = maxsig;
Daniel@0 112 Cm(end,:) = cm;
Daniel@0 113
Daniel@0 114 return;
Daniel@0 115
Daniel@0 116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
Daniel@0 117 %% descriptive rules
Daniel@0 118
Daniel@0 119 function [low,high,sig,cm] = onevar_descrule(x,y,sigmea,nanis,edges)
Daniel@0 120
Daniel@0 121 % Given a set of bin edges, find the range of bins with best significance.
Daniel@0 122 %
Daniel@0 123 % x data values in cluster
Daniel@0 124 % y data values not in cluster
Daniel@0 125 % sigmea significance measure
Daniel@0 126 % bins bin centers
Daniel@0 127 % nanis how to handle NaNs
Daniel@0 128
Daniel@0 129 % histogram counts
Daniel@0 130 if isnan(nanis), x = x(~isnan(x)); y = y(~isnan(y)); end
Daniel@0 131 [xcount,xbin] = histc(x,edges);
Daniel@0 132 [ycount,ybin] = histc(y,edges);
Daniel@0 133 xcount = xcount(1:end-1);
Daniel@0 134 ycount = ycount(1:end-1);
Daniel@0 135 xnan=sum(isnan(x));
Daniel@0 136 ynan=sum(isnan(y));
Daniel@0 137
Daniel@0 138 % find number of true items in both groups in all possible ranges
Daniel@0 139 n = length(xcount);
Daniel@0 140 V = zeros(n*(n+1)/2,4);
Daniel@0 141 s1 = cumsum(xcount);
Daniel@0 142 s2 = cumsum(xcount(end:-1:1)); s2 = s2(end:-1:1);
Daniel@0 143 m = s1(end);
Daniel@0 144 Tx = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(xcount',[n 1])+repmat(xcount,[1 n]),0);
Daniel@0 145 s1 = cumsum(ycount);
Daniel@0 146 s2 = cumsum(ycount(end:-1:1)); s2 = s2(end:-1:1);
Daniel@0 147 Ty = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(ycount',[n 1])+repmat(ycount,[1 n]),0);
Daniel@0 148 [i,j] = find(Tx+Ty);
Daniel@0 149 k = sub2ind(size(Tx),i,j);
Daniel@0 150 V = [i, j, Tx(k), Ty(k)];
Daniel@0 151 tix = V(:,3) + nanis*xnan;
Daniel@0 152 tiy = V(:,4) + nanis*ynan;
Daniel@0 153
Daniel@0 154 % select the best range
Daniel@0 155 nix = length(x);
Daniel@0 156 niy = length(y);
Daniel@0 157 Cm = [tix,nix-tix,tiy,niy-tiy];
Daniel@0 158 [s,k] = max(som_drsignif(sigmea,Cm));
Daniel@0 159
Daniel@0 160 % output
Daniel@0 161 low = edges(V(k,1));
Daniel@0 162 high = edges(V(k,2)+1);
Daniel@0 163 sig = s;
Daniel@0 164 cm = Cm(k,:);
Daniel@0 165
Daniel@0 166 return;
Daniel@0 167