wolffd@0
|
1 function [sR,best,sig,Cm] = som_drmake(D,inds1,inds2,sigmea,nanis)
|
wolffd@0
|
2
|
wolffd@0
|
3 % SOM_DRMAKE Make descriptive rules for given group within the given data.
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % sR = som_drmake(D,[inds1],[inds2],[sigmea],[nanis])
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % D (struct) map or data struct
|
wolffd@0
|
8 % (matrix) the data, of size [dlen x dim]
|
wolffd@0
|
9 % [inds1] (vector) indeces belonging to the group
|
wolffd@0
|
10 % (the whole data set by default)
|
wolffd@0
|
11 % [inds2] (vector) indeces belonging to the contrast group
|
wolffd@0
|
12 % (the rest of the data set by default)
|
wolffd@0
|
13 % [sigmea] (string) significance measure: 'accuracy',
|
wolffd@0
|
14 % 'mutuconf' (default), or 'accuracyI'.
|
wolffd@0
|
15 % (See definitions below).
|
wolffd@0
|
16 % [nanis] (scalar) value given for NaNs: 0 (=FALSE, default),
|
wolffd@0
|
17 % 1 (=TRUE) or NaN (=ignored)
|
wolffd@0
|
18 %
|
wolffd@0
|
19 % sR (struct array) best rule for each component. Each
|
wolffd@0
|
20 % struct has the following fields:
|
wolffd@0
|
21 % .type (string) 'som_rule'
|
wolffd@0
|
22 % .name (string) name of the component
|
wolffd@0
|
23 % .low (scalar) the low end of the rule range
|
wolffd@0
|
24 % .high (scalar) the high end of the rule range
|
wolffd@0
|
25 % .nanis (scalar) how NaNs are handled: NaN, 0 or 1
|
wolffd@0
|
26 %
|
wolffd@0
|
27 % best (vector) indeces of rules which make the best combined rule
|
wolffd@0
|
28 % sig (vector) significance measure values for each rule, and for the combined rule
|
wolffd@0
|
29 % Cm (matrix) A matrix of vectorized confusion matrices for each rule,
|
wolffd@0
|
30 % and for the combined rule: [a, c, b, d] (see below).
|
wolffd@0
|
31 %
|
wolffd@0
|
32 % For each rule, such rules sR.low <= x < sR.high are found
|
wolffd@0
|
33 % which optimize the given significance measure. The confusion
|
wolffd@0
|
34 % matrix below between the given grouping (G: group - not G: contrast group)
|
wolffd@0
|
35 % and rule (R: true or false) is used to determine the significance values:
|
wolffd@0
|
36 %
|
wolffd@0
|
37 % G not G
|
wolffd@0
|
38 % --------------- accuracy = (a+d) / (a+b+c+d)
|
wolffd@0
|
39 % true | a | b |
|
wolffd@0
|
40 % |-------------- mutuconf = a*a / ((a+b)(a+c))
|
wolffd@0
|
41 % false | c | d |
|
wolffd@0
|
42 % --------------- accuracyI = a / (a+b+c)
|
wolffd@0
|
43 %
|
wolffd@0
|
44 % See also SOM_DREVAL, SOM_DRTABLE.
|
wolffd@0
|
45
|
wolffd@0
|
46 % Contributed to SOM Toolbox 2.0, January 7th, 2002 by Juha Vesanto
|
wolffd@0
|
47 % Copyright (c) by Juha Vesanto
|
wolffd@0
|
48 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
49
|
wolffd@0
|
50 % Version 2.0beta juuso 070102
|
wolffd@0
|
51
|
wolffd@0
|
52 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
53 %% input arguments
|
wolffd@0
|
54
|
wolffd@0
|
55 if isstruct(D),
|
wolffd@0
|
56 switch D.type,
|
wolffd@0
|
57 case 'som_data', cn = D.comp_names; D = D.data;
|
wolffd@0
|
58 case 'som_map', cn = D.comp_names; D = D.codebook;
|
wolffd@0
|
59 end
|
wolffd@0
|
60 else
|
wolffd@0
|
61 cn = cell(size(D,2),1);
|
wolffd@0
|
62 for i=1:size(D,2), cn{i} = sprintf('Variable%d',i); end
|
wolffd@0
|
63 end
|
wolffd@0
|
64
|
wolffd@0
|
65 [dlen,dim] = size(D);
|
wolffd@0
|
66 if nargin<2 | isempty(inds1), inds1 = 1:dlen; end
|
wolffd@0
|
67 if nargin<3 | isempty(inds2), i = ones(dlen,1); i(inds1) = 0; inds2 = find(i); end
|
wolffd@0
|
68 if nargin<4, sigmea = 'mutuconf'; end
|
wolffd@0
|
69 if nargin<5, nanis = 0; end
|
wolffd@0
|
70
|
wolffd@0
|
71 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
72 %% input arguments
|
wolffd@0
|
73
|
wolffd@0
|
74 sig = zeros(dim+1,1);
|
wolffd@0
|
75 Cm = zeros(dim+1,4);
|
wolffd@0
|
76
|
wolffd@0
|
77 sR1tmp = struct('type','som_rule','name','','low',-Inf,'high',Inf,'nanis',nanis,'lowstr','','highstr','');
|
wolffd@0
|
78 sR = sR1tmp;
|
wolffd@0
|
79
|
wolffd@0
|
80 % single variable rules
|
wolffd@0
|
81 for i=1:dim,
|
wolffd@0
|
82
|
wolffd@0
|
83 % bin edges
|
wolffd@0
|
84 mi = min(D(:,i));
|
wolffd@0
|
85 ma = max(D(:,i));
|
wolffd@0
|
86 [histcount,bins] = hist([mi,ma],10);
|
wolffd@0
|
87 if size(bins,1)>1, bins = bins'; end
|
wolffd@0
|
88 edges = [-Inf, (bins(1:end-1)+bins(2:end))/2, Inf];
|
wolffd@0
|
89
|
wolffd@0
|
90 % find the rule for this variable
|
wolffd@0
|
91 [low,high,s,cm] = onevar_descrule(D(inds1,i),D(inds2,i),sigmea,nanis,edges);
|
wolffd@0
|
92 sR1 = sR1tmp;
|
wolffd@0
|
93 sR1.name = cn{i};
|
wolffd@0
|
94 sR1.low = low;
|
wolffd@0
|
95 sR1.high = high;
|
wolffd@0
|
96 sR(i) = sR1;
|
wolffd@0
|
97 sig(i) = s;
|
wolffd@0
|
98 Cm(i,:) = cm;
|
wolffd@0
|
99
|
wolffd@0
|
100 end
|
wolffd@0
|
101
|
wolffd@0
|
102 % find combined rule
|
wolffd@0
|
103 [dummy,order] = sort(-sig);
|
wolffd@0
|
104 maxsig = sig(order(1)); bestcm = Cm(order(1),:);
|
wolffd@0
|
105 best = order(1);
|
wolffd@0
|
106 for i=2:dim,
|
wolffd@0
|
107 com = [best, order(i)];
|
wolffd@0
|
108 [s,cm,truex,truey] = som_dreval(sR(com),D(:,com),sigmea,inds1,inds2,'and');
|
wolffd@0
|
109 if s>maxsig, best = com; maxsig = s; bestcm = cm; end
|
wolffd@0
|
110 end
|
wolffd@0
|
111 sig(end) = maxsig;
|
wolffd@0
|
112 Cm(end,:) = cm;
|
wolffd@0
|
113
|
wolffd@0
|
114 return;
|
wolffd@0
|
115
|
wolffd@0
|
116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
|
wolffd@0
|
117 %% descriptive rules
|
wolffd@0
|
118
|
wolffd@0
|
119 function [low,high,sig,cm] = onevar_descrule(x,y,sigmea,nanis,edges)
|
wolffd@0
|
120
|
wolffd@0
|
121 % Given a set of bin edges, find the range of bins with best significance.
|
wolffd@0
|
122 %
|
wolffd@0
|
123 % x data values in cluster
|
wolffd@0
|
124 % y data values not in cluster
|
wolffd@0
|
125 % sigmea significance measure
|
wolffd@0
|
126 % bins bin centers
|
wolffd@0
|
127 % nanis how to handle NaNs
|
wolffd@0
|
128
|
wolffd@0
|
129 % histogram counts
|
wolffd@0
|
130 if isnan(nanis), x = x(~isnan(x)); y = y(~isnan(y)); end
|
wolffd@0
|
131 [xcount,xbin] = histc(x,edges);
|
wolffd@0
|
132 [ycount,ybin] = histc(y,edges);
|
wolffd@0
|
133 xcount = xcount(1:end-1);
|
wolffd@0
|
134 ycount = ycount(1:end-1);
|
wolffd@0
|
135 xnan=sum(isnan(x));
|
wolffd@0
|
136 ynan=sum(isnan(y));
|
wolffd@0
|
137
|
wolffd@0
|
138 % find number of true items in both groups in all possible ranges
|
wolffd@0
|
139 n = length(xcount);
|
wolffd@0
|
140 V = zeros(n*(n+1)/2,4);
|
wolffd@0
|
141 s1 = cumsum(xcount);
|
wolffd@0
|
142 s2 = cumsum(xcount(end:-1:1)); s2 = s2(end:-1:1);
|
wolffd@0
|
143 m = s1(end);
|
wolffd@0
|
144 Tx = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(xcount',[n 1])+repmat(xcount,[1 n]),0);
|
wolffd@0
|
145 s1 = cumsum(ycount);
|
wolffd@0
|
146 s2 = cumsum(ycount(end:-1:1)); s2 = s2(end:-1:1);
|
wolffd@0
|
147 Ty = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(ycount',[n 1])+repmat(ycount,[1 n]),0);
|
wolffd@0
|
148 [i,j] = find(Tx+Ty);
|
wolffd@0
|
149 k = sub2ind(size(Tx),i,j);
|
wolffd@0
|
150 V = [i, j, Tx(k), Ty(k)];
|
wolffd@0
|
151 tix = V(:,3) + nanis*xnan;
|
wolffd@0
|
152 tiy = V(:,4) + nanis*ynan;
|
wolffd@0
|
153
|
wolffd@0
|
154 % select the best range
|
wolffd@0
|
155 nix = length(x);
|
wolffd@0
|
156 niy = length(y);
|
wolffd@0
|
157 Cm = [tix,nix-tix,tiy,niy-tiy];
|
wolffd@0
|
158 [s,k] = max(som_drsignif(sigmea,Cm));
|
wolffd@0
|
159
|
wolffd@0
|
160 % output
|
wolffd@0
|
161 low = edges(V(k,1));
|
wolffd@0
|
162 high = edges(V(k,2)+1);
|
wolffd@0
|
163 sig = s;
|
wolffd@0
|
164 cm = Cm(k,:);
|
wolffd@0
|
165
|
wolffd@0
|
166 return;
|
wolffd@0
|
167 |