wolffd@0
|
1 function csS = som_stats(D,varargin)
|
wolffd@0
|
2
|
wolffd@0
|
3 %SOM_STATS Calculate descriptive statistics for the data.
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % csS = som_stats(D,[sort]);
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % csS = som_stats(D);
|
wolffd@0
|
8 % csS = som_stats(D,'nosort');
|
wolffd@0
|
9 % som_table_print(som_stats_table(csS))
|
wolffd@0
|
10 %
|
wolffd@0
|
11 % Input and output arguments ([]'s are optional):
|
wolffd@0
|
12 % D (matrix) a matrix, size dlen x dim
|
wolffd@0
|
13 % (struct) data or map struct
|
wolffd@0
|
14 % [sort] (string) 'sort' (default) or 'nosort'
|
wolffd@0
|
15 % If 'nosort' is specified, the data is not
|
wolffd@0
|
16 % sorted, and therefore the values of
|
wolffd@0
|
17 % nunique, uvalues, ucount, fvalues, fcount, and tiles fields
|
wolffd@0
|
18 % are not calculated. This may be useful if
|
wolffd@0
|
19 % there is a very large amount of data, and
|
wolffd@0
|
20 % one wants to reduce calculation time.
|
wolffd@0
|
21 %
|
wolffd@0
|
22 % csS (cell array) size dim x 1, of statistics structs with
|
wolffd@0
|
23 % the following fields
|
wolffd@0
|
24 % .type (string) 'som_stat'
|
wolffd@0
|
25 % .name (string) name of the variable
|
wolffd@0
|
26 % .normalization (struct array) variable normalization (see SOM_NORMALIZE)
|
wolffd@0
|
27 % .ntotal (scalar) total number of values
|
wolffd@0
|
28 % .nvalid (scalar) number of valid values (not Inf or NaN)
|
wolffd@0
|
29 % .min (scalar) minimum value
|
wolffd@0
|
30 % .max (scalar) maximum value
|
wolffd@0
|
31 % .mean (scalar) mean value (not Inf or NaN)
|
wolffd@0
|
32 % .std (scalar) standard deviation (not Inf or NaN)
|
wolffd@0
|
33 % .nunique (scalar) number of unique values
|
wolffd@0
|
34 % .mfvalue (vector) most frequent value
|
wolffd@0
|
35 % .mfcount (vector) number of occurances of most frequent value
|
wolffd@0
|
36 % .values (vector) at most MAXDISCRETE (see below) sample values
|
wolffd@0
|
37 % .counts (vector) number of occurances for each sampled value
|
wolffd@0
|
38 % .tiles (vector) NT-tile values, for example
|
wolffd@0
|
39 % NT=4 for quartiles: 25%, 50% and 75%
|
wolffd@0
|
40 % NT=100 for percentiles: 1%, 2%, ... and 99%
|
wolffd@0
|
41 % .hist (struct) histogram struct with the following fields
|
wolffd@0
|
42 % .type (string) 'som_hist'
|
wolffd@0
|
43 % .bins (vector) histogram bin centers
|
wolffd@0
|
44 % .counts (vector) count of values in each bin
|
wolffd@0
|
45 % .binlabels (cellstr) labels for the bins (denormalized bin
|
wolffd@0
|
46 % center values)
|
wolffd@0
|
47 % .binlabels2 (cellstr) labels for the bins (denormalized bin
|
wolffd@0
|
48 % edge values, e.g. '[1.4,2.5['
|
wolffd@0
|
49 %
|
wolffd@0
|
50 % Constants:
|
wolffd@0
|
51 % MAXDISCRETE = 10
|
wolffd@0
|
52 % NT = 10
|
wolffd@0
|
53 %
|
wolffd@0
|
54 % See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT.
|
wolffd@0
|
55
|
wolffd@0
|
56 % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto
|
wolffd@0
|
57 % Copyright (c) by Juha Vesanto
|
wolffd@0
|
58 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
59
|
wolffd@0
|
60 % Version 2.0beta juuso 311201
|
wolffd@0
|
61
|
wolffd@0
|
62 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
wolffd@0
|
63 %% arguments
|
wolffd@0
|
64
|
wolffd@0
|
65 % default values
|
wolffd@0
|
66 nosort = 0;
|
wolffd@0
|
67 nbins = 10;
|
wolffd@0
|
68 maxdiscrete = 20;
|
wolffd@0
|
69 ntiles = 10;
|
wolffd@0
|
70
|
wolffd@0
|
71 % first argument
|
wolffd@0
|
72 if isstruct(D),
|
wolffd@0
|
73 switch D.type,
|
wolffd@0
|
74 case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook;
|
wolffd@0
|
75 case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data;
|
wolffd@0
|
76 otherwise, error('Invalid first argument')
|
wolffd@0
|
77 end
|
wolffd@0
|
78 else
|
wolffd@0
|
79 cn = cell(size(D,2),1);
|
wolffd@0
|
80 cn(:) = {'Variable'};
|
wolffd@0
|
81 for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end
|
wolffd@0
|
82 sN = cell(size(D,2),1);
|
wolffd@0
|
83 end
|
wolffd@0
|
84 [dlen dim] = size(D);
|
wolffd@0
|
85
|
wolffd@0
|
86 % other arguments
|
wolffd@0
|
87
|
wolffd@0
|
88 if length(varargin)>0,
|
wolffd@0
|
89 if strcmp(varargin{1},'nosort'), nosort = 1; end
|
wolffd@0
|
90 end
|
wolffd@0
|
91
|
wolffd@0
|
92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
wolffd@0
|
93 %% action
|
wolffd@0
|
94
|
wolffd@0
|
95 sStat = struct('type','som_stat','name','','normalization',[],...
|
wolffd@0
|
96 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,...
|
wolffd@0
|
97 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],...
|
wolffd@0
|
98 'ntotal',dlen,'nvalid',NaN,'hist',[]);
|
wolffd@0
|
99 csS = cell(0);
|
wolffd@0
|
100
|
wolffd@0
|
101 for i=1:dim,
|
wolffd@0
|
102 sS = sStat;
|
wolffd@0
|
103 sS.name = cn{i};
|
wolffd@0
|
104 sS.normalization = sN{i};
|
wolffd@0
|
105 x = D(:,i);
|
wolffd@0
|
106 x(find(~isfinite(x))) = [];
|
wolffd@0
|
107 % basic descriptive statistics
|
wolffd@0
|
108 sS.nvalid = length(x);
|
wolffd@0
|
109 if length(x),
|
wolffd@0
|
110 sS.min = min(x);
|
wolffd@0
|
111 sS.max = max(x);
|
wolffd@0
|
112 sS.mean = mean(x);
|
wolffd@0
|
113 sS.std = std(x);
|
wolffd@0
|
114 bins = [];
|
wolffd@0
|
115 if ~nosort,
|
wolffd@0
|
116 xsorted = sort(x);
|
wolffd@0
|
117 % number of unique values
|
wolffd@0
|
118 repeated = (xsorted(1:end-1)==xsorted(2:end));
|
wolffd@0
|
119 j = [1; find(~repeated)+1];
|
wolffd@0
|
120 xunique = xsorted(j);
|
wolffd@0
|
121 sS.nunique = length(xunique);
|
wolffd@0
|
122 ucount = diff([j; length(xsorted)+1]);
|
wolffd@0
|
123 % most frequent value
|
wolffd@0
|
124 [fcount,j] = max(ucount);
|
wolffd@0
|
125 sS.mfvalue = xunique(j);
|
wolffd@0
|
126 sS.mfcount = fcount;
|
wolffd@0
|
127 % -tiles (k*100/ntiles % of values, k=1..)
|
wolffd@0
|
128 pickind = round(linspace(1,sS.nvalid,ntiles+1));
|
wolffd@0
|
129 pickind = pickind(2:end-1);
|
wolffd@0
|
130 sS.tiles = xsorted(pickind);
|
wolffd@0
|
131 if sS.nunique <= sS.nvalid/2,
|
wolffd@0
|
132 % unique values
|
wolffd@0
|
133 sS.values = xunique;
|
wolffd@0
|
134 sS.counts = ucount;
|
wolffd@0
|
135 bins = sS.values;
|
wolffd@0
|
136 else
|
wolffd@0
|
137 % just maxdiscrete values, evenly picked
|
wolffd@0
|
138 pickind = round(linspace(1,sS.nunique,maxdiscrete));
|
wolffd@0
|
139 sS.values = xunique(pickind);
|
wolffd@0
|
140 sS.counts = ucount(pickind);
|
wolffd@0
|
141
|
wolffd@0
|
142 %% OPTION 2: maxdiscrete most frequent values
|
wolffd@0
|
143 %[v,j] = sort(ucount);
|
wolffd@0
|
144 %pickind = j(1:maxdiscrete);
|
wolffd@0
|
145 %sS.values = xunique(pickind);
|
wolffd@0
|
146 %sS.counts = ucount(pickind);
|
wolffd@0
|
147
|
wolffd@0
|
148 % OPTION 3: representative values - calculated using k-means
|
wolffd@0
|
149 %[y,bm,qe] = kmeans(x,maxdiscrete);
|
wolffd@0
|
150 %sS.values = y;
|
wolffd@0
|
151 %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2));
|
wolffd@0
|
152 end
|
wolffd@0
|
153 end
|
wolffd@0
|
154 if isempty(bins),
|
wolffd@0
|
155 bins = linspace(sS.min,sS.max,nbins+1);
|
wolffd@0
|
156 bins = (bins(1:end-1)+bins(2:end))/2;
|
wolffd@0
|
157 end
|
wolffd@0
|
158 sS.hist = som_hist(x,bins,sS.normalization);
|
wolffd@0
|
159 else
|
wolffd@0
|
160 sS.hist = som_hist(x,0);
|
wolffd@0
|
161 end
|
wolffd@0
|
162 csS{end+1} = sS;
|
wolffd@0
|
163 end
|
wolffd@0
|
164
|
wolffd@0
|
165 return;
|
wolffd@0
|
166
|
wolffd@0
|
167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
wolffd@0
|
168 %% subfunctions
|
wolffd@0
|
169
|
wolffd@0
|
170 function sH = som_hist(x,bins,sN)
|
wolffd@0
|
171
|
wolffd@0
|
172 binlabels = [];
|
wolffd@0
|
173 binlabels2 = [];
|
wolffd@0
|
174 if nargin<2 | isempty(bins) | isnan(bins),
|
wolffd@0
|
175 bins = linspace(min(x),max(x),10);
|
wolffd@0
|
176 end
|
wolffd@0
|
177 if isstruct(bins),
|
wolffd@0
|
178 bins = sH.bins;
|
wolffd@0
|
179 binlabels = sH.binlabels;
|
wolffd@0
|
180 binlabels2 = sH.binlabels2;
|
wolffd@0
|
181 end
|
wolffd@0
|
182 if nargin<3, sN = []; end
|
wolffd@0
|
183
|
wolffd@0
|
184 sH = struct('type','som_hist','bins',bins,'counts',[],...
|
wolffd@0
|
185 'binlabels',binlabels,'binlabels2',binlabels2);
|
wolffd@0
|
186
|
wolffd@0
|
187 if length(bins)==1,
|
wolffd@0
|
188 sH.counts = [length(x)];
|
wolffd@0
|
189 edges = bins;
|
wolffd@0
|
190 elseif length(x),
|
wolffd@0
|
191 edges = (bins(1:end-1)+bins(2:end))/2;
|
wolffd@0
|
192 counts = histc(x,[-Inf; edges(:); Inf]);
|
wolffd@0
|
193 sH.counts = counts(1:end-1);
|
wolffd@0
|
194 end
|
wolffd@0
|
195
|
wolffd@0
|
196 if isempty(sH.binlabels),
|
wolffd@0
|
197 b = som_denormalize(bins(:),sN);
|
wolffd@0
|
198 sH.binlabels = numtostring(b,4);
|
wolffd@0
|
199 end
|
wolffd@0
|
200
|
wolffd@0
|
201 if isempty(sH.binlabels2),
|
wolffd@0
|
202 if length(edges)==1,
|
wolffd@0
|
203 sH.binlabels2 = numtostring(som_denormalize(edges,sN),2);
|
wolffd@0
|
204 if length(bins)>1,
|
wolffd@0
|
205 sH.binlabels2 = sH.binlabels2([1 1]);
|
wolffd@0
|
206 sH.binlabels2{1} = [']' sH.binlabels2{1} '['];
|
wolffd@0
|
207 sH.binlabels2{2} = ['[' sH.binlabels2{2} '['];
|
wolffd@0
|
208 end
|
wolffd@0
|
209 else
|
wolffd@0
|
210 if size(edges,1)==1, edges = edges'; end
|
wolffd@0
|
211 bstr = numtostring(som_denormalize(edges,sN),4);
|
wolffd@0
|
212 sH.binlabels2 = bstr([1:end end]);
|
wolffd@0
|
213 sH.binlabels2{1} = [bstr{1} '['];
|
wolffd@0
|
214 for i=2:length(sH.binlabels2)-1,
|
wolffd@0
|
215 sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '['];
|
wolffd@0
|
216 end
|
wolffd@0
|
217 sH.binlabels2{end} = ['[' bstr{end}];
|
wolffd@0
|
218 end
|
wolffd@0
|
219 end
|
wolffd@0
|
220
|
wolffd@0
|
221 if 0,
|
wolffd@0
|
222 if length(bins)==1, sH.binlabels2 = {'constant'};
|
wolffd@0
|
223 else
|
wolffd@0
|
224 ntiles = 10;
|
wolffd@0
|
225 plim = [1:ntiles-1] / ntiles;
|
wolffd@0
|
226 cp = cumsum(sH.counts)/sum(sH.counts);
|
wolffd@0
|
227 [dummy,i] = histc(cp,[-Inf plim Inf]);
|
wolffd@0
|
228 l2 = cell(length(bins),1);
|
wolffd@0
|
229 for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end
|
wolffd@0
|
230 if i(1) > 1, l2{1} = ['...' l2{1}]; end
|
wolffd@0
|
231 k = 0;
|
wolffd@0
|
232 for j=2:length(bins),
|
wolffd@0
|
233 if i(j)==i(j-1),
|
wolffd@0
|
234 if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end
|
wolffd@0
|
235 k = k + 1;
|
wolffd@0
|
236 l2{j} = [l2{j} '.' num2str(k)];
|
wolffd@0
|
237 else k = 0; end
|
wolffd@0
|
238 end
|
wolffd@0
|
239 if i(end) < ntiles, l2{end} = [l2{end} '...']; end
|
wolffd@0
|
240 sH.binlabels2 = l2;
|
wolffd@0
|
241 end
|
wolffd@0
|
242 end
|
wolffd@0
|
243
|
wolffd@0
|
244 return;
|
wolffd@0
|
245
|
wolffd@0
|
246 function vstr = numtostring(v,d)
|
wolffd@0
|
247
|
wolffd@0
|
248 r = max(v)-min(v);
|
wolffd@0
|
249 if r==0, r=1; end
|
wolffd@0
|
250 nearzero = (abs(v)/r < 10.^-d);
|
wolffd@0
|
251 i1 = find(v > 0 & nearzero);
|
wolffd@0
|
252 i2 = find(v < 0 & nearzero);
|
wolffd@0
|
253 vstr = strrep(cellstr(num2str(v,d)),' ','');
|
wolffd@0
|
254 vstr(i1) = {'0.0'};
|
wolffd@0
|
255 vstr(i2) = {'-0.0'};
|
wolffd@0
|
256 return;
|
wolffd@0
|
257
|