Mercurial > hg > camir-aes2014
comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_stats.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function csS = som_stats(D,varargin) | |
2 | |
3 %SOM_STATS Calculate descriptive statistics for the data. | |
4 % | |
5 % csS = som_stats(D,[sort]); | |
6 % | |
7 % csS = som_stats(D); | |
8 % csS = som_stats(D,'nosort'); | |
9 % som_table_print(som_stats_table(csS)) | |
10 % | |
11 % Input and output arguments ([]'s are optional): | |
12 % D (matrix) a matrix, size dlen x dim | |
13 % (struct) data or map struct | |
14 % [sort] (string) 'sort' (default) or 'nosort' | |
15 % If 'nosort' is specified, the data is not | |
16 % sorted, and therefore the values of | |
17 % nunique, uvalues, ucount, fvalues, fcount, and tiles fields | |
18 % are not calculated. This may be useful if | |
19 % there is a very large amount of data, and | |
20 % one wants to reduce calculation time. | |
21 % | |
22 % csS (cell array) size dim x 1, of statistics structs with | |
23 % the following fields | |
24 % .type (string) 'som_stat' | |
25 % .name (string) name of the variable | |
26 % .normalization (struct array) variable normalization (see SOM_NORMALIZE) | |
27 % .ntotal (scalar) total number of values | |
28 % .nvalid (scalar) number of valid values (not Inf or NaN) | |
29 % .min (scalar) minimum value | |
30 % .max (scalar) maximum value | |
31 % .mean (scalar) mean value (not Inf or NaN) | |
32 % .std (scalar) standard deviation (not Inf or NaN) | |
33 % .nunique (scalar) number of unique values | |
34 % .mfvalue (vector) most frequent value | |
35 % .mfcount (vector) number of occurances of most frequent value | |
36 % .values (vector) at most MAXDISCRETE (see below) sample values | |
37 % .counts (vector) number of occurances for each sampled value | |
38 % .tiles (vector) NT-tile values, for example | |
39 % NT=4 for quartiles: 25%, 50% and 75% | |
40 % NT=100 for percentiles: 1%, 2%, ... and 99% | |
41 % .hist (struct) histogram struct with the following fields | |
42 % .type (string) 'som_hist' | |
43 % .bins (vector) histogram bin centers | |
44 % .counts (vector) count of values in each bin | |
45 % .binlabels (cellstr) labels for the bins (denormalized bin | |
46 % center values) | |
47 % .binlabels2 (cellstr) labels for the bins (denormalized bin | |
48 % edge values, e.g. '[1.4,2.5[' | |
49 % | |
50 % Constants: | |
51 % MAXDISCRETE = 10 | |
52 % NT = 10 | |
53 % | |
54 % See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT. | |
55 | |
56 % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto | |
57 % Copyright (c) by Juha Vesanto | |
58 % http://www.cis.hut.fi/projects/somtoolbox/ | |
59 | |
60 % Version 2.0beta juuso 311201 | |
61 | |
62 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 | |
63 %% arguments | |
64 | |
65 % default values | |
66 nosort = 0; | |
67 nbins = 10; | |
68 maxdiscrete = 20; | |
69 ntiles = 10; | |
70 | |
71 % first argument | |
72 if isstruct(D), | |
73 switch D.type, | |
74 case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook; | |
75 case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; | |
76 otherwise, error('Invalid first argument') | |
77 end | |
78 else | |
79 cn = cell(size(D,2),1); | |
80 cn(:) = {'Variable'}; | |
81 for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end | |
82 sN = cell(size(D,2),1); | |
83 end | |
84 [dlen dim] = size(D); | |
85 | |
86 % other arguments | |
87 | |
88 if length(varargin)>0, | |
89 if strcmp(varargin{1},'nosort'), nosort = 1; end | |
90 end | |
91 | |
92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 | |
93 %% action | |
94 | |
95 sStat = struct('type','som_stat','name','','normalization',[],... | |
96 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,... | |
97 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],... | |
98 'ntotal',dlen,'nvalid',NaN,'hist',[]); | |
99 csS = cell(0); | |
100 | |
101 for i=1:dim, | |
102 sS = sStat; | |
103 sS.name = cn{i}; | |
104 sS.normalization = sN{i}; | |
105 x = D(:,i); | |
106 x(find(~isfinite(x))) = []; | |
107 % basic descriptive statistics | |
108 sS.nvalid = length(x); | |
109 if length(x), | |
110 sS.min = min(x); | |
111 sS.max = max(x); | |
112 sS.mean = mean(x); | |
113 sS.std = std(x); | |
114 bins = []; | |
115 if ~nosort, | |
116 xsorted = sort(x); | |
117 % number of unique values | |
118 repeated = (xsorted(1:end-1)==xsorted(2:end)); | |
119 j = [1; find(~repeated)+1]; | |
120 xunique = xsorted(j); | |
121 sS.nunique = length(xunique); | |
122 ucount = diff([j; length(xsorted)+1]); | |
123 % most frequent value | |
124 [fcount,j] = max(ucount); | |
125 sS.mfvalue = xunique(j); | |
126 sS.mfcount = fcount; | |
127 % -tiles (k*100/ntiles % of values, k=1..) | |
128 pickind = round(linspace(1,sS.nvalid,ntiles+1)); | |
129 pickind = pickind(2:end-1); | |
130 sS.tiles = xsorted(pickind); | |
131 if sS.nunique <= sS.nvalid/2, | |
132 % unique values | |
133 sS.values = xunique; | |
134 sS.counts = ucount; | |
135 bins = sS.values; | |
136 else | |
137 % just maxdiscrete values, evenly picked | |
138 pickind = round(linspace(1,sS.nunique,maxdiscrete)); | |
139 sS.values = xunique(pickind); | |
140 sS.counts = ucount(pickind); | |
141 | |
142 %% OPTION 2: maxdiscrete most frequent values | |
143 %[v,j] = sort(ucount); | |
144 %pickind = j(1:maxdiscrete); | |
145 %sS.values = xunique(pickind); | |
146 %sS.counts = ucount(pickind); | |
147 | |
148 % OPTION 3: representative values - calculated using k-means | |
149 %[y,bm,qe] = kmeans(x,maxdiscrete); | |
150 %sS.values = y; | |
151 %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2)); | |
152 end | |
153 end | |
154 if isempty(bins), | |
155 bins = linspace(sS.min,sS.max,nbins+1); | |
156 bins = (bins(1:end-1)+bins(2:end))/2; | |
157 end | |
158 sS.hist = som_hist(x,bins,sS.normalization); | |
159 else | |
160 sS.hist = som_hist(x,0); | |
161 end | |
162 csS{end+1} = sS; | |
163 end | |
164 | |
165 return; | |
166 | |
167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 | |
168 %% subfunctions | |
169 | |
170 function sH = som_hist(x,bins,sN) | |
171 | |
172 binlabels = []; | |
173 binlabels2 = []; | |
174 if nargin<2 | isempty(bins) | isnan(bins), | |
175 bins = linspace(min(x),max(x),10); | |
176 end | |
177 if isstruct(bins), | |
178 bins = sH.bins; | |
179 binlabels = sH.binlabels; | |
180 binlabels2 = sH.binlabels2; | |
181 end | |
182 if nargin<3, sN = []; end | |
183 | |
184 sH = struct('type','som_hist','bins',bins,'counts',[],... | |
185 'binlabels',binlabels,'binlabels2',binlabels2); | |
186 | |
187 if length(bins)==1, | |
188 sH.counts = [length(x)]; | |
189 edges = bins; | |
190 elseif length(x), | |
191 edges = (bins(1:end-1)+bins(2:end))/2; | |
192 counts = histc(x,[-Inf; edges(:); Inf]); | |
193 sH.counts = counts(1:end-1); | |
194 end | |
195 | |
196 if isempty(sH.binlabels), | |
197 b = som_denormalize(bins(:),sN); | |
198 sH.binlabels = numtostring(b,4); | |
199 end | |
200 | |
201 if isempty(sH.binlabels2), | |
202 if length(edges)==1, | |
203 sH.binlabels2 = numtostring(som_denormalize(edges,sN),2); | |
204 if length(bins)>1, | |
205 sH.binlabels2 = sH.binlabels2([1 1]); | |
206 sH.binlabels2{1} = [']' sH.binlabels2{1} '[']; | |
207 sH.binlabels2{2} = ['[' sH.binlabels2{2} '[']; | |
208 end | |
209 else | |
210 if size(edges,1)==1, edges = edges'; end | |
211 bstr = numtostring(som_denormalize(edges,sN),4); | |
212 sH.binlabels2 = bstr([1:end end]); | |
213 sH.binlabels2{1} = [bstr{1} '[']; | |
214 for i=2:length(sH.binlabels2)-1, | |
215 sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; | |
216 end | |
217 sH.binlabels2{end} = ['[' bstr{end}]; | |
218 end | |
219 end | |
220 | |
221 if 0, | |
222 if length(bins)==1, sH.binlabels2 = {'constant'}; | |
223 else | |
224 ntiles = 10; | |
225 plim = [1:ntiles-1] / ntiles; | |
226 cp = cumsum(sH.counts)/sum(sH.counts); | |
227 [dummy,i] = histc(cp,[-Inf plim Inf]); | |
228 l2 = cell(length(bins),1); | |
229 for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end | |
230 if i(1) > 1, l2{1} = ['...' l2{1}]; end | |
231 k = 0; | |
232 for j=2:length(bins), | |
233 if i(j)==i(j-1), | |
234 if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end | |
235 k = k + 1; | |
236 l2{j} = [l2{j} '.' num2str(k)]; | |
237 else k = 0; end | |
238 end | |
239 if i(end) < ntiles, l2{end} = [l2{end} '...']; end | |
240 sH.binlabels2 = l2; | |
241 end | |
242 end | |
243 | |
244 return; | |
245 | |
246 function vstr = numtostring(v,d) | |
247 | |
248 r = max(v)-min(v); | |
249 if r==0, r=1; end | |
250 nearzero = (abs(v)/r < 10.^-d); | |
251 i1 = find(v > 0 & nearzero); | |
252 i2 = find(v < 0 & nearzero); | |
253 vstr = strrep(cellstr(num2str(v,d)),' ',''); | |
254 vstr(i1) = {'0.0'}; | |
255 vstr(i2) = {'-0.0'}; | |
256 return; | |
257 |