annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/som_stats.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 function csS = som_stats(D,varargin)
wolffd@0 2
wolffd@0 3 %SOM_STATS Calculate descriptive statistics for the data.
wolffd@0 4 %
wolffd@0 5 % csS = som_stats(D,[sort]);
wolffd@0 6 %
wolffd@0 7 % csS = som_stats(D);
wolffd@0 8 % csS = som_stats(D,'nosort');
wolffd@0 9 % som_table_print(som_stats_table(csS))
wolffd@0 10 %
wolffd@0 11 % Input and output arguments ([]'s are optional):
wolffd@0 12 % D (matrix) a matrix, size dlen x dim
wolffd@0 13 % (struct) data or map struct
wolffd@0 14 % [sort] (string) 'sort' (default) or 'nosort'
wolffd@0 15 % If 'nosort' is specified, the data is not
wolffd@0 16 % sorted, and therefore the values of
wolffd@0 17 % nunique, uvalues, ucount, fvalues, fcount, and tiles fields
wolffd@0 18 % are not calculated. This may be useful if
wolffd@0 19 % there is a very large amount of data, and
wolffd@0 20 % one wants to reduce calculation time.
wolffd@0 21 %
wolffd@0 22 % csS (cell array) size dim x 1, of statistics structs with
wolffd@0 23 % the following fields
wolffd@0 24 % .type (string) 'som_stat'
wolffd@0 25 % .name (string) name of the variable
wolffd@0 26 % .normalization (struct array) variable normalization (see SOM_NORMALIZE)
wolffd@0 27 % .ntotal (scalar) total number of values
wolffd@0 28 % .nvalid (scalar) number of valid values (not Inf or NaN)
wolffd@0 29 % .min (scalar) minimum value
wolffd@0 30 % .max (scalar) maximum value
wolffd@0 31 % .mean (scalar) mean value (not Inf or NaN)
wolffd@0 32 % .std (scalar) standard deviation (not Inf or NaN)
wolffd@0 33 % .nunique (scalar) number of unique values
wolffd@0 34 % .mfvalue (vector) most frequent value
wolffd@0 35 % .mfcount (vector) number of occurances of most frequent value
wolffd@0 36 % .values (vector) at most MAXDISCRETE (see below) sample values
wolffd@0 37 % .counts (vector) number of occurances for each sampled value
wolffd@0 38 % .tiles (vector) NT-tile values, for example
wolffd@0 39 % NT=4 for quartiles: 25%, 50% and 75%
wolffd@0 40 % NT=100 for percentiles: 1%, 2%, ... and 99%
wolffd@0 41 % .hist (struct) histogram struct with the following fields
wolffd@0 42 % .type (string) 'som_hist'
wolffd@0 43 % .bins (vector) histogram bin centers
wolffd@0 44 % .counts (vector) count of values in each bin
wolffd@0 45 % .binlabels (cellstr) labels for the bins (denormalized bin
wolffd@0 46 % center values)
wolffd@0 47 % .binlabels2 (cellstr) labels for the bins (denormalized bin
wolffd@0 48 % edge values, e.g. '[1.4,2.5['
wolffd@0 49 %
wolffd@0 50 % Constants:
wolffd@0 51 % MAXDISCRETE = 10
wolffd@0 52 % NT = 10
wolffd@0 53 %
wolffd@0 54 % See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT.
wolffd@0 55
wolffd@0 56 % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto
wolffd@0 57 % Copyright (c) by Juha Vesanto
wolffd@0 58 % http://www.cis.hut.fi/projects/somtoolbox/
wolffd@0 59
wolffd@0 60 % Version 2.0beta juuso 311201
wolffd@0 61
wolffd@0 62 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
wolffd@0 63 %% arguments
wolffd@0 64
wolffd@0 65 % default values
wolffd@0 66 nosort = 0;
wolffd@0 67 nbins = 10;
wolffd@0 68 maxdiscrete = 20;
wolffd@0 69 ntiles = 10;
wolffd@0 70
wolffd@0 71 % first argument
wolffd@0 72 if isstruct(D),
wolffd@0 73 switch D.type,
wolffd@0 74 case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook;
wolffd@0 75 case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data;
wolffd@0 76 otherwise, error('Invalid first argument')
wolffd@0 77 end
wolffd@0 78 else
wolffd@0 79 cn = cell(size(D,2),1);
wolffd@0 80 cn(:) = {'Variable'};
wolffd@0 81 for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end
wolffd@0 82 sN = cell(size(D,2),1);
wolffd@0 83 end
wolffd@0 84 [dlen dim] = size(D);
wolffd@0 85
wolffd@0 86 % other arguments
wolffd@0 87
wolffd@0 88 if length(varargin)>0,
wolffd@0 89 if strcmp(varargin{1},'nosort'), nosort = 1; end
wolffd@0 90 end
wolffd@0 91
wolffd@0 92 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
wolffd@0 93 %% action
wolffd@0 94
wolffd@0 95 sStat = struct('type','som_stat','name','','normalization',[],...
wolffd@0 96 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,...
wolffd@0 97 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],...
wolffd@0 98 'ntotal',dlen,'nvalid',NaN,'hist',[]);
wolffd@0 99 csS = cell(0);
wolffd@0 100
wolffd@0 101 for i=1:dim,
wolffd@0 102 sS = sStat;
wolffd@0 103 sS.name = cn{i};
wolffd@0 104 sS.normalization = sN{i};
wolffd@0 105 x = D(:,i);
wolffd@0 106 x(find(~isfinite(x))) = [];
wolffd@0 107 % basic descriptive statistics
wolffd@0 108 sS.nvalid = length(x);
wolffd@0 109 if length(x),
wolffd@0 110 sS.min = min(x);
wolffd@0 111 sS.max = max(x);
wolffd@0 112 sS.mean = mean(x);
wolffd@0 113 sS.std = std(x);
wolffd@0 114 bins = [];
wolffd@0 115 if ~nosort,
wolffd@0 116 xsorted = sort(x);
wolffd@0 117 % number of unique values
wolffd@0 118 repeated = (xsorted(1:end-1)==xsorted(2:end));
wolffd@0 119 j = [1; find(~repeated)+1];
wolffd@0 120 xunique = xsorted(j);
wolffd@0 121 sS.nunique = length(xunique);
wolffd@0 122 ucount = diff([j; length(xsorted)+1]);
wolffd@0 123 % most frequent value
wolffd@0 124 [fcount,j] = max(ucount);
wolffd@0 125 sS.mfvalue = xunique(j);
wolffd@0 126 sS.mfcount = fcount;
wolffd@0 127 % -tiles (k*100/ntiles % of values, k=1..)
wolffd@0 128 pickind = round(linspace(1,sS.nvalid,ntiles+1));
wolffd@0 129 pickind = pickind(2:end-1);
wolffd@0 130 sS.tiles = xsorted(pickind);
wolffd@0 131 if sS.nunique <= sS.nvalid/2,
wolffd@0 132 % unique values
wolffd@0 133 sS.values = xunique;
wolffd@0 134 sS.counts = ucount;
wolffd@0 135 bins = sS.values;
wolffd@0 136 else
wolffd@0 137 % just maxdiscrete values, evenly picked
wolffd@0 138 pickind = round(linspace(1,sS.nunique,maxdiscrete));
wolffd@0 139 sS.values = xunique(pickind);
wolffd@0 140 sS.counts = ucount(pickind);
wolffd@0 141
wolffd@0 142 %% OPTION 2: maxdiscrete most frequent values
wolffd@0 143 %[v,j] = sort(ucount);
wolffd@0 144 %pickind = j(1:maxdiscrete);
wolffd@0 145 %sS.values = xunique(pickind);
wolffd@0 146 %sS.counts = ucount(pickind);
wolffd@0 147
wolffd@0 148 % OPTION 3: representative values - calculated using k-means
wolffd@0 149 %[y,bm,qe] = kmeans(x,maxdiscrete);
wolffd@0 150 %sS.values = y;
wolffd@0 151 %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2));
wolffd@0 152 end
wolffd@0 153 end
wolffd@0 154 if isempty(bins),
wolffd@0 155 bins = linspace(sS.min,sS.max,nbins+1);
wolffd@0 156 bins = (bins(1:end-1)+bins(2:end))/2;
wolffd@0 157 end
wolffd@0 158 sS.hist = som_hist(x,bins,sS.normalization);
wolffd@0 159 else
wolffd@0 160 sS.hist = som_hist(x,0);
wolffd@0 161 end
wolffd@0 162 csS{end+1} = sS;
wolffd@0 163 end
wolffd@0 164
wolffd@0 165 return;
wolffd@0 166
wolffd@0 167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
wolffd@0 168 %% subfunctions
wolffd@0 169
wolffd@0 170 function sH = som_hist(x,bins,sN)
wolffd@0 171
wolffd@0 172 binlabels = [];
wolffd@0 173 binlabels2 = [];
wolffd@0 174 if nargin<2 | isempty(bins) | isnan(bins),
wolffd@0 175 bins = linspace(min(x),max(x),10);
wolffd@0 176 end
wolffd@0 177 if isstruct(bins),
wolffd@0 178 bins = sH.bins;
wolffd@0 179 binlabels = sH.binlabels;
wolffd@0 180 binlabels2 = sH.binlabels2;
wolffd@0 181 end
wolffd@0 182 if nargin<3, sN = []; end
wolffd@0 183
wolffd@0 184 sH = struct('type','som_hist','bins',bins,'counts',[],...
wolffd@0 185 'binlabels',binlabels,'binlabels2',binlabels2);
wolffd@0 186
wolffd@0 187 if length(bins)==1,
wolffd@0 188 sH.counts = [length(x)];
wolffd@0 189 edges = bins;
wolffd@0 190 elseif length(x),
wolffd@0 191 edges = (bins(1:end-1)+bins(2:end))/2;
wolffd@0 192 counts = histc(x,[-Inf; edges(:); Inf]);
wolffd@0 193 sH.counts = counts(1:end-1);
wolffd@0 194 end
wolffd@0 195
wolffd@0 196 if isempty(sH.binlabels),
wolffd@0 197 b = som_denormalize(bins(:),sN);
wolffd@0 198 sH.binlabels = numtostring(b,4);
wolffd@0 199 end
wolffd@0 200
wolffd@0 201 if isempty(sH.binlabels2),
wolffd@0 202 if length(edges)==1,
wolffd@0 203 sH.binlabels2 = numtostring(som_denormalize(edges,sN),2);
wolffd@0 204 if length(bins)>1,
wolffd@0 205 sH.binlabels2 = sH.binlabels2([1 1]);
wolffd@0 206 sH.binlabels2{1} = [']' sH.binlabels2{1} '['];
wolffd@0 207 sH.binlabels2{2} = ['[' sH.binlabels2{2} '['];
wolffd@0 208 end
wolffd@0 209 else
wolffd@0 210 if size(edges,1)==1, edges = edges'; end
wolffd@0 211 bstr = numtostring(som_denormalize(edges,sN),4);
wolffd@0 212 sH.binlabels2 = bstr([1:end end]);
wolffd@0 213 sH.binlabels2{1} = [bstr{1} '['];
wolffd@0 214 for i=2:length(sH.binlabels2)-1,
wolffd@0 215 sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '['];
wolffd@0 216 end
wolffd@0 217 sH.binlabels2{end} = ['[' bstr{end}];
wolffd@0 218 end
wolffd@0 219 end
wolffd@0 220
wolffd@0 221 if 0,
wolffd@0 222 if length(bins)==1, sH.binlabels2 = {'constant'};
wolffd@0 223 else
wolffd@0 224 ntiles = 10;
wolffd@0 225 plim = [1:ntiles-1] / ntiles;
wolffd@0 226 cp = cumsum(sH.counts)/sum(sH.counts);
wolffd@0 227 [dummy,i] = histc(cp,[-Inf plim Inf]);
wolffd@0 228 l2 = cell(length(bins),1);
wolffd@0 229 for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end
wolffd@0 230 if i(1) > 1, l2{1} = ['...' l2{1}]; end
wolffd@0 231 k = 0;
wolffd@0 232 for j=2:length(bins),
wolffd@0 233 if i(j)==i(j-1),
wolffd@0 234 if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end
wolffd@0 235 k = k + 1;
wolffd@0 236 l2{j} = [l2{j} '.' num2str(k)];
wolffd@0 237 else k = 0; end
wolffd@0 238 end
wolffd@0 239 if i(end) < ntiles, l2{end} = [l2{end} '...']; end
wolffd@0 240 sH.binlabels2 = l2;
wolffd@0 241 end
wolffd@0 242 end
wolffd@0 243
wolffd@0 244 return;
wolffd@0 245
wolffd@0 246 function vstr = numtostring(v,d)
wolffd@0 247
wolffd@0 248 r = max(v)-min(v);
wolffd@0 249 if r==0, r=1; end
wolffd@0 250 nearzero = (abs(v)/r < 10.^-d);
wolffd@0 251 i1 = find(v > 0 & nearzero);
wolffd@0 252 i2 = find(v < 0 & nearzero);
wolffd@0 253 vstr = strrep(cellstr(num2str(v,d)),' ','');
wolffd@0 254 vstr(i1) = {'0.0'};
wolffd@0 255 vstr(i2) = {'-0.0'};
wolffd@0 256 return;
wolffd@0 257