Mercurial > hg > camir-aes2014
diff toolboxes/MIRtoolbox1.3.2/somtoolbox/som_stats.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolboxes/MIRtoolbox1.3.2/somtoolbox/som_stats.m Tue Feb 10 15:05:51 2015 +0000 @@ -0,0 +1,257 @@ +function csS = som_stats(D,varargin) + +%SOM_STATS Calculate descriptive statistics for the data. +% +% csS = som_stats(D,[sort]); +% +% csS = som_stats(D); +% csS = som_stats(D,'nosort'); +% som_table_print(som_stats_table(csS)) +% +% Input and output arguments ([]'s are optional): +% D (matrix) a matrix, size dlen x dim +% (struct) data or map struct +% [sort] (string) 'sort' (default) or 'nosort' +% If 'nosort' is specified, the data is not +% sorted, and therefore the values of +% nunique, uvalues, ucount, fvalues, fcount, and tiles fields +% are not calculated. This may be useful if +% there is a very large amount of data, and +% one wants to reduce calculation time. +% +% csS (cell array) size dim x 1, of statistics structs with +% the following fields +% .type (string) 'som_stat' +% .name (string) name of the variable +% .normalization (struct array) variable normalization (see SOM_NORMALIZE) +% .ntotal (scalar) total number of values +% .nvalid (scalar) number of valid values (not Inf or NaN) +% .min (scalar) minimum value +% .max (scalar) maximum value +% .mean (scalar) mean value (not Inf or NaN) +% .std (scalar) standard deviation (not Inf or NaN) +% .nunique (scalar) number of unique values +% .mfvalue (vector) most frequent value +% .mfcount (vector) number of occurances of most frequent value +% .values (vector) at most MAXDISCRETE (see below) sample values +% .counts (vector) number of occurances for each sampled value +% .tiles (vector) NT-tile values, for example +% NT=4 for quartiles: 25%, 50% and 75% +% NT=100 for percentiles: 1%, 2%, ... and 99% +% .hist (struct) histogram struct with the following fields +% .type (string) 'som_hist' +% .bins (vector) histogram bin centers +% .counts (vector) count of values in each bin +% .binlabels (cellstr) labels for the bins (denormalized bin +% center values) +% .binlabels2 (cellstr) labels for the bins (denormalized bin +% edge values, e.g. '[1.4,2.5[' +% +% Constants: +% MAXDISCRETE = 10 +% NT = 10 +% +% See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT. + +% Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto +% Copyright (c) by Juha Vesanto +% http://www.cis.hut.fi/projects/somtoolbox/ + +% Version 2.0beta juuso 311201 + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 +%% arguments + +% default values +nosort = 0; +nbins = 10; +maxdiscrete = 20; +ntiles = 10; + +% first argument +if isstruct(D), + switch D.type, + case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook; + case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; + otherwise, error('Invalid first argument') + end +else + cn = cell(size(D,2),1); + cn(:) = {'Variable'}; + for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end + sN = cell(size(D,2),1); +end +[dlen dim] = size(D); + +% other arguments + +if length(varargin)>0, + if strcmp(varargin{1},'nosort'), nosort = 1; end +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 +%% action + +sStat = struct('type','som_stat','name','','normalization',[],... + 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,... + 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],... + 'ntotal',dlen,'nvalid',NaN,'hist',[]); +csS = cell(0); + +for i=1:dim, + sS = sStat; + sS.name = cn{i}; + sS.normalization = sN{i}; + x = D(:,i); + x(find(~isfinite(x))) = []; + % basic descriptive statistics + sS.nvalid = length(x); + if length(x), + sS.min = min(x); + sS.max = max(x); + sS.mean = mean(x); + sS.std = std(x); + bins = []; + if ~nosort, + xsorted = sort(x); + % number of unique values + repeated = (xsorted(1:end-1)==xsorted(2:end)); + j = [1; find(~repeated)+1]; + xunique = xsorted(j); + sS.nunique = length(xunique); + ucount = diff([j; length(xsorted)+1]); + % most frequent value + [fcount,j] = max(ucount); + sS.mfvalue = xunique(j); + sS.mfcount = fcount; + % -tiles (k*100/ntiles % of values, k=1..) + pickind = round(linspace(1,sS.nvalid,ntiles+1)); + pickind = pickind(2:end-1); + sS.tiles = xsorted(pickind); + if sS.nunique <= sS.nvalid/2, + % unique values + sS.values = xunique; + sS.counts = ucount; + bins = sS.values; + else + % just maxdiscrete values, evenly picked + pickind = round(linspace(1,sS.nunique,maxdiscrete)); + sS.values = xunique(pickind); + sS.counts = ucount(pickind); + + %% OPTION 2: maxdiscrete most frequent values + %[v,j] = sort(ucount); + %pickind = j(1:maxdiscrete); + %sS.values = xunique(pickind); + %sS.counts = ucount(pickind); + + % OPTION 3: representative values - calculated using k-means + %[y,bm,qe] = kmeans(x,maxdiscrete); + %sS.values = y; + %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2)); + end + end + if isempty(bins), + bins = linspace(sS.min,sS.max,nbins+1); + bins = (bins(1:end-1)+bins(2:end))/2; + end + sS.hist = som_hist(x,bins,sS.normalization); + else + sS.hist = som_hist(x,0); + end + csS{end+1} = sS; +end + +return; + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 +%% subfunctions + +function sH = som_hist(x,bins,sN) + + binlabels = []; + binlabels2 = []; + if nargin<2 | isempty(bins) | isnan(bins), + bins = linspace(min(x),max(x),10); + end + if isstruct(bins), + bins = sH.bins; + binlabels = sH.binlabels; + binlabels2 = sH.binlabels2; + end + if nargin<3, sN = []; end + + sH = struct('type','som_hist','bins',bins,'counts',[],... + 'binlabels',binlabels,'binlabels2',binlabels2); + + if length(bins)==1, + sH.counts = [length(x)]; + edges = bins; + elseif length(x), + edges = (bins(1:end-1)+bins(2:end))/2; + counts = histc(x,[-Inf; edges(:); Inf]); + sH.counts = counts(1:end-1); + end + + if isempty(sH.binlabels), + b = som_denormalize(bins(:),sN); + sH.binlabels = numtostring(b,4); + end + + if isempty(sH.binlabels2), + if length(edges)==1, + sH.binlabels2 = numtostring(som_denormalize(edges,sN),2); + if length(bins)>1, + sH.binlabels2 = sH.binlabels2([1 1]); + sH.binlabels2{1} = [']' sH.binlabels2{1} '[']; + sH.binlabels2{2} = ['[' sH.binlabels2{2} '[']; + end + else + if size(edges,1)==1, edges = edges'; end + bstr = numtostring(som_denormalize(edges,sN),4); + sH.binlabels2 = bstr([1:end end]); + sH.binlabels2{1} = [bstr{1} '[']; + for i=2:length(sH.binlabels2)-1, + sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; + end + sH.binlabels2{end} = ['[' bstr{end}]; + end + end + + if 0, + if length(bins)==1, sH.binlabels2 = {'constant'}; + else + ntiles = 10; + plim = [1:ntiles-1] / ntiles; + cp = cumsum(sH.counts)/sum(sH.counts); + [dummy,i] = histc(cp,[-Inf plim Inf]); + l2 = cell(length(bins),1); + for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end + if i(1) > 1, l2{1} = ['...' l2{1}]; end + k = 0; + for j=2:length(bins), + if i(j)==i(j-1), + if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end + k = k + 1; + l2{j} = [l2{j} '.' num2str(k)]; + else k = 0; end + end + if i(end) < ntiles, l2{end} = [l2{end} '...']; end + sH.binlabels2 = l2; + end + end + + return; + +function vstr = numtostring(v,d) + + r = max(v)-min(v); + if r==0, r=1; end + nearzero = (abs(v)/r < 10.^-d); + i1 = find(v > 0 & nearzero); + i2 = find(v < 0 & nearzero); + vstr = strrep(cellstr(num2str(v,d)),' ',''); + vstr(i1) = {'0.0'}; + vstr(i2) = {'-0.0'}; + return; +