Daniel@0: function csS = som_stats(D,varargin) Daniel@0: Daniel@0: %SOM_STATS Calculate descriptive statistics for the data. Daniel@0: % Daniel@0: % csS = som_stats(D,[sort]); Daniel@0: % Daniel@0: % csS = som_stats(D); Daniel@0: % csS = som_stats(D,'nosort'); Daniel@0: % som_table_print(som_stats_table(csS)) Daniel@0: % Daniel@0: % Input and output arguments ([]'s are optional): Daniel@0: % D (matrix) a matrix, size dlen x dim Daniel@0: % (struct) data or map struct Daniel@0: % [sort] (string) 'sort' (default) or 'nosort' Daniel@0: % If 'nosort' is specified, the data is not Daniel@0: % sorted, and therefore the values of Daniel@0: % nunique, uvalues, ucount, fvalues, fcount, and tiles fields Daniel@0: % are not calculated. This may be useful if Daniel@0: % there is a very large amount of data, and Daniel@0: % one wants to reduce calculation time. Daniel@0: % Daniel@0: % csS (cell array) size dim x 1, of statistics structs with Daniel@0: % the following fields Daniel@0: % .type (string) 'som_stat' Daniel@0: % .name (string) name of the variable Daniel@0: % .normalization (struct array) variable normalization (see SOM_NORMALIZE) Daniel@0: % .ntotal (scalar) total number of values Daniel@0: % .nvalid (scalar) number of valid values (not Inf or NaN) Daniel@0: % .min (scalar) minimum value Daniel@0: % .max (scalar) maximum value Daniel@0: % .mean (scalar) mean value (not Inf or NaN) Daniel@0: % .std (scalar) standard deviation (not Inf or NaN) Daniel@0: % .nunique (scalar) number of unique values Daniel@0: % .mfvalue (vector) most frequent value Daniel@0: % .mfcount (vector) number of occurances of most frequent value Daniel@0: % .values (vector) at most MAXDISCRETE (see below) sample values Daniel@0: % .counts (vector) number of occurances for each sampled value Daniel@0: % .tiles (vector) NT-tile values, for example Daniel@0: % NT=4 for quartiles: 25%, 50% and 75% Daniel@0: % NT=100 for percentiles: 1%, 2%, ... and 99% Daniel@0: % .hist (struct) histogram struct with the following fields Daniel@0: % .type (string) 'som_hist' Daniel@0: % .bins (vector) histogram bin centers Daniel@0: % .counts (vector) count of values in each bin Daniel@0: % .binlabels (cellstr) labels for the bins (denormalized bin Daniel@0: % center values) Daniel@0: % .binlabels2 (cellstr) labels for the bins (denormalized bin Daniel@0: % edge values, e.g. '[1.4,2.5[' Daniel@0: % Daniel@0: % Constants: Daniel@0: % MAXDISCRETE = 10 Daniel@0: % NT = 10 Daniel@0: % Daniel@0: % See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT. Daniel@0: Daniel@0: % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto Daniel@0: % Copyright (c) by Juha Vesanto Daniel@0: % http://www.cis.hut.fi/projects/somtoolbox/ Daniel@0: Daniel@0: % Version 2.0beta juuso 311201 Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 Daniel@0: %% arguments Daniel@0: Daniel@0: % default values Daniel@0: nosort = 0; Daniel@0: nbins = 10; Daniel@0: maxdiscrete = 20; Daniel@0: ntiles = 10; Daniel@0: Daniel@0: % first argument Daniel@0: if isstruct(D), Daniel@0: switch D.type, Daniel@0: case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook; Daniel@0: case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; Daniel@0: otherwise, error('Invalid first argument') Daniel@0: end Daniel@0: else Daniel@0: cn = cell(size(D,2),1); Daniel@0: cn(:) = {'Variable'}; Daniel@0: for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end Daniel@0: sN = cell(size(D,2),1); Daniel@0: end Daniel@0: [dlen dim] = size(D); Daniel@0: Daniel@0: % other arguments Daniel@0: Daniel@0: if length(varargin)>0, Daniel@0: if strcmp(varargin{1},'nosort'), nosort = 1; end Daniel@0: end Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 Daniel@0: %% action Daniel@0: Daniel@0: sStat = struct('type','som_stat','name','','normalization',[],... Daniel@0: 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,... Daniel@0: 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],... Daniel@0: 'ntotal',dlen,'nvalid',NaN,'hist',[]); Daniel@0: csS = cell(0); Daniel@0: Daniel@0: for i=1:dim, Daniel@0: sS = sStat; Daniel@0: sS.name = cn{i}; Daniel@0: sS.normalization = sN{i}; Daniel@0: x = D(:,i); Daniel@0: x(find(~isfinite(x))) = []; Daniel@0: % basic descriptive statistics Daniel@0: sS.nvalid = length(x); Daniel@0: if length(x), Daniel@0: sS.min = min(x); Daniel@0: sS.max = max(x); Daniel@0: sS.mean = mean(x); Daniel@0: sS.std = std(x); Daniel@0: bins = []; Daniel@0: if ~nosort, Daniel@0: xsorted = sort(x); Daniel@0: % number of unique values Daniel@0: repeated = (xsorted(1:end-1)==xsorted(2:end)); Daniel@0: j = [1; find(~repeated)+1]; Daniel@0: xunique = xsorted(j); Daniel@0: sS.nunique = length(xunique); Daniel@0: ucount = diff([j; length(xsorted)+1]); Daniel@0: % most frequent value Daniel@0: [fcount,j] = max(ucount); Daniel@0: sS.mfvalue = xunique(j); Daniel@0: sS.mfcount = fcount; Daniel@0: % -tiles (k*100/ntiles % of values, k=1..) Daniel@0: pickind = round(linspace(1,sS.nvalid,ntiles+1)); Daniel@0: pickind = pickind(2:end-1); Daniel@0: sS.tiles = xsorted(pickind); Daniel@0: if sS.nunique <= sS.nvalid/2, Daniel@0: % unique values Daniel@0: sS.values = xunique; Daniel@0: sS.counts = ucount; Daniel@0: bins = sS.values; Daniel@0: else Daniel@0: % just maxdiscrete values, evenly picked Daniel@0: pickind = round(linspace(1,sS.nunique,maxdiscrete)); Daniel@0: sS.values = xunique(pickind); Daniel@0: sS.counts = ucount(pickind); Daniel@0: Daniel@0: %% OPTION 2: maxdiscrete most frequent values Daniel@0: %[v,j] = sort(ucount); Daniel@0: %pickind = j(1:maxdiscrete); Daniel@0: %sS.values = xunique(pickind); Daniel@0: %sS.counts = ucount(pickind); Daniel@0: Daniel@0: % OPTION 3: representative values - calculated using k-means Daniel@0: %[y,bm,qe] = kmeans(x,maxdiscrete); Daniel@0: %sS.values = y; Daniel@0: %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2)); Daniel@0: end Daniel@0: end Daniel@0: if isempty(bins), Daniel@0: bins = linspace(sS.min,sS.max,nbins+1); Daniel@0: bins = (bins(1:end-1)+bins(2:end))/2; Daniel@0: end Daniel@0: sS.hist = som_hist(x,bins,sS.normalization); Daniel@0: else Daniel@0: sS.hist = som_hist(x,0); Daniel@0: end Daniel@0: csS{end+1} = sS; Daniel@0: end Daniel@0: Daniel@0: return; Daniel@0: Daniel@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 Daniel@0: %% subfunctions Daniel@0: Daniel@0: function sH = som_hist(x,bins,sN) Daniel@0: Daniel@0: binlabels = []; Daniel@0: binlabels2 = []; Daniel@0: if nargin<2 | isempty(bins) | isnan(bins), Daniel@0: bins = linspace(min(x),max(x),10); Daniel@0: end Daniel@0: if isstruct(bins), Daniel@0: bins = sH.bins; Daniel@0: binlabels = sH.binlabels; Daniel@0: binlabels2 = sH.binlabels2; Daniel@0: end Daniel@0: if nargin<3, sN = []; end Daniel@0: Daniel@0: sH = struct('type','som_hist','bins',bins,'counts',[],... Daniel@0: 'binlabels',binlabels,'binlabels2',binlabels2); Daniel@0: Daniel@0: if length(bins)==1, Daniel@0: sH.counts = [length(x)]; Daniel@0: edges = bins; Daniel@0: elseif length(x), Daniel@0: edges = (bins(1:end-1)+bins(2:end))/2; Daniel@0: counts = histc(x,[-Inf; edges(:); Inf]); Daniel@0: sH.counts = counts(1:end-1); Daniel@0: end Daniel@0: Daniel@0: if isempty(sH.binlabels), Daniel@0: b = som_denormalize(bins(:),sN); Daniel@0: sH.binlabels = numtostring(b,4); Daniel@0: end Daniel@0: Daniel@0: if isempty(sH.binlabels2), Daniel@0: if length(edges)==1, Daniel@0: sH.binlabels2 = numtostring(som_denormalize(edges,sN),2); Daniel@0: if length(bins)>1, Daniel@0: sH.binlabels2 = sH.binlabels2([1 1]); Daniel@0: sH.binlabels2{1} = [']' sH.binlabels2{1} '[']; Daniel@0: sH.binlabels2{2} = ['[' sH.binlabels2{2} '[']; Daniel@0: end Daniel@0: else Daniel@0: if size(edges,1)==1, edges = edges'; end Daniel@0: bstr = numtostring(som_denormalize(edges,sN),4); Daniel@0: sH.binlabels2 = bstr([1:end end]); Daniel@0: sH.binlabels2{1} = [bstr{1} '[']; Daniel@0: for i=2:length(sH.binlabels2)-1, Daniel@0: sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; Daniel@0: end Daniel@0: sH.binlabels2{end} = ['[' bstr{end}]; Daniel@0: end Daniel@0: end Daniel@0: Daniel@0: if 0, Daniel@0: if length(bins)==1, sH.binlabels2 = {'constant'}; Daniel@0: else Daniel@0: ntiles = 10; Daniel@0: plim = [1:ntiles-1] / ntiles; Daniel@0: cp = cumsum(sH.counts)/sum(sH.counts); Daniel@0: [dummy,i] = histc(cp,[-Inf plim Inf]); Daniel@0: l2 = cell(length(bins),1); Daniel@0: for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end Daniel@0: if i(1) > 1, l2{1} = ['...' l2{1}]; end Daniel@0: k = 0; Daniel@0: for j=2:length(bins), Daniel@0: if i(j)==i(j-1), Daniel@0: if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end Daniel@0: k = k + 1; Daniel@0: l2{j} = [l2{j} '.' num2str(k)]; Daniel@0: else k = 0; end Daniel@0: end Daniel@0: if i(end) < ntiles, l2{end} = [l2{end} '...']; end Daniel@0: sH.binlabels2 = l2; Daniel@0: end Daniel@0: end Daniel@0: Daniel@0: return; Daniel@0: Daniel@0: function vstr = numtostring(v,d) Daniel@0: Daniel@0: r = max(v)-min(v); Daniel@0: if r==0, r=1; end Daniel@0: nearzero = (abs(v)/r < 10.^-d); Daniel@0: i1 = find(v > 0 & nearzero); Daniel@0: i2 = find(v < 0 & nearzero); Daniel@0: vstr = strrep(cellstr(num2str(v,d)),' ',''); Daniel@0: vstr(i1) = {'0.0'}; Daniel@0: vstr(i2) = {'-0.0'}; Daniel@0: return; Daniel@0: