wolffd@0: function csS = som_stats(D,varargin) wolffd@0: wolffd@0: %SOM_STATS Calculate descriptive statistics for the data. wolffd@0: % wolffd@0: % csS = som_stats(D,[sort]); wolffd@0: % wolffd@0: % csS = som_stats(D); wolffd@0: % csS = som_stats(D,'nosort'); wolffd@0: % som_table_print(som_stats_table(csS)) wolffd@0: % wolffd@0: % Input and output arguments ([]'s are optional): wolffd@0: % D (matrix) a matrix, size dlen x dim wolffd@0: % (struct) data or map struct wolffd@0: % [sort] (string) 'sort' (default) or 'nosort' wolffd@0: % If 'nosort' is specified, the data is not wolffd@0: % sorted, and therefore the values of wolffd@0: % nunique, uvalues, ucount, fvalues, fcount, and tiles fields wolffd@0: % are not calculated. This may be useful if wolffd@0: % there is a very large amount of data, and wolffd@0: % one wants to reduce calculation time. wolffd@0: % wolffd@0: % csS (cell array) size dim x 1, of statistics structs with wolffd@0: % the following fields wolffd@0: % .type (string) 'som_stat' wolffd@0: % .name (string) name of the variable wolffd@0: % .normalization (struct array) variable normalization (see SOM_NORMALIZE) wolffd@0: % .ntotal (scalar) total number of values wolffd@0: % .nvalid (scalar) number of valid values (not Inf or NaN) wolffd@0: % .min (scalar) minimum value wolffd@0: % .max (scalar) maximum value wolffd@0: % .mean (scalar) mean value (not Inf or NaN) wolffd@0: % .std (scalar) standard deviation (not Inf or NaN) wolffd@0: % .nunique (scalar) number of unique values wolffd@0: % .mfvalue (vector) most frequent value wolffd@0: % .mfcount (vector) number of occurances of most frequent value wolffd@0: % .values (vector) at most MAXDISCRETE (see below) sample values wolffd@0: % .counts (vector) number of occurances for each sampled value wolffd@0: % .tiles (vector) NT-tile values, for example wolffd@0: % NT=4 for quartiles: 25%, 50% and 75% wolffd@0: % NT=100 for percentiles: 1%, 2%, ... and 99% wolffd@0: % .hist (struct) histogram struct with the following fields wolffd@0: % .type (string) 'som_hist' wolffd@0: % .bins (vector) histogram bin centers wolffd@0: % .counts (vector) count of values in each bin wolffd@0: % .binlabels (cellstr) labels for the bins (denormalized bin wolffd@0: % center values) wolffd@0: % .binlabels2 (cellstr) labels for the bins (denormalized bin wolffd@0: % edge values, e.g. '[1.4,2.5[' wolffd@0: % wolffd@0: % Constants: wolffd@0: % MAXDISCRETE = 10 wolffd@0: % NT = 10 wolffd@0: % wolffd@0: % See also SOM_STATS_PLOT, SOM_STATS_TABLE, SOM_TABLE_PRINT, SOM_STATS_REPORT. wolffd@0: wolffd@0: % Contributed to SOM Toolbox 2.0, December 31st, 2001 by Juha Vesanto wolffd@0: % Copyright (c) by Juha Vesanto wolffd@0: % http://www.cis.hut.fi/projects/somtoolbox/ wolffd@0: wolffd@0: % Version 2.0beta juuso 311201 wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 wolffd@0: %% arguments wolffd@0: wolffd@0: % default values wolffd@0: nosort = 0; wolffd@0: nbins = 10; wolffd@0: maxdiscrete = 20; wolffd@0: ntiles = 10; wolffd@0: wolffd@0: % first argument wolffd@0: if isstruct(D), wolffd@0: switch D.type, wolffd@0: case 'som_map', cn = D.comp_names; sN = D.comp_norm; D = D.codebook; wolffd@0: case 'som_data', cn = D.comp_names; sN = D.comp_norm; D = D.data; wolffd@0: otherwise, error('Invalid first argument') wolffd@0: end wolffd@0: else wolffd@0: cn = cell(size(D,2),1); wolffd@0: cn(:) = {'Variable'}; wolffd@0: for i=1:length(cn), cn{i} = sprintf('%s%d',cn{i},i); end wolffd@0: sN = cell(size(D,2),1); wolffd@0: end wolffd@0: [dlen dim] = size(D); wolffd@0: wolffd@0: % other arguments wolffd@0: wolffd@0: if length(varargin)>0, wolffd@0: if strcmp(varargin{1},'nosort'), nosort = 1; end wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 wolffd@0: %% action wolffd@0: wolffd@0: sStat = struct('type','som_stat','name','','normalization',[],... wolffd@0: 'min',NaN,'max',NaN,'mean',NaN,'std',NaN,... wolffd@0: 'nunique',NaN,'values',[],'counts',[],'mfvalue',NaN,'mfcount',NaN,'tiles',[],... wolffd@0: 'ntotal',dlen,'nvalid',NaN,'hist',[]); wolffd@0: csS = cell(0); wolffd@0: wolffd@0: for i=1:dim, wolffd@0: sS = sStat; wolffd@0: sS.name = cn{i}; wolffd@0: sS.normalization = sN{i}; wolffd@0: x = D(:,i); wolffd@0: x(find(~isfinite(x))) = []; wolffd@0: % basic descriptive statistics wolffd@0: sS.nvalid = length(x); wolffd@0: if length(x), wolffd@0: sS.min = min(x); wolffd@0: sS.max = max(x); wolffd@0: sS.mean = mean(x); wolffd@0: sS.std = std(x); wolffd@0: bins = []; wolffd@0: if ~nosort, wolffd@0: xsorted = sort(x); wolffd@0: % number of unique values wolffd@0: repeated = (xsorted(1:end-1)==xsorted(2:end)); wolffd@0: j = [1; find(~repeated)+1]; wolffd@0: xunique = xsorted(j); wolffd@0: sS.nunique = length(xunique); wolffd@0: ucount = diff([j; length(xsorted)+1]); wolffd@0: % most frequent value wolffd@0: [fcount,j] = max(ucount); wolffd@0: sS.mfvalue = xunique(j); wolffd@0: sS.mfcount = fcount; wolffd@0: % -tiles (k*100/ntiles % of values, k=1..) wolffd@0: pickind = round(linspace(1,sS.nvalid,ntiles+1)); wolffd@0: pickind = pickind(2:end-1); wolffd@0: sS.tiles = xsorted(pickind); wolffd@0: if sS.nunique <= sS.nvalid/2, wolffd@0: % unique values wolffd@0: sS.values = xunique; wolffd@0: sS.counts = ucount; wolffd@0: bins = sS.values; wolffd@0: else wolffd@0: % just maxdiscrete values, evenly picked wolffd@0: pickind = round(linspace(1,sS.nunique,maxdiscrete)); wolffd@0: sS.values = xunique(pickind); wolffd@0: sS.counts = ucount(pickind); wolffd@0: wolffd@0: %% OPTION 2: maxdiscrete most frequent values wolffd@0: %[v,j] = sort(ucount); wolffd@0: %pickind = j(1:maxdiscrete); wolffd@0: %sS.values = xunique(pickind); wolffd@0: %sS.counts = ucount(pickind); wolffd@0: wolffd@0: % OPTION 3: representative values - calculated using k-means wolffd@0: %[y,bm,qe] = kmeans(x,maxdiscrete); wolffd@0: %sS.values = y; wolffd@0: %sS.counts = full(sum(sparse(bm,1:length(bm),1,maxdiscrete,length(bm)),2)); wolffd@0: end wolffd@0: end wolffd@0: if isempty(bins), wolffd@0: bins = linspace(sS.min,sS.max,nbins+1); wolffd@0: bins = (bins(1:end-1)+bins(2:end))/2; wolffd@0: end wolffd@0: sS.hist = som_hist(x,bins,sS.normalization); wolffd@0: else wolffd@0: sS.hist = som_hist(x,0); wolffd@0: end wolffd@0: csS{end+1} = sS; wolffd@0: end wolffd@0: wolffd@0: return; wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 wolffd@0: %% subfunctions wolffd@0: wolffd@0: function sH = som_hist(x,bins,sN) wolffd@0: wolffd@0: binlabels = []; wolffd@0: binlabels2 = []; wolffd@0: if nargin<2 | isempty(bins) | isnan(bins), wolffd@0: bins = linspace(min(x),max(x),10); wolffd@0: end wolffd@0: if isstruct(bins), wolffd@0: bins = sH.bins; wolffd@0: binlabels = sH.binlabels; wolffd@0: binlabels2 = sH.binlabels2; wolffd@0: end wolffd@0: if nargin<3, sN = []; end wolffd@0: wolffd@0: sH = struct('type','som_hist','bins',bins,'counts',[],... wolffd@0: 'binlabels',binlabels,'binlabels2',binlabels2); wolffd@0: wolffd@0: if length(bins)==1, wolffd@0: sH.counts = [length(x)]; wolffd@0: edges = bins; wolffd@0: elseif length(x), wolffd@0: edges = (bins(1:end-1)+bins(2:end))/2; wolffd@0: counts = histc(x,[-Inf; edges(:); Inf]); wolffd@0: sH.counts = counts(1:end-1); wolffd@0: end wolffd@0: wolffd@0: if isempty(sH.binlabels), wolffd@0: b = som_denormalize(bins(:),sN); wolffd@0: sH.binlabels = numtostring(b,4); wolffd@0: end wolffd@0: wolffd@0: if isempty(sH.binlabels2), wolffd@0: if length(edges)==1, wolffd@0: sH.binlabels2 = numtostring(som_denormalize(edges,sN),2); wolffd@0: if length(bins)>1, wolffd@0: sH.binlabels2 = sH.binlabels2([1 1]); wolffd@0: sH.binlabels2{1} = [']' sH.binlabels2{1} '[']; wolffd@0: sH.binlabels2{2} = ['[' sH.binlabels2{2} '[']; wolffd@0: end wolffd@0: else wolffd@0: if size(edges,1)==1, edges = edges'; end wolffd@0: bstr = numtostring(som_denormalize(edges,sN),4); wolffd@0: sH.binlabels2 = bstr([1:end end]); wolffd@0: sH.binlabels2{1} = [bstr{1} '[']; wolffd@0: for i=2:length(sH.binlabels2)-1, wolffd@0: sH.binlabels2{i} = ['[' bstr{i-1} ',' bstr{i} '[']; wolffd@0: end wolffd@0: sH.binlabels2{end} = ['[' bstr{end}]; wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: if 0, wolffd@0: if length(bins)==1, sH.binlabels2 = {'constant'}; wolffd@0: else wolffd@0: ntiles = 10; wolffd@0: plim = [1:ntiles-1] / ntiles; wolffd@0: cp = cumsum(sH.counts)/sum(sH.counts); wolffd@0: [dummy,i] = histc(cp,[-Inf plim Inf]); wolffd@0: l2 = cell(length(bins),1); wolffd@0: for j=1:length(bins), l2{j} = sprintf('Q%d',i(j)); end wolffd@0: if i(1) > 1, l2{1} = ['...' l2{1}]; end wolffd@0: k = 0; wolffd@0: for j=2:length(bins), wolffd@0: if i(j)==i(j-1), wolffd@0: if k==0, l2{j-1} = [l2{j-1} '.1']; k = 1; end wolffd@0: k = k + 1; wolffd@0: l2{j} = [l2{j} '.' num2str(k)]; wolffd@0: else k = 0; end wolffd@0: end wolffd@0: if i(end) < ntiles, l2{end} = [l2{end} '...']; end wolffd@0: sH.binlabels2 = l2; wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: return; wolffd@0: wolffd@0: function vstr = numtostring(v,d) wolffd@0: wolffd@0: r = max(v)-min(v); wolffd@0: if r==0, r=1; end wolffd@0: nearzero = (abs(v)/r < 10.^-d); wolffd@0: i1 = find(v > 0 & nearzero); wolffd@0: i2 = find(v < 0 & nearzero); wolffd@0: vstr = strrep(cellstr(num2str(v,d)),' ',''); wolffd@0: vstr(i1) = {'0.0'}; wolffd@0: vstr(i2) = {'-0.0'}; wolffd@0: return; wolffd@0: