function out = genre_stats(tagged, names, freqs, childof)
% out = genre_stats(names, freqs, childof)
% 
% calculates statistics for genre distributions
% 
%

% get overall genre frequency and sort accordingly
allapp = sum(freqs,2);
allapp = allapp/max(allapp);

[null, idx] = sort(allapp,'descend');

% get root potential
rootpot = 1 - sum(childof,2);

figure;
bar(1:numel(names),[allapp(idx) rootpot(idx)])
set(gca,'XTick',1:numel(names));
set(gca,'XTickLabel',names(idx));
legend('#appearances','root genre possibility');
title 'genre statistics sorted by frequency of appearances'

% ---
% determine genres that include x% of the whole dataset
% ---
pctl = 0.98; % 80 percent included

% ---
% re-sort by appearance and root potential.
% using the multiplication, we can filter out subgenres
% ---
[null, idxrt] = sort(rootpot.*allapp,'descend');

% iteratively add 'best' genre according to root potential
gotclips = [];
numclips = [];
num_included = 0;
i = 1;
while i <= numel(names) && num_included < pctl * length(tagged) 
    
    % count clips found for this genre
    fprintf('%s \n', char(names{idxrt(i)}));
    newclips = setdiff(find(tagged(:,idxrt(i)))', gotclips);
    
    gotclips = [gotclips newclips];
    numclips(i) = numel(newclips);
    
    num_included = num_included + numclips(i);
    i = i + 1;
end

figure;
pie(numclips(numclips > 0) / length(tagged));
legend(names{idxrt(numclips > 0)});

out = [];

