Mercurial > hg > camir-aes2014
diff core/magnatagatune/MTTAudioFeatureBasicSm.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/magnatagatune/MTTAudioFeatureBasicSm.m Tue Feb 10 15:05:51 2015 +0000 @@ -0,0 +1,832 @@ +classdef MTTAudioFeatureBasicSm < MTTAudioFeature & handle + % --- + % the MTTAudioFeatureBasicSm Class contains + % a basic summary of chroma, mfcc and tempo features + % a few common chroma and mfcc vectors are concatenated + % along with some clip-wide variance + % a metric / rhythm fingerprint is added + % + % The usual workflow for these features consists of three steps + % 1. extract: extracts the basic single-file dependent features + % 2. define_global_transform: calculates the global feature + % transformation parameters + % 3. finalise: applies the common transformations to a specific feature + % --- + + properties(Constant = true) + + % svn hook + my_revision = str2double(substr('$Rev$', 5, -1)); + end + + properties + % --- + % Set default parameters + % --- + my_params = struct(... + 'nchromas', 4, ... % 4 chroma vectors + 'chroma_var', 0, ... % chroma variance + 'norm_chromas', 0, ... % not implemented, chromas already rel. + 'min_kshift_chromas', 0.1, ... % treshold for key shift. set to 1 for no shift (0-1) + ... + 'ntimbres', 4, ... + 'timbre_var', 0, ... % timbre variance + 'norm_timbres', 1, ... + 'clip_timbres', 0.85, ... % percentile of data which has to be inside 0-1 bounds + ... + 'norm_weights',0, ... % globally norm weights for chroma times? + 'norm_interval',1, ... + 'max_iter',100, ... % max iterations for chroma and timbre knn + ... + 'nrhythms', 0, ... + 'nints', 11, ... + 'energy_sr', 1000, ... % sample rate for energy curve + 'norm_acorr', 1 ... % normalise arcorr locally-> shape imp... energy is normalised anyways + ); + end + + % --- + % member functions + % --- + methods + + % --- + % constructor: pointer to feature in database + % --- + function feature = MTTAudioFeatureBasicSm(varargin) + + feature = feature@MTTAudioFeature(varargin{:}); + + end + % --- + % extract feature data from raw audio features + % --- + function data = extract(feature, clip) + % --- + % get Basic Summary audio features. this includes possible + % local normalisations + % --- + + global globalvars; + + % --- + % get casimir child clip if available + % --- + if isa(clip, 'CASIMIRClip') + baseclip = clip.child_clip(); + else + baseclip = clip; + end + if isa(baseclip, 'MTTClip') + rawf = baseclip.audio_features_raw(); + elseif isa(baseclip, 'MSDClip') + rawf = baseclip.features('MSDAudioFeatureRAW'); + end + + % --- + % now extract the features + % first step: chroma clustering + % --- + weights = [rawf.data.segments_duration]; + + % normalise weights + weights = weights / rawf.data.duration; + + chroma = [rawf.data.segments_pitches]'; + + % --- + % get most present chroma vectors. + % the weighted k-means should return the four most prominent + % chroma vectors and their weight + % --- + % display error values + + op = foptions(); + op(1) = 0; + op(14) = feature.my_params.max_iter; + + % check for trivial case + if feature.my_params.nchromas == 0 + + chromas = []; + cwght = []; + + elseif feature.my_params.nchromas == 1 + + chromas = mean(chroma, 1); + chroma_var = var(chroma, 0, 1); + cwght = 1; + + elseif numel(weights) > feature.my_params.nchromas + + % --- + % there may be few chromas, try kmeans several (20) times + % --- + cont = 0; + cwght = []; + while (numel(cwght) ~= feature.my_params.nchromas) && (cont < 20); + + [chromas, cwght, post] = ... + weighted_kmeans(feature.my_params.nchromas, chroma, weights, op); + + cont = cont + 1; + end + + if (numel(cwght) ~= feature.my_params.nchromas) + + error('cannot find enough chroma centres'); + end + + % --- + % Calculate the weighted variance of the chroma clusters + % --- + if feature.my_params.chroma_var >= 1 + + chroma_var = zeros(size(chromas)); + for i = 1:size(chroma_var,1) + + % get distance from cluster centroid + tmp_var = (chroma(post(:,i),:) - repmat(chromas(i,:), sum(post(:,i)),1)).^2; + + % add up the weighted differences and normalise by sum + % of weights + chroma_var(i,:) = (weights(post(:,i)) * tmp_var) ./... + (sum(weights(post(:,i)))); + end + end + else + % --- + % odd case: less than nchroma data points. + % we repeat the mean vector at the end + % --- + chromas = [chroma; repmat(mean(chroma, 1),... + feature.my_params.nchromas - numel(weights), 1 )]; + + cwght = weights; + cwght( end + 1:feature.my_params.nchromas ) = 0; + + % --- + % TODO: get a variance for odd case : + % replicate the complete data variance? + % NO: every vector is a clsuter => zero variance + % --- + end + + % trivial case: no variance requested + if ~exist('chroma_var','var') + chroma_var = zeros(size(chromas)); + end + + % sort by associated time + [cwght, idx] = sort(cwght, 'descend'); + chromas = chromas(idx,:); + chroma_var = chroma_var(idx,:); + + % --- + % shift according to detected key, but only if + % the confidencee is high enough + % --- + shift = 0; + if rawf.data.keyConfidence > feature.my_params.min_kshift_chromas; + + shift = -rawf.data.key; + chromas = circshift(chromas, [0 shift]); + chroma_var = circshift(chroma_var, [0 shift]); + end + + % --- + % get mfcc centres: + % the same for mfccs + % --- + mfcc = [rawf.data.segments_timbre]'; + if feature.my_params.ntimbres == 0 + + mfccs = []; + mwght = []; + + elseif feature.my_params.ntimbres == 1 + + mfccs = mean(mfcc, 1); + timbre_var = var(mfccs, 0, 1); + mwght = 1; + + elseif numel(weights) > feature.my_params.ntimbres + + % --- + % there may be few mfccs, try kmeans several times + % --- + cont = 0; + mwght = []; + while (numel(mwght) ~= feature.my_params.ntimbres) && (cont < 20); + + [mfccs, mwght, post] = ... + weighted_kmeans(feature.my_params.ntimbres, mfcc, weights, op); + cont = cont + 1; + end + + if (numel(mwght) ~= feature.my_params.ntimbres) + + error('cannot find enough mfcc centres'); + end + + % --- + % Calculate the weighted variance of the chroma clusters + % --- + if feature.my_params.timbre_var >= 1 + + timbre_var = zeros(size(mfccs)); + for i = 1:size(timbre_var,1) + + % get distance from cluster centroid + tmp_var = (mfcc(post(:,i),:) - repmat(mfccs(i,:), sum(post(:,i)),1)).^2; + + % add up the weighted differences and normalise by sum + % of weights + timbre_var(i,:) = (weights(post(:,i)) * tmp_var) ./... + (sum(weights(post(:,i)))); + end + end + + else + % --- + % odd case: less than nchroma data points. + % we repeat the mean vector at the end + % --- + mfccs = [mfcc; repmat(mean(mfcc, 1),... + feature.my_params.ntimbres - numel(weights), 1)]; + mwght = weights; + mwght( end + 1:feature.my_params.ntimbres) = 0; + end + + % trivial case: no variance requested + if ~exist('timbre_var','var') + timbre_var = zeros(size(mfccs)); + end + + % sort by associated time + [mwght, idx] = sort(mwght, 'descend'); + mfccs = mfccs(idx,:); + timbre_var = timbre_var(idx,:); + + % --- + % get beat features: + % the autocorrelation curve over n quarters of length + % + % alternative: how about using the n=8 quarters relative + % volumes from the start of a sure measure? + % --- + if feature.my_params.nrhythms >= 1 + bars = rawf.data.bars; + beats = rawf.data.beats; + tatums = rawf.data.tatums; + % --- + % NOTE: the beat and tatum markers seem to have an offset :( + % --- + offset = 0.118; %seconds + + [envelope, time] = energy_envelope(feature, clip); + + % we offset the energy curve + time = time + offset; + + % --- + % we try to start at the best beat confidence more + % than sixteen eights from the end + % --- + + if rawf.data.tempo > 0 + + eightl = 30 / rawf.data.tempo; + else + % --- + % odd case: no rhythm data. assume 100 bpm + % --- + + eightl = 0.3; + end + + if isempty(beats) + % --- + % odd case: no beats detected. -> use best tatum + % --- + if ~isempty(tatums) + + beats = tatums; + else + + % ok, just take the beginning + beats = [0; 1]; + end + end + + last_valid = find(beats(1,:) < ... + (rawf.data.duration - feature.my_params.nints * eightl),1, 'last'); + + % find the best valid beat postition + [null, max_measure] = max( beats(2, 1:last_valid)); + max_mtime = beats(1,max_measure); + + % --- + % the correlation is calculated for the estimated eights lenght + % and for the 16th intervals, respectively. + % --- + + % calculate the EIGHTS correlation for the following segment + [acorr8, eight_en, eightt] = ... + beat_histogram(feature, max_mtime, eightl, envelope, time); + + % calculate the SIXTEENTHS correlation for the following segment + [acorr16, six_en, sixt] = ... + beat_histogram(feature, max_mtime, eightl / 2, envelope, time); + + % --- + % save the various features + % --- + % save rythm feature data + + data.rhythm.acorr8 = acorr8; + data.rhythm.acorr8_lag = eightt(1:end/2)-eightt(1); + + data.rhythm.energy8 = eight_en(1:end/2); + data.rhythm.energy8_time = eightt(1:end/2); + + % -- + % the interval is normed locally up to a max value + % associated to 30bpm + % --- + if feature.my_params.norm_interval + + % 1 second max value + data.rhythm.interval8 = eightl / 2; + else + data.rhythm.interval8 = eightl / 2; + end + + if feature.my_params.nrhythms >= 2 + + data.rhythm.acorr16 = acorr16; + data.rhythm.acorr16_lag = data.rhythm.acorr8_lag / 2; + + data.rhythm.energy16 = six_en(1:end/2); + data.rhythm.energy16_time = sixt(1:end/2); + + + % save beat interval / tempo + if feature.my_params.norm_interval + + % 1 second max value + data.rhythm.interval16 = eightl / 2; + else + data.rhythm.interval16 = eightl / 2; + end + + end + else + +% % save empty rythm struct +% data.rhythm = struct([]); + end + + % chroma feature data + for i = 1:size(chromas,1) + data.chroma(i).means = chromas(i,:)'; + data.chroma(i).means_weight = cwght(i); + data.chroma(i).vars = chroma_var(i,:)'; + data.chroma(i).shift = shift; + end + + % mfcc feature data + for i = 1:size(mfccs,1) + data.timbre(i).means = mfccs(i,:)'; + data.timbre(i).means_weight = mwght(i); + data.timbre(i).vars = timbre_var(i,:)'; + end + + % prepare field for final features + data.final.vector = []; + data.final.vector_info = struct(); + data.final.dim = 0; + + % save info data + data.info.type = 'MTTAudioFeatureBasicSm'; + data.info.owner = clip; + data.info.owner_id = clip.id; + data.info.creatorrev = feature.my_revision; + + % save parameters + data.info.params = feature.my_params; + end + + function define_global_transform(features) + % calculate and set normalization factors from the group of + % input features. These features will be set for the full database + + if numel(features) == 1 + error ('Insert feature array for this method'); + end + + % --- + % here, we only need to define the post-normalisation + % --- + + % --- + % get chroma variance data NORMALISATION Factors + % TODO: transport chroma variance to finalise step + % --- + if features(1).my_params.chroma_var >= 1 + allfeat = abs(cat(2, features(1).data.chroma(:).vars)); + for i = 2:numel(features) + + allfeat = cat(2 , allfeat, abs(abs(cat(2, features(i).data.chroma(:).vars)))); + end + [~, common.post_normf.chroma_var] = mapminmax(allfeat,0,1); + end + + % --- + % get timbre variance data NORMALISATION Factors + % TODO: transport chroma variance to finalise step + % --- + if features(1).my_params.timbre_var >= 1 + allfeat = abs(cat(2, features(1).data.timbre(:).vars)); + for i = 2:numel(features) + + allfeat = cat(2 , allfeat, abs(abs(cat(2, features(i).data.timbre(:).vars)))); + end + [~, common.post_normf.timbre_var] = mapminmax(allfeat,0,1); + end + + % --- + % derive normalisation for timbre features: + % MFCC's are actually special filter outputs + % (see developer.echonest.com/docs/v4/_static/AnalyzeDocumentation_2.2.pdf + % they are unbounded, so just the relative information will be + % used here. + % We normalise each bin independently + % --- + if features(1).my_params.ntimbres > 0 + + allfeat = abs(cat(2, features(1).data.timbre(:).means)); + for i = 2:numel(features) + + allfeat = cat(2 , allfeat, abs(cat(2, features(i).data.timbre(:).means))); + end + + % --- + % get normalisation factors + % NOTE: the values will later be clipped to [0,1] + % anyways + % --- + if (features(1).my_params.clip_timbres ~= 0 ) || ... + (features(1).my_params.clip_timbres ~= 1 ) + + common.post_normf.timbre = 1 ./ prctile(allfeat, features(1).my_params.clip_timbres * 100, 2); + + else + % just use the maximum + common.post_normf.timbre = 1/max(allfeat, 2); + end + + % set common feature values + features(1).my_db.set_common(common); + + else + + features(1).my_db.set_common([1]); + end + end + + + function finalise(feature) + % applies a final transformation and + % collects the information of this feature within a single vector + % see info for types in specific dimensions + + for i = 1:numel(feature) + + % check for neccesary parameters + if isempty(feature(i).my_db.commondb) + + error('Define the global transformation first') + return; + end + + if feature(1).my_params.ntimbres > 0 + % --- + % normalise features + % --- + % norm timbre features if neccesary + timbren = []; + if feature(i).my_params.norm_timbres + for j = 1:numel(feature(i).data.timbre) + + timbren = cat(1, timbren, ... + MTTAudioFeatureBasicSm.norm_timbre... + (feature(i).data.timbre(j).means, feature(i).my_db.commondb.post_normf.timbre)); + end + else + + timbren = cat(1, timbren, feature(i).data.timbre(:).means); + end + end + + % --- + % construct resulting feature vector out of features + % --- + vec = []; + info = {}; + if feature(i).my_params.nchromas > 0 + + info{numel(vec)+ 1} = 'chroma'; + vec = cat(1, vec, feature(i).data.chroma(:).means); + + info{numel(vec)+ 1} = 'chroma weights'; + vec = cat(1, vec, [feature(i).data.chroma(:).means_weight]'); + + % --- + % NORMALISE Chroma variance + % --- + if feature(i).my_params.chroma_var >= 1 + + info{numel(vec)+ 1} = 'chroma variance'; + + % normalise this pack of variance vectors + tmp_var = mapminmax('apply', [feature(i).data.chroma(:).vars],... + feature(i).common.post_normf.chroma_var); + + % concatenate normalised data to vector + for vari = 1:size(tmp_var,2) + + vec = cat(1, vec, tmp_var(:, vari)); + end + end + end + + + if feature(i).my_params.ntimbres > 0 + + info{numel(vec)+ 1} = 'timbre'; + vec = cat(1, vec, timbren); + + info{numel(vec)+ 1} = 'timbre weights'; + vec = cat(1, vec, [feature(i).data.timbre(:).means_weight]'); + + % --- + % NORMALISE timbre variance + % --- + if feature(i).my_params.timbre_var >= 1 + + info{numel(vec)+ 1} = 'timbre variance'; + + % normalise this pack of variance vectors + tmp_var = mapminmax('apply', [feature(i).data.timbre(:).vars],... + feature(i).common.post_normf.timbre_var); + + % concatenate normalised data to vector + for vari = 1:size(tmp_var,2) + + vec = cat(1, vec, tmp_var(:, vari)); + end + end + end + + if feature(i).my_params.nrhythms > 0 + + info{numel(vec)+ 1} = 'rhythm 8'; + vec = cat(1, vec, feature(i).data.rhythm.acorr8); + + info{numel(vec)+ 1} = 'int 8'; + vec = cat(1, vec, feature(i).data.rhythm.interval8); + + if feature(i).my_params.nrhythms >= 2 + + info{numel(vec)+ 1} = 'rhythm 16'; + vec = cat(1, vec, feature(i).data.rhythm.acorr16); + + info{numel(vec)+ 1} = 'int 16'; + vec = cat(1, vec, feature(i).data.rhythm.interval16); + end + end + + feature(i).data.final.vector = vec; + feature(i).data.final.dim = numel(feature(i).data.final.vector); + + % fill up info struct and append to feature + + info(end+1: feature(i).data.final.dim) = ... + cell(feature(i).data.final.dim - numel(info),1); + + feature(i).data.final.vector_info.labels = info; + end + + % --- + % TODO: Maybe delete more basic features again at this point? + % --- + end + + % --- + % destructor: do we really want to remove this + % from the database? No, but + % TODO: create marker for unused objects in db, and a cleanup + % function + % --- + function delete(feature) + + end + + + function visualise(feature) + % --- + % plots the different data types collected in this feature + % --- + for i = 1:numel(feature) + clip = feature(i).data.info.owner; + + % display raw features + if isa(clip, 'CASIMIRClip') + baseclip = clip.child_clip(); + else + baseclip = clip; + end + if isa(baseclip, 'MTTClip') + rawf = baseclip.audio_features_raw(); + elseif isa(baseclip, 'MSDClip') + rawf = baseclip.features('MSDAudioFeatureRAW'); + end + + % --- + % @todo: implement MSD feature visualisation + % --- + [a1, a2, a3] = rawf.visualise(); + + % --- + % Display chroma features + % --- + if isfield(feature(i).data, 'chroma') + + chroma_labels = {'c', 'c#', 'd','d#', 'e', 'f','f#', 'g','g#', 'a', 'a#', 'h'}; + mode_labels = {'minor', 'major'}; + + % change labels to reflect detected mode + chroma_labels{rawf.data.key + 1} = ... + sprintf('(%s) %s',mode_labels{rawf.data.mode + 1}, chroma_labels{rawf.data.key + 1}); + + % transpose labels and data + chroma_labels = circshift(chroma_labels, [0, feature(i).data.chroma(1).shift]); + chromar = circshift([rawf.data.segments_pitches], [feature(i).data.chroma(1).shift, 0]); + + % image transposed chromas again + segments = [rawf.data.segments_start]; + segments(end) = rawf.data.duration; + + hold(a1); + uimagesc(segments, 0:11, chromar, 'Parent', a1); + set(a1,'YTick',[0:11], 'YTickLabel', chroma_labels); + + % enlarge plot and plot new data after the old ones + ax = axis(a1); + ax(2) = ax(2) + 2*feature(i).my_params.nchromas + 0.5; + axis(a1, 'xy'); + axis(a1, ax); + + imagesc(rawf.data.duration + (1:feature(i).my_params.nchromas), (-1:11), ... + [ feature(i).data.chroma(:).means_weight; feature(i).data.chroma(:).means],... + 'Parent', a1); + % variance calculated? + if isfield(feature(i).data.chroma, 'vars') + + imagesc(rawf.data.duration + feature(i).my_params.nchromas + (1:feature(i).my_params.nchromas), (-1:11), ... + [feature(i).data.chroma(:).vars],... + 'Parent', a1); + end + end + + % --- + % Display timbre features + % --- + if isfield(feature(i).data, 'timbre') + + % enlarge plot and plot new data after the old ones + hold(a2); + ax = axis(a2); + ax(2) = ax(2) + 2*feature(i).my_params.ntimbres + 0.5; + + axis(a2, ax); + imagesc(rawf.data.duration + (1:feature(i).my_params.ntimbres), (-1:11), ... + [ feature(i).data.timbre(:).means_weight; feature(i).data.timbre(:).means],... + 'Parent', a2); + if isfield(feature(i).data.timbre, 'vars') + + imagesc(rawf.data.duration + feature(i).my_params.ntimbres + (1:feature(i).my_params.ntimbres), (-1:11), ... + [feature(i).data.timbre(:).vars],... + 'Parent', a1); + end + end + + % --- + % Display rhythm features + % --- + if isfield(feature(i).data, 'rhythm') + % data.rhythm.interval + % get timecode + eightt = feature(i).data.rhythm.energy8_time; + sixt = feature(i).data.rhythm.energy16_time; + + hold(a3); + % plot sixteens acorr and energy + plot(sixt, feature(i).data.rhythm.energy16, 'bx') + + plot(sixt, feature(i).data.rhythm.acorr16, 'b') + + % plot eights acorr and energy + plot(eightt, feature(i).data.rhythm.energy8, 'rx') + + plot(eightt, feature(i).data.rhythm.acorr8, 'r') + + % broaden view by fixed 4 seconds + ax = axis(a3); + axis(a3, [max(0, eightt(1)-( eightt(end) - eightt(1) + 4 )) ... + min(rawf.data.duration, eightt(end) +4) ... + ax(3:4)]); + end + end + end + end + + + methods (Hidden = true) + + function [env, time] = energy_envelope(feature, clip) + % extracts the envelope of energy for the given clip + + % --- + % TODO: externalise envelope etc in external audio features + % --- + + [null, src] = evalc('miraudio(clip.mp3file_full())'); + [null, env] = evalc('mirenvelope(src, ''Sampling'', feature.my_params.energy_sr)'); + + time = get(env,'Time'); + time = time{1}{1}; + env = mirgetdata(env); + end + + function [acorr, base_sig, base_t] = beat_histogram(feature, startt, interval, signal, signal_t) + % acorr = beat_histogram(feature, startt, interval, signal, time) + % + % compute correlation for beats of specified length in energy curve + + % get corresponding energy values + dt = signal_t(2) - signal_t(1); + base_t = startt:interval:(startt + (feature.my_params.nints*2-1) * interval); + base_sig = signal( min( numel(signal), max(1,round((base_t - signal_t(1))/dt)))); + + % normalise energy + acbase_sig = base_sig./max(base_sig); + + % calculate their cyclic autocorrelation + acorr = circshift(xcorr(acbase_sig,acbase_sig(1:end/2)),... + [numel(acbase_sig) 0]); + + % cut acorr to relevant points, normalise and square + acorr = (acorr(1:feature.my_params.nints)./feature.my_params.nints).^2; + + % --- + % NOTE: we normalise the autocorrelation locally, to compare the + % (rhythmic) shape + % --- + if feature.my_params.norm_acorr; + + acorr = acorr - min(acorr); + acorr = acorr/max(acorr); + end + end + end + + methods(Static) + + function timbre = norm_timbre(in, normfs) + % returns normed timbre data + + % --- + % individually scale the data using + % the dimensions factors + % --- + timbre = zeros(size(in)); + for i = 1:size(in,2) + + timbre(:,i) = normfs .* in(:,i); + end + + % shift to positive values + timbre = (1 + timbre) /2; + + % clip features to [0,1] + timbre = min(1, max(timbre, 0)); + end + + % --- + % returns parameter md5 hash for comparison + % --- + end + +end \ No newline at end of file