Mercurial > hg > camir-aes2014
view core/magnatagatune/MTTAudioFeatureBasicSm.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line source
classdef MTTAudioFeatureBasicSm < MTTAudioFeature & handle % --- % the MTTAudioFeatureBasicSm Class contains % a basic summary of chroma, mfcc and tempo features % a few common chroma and mfcc vectors are concatenated % along with some clip-wide variance % a metric / rhythm fingerprint is added % % The usual workflow for these features consists of three steps % 1. extract: extracts the basic single-file dependent features % 2. define_global_transform: calculates the global feature % transformation parameters % 3. finalise: applies the common transformations to a specific feature % --- properties(Constant = true) % svn hook my_revision = str2double(substr('$Rev$', 5, -1)); end properties % --- % Set default parameters % --- my_params = struct(... 'nchromas', 4, ... % 4 chroma vectors 'chroma_var', 0, ... % chroma variance 'norm_chromas', 0, ... % not implemented, chromas already rel. 'min_kshift_chromas', 0.1, ... % treshold for key shift. set to 1 for no shift (0-1) ... 'ntimbres', 4, ... 'timbre_var', 0, ... % timbre variance 'norm_timbres', 1, ... 'clip_timbres', 0.85, ... % percentile of data which has to be inside 0-1 bounds ... 'norm_weights',0, ... % globally norm weights for chroma times? 'norm_interval',1, ... 'max_iter',100, ... % max iterations for chroma and timbre knn ... 'nrhythms', 0, ... 'nints', 11, ... 'energy_sr', 1000, ... % sample rate for energy curve 'norm_acorr', 1 ... % normalise arcorr locally-> shape imp... energy is normalised anyways ); end % --- % member functions % --- methods % --- % constructor: pointer to feature in database % --- function feature = MTTAudioFeatureBasicSm(varargin) feature = feature@MTTAudioFeature(varargin{:}); end % --- % extract feature data from raw audio features % --- function data = extract(feature, clip) % --- % get Basic Summary audio features. this includes possible % local normalisations % --- global globalvars; % --- % get casimir child clip if available % --- if isa(clip, 'CASIMIRClip') baseclip = clip.child_clip(); else baseclip = clip; end if isa(baseclip, 'MTTClip') rawf = baseclip.audio_features_raw(); elseif isa(baseclip, 'MSDClip') rawf = baseclip.features('MSDAudioFeatureRAW'); end % --- % now extract the features % first step: chroma clustering % --- weights = [rawf.data.segments_duration]; % normalise weights weights = weights / rawf.data.duration; chroma = [rawf.data.segments_pitches]'; % --- % get most present chroma vectors. % the weighted k-means should return the four most prominent % chroma vectors and their weight % --- % display error values op = foptions(); op(1) = 0; op(14) = feature.my_params.max_iter; % check for trivial case if feature.my_params.nchromas == 0 chromas = []; cwght = []; elseif feature.my_params.nchromas == 1 chromas = mean(chroma, 1); chroma_var = var(chroma, 0, 1); cwght = 1; elseif numel(weights) > feature.my_params.nchromas % --- % there may be few chromas, try kmeans several (20) times % --- cont = 0; cwght = []; while (numel(cwght) ~= feature.my_params.nchromas) && (cont < 20); [chromas, cwght, post] = ... weighted_kmeans(feature.my_params.nchromas, chroma, weights, op); cont = cont + 1; end if (numel(cwght) ~= feature.my_params.nchromas) error('cannot find enough chroma centres'); end % --- % Calculate the weighted variance of the chroma clusters % --- if feature.my_params.chroma_var >= 1 chroma_var = zeros(size(chromas)); for i = 1:size(chroma_var,1) % get distance from cluster centroid tmp_var = (chroma(post(:,i),:) - repmat(chromas(i,:), sum(post(:,i)),1)).^2; % add up the weighted differences and normalise by sum % of weights chroma_var(i,:) = (weights(post(:,i)) * tmp_var) ./... (sum(weights(post(:,i)))); end end else % --- % odd case: less than nchroma data points. % we repeat the mean vector at the end % --- chromas = [chroma; repmat(mean(chroma, 1),... feature.my_params.nchromas - numel(weights), 1 )]; cwght = weights; cwght( end + 1:feature.my_params.nchromas ) = 0; % --- % TODO: get a variance for odd case : % replicate the complete data variance? % NO: every vector is a clsuter => zero variance % --- end % trivial case: no variance requested if ~exist('chroma_var','var') chroma_var = zeros(size(chromas)); end % sort by associated time [cwght, idx] = sort(cwght, 'descend'); chromas = chromas(idx,:); chroma_var = chroma_var(idx,:); % --- % shift according to detected key, but only if % the confidencee is high enough % --- shift = 0; if rawf.data.keyConfidence > feature.my_params.min_kshift_chromas; shift = -rawf.data.key; chromas = circshift(chromas, [0 shift]); chroma_var = circshift(chroma_var, [0 shift]); end % --- % get mfcc centres: % the same for mfccs % --- mfcc = [rawf.data.segments_timbre]'; if feature.my_params.ntimbres == 0 mfccs = []; mwght = []; elseif feature.my_params.ntimbres == 1 mfccs = mean(mfcc, 1); timbre_var = var(mfccs, 0, 1); mwght = 1; elseif numel(weights) > feature.my_params.ntimbres % --- % there may be few mfccs, try kmeans several times % --- cont = 0; mwght = []; while (numel(mwght) ~= feature.my_params.ntimbres) && (cont < 20); [mfccs, mwght, post] = ... weighted_kmeans(feature.my_params.ntimbres, mfcc, weights, op); cont = cont + 1; end if (numel(mwght) ~= feature.my_params.ntimbres) error('cannot find enough mfcc centres'); end % --- % Calculate the weighted variance of the chroma clusters % --- if feature.my_params.timbre_var >= 1 timbre_var = zeros(size(mfccs)); for i = 1:size(timbre_var,1) % get distance from cluster centroid tmp_var = (mfcc(post(:,i),:) - repmat(mfccs(i,:), sum(post(:,i)),1)).^2; % add up the weighted differences and normalise by sum % of weights timbre_var(i,:) = (weights(post(:,i)) * tmp_var) ./... (sum(weights(post(:,i)))); end end else % --- % odd case: less than nchroma data points. % we repeat the mean vector at the end % --- mfccs = [mfcc; repmat(mean(mfcc, 1),... feature.my_params.ntimbres - numel(weights), 1)]; mwght = weights; mwght( end + 1:feature.my_params.ntimbres) = 0; end % trivial case: no variance requested if ~exist('timbre_var','var') timbre_var = zeros(size(mfccs)); end % sort by associated time [mwght, idx] = sort(mwght, 'descend'); mfccs = mfccs(idx,:); timbre_var = timbre_var(idx,:); % --- % get beat features: % the autocorrelation curve over n quarters of length % % alternative: how about using the n=8 quarters relative % volumes from the start of a sure measure? % --- if feature.my_params.nrhythms >= 1 bars = rawf.data.bars; beats = rawf.data.beats; tatums = rawf.data.tatums; % --- % NOTE: the beat and tatum markers seem to have an offset :( % --- offset = 0.118; %seconds [envelope, time] = energy_envelope(feature, clip); % we offset the energy curve time = time + offset; % --- % we try to start at the best beat confidence more % than sixteen eights from the end % --- if rawf.data.tempo > 0 eightl = 30 / rawf.data.tempo; else % --- % odd case: no rhythm data. assume 100 bpm % --- eightl = 0.3; end if isempty(beats) % --- % odd case: no beats detected. -> use best tatum % --- if ~isempty(tatums) beats = tatums; else % ok, just take the beginning beats = [0; 1]; end end last_valid = find(beats(1,:) < ... (rawf.data.duration - feature.my_params.nints * eightl),1, 'last'); % find the best valid beat postition [null, max_measure] = max( beats(2, 1:last_valid)); max_mtime = beats(1,max_measure); % --- % the correlation is calculated for the estimated eights lenght % and for the 16th intervals, respectively. % --- % calculate the EIGHTS correlation for the following segment [acorr8, eight_en, eightt] = ... beat_histogram(feature, max_mtime, eightl, envelope, time); % calculate the SIXTEENTHS correlation for the following segment [acorr16, six_en, sixt] = ... beat_histogram(feature, max_mtime, eightl / 2, envelope, time); % --- % save the various features % --- % save rythm feature data data.rhythm.acorr8 = acorr8; data.rhythm.acorr8_lag = eightt(1:end/2)-eightt(1); data.rhythm.energy8 = eight_en(1:end/2); data.rhythm.energy8_time = eightt(1:end/2); % -- % the interval is normed locally up to a max value % associated to 30bpm % --- if feature.my_params.norm_interval % 1 second max value data.rhythm.interval8 = eightl / 2; else data.rhythm.interval8 = eightl / 2; end if feature.my_params.nrhythms >= 2 data.rhythm.acorr16 = acorr16; data.rhythm.acorr16_lag = data.rhythm.acorr8_lag / 2; data.rhythm.energy16 = six_en(1:end/2); data.rhythm.energy16_time = sixt(1:end/2); % save beat interval / tempo if feature.my_params.norm_interval % 1 second max value data.rhythm.interval16 = eightl / 2; else data.rhythm.interval16 = eightl / 2; end end else % % save empty rythm struct % data.rhythm = struct([]); end % chroma feature data for i = 1:size(chromas,1) data.chroma(i).means = chromas(i,:)'; data.chroma(i).means_weight = cwght(i); data.chroma(i).vars = chroma_var(i,:)'; data.chroma(i).shift = shift; end % mfcc feature data for i = 1:size(mfccs,1) data.timbre(i).means = mfccs(i,:)'; data.timbre(i).means_weight = mwght(i); data.timbre(i).vars = timbre_var(i,:)'; end % prepare field for final features data.final.vector = []; data.final.vector_info = struct(); data.final.dim = 0; % save info data data.info.type = 'MTTAudioFeatureBasicSm'; data.info.owner = clip; data.info.owner_id = clip.id; data.info.creatorrev = feature.my_revision; % save parameters data.info.params = feature.my_params; end function define_global_transform(features) % calculate and set normalization factors from the group of % input features. These features will be set for the full database if numel(features) == 1 error ('Insert feature array for this method'); end % --- % here, we only need to define the post-normalisation % --- % --- % get chroma variance data NORMALISATION Factors % TODO: transport chroma variance to finalise step % --- if features(1).my_params.chroma_var >= 1 allfeat = abs(cat(2, features(1).data.chroma(:).vars)); for i = 2:numel(features) allfeat = cat(2 , allfeat, abs(abs(cat(2, features(i).data.chroma(:).vars)))); end [~, common.post_normf.chroma_var] = mapminmax(allfeat,0,1); end % --- % get timbre variance data NORMALISATION Factors % TODO: transport chroma variance to finalise step % --- if features(1).my_params.timbre_var >= 1 allfeat = abs(cat(2, features(1).data.timbre(:).vars)); for i = 2:numel(features) allfeat = cat(2 , allfeat, abs(abs(cat(2, features(i).data.timbre(:).vars)))); end [~, common.post_normf.timbre_var] = mapminmax(allfeat,0,1); end % --- % derive normalisation for timbre features: % MFCC's are actually special filter outputs % (see developer.echonest.com/docs/v4/_static/AnalyzeDocumentation_2.2.pdf % they are unbounded, so just the relative information will be % used here. % We normalise each bin independently % --- if features(1).my_params.ntimbres > 0 allfeat = abs(cat(2, features(1).data.timbre(:).means)); for i = 2:numel(features) allfeat = cat(2 , allfeat, abs(cat(2, features(i).data.timbre(:).means))); end % --- % get normalisation factors % NOTE: the values will later be clipped to [0,1] % anyways % --- if (features(1).my_params.clip_timbres ~= 0 ) || ... (features(1).my_params.clip_timbres ~= 1 ) common.post_normf.timbre = 1 ./ prctile(allfeat, features(1).my_params.clip_timbres * 100, 2); else % just use the maximum common.post_normf.timbre = 1/max(allfeat, 2); end % set common feature values features(1).my_db.set_common(common); else features(1).my_db.set_common([1]); end end function finalise(feature) % applies a final transformation and % collects the information of this feature within a single vector % see info for types in specific dimensions for i = 1:numel(feature) % check for neccesary parameters if isempty(feature(i).my_db.commondb) error('Define the global transformation first') return; end if feature(1).my_params.ntimbres > 0 % --- % normalise features % --- % norm timbre features if neccesary timbren = []; if feature(i).my_params.norm_timbres for j = 1:numel(feature(i).data.timbre) timbren = cat(1, timbren, ... MTTAudioFeatureBasicSm.norm_timbre... (feature(i).data.timbre(j).means, feature(i).my_db.commondb.post_normf.timbre)); end else timbren = cat(1, timbren, feature(i).data.timbre(:).means); end end % --- % construct resulting feature vector out of features % --- vec = []; info = {}; if feature(i).my_params.nchromas > 0 info{numel(vec)+ 1} = 'chroma'; vec = cat(1, vec, feature(i).data.chroma(:).means); info{numel(vec)+ 1} = 'chroma weights'; vec = cat(1, vec, [feature(i).data.chroma(:).means_weight]'); % --- % NORMALISE Chroma variance % --- if feature(i).my_params.chroma_var >= 1 info{numel(vec)+ 1} = 'chroma variance'; % normalise this pack of variance vectors tmp_var = mapminmax('apply', [feature(i).data.chroma(:).vars],... feature(i).common.post_normf.chroma_var); % concatenate normalised data to vector for vari = 1:size(tmp_var,2) vec = cat(1, vec, tmp_var(:, vari)); end end end if feature(i).my_params.ntimbres > 0 info{numel(vec)+ 1} = 'timbre'; vec = cat(1, vec, timbren); info{numel(vec)+ 1} = 'timbre weights'; vec = cat(1, vec, [feature(i).data.timbre(:).means_weight]'); % --- % NORMALISE timbre variance % --- if feature(i).my_params.timbre_var >= 1 info{numel(vec)+ 1} = 'timbre variance'; % normalise this pack of variance vectors tmp_var = mapminmax('apply', [feature(i).data.timbre(:).vars],... feature(i).common.post_normf.timbre_var); % concatenate normalised data to vector for vari = 1:size(tmp_var,2) vec = cat(1, vec, tmp_var(:, vari)); end end end if feature(i).my_params.nrhythms > 0 info{numel(vec)+ 1} = 'rhythm 8'; vec = cat(1, vec, feature(i).data.rhythm.acorr8); info{numel(vec)+ 1} = 'int 8'; vec = cat(1, vec, feature(i).data.rhythm.interval8); if feature(i).my_params.nrhythms >= 2 info{numel(vec)+ 1} = 'rhythm 16'; vec = cat(1, vec, feature(i).data.rhythm.acorr16); info{numel(vec)+ 1} = 'int 16'; vec = cat(1, vec, feature(i).data.rhythm.interval16); end end feature(i).data.final.vector = vec; feature(i).data.final.dim = numel(feature(i).data.final.vector); % fill up info struct and append to feature info(end+1: feature(i).data.final.dim) = ... cell(feature(i).data.final.dim - numel(info),1); feature(i).data.final.vector_info.labels = info; end % --- % TODO: Maybe delete more basic features again at this point? % --- end % --- % destructor: do we really want to remove this % from the database? No, but % TODO: create marker for unused objects in db, and a cleanup % function % --- function delete(feature) end function visualise(feature) % --- % plots the different data types collected in this feature % --- for i = 1:numel(feature) clip = feature(i).data.info.owner; % display raw features if isa(clip, 'CASIMIRClip') baseclip = clip.child_clip(); else baseclip = clip; end if isa(baseclip, 'MTTClip') rawf = baseclip.audio_features_raw(); elseif isa(baseclip, 'MSDClip') rawf = baseclip.features('MSDAudioFeatureRAW'); end % --- % @todo: implement MSD feature visualisation % --- [a1, a2, a3] = rawf.visualise(); % --- % Display chroma features % --- if isfield(feature(i).data, 'chroma') chroma_labels = {'c', 'c#', 'd','d#', 'e', 'f','f#', 'g','g#', 'a', 'a#', 'h'}; mode_labels = {'minor', 'major'}; % change labels to reflect detected mode chroma_labels{rawf.data.key + 1} = ... sprintf('(%s) %s',mode_labels{rawf.data.mode + 1}, chroma_labels{rawf.data.key + 1}); % transpose labels and data chroma_labels = circshift(chroma_labels, [0, feature(i).data.chroma(1).shift]); chromar = circshift([rawf.data.segments_pitches], [feature(i).data.chroma(1).shift, 0]); % image transposed chromas again segments = [rawf.data.segments_start]; segments(end) = rawf.data.duration; hold(a1); uimagesc(segments, 0:11, chromar, 'Parent', a1); set(a1,'YTick',[0:11], 'YTickLabel', chroma_labels); % enlarge plot and plot new data after the old ones ax = axis(a1); ax(2) = ax(2) + 2*feature(i).my_params.nchromas + 0.5; axis(a1, 'xy'); axis(a1, ax); imagesc(rawf.data.duration + (1:feature(i).my_params.nchromas), (-1:11), ... [ feature(i).data.chroma(:).means_weight; feature(i).data.chroma(:).means],... 'Parent', a1); % variance calculated? if isfield(feature(i).data.chroma, 'vars') imagesc(rawf.data.duration + feature(i).my_params.nchromas + (1:feature(i).my_params.nchromas), (-1:11), ... [feature(i).data.chroma(:).vars],... 'Parent', a1); end end % --- % Display timbre features % --- if isfield(feature(i).data, 'timbre') % enlarge plot and plot new data after the old ones hold(a2); ax = axis(a2); ax(2) = ax(2) + 2*feature(i).my_params.ntimbres + 0.5; axis(a2, ax); imagesc(rawf.data.duration + (1:feature(i).my_params.ntimbres), (-1:11), ... [ feature(i).data.timbre(:).means_weight; feature(i).data.timbre(:).means],... 'Parent', a2); if isfield(feature(i).data.timbre, 'vars') imagesc(rawf.data.duration + feature(i).my_params.ntimbres + (1:feature(i).my_params.ntimbres), (-1:11), ... [feature(i).data.timbre(:).vars],... 'Parent', a1); end end % --- % Display rhythm features % --- if isfield(feature(i).data, 'rhythm') % data.rhythm.interval % get timecode eightt = feature(i).data.rhythm.energy8_time; sixt = feature(i).data.rhythm.energy16_time; hold(a3); % plot sixteens acorr and energy plot(sixt, feature(i).data.rhythm.energy16, 'bx') plot(sixt, feature(i).data.rhythm.acorr16, 'b') % plot eights acorr and energy plot(eightt, feature(i).data.rhythm.energy8, 'rx') plot(eightt, feature(i).data.rhythm.acorr8, 'r') % broaden view by fixed 4 seconds ax = axis(a3); axis(a3, [max(0, eightt(1)-( eightt(end) - eightt(1) + 4 )) ... min(rawf.data.duration, eightt(end) +4) ... ax(3:4)]); end end end end methods (Hidden = true) function [env, time] = energy_envelope(feature, clip) % extracts the envelope of energy for the given clip % --- % TODO: externalise envelope etc in external audio features % --- [null, src] = evalc('miraudio(clip.mp3file_full())'); [null, env] = evalc('mirenvelope(src, ''Sampling'', feature.my_params.energy_sr)'); time = get(env,'Time'); time = time{1}{1}; env = mirgetdata(env); end function [acorr, base_sig, base_t] = beat_histogram(feature, startt, interval, signal, signal_t) % acorr = beat_histogram(feature, startt, interval, signal, time) % % compute correlation for beats of specified length in energy curve % get corresponding energy values dt = signal_t(2) - signal_t(1); base_t = startt:interval:(startt + (feature.my_params.nints*2-1) * interval); base_sig = signal( min( numel(signal), max(1,round((base_t - signal_t(1))/dt)))); % normalise energy acbase_sig = base_sig./max(base_sig); % calculate their cyclic autocorrelation acorr = circshift(xcorr(acbase_sig,acbase_sig(1:end/2)),... [numel(acbase_sig) 0]); % cut acorr to relevant points, normalise and square acorr = (acorr(1:feature.my_params.nints)./feature.my_params.nints).^2; % --- % NOTE: we normalise the autocorrelation locally, to compare the % (rhythmic) shape % --- if feature.my_params.norm_acorr; acorr = acorr - min(acorr); acorr = acorr/max(acorr); end end end methods(Static) function timbre = norm_timbre(in, normfs) % returns normed timbre data % --- % individually scale the data using % the dimensions factors % --- timbre = zeros(size(in)); for i = 1:size(in,2) timbre(:,i) = normfs .* in(:,i); end % shift to positive values timbre = (1 + timbre) /2; % clip features to [0,1] timbre = min(1, max(timbre, 0)); end % --- % returns parameter md5 hash for comparison % --- end end