# HG changeset patch # User Chris Cannam # Date 1417698316 0 # Node ID 593054bf647638ab5606bc1b34644cf439bbf591 # Parent 6b91e40b2c04d5761a5343b0296dc1145f1aa043 Pull out normalisation and specdiff stuff into FeatureConditioner diff -r 6b91e40b2c04 -r 593054bf6476 Makefile.inc --- a/Makefile.inc Thu Nov 27 16:50:14 2014 +0000 +++ b/Makefile.inc Thu Dec 04 13:05:16 2014 +0000 @@ -20,6 +20,7 @@ depend: makedepend -Y -fMakefile.inc $(SOURCES) $(HEADERS) + # DO NOT DELETE src/DistanceMetric.o: src/DistanceMetric.h diff -r 6b91e40b2c04 -r 593054bf6476 src/FeatureConditioner.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/FeatureConditioner.cpp Thu Dec 04 13:05:16 2014 +0000 @@ -0,0 +1,80 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Vamp feature extraction plugin using the MATCH audio alignment + algorithm. + + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#include "FeatureConditioner.h" + +#include + +using namespace std; + +vector +FeatureConditioner::process(const vector &feature) +{ + if (m_prev.empty()) { + m_prev.resize(feature.size(), 0.0); + } + if (m_prev.size() != feature.size()) { + cerr << "ERROR: FeatureConditioner::process: feature size " + << feature.size() << " differs from previous feature size " + << m_prev.size() << endl; + return feature; + } + + int size = feature.size(); + + vector out(size, 0.0); + + double totalEnergy = 0; + if (m_params.order == OutputRectifiedDerivative) { + for (int i = 0; i < size; i++) { + totalEnergy += feature[i]; + if (feature[i] > m_prev[i]) { + out[i] = feature[i] - m_prev[i]; + } else { + out[i] = 0; + } + } + } else { + for (int i = 0; i < size; i++) { + out[i] = feature[i]; + totalEnergy += out[i]; + } + } + + if (m_ltAverage == 0) { + m_ltAverage = totalEnergy; + } else { + double decay = m_params.decay; + m_ltAverage = m_ltAverage * decay + totalEnergy * (1.0 - decay); + } + + if (totalEnergy <= m_params.silenceThreshold) { + for (int i = 0; i < size; i++) { + out[i] = 0; + } + } else if (m_params.norm == NormaliseToSum1) { + for (int i = 0; i < size; i++) { + out[i] /= totalEnergy; + } + } else if (m_params.norm == NormaliseToLTAverage) { + for (int i = 0; i < size; i++) { + out[i] /= m_ltAverage; + } + } + + m_prev = feature; + return out; +} + diff -r 6b91e40b2c04 -r 593054bf6476 src/FeatureConditioner.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/FeatureConditioner.h Thu Dec 04 13:05:16 2014 +0000 @@ -0,0 +1,105 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Vamp feature extraction plugin using the MATCH audio alignment + algorithm. + + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#ifndef FEATURE_CONDITIONER_H +#define FEATURE_CONDITIONER_H + +#include + +/** + * Take a series of feature vectors and apply conditioning of some + * sort, such as normalisation or first-order derivative. + * + * Note that FeatureConditioner maintains internal frame-to-frame + * state: use one FeatureConditioner per audio source, and construct a + * new one for each new source. + */ +class FeatureConditioner +{ +public: + enum Normalisation { + + /** Do not normalise */ + NoNormalisation, + + /** Normalise each feature vector to have a sum of 1 */ + NormaliseToSum1, + + /** Normalise each feature vector by the long-term average of + * the summed energy */ + NormaliseToLTAverage, + }; + + enum OutputOrder { + + /** Output the normalised features without further processing */ + OutputFeatures, + + /** Output the half-wave rectified difference between the + * previous and current features instead of the straight + * feature values. */ + OutputRectifiedDerivative, + }; + + struct Parameters { + + Parameters() : + norm(NormaliseToSum1), + order(OutputRectifiedDerivative), + silenceThreshold(0.01), + decay(0.99) + {} + + /** Feature normalisation. */ + Normalisation norm; + + /** Type of output to generate (plain feature, derivative etc). */ + OutputOrder order; + + /** Silence threshold. If non-zero, any feature whose total + * energy (simply the sum of feature values) is below that + * threshold will be rounded down to all zeros. */ + double silenceThreshold; + + /** Frame-to-frame decay factor in calculating long-term average */ + double decay; + }; + + /** + * Construct a FeatureExtractor with the given parameters. + * + * Note that FeatureExtractor maintains internal frame-to-frame + * state: use one FeatureExtractor per audio source, and construct + * a new one for each new source. + */ + FeatureConditioner(Parameters parameters) : m_params(parameters) { } + + /** + * Process the given feature and return the conditioned feature. + */ + std::vector process(const std::vector &feature); + +protected: + Parameters m_params; + + /** Long term average feature energy. */ + double m_ltAverage; + + /** The most recent feature, used for calculating the feature to + * feature difference. This is therefore not yet normalised. */ + std::vector m_prev; +}; + +#endif diff -r 6b91e40b2c04 -r 593054bf6476 src/FeatureExtractor.cpp --- a/src/FeatureExtractor.cpp Thu Nov 27 16:50:14 2014 +0000 +++ b/src/FeatureExtractor.cpp Thu Dec 04 13:05:16 2014 +0000 @@ -25,12 +25,9 @@ using namespace std; FeatureExtractor::FeatureExtractor(Parameters parameters) : - m_params(parameters), - m_ltAverage(0) + m_params(parameters) { m_featureSize = getFeatureSizeFor(parameters); - m_prevFrame = vector(m_featureSize, 0.0); - makeFreqMap(); } @@ -107,15 +104,12 @@ { vector frame(m_featureSize, 0.0); - double rms = 0; for (int i = 0; i <= m_params.fftSize/2; i++) { double mag = real[i] * real[i] + imag[i] * imag[i]; - rms += mag; frame[m_freqMap[i]] += mag; } - rms = sqrt(rms / (m_params.fftSize/2)); - return postProcess(frame, rms); + return frame; } vector @@ -123,61 +117,11 @@ { vector frame(m_featureSize, 0.0); - double rms = 0; for (int i = 0; i <= m_params.fftSize/2; i++) { double mag = cframe[i*2] * cframe[i*2] + cframe[i*2+1] * cframe[i*2+1]; - rms += mag; frame[m_freqMap[i]] += mag; } - rms = sqrt(rms / (m_params.fftSize/2)); - return postProcess(frame, rms); + return frame; } -vector -FeatureExtractor::postProcess(const vector &frame, double rms) -{ - vector feature(m_featureSize, 0.0); - - double totalEnergy = 0; - if (m_params.useSpectralDifference) { - for (int i = 0; i < m_featureSize; i++) { - totalEnergy += frame[i]; - if (frame[i] > m_prevFrame[i]) { - feature[i] = frame[i] - m_prevFrame[i]; - } else { - feature[i] = 0; - } - } - } else { - for (int i = 0; i < m_featureSize; i++) { - feature[i] = frame[i]; - totalEnergy += feature[i]; - } - } - - if (m_ltAverage == 0) { - m_ltAverage = totalEnergy; - } else { - double decay = m_params.decay; - m_ltAverage = m_ltAverage * decay + totalEnergy * (1.0 - decay); - } - - if (rms <= m_params.silenceThreshold) { - for (int i = 0; i < m_featureSize; i++) { - feature[i] = 0; - } - } else if (m_params.frameNorm == NormaliseFrameToSum1) { - for (int i = 0; i < m_featureSize; i++) { - feature[i] /= totalEnergy; - } - } else if (m_params.frameNorm == NormaliseFrameToLTAverage) { - for (int i = 0; i < m_featureSize; i++) { - feature[i] /= m_ltAverage; - } - } - - m_prevFrame = frame; - return feature; -} - diff -r 6b91e40b2c04 -r 593054bf6476 src/FeatureExtractor.h --- a/src/FeatureExtractor.h Thu Nov 27 16:50:14 2014 +0000 +++ b/src/FeatureExtractor.h Thu Dec 04 13:05:16 2014 +0000 @@ -25,50 +25,24 @@ * the frequency data to map higher frequencies into a linear scale. A * chroma mapping is also available. * - * Note that FeatureExtractor maintains internal frame-to-frame state: - * use one FeatureExtractor per audio source, and construct a new one - * for each new source. + * Note that FeatureExtractor may maintain internal frame-to-frame + * state: use one FeatureExtractor per audio source, and construct a + * new one for each new source. */ class FeatureExtractor { public: - enum FrameNormalisation { - - /** Do not normalise frames */ - NoFrameNormalisation, - - /** Normalise each frame to have a sum of 1 */ - NormaliseFrameToSum1, - - /** Normalise each frame by the long-term average of the - * summed energy */ - NormaliseFrameToLTAverage, - }; - struct Parameters { Parameters(float rate_, int fftSize_) : sampleRate(rate_), - frameNorm(NormaliseFrameToSum1), - useSpectralDifference(true), useChromaFrequencyMap(false), - fftSize(fftSize_), - silenceThreshold(0.01), - decay(0.99) + fftSize(fftSize_) {} /** Sample rate of audio */ float sampleRate; - /** Type of audio frame normalisation */ - FrameNormalisation frameNorm; - - /** Flag indicating whether or not the half-wave rectified - * spectral difference should be used in calculating the - * distance metric for pairs of audio frames, instead of the - * straight spectrum values. */ - bool useSpectralDifference; - /** Flag indicating whether to use a chroma frequency map (12 * bins) instead of the default warped spectrogram */ bool useChromaFrequencyMap; @@ -82,12 +56,6 @@ * in is already in the frequency domain, so this expresses * the size of the frame that the caller will be providing. */ int fftSize; - - /** RMS level below which frame is considered silent */ - double silenceThreshold; - - /** Frame-to-frame decay factor in calculating long-term average */ - double decay; }; /** @@ -117,12 +85,8 @@ * have at least params.fftSize/2+1 elements each. * * Operates by mapping the frequency bins into a part-linear - * part-logarithmic array, then (optionally) computing the - * half-wave rectified spectral difference from the previous - * frame, then (optionally) normalising to a sum of 1. - * - * Return value is the frame (post-processed, with warping, - * rectification, and normalisation as appropriate). + * part-logarithmic array, unless useChromaFrequencyMap is true in + * which case they are mapped into chroma bins. */ std::vector process(const std::vector &real, const std::vector &imag); @@ -133,12 +97,8 @@ * must have at least 2 * (params.fftSize/2 + 1) elements. * * Operates by mapping the frequency bins into a part-linear - * part-logarithmic array, then (optionally) computing the - * half-wave rectified spectral difference from the previous - * frame, then (optionally) normalising to a sum of 1. - * - * Return value is the frame (post-processed, with warping, - * rectification, and normalisation as appropriate). + * part-logarithmic array, unless useChromaFrequencyMap is true in + * which case they are mapped into chroma bins. */ std::vector process(const float *carray); @@ -157,14 +117,8 @@ /** Creates a map of FFT frequency bins to semitone chroma bins. */ void makeChromaFrequencyMap(); - std::vector postProcess(const std::vector &, double rms); - /** Configuration parameters */ Parameters m_params; - - /** Long term average frame energy (in frequency domain - * representation). */ - double m_ltAverage; /** A mapping function for mapping FFT bins to final frequency * bins. The mapping is linear (1-1) until the resolution @@ -179,11 +133,6 @@ /** The size of a returned feature. */ int m_featureSize; - - /** The most recent frame; used for calculating the frame to frame - * spectral difference. This is therefore frequency warped but - * not yet normalised. */ - std::vector m_prevFrame; }; #endif diff -r 6b91e40b2c04 -r 593054bf6476 src/MatchVampPlugin.cpp --- a/src/MatchVampPlugin.cpp Thu Nov 27 16:50:14 2014 +0000 +++ b/src/MatchVampPlugin.cpp Thu Dec 04 13:05:16 2014 +0000 @@ -63,7 +63,9 @@ m_params(inputSampleRate, defaultStepTime, m_blockSize), m_defaultParams(inputSampleRate, defaultStepTime, m_blockSize), m_feParams(inputSampleRate, m_blockSize), - m_defaultFeParams(inputSampleRate, m_blockSize) + m_defaultFeParams(inputSampleRate, m_blockSize), + m_fcParams(), + m_defaultFcParams() { if (inputSampleRate < sampleRateMin) { std::cerr << "MatchVampPlugin::MatchVampPlugin: input sample rate " @@ -167,7 +169,7 @@ desc.description = "Type of normalisation to use for frequency-domain audio features"; desc.minValue = 0; desc.maxValue = 2; - desc.defaultValue = (int)m_defaultFeParams.frameNorm; + desc.defaultValue = (int)m_defaultFcParams.norm; desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.clear(); @@ -197,7 +199,7 @@ desc.description = "Whether to use half-wave rectified spectral difference instead of straight spectrum"; desc.minValue = 0; desc.maxValue = 1; - desc.defaultValue = m_defaultFeParams.useSpectralDifference ? 1 : 0; + desc.defaultValue = (int)m_defaultFcParams.order; desc.isQuantized = true; desc.quantizeStep = 1; list.push_back(desc); @@ -263,11 +265,11 @@ if (name == "serialise") { return m_serialise ? 1.0 : 0.0; } else if (name == "framenorm") { - return (int)m_feParams.frameNorm; + return (int)m_fcParams.norm; } else if (name == "distnorm") { return (int)m_params.distanceNorm; } else if (name == "usespecdiff") { - return m_feParams.useSpectralDifference ? 1.0 : 0.0; + return (int)m_fcParams.order; } else if (name == "usechroma") { return m_feParams.useChromaFrequencyMap ? 1.0 : 0.0; } else if (name == "gradientlimit") { @@ -289,11 +291,11 @@ if (name == "serialise") { m_serialise = (value > 0.5); } else if (name == "framenorm") { - m_feParams.frameNorm = (FeatureExtractor::FrameNormalisation)(int(value + 0.1)); + m_fcParams.norm = (FeatureConditioner::Normalisation)(int(value + 0.1)); } else if (name == "distnorm") { m_params.distanceNorm = (DistanceMetric::DistanceNormalisation)(int(value + 0.1)); } else if (name == "usespecdiff") { - m_feParams.useSpectralDifference = (value > 0.5); + m_fcParams.order = (FeatureConditioner::OutputOrder)(int(value + 0.1)); } else if (name == "usechroma") { m_feParams.useChromaFrequencyMap = (value > 0.5); } else if (name == "gradientlimit") { @@ -327,6 +329,8 @@ m_feParams.fftSize = m_blockSize; m_fe1 = new FeatureExtractor(m_feParams); m_fe2 = new FeatureExtractor(m_feParams); + m_fc1 = new FeatureConditioner(m_fcParams); + m_fc2 = new FeatureConditioner(m_fcParams); m_pm1 = new Matcher(m_params, 0, m_fe1->getFeatureSize()); m_pm2 = new Matcher(m_params, m_pm1, m_fe2->getFeatureSize()); m_pm1->setOtherMatcher(m_pm2); @@ -364,12 +368,16 @@ delete m_feeder; delete m_fe1; delete m_fe2; + delete m_fc1; + delete m_fc2; delete m_pm1; delete m_pm2; m_feeder = 0; m_fe1 = 0; m_fe2 = 0; + m_fc1 = 0; + m_fc2 = 0; m_pm1 = 0; m_pm2 = 0; @@ -521,8 +529,8 @@ if (aboveThreshold(inputBuffers[0])) m_lastFrameIn1 = m_frameNo; if (aboveThreshold(inputBuffers[1])) m_lastFrameIn2 = m_frameNo; - vector f1 = m_fe1->process(inputBuffers[0]); - vector f2 = m_fe2->process(inputBuffers[1]); + vector f1 = m_fc1->process(m_fe1->process(inputBuffers[0])); + vector f2 = m_fc1->process(m_fe2->process(inputBuffers[1])); m_feeder->feed(f1, f2); diff -r 6b91e40b2c04 -r 593054bf6476 src/MatchVampPlugin.h --- a/src/MatchVampPlugin.h Thu Nov 27 16:50:14 2014 +0000 +++ b/src/MatchVampPlugin.h Thu Dec 04 13:05:16 2014 +0000 @@ -27,6 +27,7 @@ #include "Matcher.h" #include "FeatureExtractor.h" +#include "FeatureConditioner.h" class MatchFeatureFeeder; @@ -73,6 +74,8 @@ Matcher *m_pm2; FeatureExtractor *m_fe1; FeatureExtractor *m_fe2; + FeatureConditioner *m_fc1; + FeatureConditioner *m_fc2; MatchFeatureFeeder *m_feeder; Vamp::RealTime m_startTime; @@ -94,6 +97,9 @@ FeatureExtractor::Parameters m_feParams; FeatureExtractor::Parameters m_defaultFeParams; + FeatureConditioner::Parameters m_fcParams; + FeatureConditioner::Parameters m_defaultFcParams; + mutable int m_pathOutNo; mutable int m_abOutNo; mutable int m_baOutNo;