changeset 103:593054bf6476 feature_conditioner

Pull out normalisation and specdiff stuff into FeatureConditioner
author Chris Cannam
date Thu, 04 Dec 2014 13:05:16 +0000
parents 6b91e40b2c04
children 6636aca831c0
files Makefile.inc src/FeatureConditioner.cpp src/FeatureConditioner.h src/FeatureExtractor.cpp src/FeatureExtractor.h src/MatchVampPlugin.cpp src/MatchVampPlugin.h
diffstat 7 files changed, 220 insertions(+), 127 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile.inc	Thu Nov 27 16:50:14 2014 +0000
+++ b/Makefile.inc	Thu Dec 04 13:05:16 2014 +0000
@@ -20,6 +20,7 @@
 
 depend:
 	makedepend -Y -fMakefile.inc $(SOURCES) $(HEADERS)
+	
 # DO NOT DELETE
 
 src/DistanceMetric.o: src/DistanceMetric.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/FeatureConditioner.cpp	Thu Dec 04 13:05:16 2014 +0000
@@ -0,0 +1,80 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Vamp feature extraction plugin using the MATCH audio alignment
+    algorithm.
+
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#include "FeatureConditioner.h"
+
+#include <iostream>
+
+using namespace std;
+
+vector<double>
+FeatureConditioner::process(const vector<double> &feature)
+{
+    if (m_prev.empty()) {
+	m_prev.resize(feature.size(), 0.0);
+    }
+    if (m_prev.size() != feature.size()) {
+	cerr << "ERROR: FeatureConditioner::process: feature size "
+	     << feature.size() << " differs from previous feature size "
+	     << m_prev.size() << endl;
+	return feature;
+    }
+
+    int size = feature.size();
+    
+    vector<double> out(size, 0.0);
+
+    double totalEnergy = 0;
+    if (m_params.order == OutputRectifiedDerivative) {
+        for (int i = 0; i < size; i++) {
+            totalEnergy += feature[i];
+            if (feature[i] > m_prev[i]) {
+                out[i] = feature[i] - m_prev[i];
+            } else {
+                out[i] = 0;
+            }
+        }
+    } else {
+        for (int i = 0; i < size; i++) {
+            out[i] = feature[i];
+            totalEnergy += out[i];
+        }
+    }
+
+    if (m_ltAverage == 0) {
+	m_ltAverage = totalEnergy;
+    } else {
+	double decay = m_params.decay;
+        m_ltAverage = m_ltAverage * decay + totalEnergy * (1.0 - decay);
+    }
+
+    if (totalEnergy <= m_params.silenceThreshold) {
+        for (int i = 0; i < size; i++) {
+            out[i] = 0;
+	}
+    } else if (m_params.norm == NormaliseToSum1) {
+        for (int i = 0; i < size; i++) { 
+            out[i] /= totalEnergy;
+	}
+    } else if (m_params.norm == NormaliseToLTAverage) {
+        for (int i = 0; i < size; i++) {
+            out[i] /= m_ltAverage;
+	}
+    }
+
+    m_prev = feature;
+    return out;
+}
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/FeatureConditioner.h	Thu Dec 04 13:05:16 2014 +0000
@@ -0,0 +1,105 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Vamp feature extraction plugin using the MATCH audio alignment
+    algorithm.
+
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#ifndef FEATURE_CONDITIONER_H
+#define FEATURE_CONDITIONER_H
+
+#include <vector>
+
+/**
+ * Take a series of feature vectors and apply conditioning of some
+ * sort, such as normalisation or first-order derivative.
+ *
+ * Note that FeatureConditioner maintains internal frame-to-frame
+ * state: use one FeatureConditioner per audio source, and construct a
+ * new one for each new source.
+ */
+class FeatureConditioner
+{
+public:
+    enum Normalisation {
+
+        /** Do not normalise */
+        NoNormalisation,
+        
+        /** Normalise each feature vector to have a sum of 1 */
+        NormaliseToSum1,
+        
+        /** Normalise each feature vector by the long-term average of
+         *  the summed energy */
+        NormaliseToLTAverage,
+    };
+
+    enum OutputOrder {
+
+	/** Output the normalised features without further processing */
+	OutputFeatures,
+
+	/** Output the half-wave rectified difference between the
+	 * previous and current features instead of the straight
+	 * feature values. */
+	OutputRectifiedDerivative,
+    };
+
+    struct Parameters {
+
+	Parameters() :
+	    norm(NormaliseToSum1),
+	    order(OutputRectifiedDerivative),
+	    silenceThreshold(0.01),
+	    decay(0.99)
+	{}
+
+	/** Feature normalisation. */
+	Normalisation norm;
+
+	/** Type of output to generate (plain feature, derivative etc). */
+	OutputOrder order;
+
+	/** Silence threshold. If non-zero, any feature whose total
+	 *  energy (simply the sum of feature values) is below that
+	 *  threshold will be rounded down to all zeros. */
+	double silenceThreshold;
+
+        /** Frame-to-frame decay factor in calculating long-term average */
+	double decay;
+    };
+	
+    /**
+     * Construct a FeatureExtractor with the given parameters.
+     *
+     * Note that FeatureExtractor maintains internal frame-to-frame
+     * state: use one FeatureExtractor per audio source, and construct
+     * a new one for each new source.
+     */
+    FeatureConditioner(Parameters parameters) : m_params(parameters) { }
+
+    /**
+     * Process the given feature and return the conditioned feature.
+     */
+    std::vector<double> process(const std::vector<double> &feature);
+
+protected:
+    Parameters m_params;
+    
+    /** Long term average feature energy. */
+    double m_ltAverage;
+
+    /** The most recent feature, used for calculating the feature to
+     *  feature difference. This is therefore not yet normalised. */
+    std::vector<double> m_prev;
+};
+
+#endif
--- a/src/FeatureExtractor.cpp	Thu Nov 27 16:50:14 2014 +0000
+++ b/src/FeatureExtractor.cpp	Thu Dec 04 13:05:16 2014 +0000
@@ -25,12 +25,9 @@
 using namespace std;
 
 FeatureExtractor::FeatureExtractor(Parameters parameters) :
-    m_params(parameters),
-    m_ltAverage(0)
+    m_params(parameters)
 {
     m_featureSize = getFeatureSizeFor(parameters);
-    m_prevFrame = vector<double>(m_featureSize, 0.0);
-
     makeFreqMap();
 }
 
@@ -107,15 +104,12 @@
 {
     vector<double> frame(m_featureSize, 0.0);
     
-    double rms = 0;
     for (int i = 0; i <= m_params.fftSize/2; i++) {
         double mag = real[i] * real[i] + imag[i] * imag[i];
-        rms += mag;
         frame[m_freqMap[i]] += mag;
     }
-    rms = sqrt(rms / (m_params.fftSize/2));
 
-    return postProcess(frame, rms);
+    return frame;
 }
 
 vector<double>
@@ -123,61 +117,11 @@
 {
     vector<double> frame(m_featureSize, 0.0);
     
-    double rms = 0;
     for (int i = 0; i <= m_params.fftSize/2; i++) {
         double mag = cframe[i*2] * cframe[i*2] + cframe[i*2+1] * cframe[i*2+1];
-        rms += mag;
         frame[m_freqMap[i]] += mag;
     }
-    rms = sqrt(rms / (m_params.fftSize/2));
 
-    return postProcess(frame, rms);
+    return frame;
 }
 
-vector<double>
-FeatureExtractor::postProcess(const vector<double> &frame, double rms)
-{
-    vector<double> feature(m_featureSize, 0.0);
-
-    double totalEnergy = 0;
-    if (m_params.useSpectralDifference) {
-        for (int i = 0; i < m_featureSize; i++) {
-            totalEnergy += frame[i];
-            if (frame[i] > m_prevFrame[i]) {
-                feature[i] = frame[i] - m_prevFrame[i];
-            } else {
-                feature[i] = 0;
-            }
-        }
-    } else {
-        for (int i = 0; i < m_featureSize; i++) {
-            feature[i] = frame[i];
-            totalEnergy += feature[i];
-        }
-    }
-
-    if (m_ltAverage == 0) {
-	m_ltAverage = totalEnergy;
-    } else {
-	double decay = m_params.decay;
-        m_ltAverage = m_ltAverage * decay + totalEnergy * (1.0 - decay);
-    }
-
-    if (rms <= m_params.silenceThreshold) {
-        for (int i = 0; i < m_featureSize; i++) {
-            feature[i] = 0;
-	}
-    } else if (m_params.frameNorm == NormaliseFrameToSum1) {
-        for (int i = 0; i < m_featureSize; i++) { 
-            feature[i] /= totalEnergy;
-	}
-    } else if (m_params.frameNorm == NormaliseFrameToLTAverage) {
-        for (int i = 0; i < m_featureSize; i++) {
-            feature[i] /= m_ltAverage;
-	}
-    }
-
-    m_prevFrame = frame;
-    return feature;
-}
-    
--- a/src/FeatureExtractor.h	Thu Nov 27 16:50:14 2014 +0000
+++ b/src/FeatureExtractor.h	Thu Dec 04 13:05:16 2014 +0000
@@ -25,50 +25,24 @@
  * the frequency data to map higher frequencies into a linear scale. A
  * chroma mapping is also available.
  *
- * Note that FeatureExtractor maintains internal frame-to-frame state:
- * use one FeatureExtractor per audio source, and construct a new one
- * for each new source.
+ * Note that FeatureExtractor may maintain internal frame-to-frame
+ * state: use one FeatureExtractor per audio source, and construct a
+ * new one for each new source.
  */
 class FeatureExtractor
 {
 public:
-    enum FrameNormalisation {
-
-        /** Do not normalise frames */
-        NoFrameNormalisation,
-        
-        /** Normalise each frame to have a sum of 1 */
-        NormaliseFrameToSum1,
-        
-        /** Normalise each frame by the long-term average of the
-         *  summed energy */
-        NormaliseFrameToLTAverage,
-    };
-
     struct Parameters {
 
         Parameters(float rate_, int fftSize_) :
             sampleRate(rate_),
-            frameNorm(NormaliseFrameToSum1),
-            useSpectralDifference(true),
             useChromaFrequencyMap(false),
-            fftSize(fftSize_),
-            silenceThreshold(0.01),
-            decay(0.99)
+            fftSize(fftSize_)
         {}
 
         /** Sample rate of audio */
         float sampleRate;
 
-        /** Type of audio frame normalisation */
-        FrameNormalisation frameNorm;
-
-        /** Flag indicating whether or not the half-wave rectified
-         *  spectral difference should be used in calculating the
-         *  distance metric for pairs of audio frames, instead of the
-         *  straight spectrum values. */
-        bool useSpectralDifference;
-
         /** Flag indicating whether to use a chroma frequency map (12
          *  bins) instead of the default warped spectrogram */
         bool useChromaFrequencyMap;
@@ -82,12 +56,6 @@
          *  in is already in the frequency domain, so this expresses
          *  the size of the frame that the caller will be providing. */
         int fftSize;
-
-        /** RMS level below which frame is considered silent */
-        double silenceThreshold;
-
-        /** Frame-to-frame decay factor in calculating long-term average */
-        double decay;
     };
 
     /**
@@ -117,12 +85,8 @@
      * have at least params.fftSize/2+1 elements each.
      *
      * Operates by mapping the frequency bins into a part-linear
-     * part-logarithmic array, then (optionally) computing the
-     * half-wave rectified spectral difference from the previous
-     * frame, then (optionally) normalising to a sum of 1.
-     *
-     * Return value is the frame (post-processed, with warping,
-     * rectification, and normalisation as appropriate).
+     * part-logarithmic array, unless useChromaFrequencyMap is true in
+     * which case they are mapped into chroma bins.
      */
     std::vector<double> process(const std::vector<double> &real,
                                 const std::vector<double> &imag);
@@ -133,12 +97,8 @@
      * must have at least 2 * (params.fftSize/2 + 1) elements.
      *
      * Operates by mapping the frequency bins into a part-linear
-     * part-logarithmic array, then (optionally) computing the
-     * half-wave rectified spectral difference from the previous
-     * frame, then (optionally) normalising to a sum of 1.
-     *
-     * Return value is the frame (post-processed, with warping,
-     * rectification, and normalisation as appropriate).
+     * part-logarithmic array, unless useChromaFrequencyMap is true in
+     * which case they are mapped into chroma bins.
      */
     std::vector<double> process(const float *carray);
     
@@ -157,14 +117,8 @@
     /** Creates a map of FFT frequency bins to semitone chroma bins. */
     void makeChromaFrequencyMap();
 
-    std::vector<double> postProcess(const std::vector<double> &, double rms);
-    
     /** Configuration parameters */
     Parameters m_params;
-    
-    /** Long term average frame energy (in frequency domain
-     *  representation). */
-    double m_ltAverage;
 
     /** A mapping function for mapping FFT bins to final frequency
      *  bins.  The mapping is linear (1-1) until the resolution
@@ -179,11 +133,6 @@
 
     /** The size of a returned feature. */
     int m_featureSize;
-
-    /** The most recent frame; used for calculating the frame to frame
-     *  spectral difference. This is therefore frequency warped but
-     *  not yet normalised. */
-    std::vector<double> m_prevFrame;
 };
 
 #endif
--- a/src/MatchVampPlugin.cpp	Thu Nov 27 16:50:14 2014 +0000
+++ b/src/MatchVampPlugin.cpp	Thu Dec 04 13:05:16 2014 +0000
@@ -63,7 +63,9 @@
     m_params(inputSampleRate, defaultStepTime, m_blockSize),
     m_defaultParams(inputSampleRate, defaultStepTime, m_blockSize),
     m_feParams(inputSampleRate, m_blockSize),
-    m_defaultFeParams(inputSampleRate, m_blockSize)
+    m_defaultFeParams(inputSampleRate, m_blockSize),
+    m_fcParams(),
+    m_defaultFcParams()
 {
     if (inputSampleRate < sampleRateMin) {
         std::cerr << "MatchVampPlugin::MatchVampPlugin: input sample rate "
@@ -167,7 +169,7 @@
     desc.description = "Type of normalisation to use for frequency-domain audio features";
     desc.minValue = 0;
     desc.maxValue = 2;
-    desc.defaultValue = (int)m_defaultFeParams.frameNorm;
+    desc.defaultValue = (int)m_defaultFcParams.norm;
     desc.isQuantized = true;
     desc.quantizeStep = 1;
     desc.valueNames.clear();
@@ -197,7 +199,7 @@
     desc.description = "Whether to use half-wave rectified spectral difference instead of straight spectrum";
     desc.minValue = 0;
     desc.maxValue = 1;
-    desc.defaultValue = m_defaultFeParams.useSpectralDifference ? 1 : 0;
+    desc.defaultValue = (int)m_defaultFcParams.order;
     desc.isQuantized = true;
     desc.quantizeStep = 1;
     list.push_back(desc);
@@ -263,11 +265,11 @@
     if (name == "serialise") {
         return m_serialise ? 1.0 : 0.0; 
     } else if (name == "framenorm") {
-        return (int)m_feParams.frameNorm;
+        return (int)m_fcParams.norm;
     } else if (name == "distnorm") {
         return (int)m_params.distanceNorm;
     } else if (name == "usespecdiff") {
-        return m_feParams.useSpectralDifference ? 1.0 : 0.0;
+        return (int)m_fcParams.order;
     } else if (name == "usechroma") {
         return m_feParams.useChromaFrequencyMap ? 1.0 : 0.0;
     } else if (name == "gradientlimit") {
@@ -289,11 +291,11 @@
     if (name == "serialise") {
         m_serialise = (value > 0.5);
     } else if (name == "framenorm") {
-        m_feParams.frameNorm = (FeatureExtractor::FrameNormalisation)(int(value + 0.1));
+        m_fcParams.norm = (FeatureConditioner::Normalisation)(int(value + 0.1));
     } else if (name == "distnorm") {
         m_params.distanceNorm = (DistanceMetric::DistanceNormalisation)(int(value + 0.1));
     } else if (name == "usespecdiff") {
-        m_feParams.useSpectralDifference = (value > 0.5);
+        m_fcParams.order = (FeatureConditioner::OutputOrder)(int(value + 0.1));
     } else if (name == "usechroma") {
         m_feParams.useChromaFrequencyMap = (value > 0.5);
     } else if (name == "gradientlimit") {
@@ -327,6 +329,8 @@
     m_feParams.fftSize = m_blockSize;
     m_fe1 = new FeatureExtractor(m_feParams);
     m_fe2 = new FeatureExtractor(m_feParams);
+    m_fc1 = new FeatureConditioner(m_fcParams);
+    m_fc2 = new FeatureConditioner(m_fcParams);
     m_pm1 = new Matcher(m_params, 0, m_fe1->getFeatureSize());
     m_pm2 = new Matcher(m_params, m_pm1, m_fe2->getFeatureSize());
     m_pm1->setOtherMatcher(m_pm2);
@@ -364,12 +368,16 @@
     delete m_feeder;
     delete m_fe1;
     delete m_fe2;
+    delete m_fc1;
+    delete m_fc2;
     delete m_pm1;
     delete m_pm2;
 
     m_feeder = 0;
     m_fe1 = 0;
     m_fe2 = 0;
+    m_fc1 = 0;
+    m_fc2 = 0;
     m_pm1 = 0;
     m_pm2 = 0;
 
@@ -521,8 +529,8 @@
     if (aboveThreshold(inputBuffers[0])) m_lastFrameIn1 = m_frameNo;
     if (aboveThreshold(inputBuffers[1])) m_lastFrameIn2 = m_frameNo;
 
-    vector<double> f1 = m_fe1->process(inputBuffers[0]);
-    vector<double> f2 = m_fe2->process(inputBuffers[1]);
+    vector<double> f1 = m_fc1->process(m_fe1->process(inputBuffers[0]));
+    vector<double> f2 = m_fc1->process(m_fe2->process(inputBuffers[1]));
     
     m_feeder->feed(f1, f2);
 
--- a/src/MatchVampPlugin.h	Thu Nov 27 16:50:14 2014 +0000
+++ b/src/MatchVampPlugin.h	Thu Dec 04 13:05:16 2014 +0000
@@ -27,6 +27,7 @@
 
 #include "Matcher.h"
 #include "FeatureExtractor.h"
+#include "FeatureConditioner.h"
 
 class MatchFeatureFeeder;
 
@@ -73,6 +74,8 @@
     Matcher *m_pm2;
     FeatureExtractor *m_fe1;
     FeatureExtractor *m_fe2;
+    FeatureConditioner *m_fc1;
+    FeatureConditioner *m_fc2;
     MatchFeatureFeeder *m_feeder;
 
     Vamp::RealTime m_startTime;
@@ -94,6 +97,9 @@
     FeatureExtractor::Parameters m_feParams;
     FeatureExtractor::Parameters m_defaultFeParams;
 
+    FeatureConditioner::Parameters m_fcParams;
+    FeatureConditioner::Parameters m_defaultFcParams;
+
     mutable int m_pathOutNo;
     mutable int m_abOutNo;
     mutable int m_baOutNo;