changeset 37:91410483228b refactors

refactor: Pull out feature extraction code to FeatureExtractor.cpp
author Chris Cannam
date Thu, 13 Nov 2014 12:03:52 +0000
parents 16870e8770ae
children 8cce4e13ede3
files src/FeatureExtractor.cpp src/FeatureExtractor.h
diffstat 2 files changed, 322 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/FeatureExtractor.cpp	Thu Nov 13 12:03:52 2014 +0000
@@ -0,0 +1,156 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Vamp feature extraction plugin using the MATCH audio alignment
+    algorithm.
+
+    Centre for Digital Music, Queen Mary, University of London.
+    This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#include "FeatureExtractor.h"
+
+#include <iostream>
+
+#include <cstdlib>
+#include <cassert>
+#include <cmath>
+
+using namespace std;
+
+FeatureExtractor::FeatureExtractor(Parameters parameters) :
+    m_params(parameters),
+    m_ltAverage(0)
+{
+    if (m_params.useChromaFrequencyMap) {
+	m_featureSize = 13;
+    } else {
+	m_featureSize = 84;
+    }
+
+    m_prevFrame = vector<double>(m_featureSize, 0.0);
+
+    makeFreqMap();
+}
+
+void
+FeatureExtractor::makeFreqMap()
+{
+    m_freqMap = vector<int>(m_params.fftSize / 2 + 1, 0);
+
+    if (m_params.useChromaFrequencyMap) {
+#ifdef DEBUG_MATCHER
+        cerr << "makeFreqMap: calling makeChromaFrequencyMap" << endl;
+#endif
+        makeChromaFrequencyMap();
+    } else {
+#ifdef DEBUG_MATCHER
+        cerr << "makeFreqMap: calling makeStandardFrequencyMap" << endl;
+#endif
+        makeStandardFrequencyMap();
+    }
+}
+
+void
+FeatureExtractor::makeStandardFrequencyMap()
+{
+    double binWidth = m_params.sampleRate / m_params.fftSize;
+    int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
+    int crossoverMidi = lrint(log(crossoverBin*binWidth/440.0)/
+                              log(2.0) * 12 + 69);
+
+    // freq = 440 * Math.pow(2, (midi-69)/12.0) / binWidth;
+    
+    int i = 0;
+    while (i <= crossoverBin) {
+        m_freqMap[i] = i;
+        ++i;
+    }
+
+    while (i <= m_params.fftSize/2) {
+        double midi = log(i*binWidth/440.0) / log(2.0) * 12 + 69;
+        if (midi > 127) midi = 127;
+        m_freqMap[i++] = crossoverBin + lrint(midi) - crossoverMidi;
+    }
+
+    assert(m_featureSize == m_freqMap[i-1] + 1);
+}
+
+void
+FeatureExtractor::makeChromaFrequencyMap()
+{
+    double binWidth = m_params.sampleRate / m_params.fftSize;
+    int crossoverBin = (int)(1 / (pow(2, 1/12.0) - 1));
+    int i = 0;
+    while (i <= crossoverBin) {
+        m_freqMap[i++] = 0;
+    }
+    while (i <= m_params.fftSize/2) {
+        double midi = log(i*binWidth/440.0) / log(2.0) * 12 + 69;
+        m_freqMap[i++] = (lrint(midi)) % 12 + 1;
+    }
+}
+
+vector<double>
+FeatureExtractor::process(const vector<double> &real, const vector<double> &imag)
+{
+    vector<double> frame(m_featureSize, 0.0);
+    
+    double rms = 0;
+    for (int i = 0; i <= m_params.fftSize/2; i++) {
+        double mag = real[i] * real[i] + imag[i] * imag[i];
+        rms += mag;
+        frame[m_freqMap[i]] += mag;
+    }
+    rms = sqrt(rms / (m_params.fftSize/2));
+
+    vector<double> feature(m_featureSize, 0.0);
+
+    double totalEnergy = 0;
+    if (m_params.useSpectralDifference) {
+        for (int i = 0; i < m_featureSize; i++) {
+            totalEnergy += frame[i];
+            if (frame[i] > m_prevFrame[i]) {
+                feature[i] = frame[i] - m_prevFrame[i];
+            } else {
+                feature[i] = 0;
+            }
+        }
+    } else {
+        for (int i = 0; i < m_featureSize; i++) {
+            feature[i] = frame[i];
+            totalEnergy += feature[i];
+        }
+    }
+
+    if (m_ltAverage == 0) {
+	m_ltAverage = totalEnergy;
+    } else {
+	double decay = m_params.decay;
+        m_ltAverage = m_ltAverage * decay + totalEnergy * (1.0 - decay);
+    }
+
+    if (rms <= m_params.silenceThreshold) {
+        for (int i = 0; i < m_featureSize; i++) {
+            feature[i] = 0;
+	}
+    } else if (m_params.frameNorm == NormaliseFrameToSum1) {
+        for (int i = 0; i < m_featureSize; i++) { 
+            feature[i] /= totalEnergy;
+	}
+    } else if (m_params.frameNorm == NormaliseFrameToLTAverage) {
+        for (int i = 0; i < m_featureSize; i++) {
+            feature[i] /= m_ltAverage;
+	}
+    }
+
+    m_prevFrame = frame;
+    return feature;
+}
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/FeatureExtractor.h	Thu Nov 13 12:03:52 2014 +0000
@@ -0,0 +1,166 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Vamp feature extraction plugin using the MATCH audio alignment
+    algorithm.
+
+    Centre for Digital Music, Queen Mary, University of London.
+    This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#ifndef FEATURE_EXTRACTOR_H
+#define FEATURE_EXTRACTOR_H
+
+#include <vector>
+
+/**
+ * Convert frequency-domain audio frames into features suitable for
+ * MATCH alignment calculation. The default feature is a warping of
+ * the frequency data to map higher frequencies into a linear scale. A
+ * chroma mapping is also available.
+ *
+ * Note that FeatureExtractor maintains internal frame-to-frame state:
+ * use one FeatureExtractor per audio source, and construct a new one
+ * for each new source.
+ */
+class FeatureExtractor
+{
+public:
+    enum FrameNormalisation {
+
+        /** Do not normalise frames */
+        NoFrameNormalisation,
+        
+        /** Normalise each frame to have a sum of 1 */
+        NormaliseFrameToSum1,
+        
+        /** Normalise each frame by the long-term average of the
+         *  summed energy */
+        NormaliseFrameToLTAverage,
+    };
+
+    struct Parameters {
+
+        Parameters(float rate_, int fftSize_) :
+            sampleRate(rate_),
+            frameNorm(NormaliseFrameToSum1),
+            useSpectralDifference(true),
+            useChromaFrequencyMap(false),
+            fftSize(fftSize_),
+            silenceThreshold(0.01),
+            decay(0.99)
+        {}
+
+        /** Sample rate of audio */
+        float sampleRate;
+
+        /** Type of audio frame normalisation */
+        FrameNormalisation frameNorm;
+
+        /** Flag indicating whether or not the half-wave rectified
+         *  spectral difference should be used in calculating the
+         *  distance metric for pairs of audio frames, instead of the
+         *  straight spectrum values. */
+        bool useSpectralDifference;
+
+        /** Flag indicating whether to use a chroma frequency map (12
+         *  bins) instead of the default warped spectrogram */
+        bool useChromaFrequencyMap;
+
+        /** Spacing of audio frames (determines the amount of overlap or
+         *  skip between frames). This value is expressed in
+         *  seconds. */
+        double hopTime;
+
+        /** Size of an FFT frame in samples. Note that the data passed
+         *  in is already in the frequency domain, so this expresses
+         *  the size of the frame that the caller will be providing. */
+        int fftSize;
+
+        /** RMS level below which frame is considered silent */
+        double silenceThreshold;
+
+        /** Frame-to-frame decay factor in calculating long-term average */
+        double decay;
+    };
+
+    /**
+     * Construct a FeatureExtractor with the given parameters.
+     *
+     * Note that FeatureExtractor maintains internal frame-to-frame
+     * state: use one FeatureExtractor per audio source, and construct
+     * a new one for each new source.
+     */
+    FeatureExtractor(Parameters params);
+
+    /**
+     * Return the feature vector size that will be returned from process().
+     */
+    int getFeatureSize() const { return m_featureSize; }
+    
+    /**
+     * Process one frequency-domain audio frame (provided as real &
+     * imaginary components from the FFT output). Return a feature
+     * vector of size given by getFeatureSize().
+     *
+     * Operates by mapping the frequency bins into a part-linear
+     * part-logarithmic array, then (optionally) computing the
+     * half-wave rectified spectral difference from the previous
+     * frame, then (optionally) normalising to a sum of 1.
+     *
+     * Return value is the frame (post-processed, with warping,
+     * rectification, and normalisation as appropriate).
+     */
+    std::vector<double> process(const std::vector<double> &real,
+                                const std::vector<double> &imag);
+    
+protected:
+    /** Make either standard or chroma map, depending on m_params */
+    void makeFreqMap();
+
+    /** Creates a map of FFT frequency bins to comparison bins.  Where
+     *  the spacing of FFT bins is less than 0.5 semitones, the
+     *  mapping is one to one. Where the spacing is greater than 0.5
+     *  semitones, the FFT energy is mapped into semitone-wide
+     *  bins. No scaling is performed; that is the energy is summed
+     *  into the comparison bins. */
+    void makeStandardFrequencyMap();
+
+    /** Creates a map of FFT frequency bins to semitone chroma bins. */
+    void makeChromaFrequencyMap();
+
+    /** Configuration parameters */
+    Parameters m_params;
+    
+    /** Long term average frame energy (in frequency domain
+     *  representation). */
+    double m_ltAverage;
+
+    /** A mapping function for mapping FFT bins to final frequency
+     *  bins.  The mapping is linear (1-1) until the resolution
+     *  reaches 2 points per semitone, then logarithmic with a
+     *  semitone resolution.  e.g. for 44.1kHz sampling rate and
+     *  fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
+     *  linearly for bins 0-34 (0 to 732Hz), and logarithmically for
+     *  the remaining bins (midi notes 79 to 127, bins 35 to 83),
+     *  where all energy above note 127 is mapped into the final
+     *  bin. */
+    std::vector<int> m_freqMap;
+
+    /** The size of a returned feature. */
+    int m_featureSize;
+
+    /** The most recent frame; used for calculating the frame to frame
+     *  spectral difference. This is therefore frequency warped but
+     *  not yet normalised. */
+    std::vector<double> m_prevFrame;
+};
+
+#endif
+