Mercurial > hg > match-vamp
diff src/FeatureExtractor.h @ 37:91410483228b refactors
refactor: Pull out feature extraction code to FeatureExtractor.cpp
author | Chris Cannam |
---|---|
date | Thu, 13 Nov 2014 12:03:52 +0000 |
parents | |
children | 8cce4e13ede3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/FeatureExtractor.h Thu Nov 13 12:03:52 2014 +0000 @@ -0,0 +1,166 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Vamp feature extraction plugin using the MATCH audio alignment + algorithm. + + Centre for Digital Music, Queen Mary, University of London. + This file copyright 2007 Simon Dixon, Chris Cannam and QMUL. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#ifndef FEATURE_EXTRACTOR_H +#define FEATURE_EXTRACTOR_H + +#include <vector> + +/** + * Convert frequency-domain audio frames into features suitable for + * MATCH alignment calculation. The default feature is a warping of + * the frequency data to map higher frequencies into a linear scale. A + * chroma mapping is also available. + * + * Note that FeatureExtractor maintains internal frame-to-frame state: + * use one FeatureExtractor per audio source, and construct a new one + * for each new source. + */ +class FeatureExtractor +{ +public: + enum FrameNormalisation { + + /** Do not normalise frames */ + NoFrameNormalisation, + + /** Normalise each frame to have a sum of 1 */ + NormaliseFrameToSum1, + + /** Normalise each frame by the long-term average of the + * summed energy */ + NormaliseFrameToLTAverage, + }; + + struct Parameters { + + Parameters(float rate_, int fftSize_) : + sampleRate(rate_), + frameNorm(NormaliseFrameToSum1), + useSpectralDifference(true), + useChromaFrequencyMap(false), + fftSize(fftSize_), + silenceThreshold(0.01), + decay(0.99) + {} + + /** Sample rate of audio */ + float sampleRate; + + /** Type of audio frame normalisation */ + FrameNormalisation frameNorm; + + /** Flag indicating whether or not the half-wave rectified + * spectral difference should be used in calculating the + * distance metric for pairs of audio frames, instead of the + * straight spectrum values. */ + bool useSpectralDifference; + + /** Flag indicating whether to use a chroma frequency map (12 + * bins) instead of the default warped spectrogram */ + bool useChromaFrequencyMap; + + /** Spacing of audio frames (determines the amount of overlap or + * skip between frames). This value is expressed in + * seconds. */ + double hopTime; + + /** Size of an FFT frame in samples. Note that the data passed + * in is already in the frequency domain, so this expresses + * the size of the frame that the caller will be providing. */ + int fftSize; + + /** RMS level below which frame is considered silent */ + double silenceThreshold; + + /** Frame-to-frame decay factor in calculating long-term average */ + double decay; + }; + + /** + * Construct a FeatureExtractor with the given parameters. + * + * Note that FeatureExtractor maintains internal frame-to-frame + * state: use one FeatureExtractor per audio source, and construct + * a new one for each new source. + */ + FeatureExtractor(Parameters params); + + /** + * Return the feature vector size that will be returned from process(). + */ + int getFeatureSize() const { return m_featureSize; } + + /** + * Process one frequency-domain audio frame (provided as real & + * imaginary components from the FFT output). Return a feature + * vector of size given by getFeatureSize(). + * + * Operates by mapping the frequency bins into a part-linear + * part-logarithmic array, then (optionally) computing the + * half-wave rectified spectral difference from the previous + * frame, then (optionally) normalising to a sum of 1. + * + * Return value is the frame (post-processed, with warping, + * rectification, and normalisation as appropriate). + */ + std::vector<double> process(const std::vector<double> &real, + const std::vector<double> &imag); + +protected: + /** Make either standard or chroma map, depending on m_params */ + void makeFreqMap(); + + /** Creates a map of FFT frequency bins to comparison bins. Where + * the spacing of FFT bins is less than 0.5 semitones, the + * mapping is one to one. Where the spacing is greater than 0.5 + * semitones, the FFT energy is mapped into semitone-wide + * bins. No scaling is performed; that is the energy is summed + * into the comparison bins. */ + void makeStandardFrequencyMap(); + + /** Creates a map of FFT frequency bins to semitone chroma bins. */ + void makeChromaFrequencyMap(); + + /** Configuration parameters */ + Parameters m_params; + + /** Long term average frame energy (in frequency domain + * representation). */ + double m_ltAverage; + + /** A mapping function for mapping FFT bins to final frequency + * bins. The mapping is linear (1-1) until the resolution + * reaches 2 points per semitone, then logarithmic with a + * semitone resolution. e.g. for 44.1kHz sampling rate and + * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped + * linearly for bins 0-34 (0 to 732Hz), and logarithmically for + * the remaining bins (midi notes 79 to 127, bins 35 to 83), + * where all energy above note 127 is mapped into the final + * bin. */ + std::vector<int> m_freqMap; + + /** The size of a returned feature. */ + int m_featureSize; + + /** The most recent frame; used for calculating the frame to frame + * spectral difference. This is therefore frequency warped but + * not yet normalised. */ + std::vector<double> m_prevFrame; +}; + +#endif +