Chris@37: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@37: Chris@37: /* Chris@37: Vamp feature extraction plugin using the MATCH audio alignment Chris@37: algorithm. Chris@37: Chris@37: Centre for Digital Music, Queen Mary, University of London. Chris@37: This file copyright 2007 Simon Dixon, Chris Cannam and QMUL. Chris@37: Chris@37: This program is free software; you can redistribute it and/or Chris@37: modify it under the terms of the GNU General Public License as Chris@37: published by the Free Software Foundation; either version 2 of the Chris@37: License, or (at your option) any later version. See the file Chris@37: COPYING included with this distribution for more information. Chris@37: */ Chris@37: Chris@37: #ifndef FEATURE_EXTRACTOR_H Chris@37: #define FEATURE_EXTRACTOR_H Chris@37: Chris@37: #include Chris@37: Chris@37: /** Chris@37: * Convert frequency-domain audio frames into features suitable for Chris@37: * MATCH alignment calculation. The default feature is a warping of Chris@37: * the frequency data to map higher frequencies into a linear scale. A Chris@37: * chroma mapping is also available. Chris@37: * Chris@37: * Note that FeatureExtractor maintains internal frame-to-frame state: Chris@37: * use one FeatureExtractor per audio source, and construct a new one Chris@37: * for each new source. Chris@37: */ Chris@37: class FeatureExtractor Chris@37: { Chris@37: public: Chris@37: enum FrameNormalisation { Chris@37: Chris@37: /** Do not normalise frames */ Chris@37: NoFrameNormalisation, Chris@37: Chris@37: /** Normalise each frame to have a sum of 1 */ Chris@37: NormaliseFrameToSum1, Chris@37: Chris@37: /** Normalise each frame by the long-term average of the Chris@37: * summed energy */ Chris@37: NormaliseFrameToLTAverage, Chris@37: }; Chris@37: Chris@37: struct Parameters { Chris@37: Chris@37: Parameters(float rate_, int fftSize_) : Chris@37: sampleRate(rate_), Chris@37: frameNorm(NormaliseFrameToSum1), Chris@37: useSpectralDifference(true), Chris@37: useChromaFrequencyMap(false), Chris@37: fftSize(fftSize_), Chris@37: silenceThreshold(0.01), Chris@37: decay(0.99) Chris@37: {} Chris@37: Chris@37: /** Sample rate of audio */ Chris@37: float sampleRate; Chris@37: Chris@37: /** Type of audio frame normalisation */ Chris@37: FrameNormalisation frameNorm; Chris@37: Chris@37: /** Flag indicating whether or not the half-wave rectified Chris@37: * spectral difference should be used in calculating the Chris@37: * distance metric for pairs of audio frames, instead of the Chris@37: * straight spectrum values. */ Chris@37: bool useSpectralDifference; Chris@37: Chris@37: /** Flag indicating whether to use a chroma frequency map (12 Chris@37: * bins) instead of the default warped spectrogram */ Chris@37: bool useChromaFrequencyMap; Chris@37: Chris@37: /** Spacing of audio frames (determines the amount of overlap or Chris@37: * skip between frames). This value is expressed in Chris@37: * seconds. */ Chris@37: double hopTime; Chris@37: Chris@37: /** Size of an FFT frame in samples. Note that the data passed Chris@37: * in is already in the frequency domain, so this expresses Chris@37: * the size of the frame that the caller will be providing. */ Chris@37: int fftSize; Chris@37: Chris@37: /** RMS level below which frame is considered silent */ Chris@37: double silenceThreshold; Chris@37: Chris@37: /** Frame-to-frame decay factor in calculating long-term average */ Chris@37: double decay; Chris@37: }; Chris@37: Chris@37: /** Chris@37: * Construct a FeatureExtractor with the given parameters. Chris@37: * Chris@37: * Note that FeatureExtractor maintains internal frame-to-frame Chris@37: * state: use one FeatureExtractor per audio source, and construct Chris@37: * a new one for each new source. Chris@37: */ Chris@37: FeatureExtractor(Parameters params); Chris@37: Chris@37: /** Chris@37: * Return the feature vector size that will be returned from process(). Chris@37: */ Chris@37: int getFeatureSize() const { return m_featureSize; } Chris@37: Chris@37: /** Chris@37: * Process one frequency-domain audio frame (provided as real & Chris@37: * imaginary components from the FFT output). Return a feature Chris@37: * vector of size given by getFeatureSize(). Chris@37: * Chris@37: * Operates by mapping the frequency bins into a part-linear Chris@37: * part-logarithmic array, then (optionally) computing the Chris@37: * half-wave rectified spectral difference from the previous Chris@37: * frame, then (optionally) normalising to a sum of 1. Chris@37: * Chris@37: * Return value is the frame (post-processed, with warping, Chris@37: * rectification, and normalisation as appropriate). Chris@37: */ Chris@37: std::vector process(const std::vector &real, Chris@37: const std::vector &imag); Chris@37: Chris@37: protected: Chris@37: /** Make either standard or chroma map, depending on m_params */ Chris@37: void makeFreqMap(); Chris@37: Chris@37: /** Creates a map of FFT frequency bins to comparison bins. Where Chris@37: * the spacing of FFT bins is less than 0.5 semitones, the Chris@37: * mapping is one to one. Where the spacing is greater than 0.5 Chris@37: * semitones, the FFT energy is mapped into semitone-wide Chris@37: * bins. No scaling is performed; that is the energy is summed Chris@37: * into the comparison bins. */ Chris@37: void makeStandardFrequencyMap(); Chris@37: Chris@37: /** Creates a map of FFT frequency bins to semitone chroma bins. */ Chris@37: void makeChromaFrequencyMap(); Chris@37: Chris@37: /** Configuration parameters */ Chris@37: Parameters m_params; Chris@37: Chris@37: /** Long term average frame energy (in frequency domain Chris@37: * representation). */ Chris@37: double m_ltAverage; Chris@37: Chris@37: /** A mapping function for mapping FFT bins to final frequency Chris@37: * bins. The mapping is linear (1-1) until the resolution Chris@37: * reaches 2 points per semitone, then logarithmic with a Chris@37: * semitone resolution. e.g. for 44.1kHz sampling rate and Chris@37: * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped Chris@37: * linearly for bins 0-34 (0 to 732Hz), and logarithmically for Chris@37: * the remaining bins (midi notes 79 to 127, bins 35 to 83), Chris@37: * where all energy above note 127 is mapped into the final Chris@37: * bin. */ Chris@37: std::vector m_freqMap; Chris@37: Chris@37: /** The size of a returned feature. */ Chris@37: int m_featureSize; Chris@37: Chris@37: /** The most recent frame; used for calculating the frame to frame Chris@37: * spectral difference. This is therefore frequency warped but Chris@37: * not yet normalised. */ Chris@37: std::vector m_prevFrame; Chris@37: }; Chris@37: Chris@37: #endif Chris@37: