Chris@37: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
Chris@37: 
Chris@37: /*
Chris@37:     Vamp feature extraction plugin using the MATCH audio alignment
Chris@37:     algorithm.
Chris@37: 
Chris@37:     Centre for Digital Music, Queen Mary, University of London.
Chris@37:     This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
Chris@37:     
Chris@37:     This program is free software; you can redistribute it and/or
Chris@37:     modify it under the terms of the GNU General Public License as
Chris@37:     published by the Free Software Foundation; either version 2 of the
Chris@37:     License, or (at your option) any later version.  See the file
Chris@37:     COPYING included with this distribution for more information.
Chris@37: */
Chris@37: 
Chris@37: #ifndef FEATURE_EXTRACTOR_H
Chris@37: #define FEATURE_EXTRACTOR_H
Chris@37: 
Chris@37: #include <vector>
Chris@37: 
Chris@37: /**
Chris@37:  * Convert frequency-domain audio frames into features suitable for
Chris@37:  * MATCH alignment calculation. The default feature is a warping of
Chris@37:  * the frequency data to map higher frequencies into a linear scale. A
Chris@37:  * chroma mapping is also available.
Chris@37:  *
Chris@37:  * Note that FeatureExtractor maintains internal frame-to-frame state:
Chris@37:  * use one FeatureExtractor per audio source, and construct a new one
Chris@37:  * for each new source.
Chris@37:  */
Chris@37: class FeatureExtractor
Chris@37: {
Chris@37: public:
Chris@37:     enum FrameNormalisation {
Chris@37: 
Chris@37:         /** Do not normalise frames */
Chris@37:         NoFrameNormalisation,
Chris@37:         
Chris@37:         /** Normalise each frame to have a sum of 1 */
Chris@37:         NormaliseFrameToSum1,
Chris@37:         
Chris@37:         /** Normalise each frame by the long-term average of the
Chris@37:          *  summed energy */
Chris@37:         NormaliseFrameToLTAverage,
Chris@37:     };
Chris@37: 
Chris@37:     struct Parameters {
Chris@37: 
Chris@37:         Parameters(float rate_, int fftSize_) :
Chris@37:             sampleRate(rate_),
Chris@37:             frameNorm(NormaliseFrameToSum1),
Chris@37:             useSpectralDifference(true),
Chris@37:             useChromaFrequencyMap(false),
Chris@37:             fftSize(fftSize_),
Chris@37:             silenceThreshold(0.01),
Chris@37:             decay(0.99)
Chris@37:         {}
Chris@37: 
Chris@37:         /** Sample rate of audio */
Chris@37:         float sampleRate;
Chris@37: 
Chris@37:         /** Type of audio frame normalisation */
Chris@37:         FrameNormalisation frameNorm;
Chris@37: 
Chris@37:         /** Flag indicating whether or not the half-wave rectified
Chris@37:          *  spectral difference should be used in calculating the
Chris@37:          *  distance metric for pairs of audio frames, instead of the
Chris@37:          *  straight spectrum values. */
Chris@37:         bool useSpectralDifference;
Chris@37: 
Chris@37:         /** Flag indicating whether to use a chroma frequency map (12
Chris@37:          *  bins) instead of the default warped spectrogram */
Chris@37:         bool useChromaFrequencyMap;
Chris@37: 
Chris@37:         /** Spacing of audio frames (determines the amount of overlap or
Chris@37:          *  skip between frames). This value is expressed in
Chris@37:          *  seconds. */
Chris@37:         double hopTime;
Chris@37: 
Chris@37:         /** Size of an FFT frame in samples. Note that the data passed
Chris@37:          *  in is already in the frequency domain, so this expresses
Chris@37:          *  the size of the frame that the caller will be providing. */
Chris@37:         int fftSize;
Chris@37: 
Chris@37:         /** RMS level below which frame is considered silent */
Chris@37:         double silenceThreshold;
Chris@37: 
Chris@37:         /** Frame-to-frame decay factor in calculating long-term average */
Chris@37:         double decay;
Chris@37:     };
Chris@37: 
Chris@37:     /**
Chris@37:      * Construct a FeatureExtractor with the given parameters.
Chris@37:      *
Chris@37:      * Note that FeatureExtractor maintains internal frame-to-frame
Chris@37:      * state: use one FeatureExtractor per audio source, and construct
Chris@37:      * a new one for each new source.
Chris@37:      */
Chris@37:     FeatureExtractor(Parameters params);
Chris@37: 
Chris@37:     /**
Chris@37:      * Return the feature vector size that will be returned from process().
Chris@37:      */
Chris@37:     int getFeatureSize() const { return m_featureSize; }
Chris@37:     
Chris@37:     /**
Chris@37:      * Process one frequency-domain audio frame (provided as real &
Chris@37:      * imaginary components from the FFT output). Return a feature
Chris@37:      * vector of size given by getFeatureSize().
Chris@37:      *
Chris@37:      * Operates by mapping the frequency bins into a part-linear
Chris@37:      * part-logarithmic array, then (optionally) computing the
Chris@37:      * half-wave rectified spectral difference from the previous
Chris@37:      * frame, then (optionally) normalising to a sum of 1.
Chris@37:      *
Chris@37:      * Return value is the frame (post-processed, with warping,
Chris@37:      * rectification, and normalisation as appropriate).
Chris@37:      */
Chris@37:     std::vector<double> process(const std::vector<double> &real,
Chris@37:                                 const std::vector<double> &imag);
Chris@37:     
Chris@37: protected:
Chris@37:     /** Make either standard or chroma map, depending on m_params */
Chris@37:     void makeFreqMap();
Chris@37: 
Chris@37:     /** Creates a map of FFT frequency bins to comparison bins.  Where
Chris@37:      *  the spacing of FFT bins is less than 0.5 semitones, the
Chris@37:      *  mapping is one to one. Where the spacing is greater than 0.5
Chris@37:      *  semitones, the FFT energy is mapped into semitone-wide
Chris@37:      *  bins. No scaling is performed; that is the energy is summed
Chris@37:      *  into the comparison bins. */
Chris@37:     void makeStandardFrequencyMap();
Chris@37: 
Chris@37:     /** Creates a map of FFT frequency bins to semitone chroma bins. */
Chris@37:     void makeChromaFrequencyMap();
Chris@37: 
Chris@37:     /** Configuration parameters */
Chris@37:     Parameters m_params;
Chris@37:     
Chris@37:     /** Long term average frame energy (in frequency domain
Chris@37:      *  representation). */
Chris@37:     double m_ltAverage;
Chris@37: 
Chris@37:     /** A mapping function for mapping FFT bins to final frequency
Chris@37:      *  bins.  The mapping is linear (1-1) until the resolution
Chris@37:      *  reaches 2 points per semitone, then logarithmic with a
Chris@37:      *  semitone resolution.  e.g. for 44.1kHz sampling rate and
Chris@37:      *  fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
Chris@37:      *  linearly for bins 0-34 (0 to 732Hz), and logarithmically for
Chris@37:      *  the remaining bins (midi notes 79 to 127, bins 35 to 83),
Chris@37:      *  where all energy above note 127 is mapped into the final
Chris@37:      *  bin. */
Chris@37:     std::vector<int> m_freqMap;
Chris@37: 
Chris@37:     /** The size of a returned feature. */
Chris@37:     int m_featureSize;
Chris@37: 
Chris@37:     /** The most recent frame; used for calculating the frame to frame
Chris@37:      *  spectral difference. This is therefore frequency warped but
Chris@37:      *  not yet normalised. */
Chris@37:     std::vector<double> m_prevFrame;
Chris@37: };
Chris@37: 
Chris@37: #endif
Chris@37: