Chris@37: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
Chris@37: 
Chris@37: /*
Chris@37:     Vamp feature extraction plugin using the MATCH audio alignment
Chris@37:     algorithm.
Chris@37: 
Chris@37:     Centre for Digital Music, Queen Mary, University of London.
Chris@236:     Copyright (c) 2007-2020 Simon Dixon, Chris Cannam, and Queen Mary
Chris@230:     University of London, Copyright (c) 2014-2015 Tido GmbH.
Chris@37:     
Chris@37:     This program is free software; you can redistribute it and/or
Chris@37:     modify it under the terms of the GNU General Public License as
Chris@37:     published by the Free Software Foundation; either version 2 of the
Chris@37:     License, or (at your option) any later version.  See the file
Chris@37:     COPYING included with this distribution for more information.
Chris@37: */
Chris@37: 
Chris@37: #ifndef FEATURE_EXTRACTOR_H
Chris@37: #define FEATURE_EXTRACTOR_H
Chris@37: 
Chris@187: #include "MatchTypes.h"
Chris@37: 
Chris@37: /**
Chris@37:  * Convert frequency-domain audio frames into features suitable for
Chris@125:  * MATCH alignment calculation.
Chris@37:  *
Chris@125:  * The default feature is a warping of the frequency data to map FFT
Chris@125:  * frequency bins into feature bins. The mapping is linear (1-1) until
Chris@125:  * the resolution reaches 2 points per semitone, then logarithmic with
Chris@125:  * a semitone resolution.  e.g. for 44.1kHz sampling rate and fftSize
Chris@125:  * of 2048 (46ms), bin spacing is 21.5Hz, which is mapped linearly for
Chris@125:  * bins 0-34 (0 to 732Hz), and logarithmically for the remaining bins
Chris@125:  * (midi notes 79 to 127, bins 35 to 83), where all energy above note
Chris@125:  * 127 is mapped into the final bin.
Chris@125:  *
Chris@125:  * Alternatively a chroma mapping is also available. This produces a
Chris@125:  * 13-bin feature by mapping all FFT bins into bin 0 until the
Chris@125:  * resolution reaches 1 point per semitone, then mapping each
Chris@125:  * subsequent bin into its corresponding semitone in the remaining 12
Chris@125:  * bins (where bin 1 is C).  e.g. e.g. for 44.1kHz sampling rate and
Chris@125:  * fftSize of 2048 (46ms), frequencies up to 361 Hz go to bin 0,
Chris@125:  * subsequent frequencies to the chroma bins.
Chris@37:  */
Chris@37: class FeatureExtractor
Chris@37: {
Chris@37: public:
Chris@37:     struct Parameters {
Chris@37: 
Chris@216:         Parameters(float rate_) :
Chris@37:             sampleRate(rate_),
Chris@37:             useChromaFrequencyMap(false),
Chris@216:             fftSize(2048),
Chris@176:             referenceFrequency(440.0),
Chris@219:             minFrequency(150.),
Chris@176:             maxFrequency(rate_/2.)
Chris@37:         {}
Chris@37: 
Chris@37:         /** Sample rate of audio */
Chris@37:         float sampleRate;
Chris@37: 
Chris@37:         /** Flag indicating whether to use a chroma frequency map (12
Chris@37:          *  bins) instead of the default warped spectrogram */
Chris@37:         bool useChromaFrequencyMap;
Chris@37: 
Chris@37:         /** Size of an FFT frame in samples. Note that the data passed
Chris@37:          *  in is already in the frequency domain, so this expresses
Chris@37:          *  the size of the frame that the caller will be providing. */
Chris@37:         int fftSize;
Chris@159: 
Chris@159:         /** Frequency of concert A */
Chris@159:         double referenceFrequency;
Chris@176: 
Chris@176:         /** Minimum frequency cutoff to include in feature */
Chris@176:         double minFrequency;
Chris@176: 
Chris@176:         /** Maximum frequency cutoff to include in feature */
Chris@176:         double maxFrequency;
Chris@37:     };
Chris@37: 
Chris@37:     /**
Chris@37:      * Construct a FeatureExtractor with the given parameters.
Chris@37:      *
Chris@37:      * Note that FeatureExtractor maintains internal frame-to-frame
Chris@37:      * state: use one FeatureExtractor per audio source, and construct
Chris@37:      * a new one for each new source.
Chris@37:      */
Chris@37:     FeatureExtractor(Parameters params);
Chris@37: 
Chris@37:     /**
Chris@37:      * Return the feature vector size that will be returned from process().
Chris@37:      */
Chris@37:     int getFeatureSize() const { return m_featureSize; }
Chris@74: 
Chris@74:     /**
Chris@74:      * Return the feature vector size that would be returned from
Chris@74:      * process() with these parameters.
Chris@74:      */
Chris@74:     static int getFeatureSizeFor(Parameters params);
Chris@37:     
Chris@37:     /**
Chris@201:      * Process one frequency-domain audio frame, provided as real &
Chris@201:      * imaginary components from the FFT output. Return a feature
Chris@38:      * vector of size given by getFeatureSize(). Input vectors must
Chris@38:      * have at least params.fftSize/2+1 elements each.
Chris@37:      *
Chris@37:      * Operates by mapping the frequency bins into a part-linear
Chris@103:      * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103:      * which case they are mapped into chroma bins.
Chris@37:      */
Chris@183:     feature_t process(const std::vector<double> &real,
Chris@183:                       const std::vector<double> &imag);
Chris@37:     
Chris@74:     /**
Chris@201:      * Process one frequency-domain audio frame, provided as real &
Chris@201:      * imaginary components from the FFT output. Return a feature
Chris@184:      * vector of size given by getFeatureSize(). Input vectors must
Chris@184:      * have at least params.fftSize/2+1 elements each.
Chris@184:      *
Chris@184:      * Operates by mapping the frequency bins into a part-linear
Chris@184:      * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@184:      * which case they are mapped into chroma bins.
Chris@184:      */
Chris@184:     feature_t process(const std::vector<float> &real,
Chris@184:                       const std::vector<float> &imag);
Chris@184:     
Chris@184:     /**
Chris@201:      * Process one frequency-domain audio frame, provided as real &
Chris@201:      * imaginary components from the FFT output. Return a feature
Chris@201:      * vector of size given by getFeatureSize(). Input arrays must
Chris@201:      * have at least params.fftSize/2+1 elements each.
Chris@201:      *
Chris@201:      * Operates by mapping the frequency bins into a part-linear
Chris@201:      * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@201:      * which case they are mapped into chroma bins.
Chris@201:      */
Chris@201:     feature_t process(const float *real, const float *imag);
Chris@201:     
Chris@201:     /**
Chris@74:      * Process one frequency-domain audio frame, provided as a single
Chris@74:      * array of alternating real and imaginary components. Input array
Chris@74:      * must have at least 2 * (params.fftSize/2 + 1) elements.
Chris@74:      *
Chris@74:      * Operates by mapping the frequency bins into a part-linear
Chris@103:      * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103:      * which case they are mapped into chroma bins.
Chris@74:      */
Chris@184:     feature_t process(const float *carray);
Chris@74:     
Chris@37: protected:
Chris@37:     /** Make either standard or chroma map, depending on m_params */
Chris@37:     void makeFreqMap();
Chris@37: 
Chris@37:     /** Creates a map of FFT frequency bins to comparison bins.  Where
Chris@37:      *  the spacing of FFT bins is less than 0.5 semitones, the
Chris@37:      *  mapping is one to one. Where the spacing is greater than 0.5
Chris@37:      *  semitones, the FFT energy is mapped into semitone-wide
Chris@37:      *  bins. No scaling is performed; that is the energy is summed
Chris@37:      *  into the comparison bins. */
Chris@37:     void makeStandardFrequencyMap();
Chris@37: 
Chris@37:     /** Creates a map of FFT frequency bins to semitone chroma bins. */
Chris@37:     void makeChromaFrequencyMap();
Chris@37: 
Chris@37:     /** Configuration parameters */
Chris@37:     Parameters m_params;
Chris@37: 
Chris@37:     /** A mapping function for mapping FFT bins to final frequency
Chris@37:      *  bins.  The mapping is linear (1-1) until the resolution
Chris@37:      *  reaches 2 points per semitone, then logarithmic with a
Chris@37:      *  semitone resolution.  e.g. for 44.1kHz sampling rate and
Chris@37:      *  fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
Chris@37:      *  linearly for bins 0-34 (0 to 732Hz), and logarithmically for
Chris@37:      *  the remaining bins (midi notes 79 to 127, bins 35 to 83),
Chris@37:      *  where all energy above note 127 is mapped into the final
Chris@176:      *  bin.
Chris@176:      * 
Chris@176:      *  If a bin's frequency is outside the minFrequency->maxFrequency
Chris@176:      *  range, it will be mapped to a target bin of -1 and should be
Chris@176:      *  discarded.
Chris@176:      */
Chris@37:     std::vector<int> m_freqMap;
Chris@37: 
Chris@184:     feature_t processMags(const std::vector<float> &mags);
Chris@184:     std::vector<float> scaleMags(const std::vector<float> &mags);
Chris@169:     
Chris@37:     /** The size of a returned feature. */
Chris@37:     int m_featureSize;
Chris@37: };
Chris@37: 
Chris@37: #endif
Chris@37: