match-vamp: src/FeatureExtractor.h annotate

annotate src/FeatureExtractor.h @ 37:91410483228b refactors

refactor: Pull out feature extraction code to FeatureExtractor.cpp

author	Chris Cannam
date	Thu, 13 Nov 2014 12:03:52 +0000
parents
children	8cce4e13ede3

rev	line source
Chris@37	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
Chris@37	2
Chris@37	3 /*
Chris@37	4 Vamp feature extraction plugin using the MATCH audio alignment
Chris@37	5 algorithm.
Chris@37	6
Chris@37	7 Centre for Digital Music, Queen Mary, University of London.
Chris@37	8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
Chris@37	9
Chris@37	10 This program is free software; you can redistribute it and/or
Chris@37	11 modify it under the terms of the GNU General Public License as
Chris@37	12 published by the Free Software Foundation; either version 2 of the
Chris@37	13 License, or (at your option) any later version. See the file
Chris@37	14 COPYING included with this distribution for more information.
Chris@37	15 */
Chris@37	16
Chris@37	17 #ifndef FEATURE_EXTRACTOR_H
Chris@37	18 #define FEATURE_EXTRACTOR_H
Chris@37	19
Chris@37	20 #include <vector>
Chris@37	21
Chris@37	22 /**
Chris@37	23 * Convert frequency-domain audio frames into features suitable for
Chris@37	24 * MATCH alignment calculation. The default feature is a warping of
Chris@37	25 * the frequency data to map higher frequencies into a linear scale. A
Chris@37	26 * chroma mapping is also available.
Chris@37	27 *
Chris@37	28 * Note that FeatureExtractor maintains internal frame-to-frame state:
Chris@37	29 * use one FeatureExtractor per audio source, and construct a new one
Chris@37	30 * for each new source.
Chris@37	31 */
Chris@37	32 class FeatureExtractor
Chris@37	33 {
Chris@37	34 public:
Chris@37	35 enum FrameNormalisation {
Chris@37	36
Chris@37	37 /** Do not normalise frames */
Chris@37	38 NoFrameNormalisation,
Chris@37	39
Chris@37	40 /** Normalise each frame to have a sum of 1 */
Chris@37	41 NormaliseFrameToSum1,
Chris@37	42
Chris@37	43 /** Normalise each frame by the long-term average of the
Chris@37	44 * summed energy */
Chris@37	45 NormaliseFrameToLTAverage,
Chris@37	46 };
Chris@37	47
Chris@37	48 struct Parameters {
Chris@37	49
Chris@37	50 Parameters(float rate_, int fftSize_) :
Chris@37	51 sampleRate(rate_),
Chris@37	52 frameNorm(NormaliseFrameToSum1),
Chris@37	53 useSpectralDifference(true),
Chris@37	54 useChromaFrequencyMap(false),
Chris@37	55 fftSize(fftSize_),
Chris@37	56 silenceThreshold(0.01),
Chris@37	57 decay(0.99)
Chris@37	58 {}
Chris@37	59
Chris@37	60 /** Sample rate of audio */
Chris@37	61 float sampleRate;
Chris@37	62
Chris@37	63 /** Type of audio frame normalisation */
Chris@37	64 FrameNormalisation frameNorm;
Chris@37	65
Chris@37	66 /** Flag indicating whether or not the half-wave rectified
Chris@37	67 * spectral difference should be used in calculating the
Chris@37	68 * distance metric for pairs of audio frames, instead of the
Chris@37	69 * straight spectrum values. */
Chris@37	70 bool useSpectralDifference;
Chris@37	71
Chris@37	72 /** Flag indicating whether to use a chroma frequency map (12
Chris@37	73 * bins) instead of the default warped spectrogram */
Chris@37	74 bool useChromaFrequencyMap;
Chris@37	75
Chris@37	76 /** Spacing of audio frames (determines the amount of overlap or
Chris@37	77 * skip between frames). This value is expressed in
Chris@37	78 * seconds. */
Chris@37	79 double hopTime;
Chris@37	80
Chris@37	81 /** Size of an FFT frame in samples. Note that the data passed
Chris@37	82 * in is already in the frequency domain, so this expresses
Chris@37	83 * the size of the frame that the caller will be providing. */
Chris@37	84 int fftSize;
Chris@37	85
Chris@37	86 /** RMS level below which frame is considered silent */
Chris@37	87 double silenceThreshold;
Chris@37	88
Chris@37	89 /** Frame-to-frame decay factor in calculating long-term average */
Chris@37	90 double decay;
Chris@37	91 };
Chris@37	92
Chris@37	93 /**
Chris@37	94 * Construct a FeatureExtractor with the given parameters.
Chris@37	95 *
Chris@37	96 * Note that FeatureExtractor maintains internal frame-to-frame
Chris@37	97 * state: use one FeatureExtractor per audio source, and construct
Chris@37	98 * a new one for each new source.
Chris@37	99 */
Chris@37	100 FeatureExtractor(Parameters params);
Chris@37	101
Chris@37	102 /**
Chris@37	103 * Return the feature vector size that will be returned from process().
Chris@37	104 */
Chris@37	105 int getFeatureSize() const { return m_featureSize; }
Chris@37	106
Chris@37	107 /**
Chris@37	108 * Process one frequency-domain audio frame (provided as real &
Chris@37	109 * imaginary components from the FFT output). Return a feature
Chris@37	110 * vector of size given by getFeatureSize().
Chris@37	111 *
Chris@37	112 * Operates by mapping the frequency bins into a part-linear
Chris@37	113 * part-logarithmic array, then (optionally) computing the
Chris@37	114 * half-wave rectified spectral difference from the previous
Chris@37	115 * frame, then (optionally) normalising to a sum of 1.
Chris@37	116 *
Chris@37	117 * Return value is the frame (post-processed, with warping,
Chris@37	118 * rectification, and normalisation as appropriate).
Chris@37	119 */
Chris@37	120 std::vector<double> process(const std::vector<double> &real,
Chris@37	121 const std::vector<double> &imag);
Chris@37	122
Chris@37	123 protected:
Chris@37	124 /** Make either standard or chroma map, depending on m_params */
Chris@37	125 void makeFreqMap();
Chris@37	126
Chris@37	127 /** Creates a map of FFT frequency bins to comparison bins. Where
Chris@37	128 * the spacing of FFT bins is less than 0.5 semitones, the
Chris@37	129 * mapping is one to one. Where the spacing is greater than 0.5
Chris@37	130 * semitones, the FFT energy is mapped into semitone-wide
Chris@37	131 * bins. No scaling is performed; that is the energy is summed
Chris@37	132 * into the comparison bins. */
Chris@37	133 void makeStandardFrequencyMap();
Chris@37	134
Chris@37	135 /** Creates a map of FFT frequency bins to semitone chroma bins. */
Chris@37	136 void makeChromaFrequencyMap();
Chris@37	137
Chris@37	138 /** Configuration parameters */
Chris@37	139 Parameters m_params;
Chris@37	140
Chris@37	141 /** Long term average frame energy (in frequency domain
Chris@37	142 * representation). */
Chris@37	143 double m_ltAverage;
Chris@37	144
Chris@37	145 /** A mapping function for mapping FFT bins to final frequency
Chris@37	146 * bins. The mapping is linear (1-1) until the resolution
Chris@37	147 * reaches 2 points per semitone, then logarithmic with a
Chris@37	148 * semitone resolution. e.g. for 44.1kHz sampling rate and
Chris@37	149 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
Chris@37	150 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
Chris@37	151 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
Chris@37	152 * where all energy above note 127 is mapped into the final
Chris@37	153 * bin. */
Chris@37	154 std::vector<int> m_freqMap;
Chris@37	155
Chris@37	156 /** The size of a returned feature. */
Chris@37	157 int m_featureSize;
Chris@37	158
Chris@37	159 /** The most recent frame; used for calculating the frame to frame
Chris@37	160 * spectral difference. This is therefore frequency warped but
Chris@37	161 * not yet normalised. */
Chris@37	162 std::vector<double> m_prevFrame;
Chris@37	163 };
Chris@37	164
Chris@37	165 #endif
Chris@37	166

Mercurial > hg > match-vamp

annotate src/FeatureExtractor.h @ 37:91410483228b refactors