Mercurial > hg > match-vamp
comparison src/FeatureExtractor.h @ 103:593054bf6476 feature_conditioner
Pull out normalisation and specdiff stuff into FeatureConditioner
author | Chris Cannam |
---|---|
date | Thu, 04 Dec 2014 13:05:16 +0000 |
parents | b9aa663a607b |
children | 3792bcd34470 |
comparison
equal
deleted
inserted
replaced
96:6b91e40b2c04 | 103:593054bf6476 |
---|---|
23 * Convert frequency-domain audio frames into features suitable for | 23 * Convert frequency-domain audio frames into features suitable for |
24 * MATCH alignment calculation. The default feature is a warping of | 24 * MATCH alignment calculation. The default feature is a warping of |
25 * the frequency data to map higher frequencies into a linear scale. A | 25 * the frequency data to map higher frequencies into a linear scale. A |
26 * chroma mapping is also available. | 26 * chroma mapping is also available. |
27 * | 27 * |
28 * Note that FeatureExtractor maintains internal frame-to-frame state: | 28 * Note that FeatureExtractor may maintain internal frame-to-frame |
29 * use one FeatureExtractor per audio source, and construct a new one | 29 * state: use one FeatureExtractor per audio source, and construct a |
30 * for each new source. | 30 * new one for each new source. |
31 */ | 31 */ |
32 class FeatureExtractor | 32 class FeatureExtractor |
33 { | 33 { |
34 public: | 34 public: |
35 enum FrameNormalisation { | |
36 | |
37 /** Do not normalise frames */ | |
38 NoFrameNormalisation, | |
39 | |
40 /** Normalise each frame to have a sum of 1 */ | |
41 NormaliseFrameToSum1, | |
42 | |
43 /** Normalise each frame by the long-term average of the | |
44 * summed energy */ | |
45 NormaliseFrameToLTAverage, | |
46 }; | |
47 | |
48 struct Parameters { | 35 struct Parameters { |
49 | 36 |
50 Parameters(float rate_, int fftSize_) : | 37 Parameters(float rate_, int fftSize_) : |
51 sampleRate(rate_), | 38 sampleRate(rate_), |
52 frameNorm(NormaliseFrameToSum1), | |
53 useSpectralDifference(true), | |
54 useChromaFrequencyMap(false), | 39 useChromaFrequencyMap(false), |
55 fftSize(fftSize_), | 40 fftSize(fftSize_) |
56 silenceThreshold(0.01), | |
57 decay(0.99) | |
58 {} | 41 {} |
59 | 42 |
60 /** Sample rate of audio */ | 43 /** Sample rate of audio */ |
61 float sampleRate; | 44 float sampleRate; |
62 | |
63 /** Type of audio frame normalisation */ | |
64 FrameNormalisation frameNorm; | |
65 | |
66 /** Flag indicating whether or not the half-wave rectified | |
67 * spectral difference should be used in calculating the | |
68 * distance metric for pairs of audio frames, instead of the | |
69 * straight spectrum values. */ | |
70 bool useSpectralDifference; | |
71 | 45 |
72 /** Flag indicating whether to use a chroma frequency map (12 | 46 /** Flag indicating whether to use a chroma frequency map (12 |
73 * bins) instead of the default warped spectrogram */ | 47 * bins) instead of the default warped spectrogram */ |
74 bool useChromaFrequencyMap; | 48 bool useChromaFrequencyMap; |
75 | 49 |
80 | 54 |
81 /** Size of an FFT frame in samples. Note that the data passed | 55 /** Size of an FFT frame in samples. Note that the data passed |
82 * in is already in the frequency domain, so this expresses | 56 * in is already in the frequency domain, so this expresses |
83 * the size of the frame that the caller will be providing. */ | 57 * the size of the frame that the caller will be providing. */ |
84 int fftSize; | 58 int fftSize; |
85 | |
86 /** RMS level below which frame is considered silent */ | |
87 double silenceThreshold; | |
88 | |
89 /** Frame-to-frame decay factor in calculating long-term average */ | |
90 double decay; | |
91 }; | 59 }; |
92 | 60 |
93 /** | 61 /** |
94 * Construct a FeatureExtractor with the given parameters. | 62 * Construct a FeatureExtractor with the given parameters. |
95 * | 63 * |
115 * imaginary components from the FFT output). Return a feature | 83 * imaginary components from the FFT output). Return a feature |
116 * vector of size given by getFeatureSize(). Input vectors must | 84 * vector of size given by getFeatureSize(). Input vectors must |
117 * have at least params.fftSize/2+1 elements each. | 85 * have at least params.fftSize/2+1 elements each. |
118 * | 86 * |
119 * Operates by mapping the frequency bins into a part-linear | 87 * Operates by mapping the frequency bins into a part-linear |
120 * part-logarithmic array, then (optionally) computing the | 88 * part-logarithmic array, unless useChromaFrequencyMap is true in |
121 * half-wave rectified spectral difference from the previous | 89 * which case they are mapped into chroma bins. |
122 * frame, then (optionally) normalising to a sum of 1. | |
123 * | |
124 * Return value is the frame (post-processed, with warping, | |
125 * rectification, and normalisation as appropriate). | |
126 */ | 90 */ |
127 std::vector<double> process(const std::vector<double> &real, | 91 std::vector<double> process(const std::vector<double> &real, |
128 const std::vector<double> &imag); | 92 const std::vector<double> &imag); |
129 | 93 |
130 /** | 94 /** |
131 * Process one frequency-domain audio frame, provided as a single | 95 * Process one frequency-domain audio frame, provided as a single |
132 * array of alternating real and imaginary components. Input array | 96 * array of alternating real and imaginary components. Input array |
133 * must have at least 2 * (params.fftSize/2 + 1) elements. | 97 * must have at least 2 * (params.fftSize/2 + 1) elements. |
134 * | 98 * |
135 * Operates by mapping the frequency bins into a part-linear | 99 * Operates by mapping the frequency bins into a part-linear |
136 * part-logarithmic array, then (optionally) computing the | 100 * part-logarithmic array, unless useChromaFrequencyMap is true in |
137 * half-wave rectified spectral difference from the previous | 101 * which case they are mapped into chroma bins. |
138 * frame, then (optionally) normalising to a sum of 1. | |
139 * | |
140 * Return value is the frame (post-processed, with warping, | |
141 * rectification, and normalisation as appropriate). | |
142 */ | 102 */ |
143 std::vector<double> process(const float *carray); | 103 std::vector<double> process(const float *carray); |
144 | 104 |
145 protected: | 105 protected: |
146 /** Make either standard or chroma map, depending on m_params */ | 106 /** Make either standard or chroma map, depending on m_params */ |
155 void makeStandardFrequencyMap(); | 115 void makeStandardFrequencyMap(); |
156 | 116 |
157 /** Creates a map of FFT frequency bins to semitone chroma bins. */ | 117 /** Creates a map of FFT frequency bins to semitone chroma bins. */ |
158 void makeChromaFrequencyMap(); | 118 void makeChromaFrequencyMap(); |
159 | 119 |
160 std::vector<double> postProcess(const std::vector<double> &, double rms); | |
161 | |
162 /** Configuration parameters */ | 120 /** Configuration parameters */ |
163 Parameters m_params; | 121 Parameters m_params; |
164 | |
165 /** Long term average frame energy (in frequency domain | |
166 * representation). */ | |
167 double m_ltAverage; | |
168 | 122 |
169 /** A mapping function for mapping FFT bins to final frequency | 123 /** A mapping function for mapping FFT bins to final frequency |
170 * bins. The mapping is linear (1-1) until the resolution | 124 * bins. The mapping is linear (1-1) until the resolution |
171 * reaches 2 points per semitone, then logarithmic with a | 125 * reaches 2 points per semitone, then logarithmic with a |
172 * semitone resolution. e.g. for 44.1kHz sampling rate and | 126 * semitone resolution. e.g. for 44.1kHz sampling rate and |
177 * bin. */ | 131 * bin. */ |
178 std::vector<int> m_freqMap; | 132 std::vector<int> m_freqMap; |
179 | 133 |
180 /** The size of a returned feature. */ | 134 /** The size of a returned feature. */ |
181 int m_featureSize; | 135 int m_featureSize; |
182 | |
183 /** The most recent frame; used for calculating the frame to frame | |
184 * spectral difference. This is therefore frequency warped but | |
185 * not yet normalised. */ | |
186 std::vector<double> m_prevFrame; | |
187 }; | 136 }; |
188 | 137 |
189 #endif | 138 #endif |
190 | 139 |