comparison src/FeatureExtractor.h @ 103:593054bf6476 feature_conditioner

Pull out normalisation and specdiff stuff into FeatureConditioner
author Chris Cannam
date Thu, 04 Dec 2014 13:05:16 +0000
parents b9aa663a607b
children 3792bcd34470
comparison
equal deleted inserted replaced
96:6b91e40b2c04 103:593054bf6476
23 * Convert frequency-domain audio frames into features suitable for 23 * Convert frequency-domain audio frames into features suitable for
24 * MATCH alignment calculation. The default feature is a warping of 24 * MATCH alignment calculation. The default feature is a warping of
25 * the frequency data to map higher frequencies into a linear scale. A 25 * the frequency data to map higher frequencies into a linear scale. A
26 * chroma mapping is also available. 26 * chroma mapping is also available.
27 * 27 *
28 * Note that FeatureExtractor maintains internal frame-to-frame state: 28 * Note that FeatureExtractor may maintain internal frame-to-frame
29 * use one FeatureExtractor per audio source, and construct a new one 29 * state: use one FeatureExtractor per audio source, and construct a
30 * for each new source. 30 * new one for each new source.
31 */ 31 */
32 class FeatureExtractor 32 class FeatureExtractor
33 { 33 {
34 public: 34 public:
35 enum FrameNormalisation {
36
37 /** Do not normalise frames */
38 NoFrameNormalisation,
39
40 /** Normalise each frame to have a sum of 1 */
41 NormaliseFrameToSum1,
42
43 /** Normalise each frame by the long-term average of the
44 * summed energy */
45 NormaliseFrameToLTAverage,
46 };
47
48 struct Parameters { 35 struct Parameters {
49 36
50 Parameters(float rate_, int fftSize_) : 37 Parameters(float rate_, int fftSize_) :
51 sampleRate(rate_), 38 sampleRate(rate_),
52 frameNorm(NormaliseFrameToSum1),
53 useSpectralDifference(true),
54 useChromaFrequencyMap(false), 39 useChromaFrequencyMap(false),
55 fftSize(fftSize_), 40 fftSize(fftSize_)
56 silenceThreshold(0.01),
57 decay(0.99)
58 {} 41 {}
59 42
60 /** Sample rate of audio */ 43 /** Sample rate of audio */
61 float sampleRate; 44 float sampleRate;
62
63 /** Type of audio frame normalisation */
64 FrameNormalisation frameNorm;
65
66 /** Flag indicating whether or not the half-wave rectified
67 * spectral difference should be used in calculating the
68 * distance metric for pairs of audio frames, instead of the
69 * straight spectrum values. */
70 bool useSpectralDifference;
71 45
72 /** Flag indicating whether to use a chroma frequency map (12 46 /** Flag indicating whether to use a chroma frequency map (12
73 * bins) instead of the default warped spectrogram */ 47 * bins) instead of the default warped spectrogram */
74 bool useChromaFrequencyMap; 48 bool useChromaFrequencyMap;
75 49
80 54
81 /** Size of an FFT frame in samples. Note that the data passed 55 /** Size of an FFT frame in samples. Note that the data passed
82 * in is already in the frequency domain, so this expresses 56 * in is already in the frequency domain, so this expresses
83 * the size of the frame that the caller will be providing. */ 57 * the size of the frame that the caller will be providing. */
84 int fftSize; 58 int fftSize;
85
86 /** RMS level below which frame is considered silent */
87 double silenceThreshold;
88
89 /** Frame-to-frame decay factor in calculating long-term average */
90 double decay;
91 }; 59 };
92 60
93 /** 61 /**
94 * Construct a FeatureExtractor with the given parameters. 62 * Construct a FeatureExtractor with the given parameters.
95 * 63 *
115 * imaginary components from the FFT output). Return a feature 83 * imaginary components from the FFT output). Return a feature
116 * vector of size given by getFeatureSize(). Input vectors must 84 * vector of size given by getFeatureSize(). Input vectors must
117 * have at least params.fftSize/2+1 elements each. 85 * have at least params.fftSize/2+1 elements each.
118 * 86 *
119 * Operates by mapping the frequency bins into a part-linear 87 * Operates by mapping the frequency bins into a part-linear
120 * part-logarithmic array, then (optionally) computing the 88 * part-logarithmic array, unless useChromaFrequencyMap is true in
121 * half-wave rectified spectral difference from the previous 89 * which case they are mapped into chroma bins.
122 * frame, then (optionally) normalising to a sum of 1.
123 *
124 * Return value is the frame (post-processed, with warping,
125 * rectification, and normalisation as appropriate).
126 */ 90 */
127 std::vector<double> process(const std::vector<double> &real, 91 std::vector<double> process(const std::vector<double> &real,
128 const std::vector<double> &imag); 92 const std::vector<double> &imag);
129 93
130 /** 94 /**
131 * Process one frequency-domain audio frame, provided as a single 95 * Process one frequency-domain audio frame, provided as a single
132 * array of alternating real and imaginary components. Input array 96 * array of alternating real and imaginary components. Input array
133 * must have at least 2 * (params.fftSize/2 + 1) elements. 97 * must have at least 2 * (params.fftSize/2 + 1) elements.
134 * 98 *
135 * Operates by mapping the frequency bins into a part-linear 99 * Operates by mapping the frequency bins into a part-linear
136 * part-logarithmic array, then (optionally) computing the 100 * part-logarithmic array, unless useChromaFrequencyMap is true in
137 * half-wave rectified spectral difference from the previous 101 * which case they are mapped into chroma bins.
138 * frame, then (optionally) normalising to a sum of 1.
139 *
140 * Return value is the frame (post-processed, with warping,
141 * rectification, and normalisation as appropriate).
142 */ 102 */
143 std::vector<double> process(const float *carray); 103 std::vector<double> process(const float *carray);
144 104
145 protected: 105 protected:
146 /** Make either standard or chroma map, depending on m_params */ 106 /** Make either standard or chroma map, depending on m_params */
155 void makeStandardFrequencyMap(); 115 void makeStandardFrequencyMap();
156 116
157 /** Creates a map of FFT frequency bins to semitone chroma bins. */ 117 /** Creates a map of FFT frequency bins to semitone chroma bins. */
158 void makeChromaFrequencyMap(); 118 void makeChromaFrequencyMap();
159 119
160 std::vector<double> postProcess(const std::vector<double> &, double rms);
161
162 /** Configuration parameters */ 120 /** Configuration parameters */
163 Parameters m_params; 121 Parameters m_params;
164
165 /** Long term average frame energy (in frequency domain
166 * representation). */
167 double m_ltAverage;
168 122
169 /** A mapping function for mapping FFT bins to final frequency 123 /** A mapping function for mapping FFT bins to final frequency
170 * bins. The mapping is linear (1-1) until the resolution 124 * bins. The mapping is linear (1-1) until the resolution
171 * reaches 2 points per semitone, then logarithmic with a 125 * reaches 2 points per semitone, then logarithmic with a
172 * semitone resolution. e.g. for 44.1kHz sampling rate and 126 * semitone resolution. e.g. for 44.1kHz sampling rate and
177 * bin. */ 131 * bin. */
178 std::vector<int> m_freqMap; 132 std::vector<int> m_freqMap;
179 133
180 /** The size of a returned feature. */ 134 /** The size of a returned feature. */
181 int m_featureSize; 135 int m_featureSize;
182
183 /** The most recent frame; used for calculating the frame to frame
184 * spectral difference. This is therefore frequency warped but
185 * not yet normalised. */
186 std::vector<double> m_prevFrame;
187 }; 136 };
188 137
189 #endif 138 #endif
190 139