Chris@37
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@37
|
2
|
Chris@37
|
3 /*
|
Chris@37
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
Chris@37
|
5 algorithm.
|
Chris@37
|
6
|
Chris@37
|
7 Centre for Digital Music, Queen Mary, University of London.
|
Chris@236
|
8 Copyright (c) 2007-2020 Simon Dixon, Chris Cannam, and Queen Mary
|
Chris@230
|
9 University of London, Copyright (c) 2014-2015 Tido GmbH.
|
Chris@37
|
10
|
Chris@37
|
11 This program is free software; you can redistribute it and/or
|
Chris@37
|
12 modify it under the terms of the GNU General Public License as
|
Chris@37
|
13 published by the Free Software Foundation; either version 2 of the
|
Chris@37
|
14 License, or (at your option) any later version. See the file
|
Chris@37
|
15 COPYING included with this distribution for more information.
|
Chris@37
|
16 */
|
Chris@37
|
17
|
Chris@37
|
18 #ifndef FEATURE_EXTRACTOR_H
|
Chris@37
|
19 #define FEATURE_EXTRACTOR_H
|
Chris@37
|
20
|
Chris@187
|
21 #include "MatchTypes.h"
|
Chris@37
|
22
|
Chris@37
|
23 /**
|
Chris@37
|
24 * Convert frequency-domain audio frames into features suitable for
|
Chris@125
|
25 * MATCH alignment calculation.
|
Chris@37
|
26 *
|
Chris@125
|
27 * The default feature is a warping of the frequency data to map FFT
|
Chris@125
|
28 * frequency bins into feature bins. The mapping is linear (1-1) until
|
Chris@125
|
29 * the resolution reaches 2 points per semitone, then logarithmic with
|
Chris@125
|
30 * a semitone resolution. e.g. for 44.1kHz sampling rate and fftSize
|
Chris@125
|
31 * of 2048 (46ms), bin spacing is 21.5Hz, which is mapped linearly for
|
Chris@125
|
32 * bins 0-34 (0 to 732Hz), and logarithmically for the remaining bins
|
Chris@125
|
33 * (midi notes 79 to 127, bins 35 to 83), where all energy above note
|
Chris@125
|
34 * 127 is mapped into the final bin.
|
Chris@125
|
35 *
|
Chris@125
|
36 * Alternatively a chroma mapping is also available. This produces a
|
Chris@125
|
37 * 13-bin feature by mapping all FFT bins into bin 0 until the
|
Chris@125
|
38 * resolution reaches 1 point per semitone, then mapping each
|
Chris@125
|
39 * subsequent bin into its corresponding semitone in the remaining 12
|
Chris@125
|
40 * bins (where bin 1 is C). e.g. e.g. for 44.1kHz sampling rate and
|
Chris@125
|
41 * fftSize of 2048 (46ms), frequencies up to 361 Hz go to bin 0,
|
Chris@125
|
42 * subsequent frequencies to the chroma bins.
|
Chris@37
|
43 */
|
Chris@37
|
44 class FeatureExtractor
|
Chris@37
|
45 {
|
Chris@37
|
46 public:
|
Chris@37
|
47 struct Parameters {
|
Chris@37
|
48
|
Chris@216
|
49 Parameters(float rate_) :
|
Chris@37
|
50 sampleRate(rate_),
|
Chris@37
|
51 useChromaFrequencyMap(false),
|
Chris@216
|
52 fftSize(2048),
|
Chris@176
|
53 referenceFrequency(440.0),
|
Chris@219
|
54 minFrequency(150.),
|
Chris@176
|
55 maxFrequency(rate_/2.)
|
Chris@37
|
56 {}
|
Chris@37
|
57
|
Chris@37
|
58 /** Sample rate of audio */
|
Chris@37
|
59 float sampleRate;
|
Chris@37
|
60
|
Chris@37
|
61 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@37
|
62 * bins) instead of the default warped spectrogram */
|
Chris@37
|
63 bool useChromaFrequencyMap;
|
Chris@37
|
64
|
Chris@37
|
65 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@37
|
66 * in is already in the frequency domain, so this expresses
|
Chris@37
|
67 * the size of the frame that the caller will be providing. */
|
Chris@37
|
68 int fftSize;
|
Chris@159
|
69
|
Chris@159
|
70 /** Frequency of concert A */
|
Chris@159
|
71 double referenceFrequency;
|
Chris@176
|
72
|
Chris@176
|
73 /** Minimum frequency cutoff to include in feature */
|
Chris@176
|
74 double minFrequency;
|
Chris@176
|
75
|
Chris@176
|
76 /** Maximum frequency cutoff to include in feature */
|
Chris@176
|
77 double maxFrequency;
|
Chris@37
|
78 };
|
Chris@37
|
79
|
Chris@37
|
80 /**
|
Chris@37
|
81 * Construct a FeatureExtractor with the given parameters.
|
Chris@37
|
82 *
|
Chris@37
|
83 * Note that FeatureExtractor maintains internal frame-to-frame
|
Chris@37
|
84 * state: use one FeatureExtractor per audio source, and construct
|
Chris@37
|
85 * a new one for each new source.
|
Chris@37
|
86 */
|
Chris@37
|
87 FeatureExtractor(Parameters params);
|
Chris@37
|
88
|
Chris@37
|
89 /**
|
Chris@37
|
90 * Return the feature vector size that will be returned from process().
|
Chris@37
|
91 */
|
Chris@37
|
92 int getFeatureSize() const { return m_featureSize; }
|
Chris@74
|
93
|
Chris@74
|
94 /**
|
Chris@74
|
95 * Return the feature vector size that would be returned from
|
Chris@74
|
96 * process() with these parameters.
|
Chris@74
|
97 */
|
Chris@74
|
98 static int getFeatureSizeFor(Parameters params);
|
Chris@37
|
99
|
Chris@37
|
100 /**
|
Chris@201
|
101 * Process one frequency-domain audio frame, provided as real &
|
Chris@201
|
102 * imaginary components from the FFT output. Return a feature
|
Chris@38
|
103 * vector of size given by getFeatureSize(). Input vectors must
|
Chris@38
|
104 * have at least params.fftSize/2+1 elements each.
|
Chris@37
|
105 *
|
Chris@37
|
106 * Operates by mapping the frequency bins into a part-linear
|
Chris@103
|
107 * part-logarithmic array, unless useChromaFrequencyMap is true in
|
Chris@103
|
108 * which case they are mapped into chroma bins.
|
Chris@37
|
109 */
|
Chris@183
|
110 feature_t process(const std::vector<double> &real,
|
Chris@183
|
111 const std::vector<double> &imag);
|
Chris@37
|
112
|
Chris@74
|
113 /**
|
Chris@201
|
114 * Process one frequency-domain audio frame, provided as real &
|
Chris@201
|
115 * imaginary components from the FFT output. Return a feature
|
Chris@184
|
116 * vector of size given by getFeatureSize(). Input vectors must
|
Chris@184
|
117 * have at least params.fftSize/2+1 elements each.
|
Chris@184
|
118 *
|
Chris@184
|
119 * Operates by mapping the frequency bins into a part-linear
|
Chris@184
|
120 * part-logarithmic array, unless useChromaFrequencyMap is true in
|
Chris@184
|
121 * which case they are mapped into chroma bins.
|
Chris@184
|
122 */
|
Chris@184
|
123 feature_t process(const std::vector<float> &real,
|
Chris@184
|
124 const std::vector<float> &imag);
|
Chris@184
|
125
|
Chris@184
|
126 /**
|
Chris@201
|
127 * Process one frequency-domain audio frame, provided as real &
|
Chris@201
|
128 * imaginary components from the FFT output. Return a feature
|
Chris@201
|
129 * vector of size given by getFeatureSize(). Input arrays must
|
Chris@201
|
130 * have at least params.fftSize/2+1 elements each.
|
Chris@201
|
131 *
|
Chris@201
|
132 * Operates by mapping the frequency bins into a part-linear
|
Chris@201
|
133 * part-logarithmic array, unless useChromaFrequencyMap is true in
|
Chris@201
|
134 * which case they are mapped into chroma bins.
|
Chris@201
|
135 */
|
Chris@201
|
136 feature_t process(const float *real, const float *imag);
|
Chris@201
|
137
|
Chris@201
|
138 /**
|
Chris@74
|
139 * Process one frequency-domain audio frame, provided as a single
|
Chris@74
|
140 * array of alternating real and imaginary components. Input array
|
Chris@74
|
141 * must have at least 2 * (params.fftSize/2 + 1) elements.
|
Chris@74
|
142 *
|
Chris@74
|
143 * Operates by mapping the frequency bins into a part-linear
|
Chris@103
|
144 * part-logarithmic array, unless useChromaFrequencyMap is true in
|
Chris@103
|
145 * which case they are mapped into chroma bins.
|
Chris@74
|
146 */
|
Chris@184
|
147 feature_t process(const float *carray);
|
Chris@74
|
148
|
Chris@37
|
149 protected:
|
Chris@37
|
150 /** Make either standard or chroma map, depending on m_params */
|
Chris@37
|
151 void makeFreqMap();
|
Chris@37
|
152
|
Chris@37
|
153 /** Creates a map of FFT frequency bins to comparison bins. Where
|
Chris@37
|
154 * the spacing of FFT bins is less than 0.5 semitones, the
|
Chris@37
|
155 * mapping is one to one. Where the spacing is greater than 0.5
|
Chris@37
|
156 * semitones, the FFT energy is mapped into semitone-wide
|
Chris@37
|
157 * bins. No scaling is performed; that is the energy is summed
|
Chris@37
|
158 * into the comparison bins. */
|
Chris@37
|
159 void makeStandardFrequencyMap();
|
Chris@37
|
160
|
Chris@37
|
161 /** Creates a map of FFT frequency bins to semitone chroma bins. */
|
Chris@37
|
162 void makeChromaFrequencyMap();
|
Chris@37
|
163
|
Chris@37
|
164 /** Configuration parameters */
|
Chris@37
|
165 Parameters m_params;
|
Chris@37
|
166
|
Chris@37
|
167 /** A mapping function for mapping FFT bins to final frequency
|
Chris@37
|
168 * bins. The mapping is linear (1-1) until the resolution
|
Chris@37
|
169 * reaches 2 points per semitone, then logarithmic with a
|
Chris@37
|
170 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
Chris@37
|
171 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
Chris@37
|
172 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
Chris@37
|
173 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
Chris@37
|
174 * where all energy above note 127 is mapped into the final
|
Chris@176
|
175 * bin.
|
Chris@176
|
176 *
|
Chris@176
|
177 * If a bin's frequency is outside the minFrequency->maxFrequency
|
Chris@176
|
178 * range, it will be mapped to a target bin of -1 and should be
|
Chris@176
|
179 * discarded.
|
Chris@176
|
180 */
|
Chris@37
|
181 std::vector<int> m_freqMap;
|
Chris@37
|
182
|
Chris@184
|
183 feature_t processMags(const std::vector<float> &mags);
|
Chris@184
|
184 std::vector<float> scaleMags(const std::vector<float> &mags);
|
Chris@169
|
185
|
Chris@37
|
186 /** The size of a returned feature. */
|
Chris@37
|
187 int m_featureSize;
|
Chris@37
|
188 };
|
Chris@37
|
189
|
Chris@37
|
190 #endif
|
Chris@37
|
191
|