cannam@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
cannam@0
|
2
|
cannam@0
|
3 /*
|
cannam@0
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
cannam@0
|
5 algorithm.
|
cannam@0
|
6
|
cannam@0
|
7 Centre for Digital Music, Queen Mary, University of London.
|
cannam@0
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
cannam@0
|
9
|
cannam@0
|
10 This program is free software; you can redistribute it and/or
|
cannam@0
|
11 modify it under the terms of the GNU General Public License as
|
cannam@0
|
12 published by the Free Software Foundation; either version 2 of the
|
cannam@0
|
13 License, or (at your option) any later version. See the file
|
cannam@0
|
14 COPYING included with this distribution for more information.
|
cannam@0
|
15 */
|
cannam@0
|
16
|
cannam@0
|
17 #ifndef _MATCHER_H_
|
cannam@0
|
18 #define _MATCHER_H_
|
cannam@0
|
19
|
cannam@0
|
20 #include <vector>
|
cannam@0
|
21 #include <iostream>
|
cannam@0
|
22 #include <sstream>
|
cannam@0
|
23 #include <cmath>
|
cannam@0
|
24
|
cannam@0
|
25 #define ADVANCE_THIS 1
|
cannam@0
|
26 #define ADVANCE_OTHER 2
|
cannam@0
|
27 #define ADVANCE_BOTH 3
|
cannam@0
|
28 #define MASK 0xfc
|
cannam@0
|
29
|
cannam@0
|
30
|
cannam@0
|
31 using std::vector;
|
cannam@0
|
32 using std::string;
|
cannam@0
|
33 using std::cerr;
|
cannam@0
|
34 using std::endl;
|
cannam@0
|
35
|
cannam@0
|
36 /** Represents an audio stream that can be matched to another audio
|
cannam@0
|
37 * stream of the same piece of music. The matching algorithm uses
|
cannam@0
|
38 * dynamic time warping. The distance metric is a Euclidean metric
|
cannam@0
|
39 * on the FFT data with the higher frequencies mapped onto a linear
|
cannam@0
|
40 * scale.
|
cannam@0
|
41 */
|
cannam@0
|
42 class Matcher
|
cannam@0
|
43 {
|
Chris@15
|
44 public:
|
Chris@15
|
45 enum FrameNormalisation {
|
Chris@15
|
46
|
Chris@15
|
47 /** Do not normalise audio frames */
|
Chris@15
|
48 NoFrameNormalisation,
|
Chris@15
|
49
|
Chris@15
|
50 /** Normalise each frame of audio to have a sum of 1 */
|
Chris@15
|
51 NormaliseFrameToSum1,
|
Chris@15
|
52
|
Chris@15
|
53 /** Normalise each frame of audio by the long-term average
|
Chris@15
|
54 * of the summed energy */
|
Chris@15
|
55 NormaliseFrameToLTAverage,
|
Chris@15
|
56 };
|
Chris@15
|
57
|
Chris@15
|
58 enum DistanceNormalisation {
|
Chris@15
|
59
|
Chris@15
|
60 /** Do not normalise distance metrics */
|
Chris@15
|
61 NoDistanceNormalisation,
|
Chris@15
|
62
|
Chris@15
|
63 /** Normalise distance metric for pairs of audio frames by
|
Chris@15
|
64 * the sum of the two frames. */
|
Chris@15
|
65 NormaliseDistanceToSum,
|
Chris@15
|
66
|
Chris@15
|
67 /** Normalise distance metric for pairs of audio frames by
|
Chris@15
|
68 * the log of the sum of the frames. */
|
Chris@15
|
69 NormaliseDistanceToLogSum,
|
Chris@15
|
70 };
|
Chris@15
|
71
|
Chris@15
|
72 struct Parameters {
|
Chris@15
|
73
|
Chris@15
|
74 Parameters(float rate_, double hopTime_, int fftSize_) :
|
Chris@15
|
75 sampleRate(rate_),
|
Chris@15
|
76 frameNorm(NormaliseFrameToSum1),
|
Chris@15
|
77 distanceNorm(NormaliseDistanceToLogSum),
|
Chris@15
|
78 useSpectralDifference(true),
|
Chris@15
|
79 useChromaFrequencyMap(false),
|
Chris@15
|
80 hopTime(hopTime_),
|
Chris@15
|
81 fftSize(fftSize_),
|
Chris@15
|
82 blockTime(10.0),
|
Chris@15
|
83 silenceThreshold(0.01),
|
Chris@15
|
84 decay(0.99),
|
Chris@15
|
85 maxRunCount(3)
|
Chris@15
|
86 {}
|
Chris@15
|
87
|
Chris@15
|
88 /** Sample rate of audio */
|
Chris@15
|
89 float sampleRate;
|
Chris@15
|
90
|
Chris@15
|
91 /** Type of audio frame normalisation */
|
Chris@15
|
92 FrameNormalisation frameNorm;
|
Chris@15
|
93
|
Chris@15
|
94 /** Type of distance metric normalisation */
|
Chris@15
|
95 DistanceNormalisation distanceNorm;
|
Chris@15
|
96
|
Chris@15
|
97 /** Flag indicating whether or not the half-wave rectified
|
Chris@15
|
98 * spectral difference should be used in calculating the
|
Chris@15
|
99 * distance metric for pairs of audio frames, instead of the
|
Chris@15
|
100 * straight spectrum values. */
|
Chris@15
|
101 bool useSpectralDifference;
|
Chris@15
|
102
|
Chris@15
|
103 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@15
|
104 * bins) instead of the default warped spectrogram */
|
Chris@15
|
105 bool useChromaFrequencyMap;
|
Chris@15
|
106
|
Chris@15
|
107 /** Spacing of audio frames (determines the amount of overlap or
|
Chris@15
|
108 * skip between frames). This value is expressed in
|
Chris@15
|
109 * seconds. */
|
Chris@15
|
110 double hopTime;
|
Chris@15
|
111
|
Chris@15
|
112 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@15
|
113 * in to Matcher is already in the frequency domain, so this
|
Chris@15
|
114 * expresses the size of the frame that the caller will be
|
Chris@15
|
115 * providing.
|
Chris@15
|
116 */
|
Chris@15
|
117 int fftSize;
|
Chris@15
|
118
|
Chris@15
|
119 /** The width of the search band (error margin) around the current
|
Chris@15
|
120 * match position, measured in seconds. Strictly speaking the
|
Chris@15
|
121 * width is measured backwards from the current point, since the
|
Chris@15
|
122 * algorithm has to work causally.
|
Chris@15
|
123 */
|
Chris@15
|
124 double blockTime;
|
Chris@15
|
125
|
Chris@15
|
126 /** RMS level below which frame is considered silent */
|
Chris@15
|
127 double silenceThreshold;
|
Chris@15
|
128
|
Chris@15
|
129 /** Frame-to-frame decay factor in calculating long-term average */
|
Chris@15
|
130 double decay;
|
Chris@15
|
131
|
Chris@15
|
132 /** Maximum number of frames sequentially processed by this
|
Chris@15
|
133 * matcher, without a frame of the other matcher being
|
Chris@15
|
134 * processed.
|
Chris@15
|
135 */
|
Chris@15
|
136 int maxRunCount;
|
Chris@15
|
137 };
|
Chris@15
|
138
|
cannam@0
|
139 protected:
|
cannam@0
|
140 /** Points to the other performance with which this one is being
|
cannam@0
|
141 * compared. The data for the distance metric and the dynamic
|
cannam@0
|
142 * time warping is shared between the two matchers. In the
|
cannam@0
|
143 * original version, only one of the two performance matchers
|
cannam@0
|
144 * contained the distance metric. (See <code>first</code>)
|
cannam@0
|
145 */
|
cannam@0
|
146 Matcher *otherMatcher;
|
cannam@0
|
147
|
cannam@0
|
148 /** Indicates which performance is considered primary (the
|
cannam@0
|
149 * score). This is the performance shown on the vertical axis,
|
cannam@0
|
150 * and referred to as "this" in the codes for the direction of
|
cannam@0
|
151 * DTW steps. */
|
cannam@0
|
152 bool firstPM;
|
cannam@0
|
153
|
Chris@15
|
154 /** Configuration parameters */
|
Chris@15
|
155 Parameters params;
|
cannam@0
|
156
|
cannam@0
|
157 /** Scaling factor for distance metric; must guarantee that the
|
cannam@0
|
158 * final value fits in the data type used, that is, unsigned
|
Chris@15
|
159 * char.
|
cannam@0
|
160 */
|
cannam@0
|
161 double scale;
|
cannam@0
|
162
|
cannam@0
|
163 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
|
cannam@0
|
164 int blockSize;
|
cannam@0
|
165
|
cannam@0
|
166 /** The number of frames of audio data which have been read. */
|
cannam@0
|
167 int frameCount;
|
cannam@0
|
168
|
cannam@0
|
169 /** Long term average frame energy (in frequency domain
|
cannam@0
|
170 * representation). */
|
cannam@0
|
171 double ltAverage;
|
cannam@0
|
172
|
cannam@0
|
173 /** The number of frames sequentially processed by this matcher,
|
cannam@0
|
174 * without a frame of the other matcher being processed.
|
cannam@0
|
175 */
|
cannam@0
|
176 int runCount;
|
cannam@0
|
177
|
cannam@0
|
178 /** A mapping function for mapping FFT bins to final frequency
|
cannam@0
|
179 * bins. The mapping is linear (1-1) until the resolution
|
cannam@0
|
180 * reaches 2 points per semitone, then logarithmic with a
|
cannam@0
|
181 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
cannam@0
|
182 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
cannam@0
|
183 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
cannam@0
|
184 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
cannam@0
|
185 * where all energy above note 127 is mapped into the final
|
cannam@0
|
186 * bin. */
|
cannam@0
|
187 vector<int> freqMap;
|
cannam@0
|
188
|
cannam@0
|
189 /** The number of entries in <code>freqMap</code>. Note that the
|
cannam@0
|
190 * length of the array is greater, because its size is not known
|
cannam@0
|
191 * at creation time. */
|
cannam@0
|
192 int freqMapSize;
|
cannam@0
|
193
|
cannam@0
|
194 /** The most recent frame; used for calculating the frame to frame
|
Chris@13
|
195 * spectral difference. These are therefore frequency warped but
|
Chris@13
|
196 * not yet normalised. */
|
cannam@0
|
197 vector<double> prevFrame;
|
cannam@0
|
198 vector<double> newFrame;
|
cannam@0
|
199
|
cannam@0
|
200 /** A block of previously seen frames are stored in this structure
|
cannam@0
|
201 * for calculation of the distance matrix as the new frames are
|
cannam@0
|
202 * read in. One can think of the structure of the array as a
|
Chris@13
|
203 * circular buffer of vectors. These are the frames with all
|
Chris@13
|
204 * applicable processing applied (e.g. spectral difference,
|
Chris@13
|
205 * normalisation), unlike prevFrame and newFrame. The total
|
Chris@13
|
206 * energy of frames[i] is stored in totalEnergies[i]. */
|
cannam@0
|
207 vector<vector<double> > frames;
|
cannam@0
|
208
|
Chris@13
|
209 /** The total energy of each frame in the frames block. */
|
Chris@13
|
210 vector<double> totalEnergies;
|
Chris@13
|
211
|
cannam@0
|
212 /** The best path cost matrix. */
|
cannam@0
|
213 int **bestPathCost;
|
cannam@0
|
214
|
cannam@0
|
215 /** The distance matrix. */
|
cannam@0
|
216 unsigned char **distance;
|
cannam@0
|
217
|
cannam@0
|
218 /** The bounds of each row of data in the distance and path cost matrices.*/
|
cannam@0
|
219 int *first;
|
cannam@0
|
220 int *last;
|
cannam@0
|
221
|
cannam@0
|
222 /** Height of each column in distance and bestPathCost matrices */
|
cannam@0
|
223 int *distYSizes;
|
cannam@0
|
224
|
cannam@0
|
225 /** Width of distance and bestPathCost matrices and first and last vectors */
|
cannam@0
|
226 int distXSize;
|
cannam@0
|
227
|
cannam@0
|
228 bool initialised;
|
cannam@0
|
229
|
cannam@0
|
230 /** Disable or enable debugging output */
|
cannam@0
|
231 static bool silent;
|
cannam@0
|
232
|
cannam@0
|
233 public:
|
cannam@0
|
234 /** Constructor for Matcher.
|
cannam@0
|
235 *
|
cannam@0
|
236 * @param p The Matcher representing the performance with which
|
cannam@0
|
237 * this one is going to be matched. Some information is shared
|
cannam@0
|
238 * between the two matchers (currently one possesses the distance
|
cannam@0
|
239 * matrix and optimal path matrix).
|
cannam@0
|
240 */
|
Chris@15
|
241 Matcher(Parameters parameters, Matcher *p);
|
cannam@0
|
242
|
cannam@0
|
243 ~Matcher();
|
cannam@0
|
244
|
cannam@0
|
245 /** For debugging, outputs information about the Matcher to
|
cannam@0
|
246 * standard error.
|
cannam@0
|
247 */
|
cannam@0
|
248 void print();
|
cannam@0
|
249
|
cannam@0
|
250 /** Adds a link to the Matcher object representing the performance
|
cannam@0
|
251 * which is going to be matched to this one.
|
cannam@0
|
252 *
|
cannam@0
|
253 * @param p the Matcher representing the other performance
|
cannam@0
|
254 */
|
cannam@0
|
255 void setOtherMatcher(Matcher *p) {
|
cannam@0
|
256 otherMatcher = p;
|
cannam@0
|
257 } // setOtherMatcher()
|
cannam@0
|
258
|
cannam@0
|
259 int getFrameCount() {
|
cannam@0
|
260 return frameCount;
|
cannam@0
|
261 }
|
cannam@0
|
262
|
Chris@16
|
263 /**
|
Chris@16
|
264 * Return the feature vector size that will be used for the given
|
Chris@16
|
265 * parameters.
|
Chris@16
|
266 */
|
Chris@16
|
267 static int getFeatureSize(Parameters params);
|
Chris@16
|
268
|
cannam@0
|
269 protected:
|
cannam@0
|
270 template <typename T>
|
cannam@0
|
271 void initVector(vector<T> &vec, int sz, T dflt = 0) {
|
cannam@0
|
272 vec.clear();
|
cannam@0
|
273 while ((int)vec.size() < sz) vec.push_back(dflt);
|
cannam@0
|
274 }
|
cannam@0
|
275
|
cannam@0
|
276 template <typename T>
|
cannam@0
|
277 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
|
cannam@0
|
278 T dflt = 0, int fillTo = -1) {
|
cannam@0
|
279 mat.clear();
|
cannam@0
|
280 if (fillTo < 0) fillTo = hsz;
|
cannam@0
|
281 for (int i = 0; i < hsz; ++i) {
|
cannam@0
|
282 mat.push_back(vector<T>());
|
cannam@0
|
283 if (i < fillTo) {
|
cannam@0
|
284 while ((int)mat[i].size() < vsz) {
|
cannam@0
|
285 mat[i].push_back(dflt);
|
cannam@0
|
286 }
|
cannam@0
|
287 }
|
cannam@0
|
288 }
|
cannam@0
|
289 }
|
cannam@0
|
290
|
cannam@0
|
291 void init();
|
cannam@0
|
292
|
Chris@15
|
293 void makeFreqMap();
|
cannam@0
|
294
|
cannam@0
|
295 /** Creates a map of FFT frequency bins to comparison bins. Where
|
cannam@0
|
296 * the spacing of FFT bins is less than 0.5 semitones, the
|
cannam@0
|
297 * mapping is one to one. Where the spacing is greater than 0.5
|
cannam@0
|
298 * semitones, the FFT energy is mapped into semitone-wide
|
cannam@0
|
299 * bins. No scaling is performed; that is the energy is summed
|
Chris@21
|
300 * into the comparison bins. See also consumeFrame()
|
cannam@0
|
301 */
|
Chris@15
|
302 void makeStandardFrequencyMap();
|
cannam@0
|
303
|
Chris@15
|
304 void makeChromaFrequencyMap();
|
cannam@0
|
305
|
cannam@0
|
306 /** Processes a frame of audio data by first computing the STFT
|
cannam@0
|
307 * with a Hamming window, then mapping the frequency bins into a
|
cannam@0
|
308 * part-linear part-logarithmic array, then (optionally)
|
cannam@0
|
309 * computing the half-wave rectified spectral difference from the
|
cannam@0
|
310 * previous frame, then (optionally) normalising to a sum of 1,
|
cannam@0
|
311 * then calculating the distance to all frames stored in the
|
cannam@0
|
312 * otherMatcher and storing them in the distance matrix, and
|
cannam@0
|
313 * finally updating the optimal path matrix using the dynamic
|
cannam@0
|
314 * time warping algorithm.
|
Chris@14
|
315 *
|
Chris@14
|
316 * Return value is the frame (post-processed, with warping,
|
Chris@14
|
317 * rectification, and normalisation as appropriate).
|
cannam@0
|
318 */
|
Chris@21
|
319 std::vector<double> consumeFrame(double *reBuffer, double *imBuffer);
|
cannam@0
|
320
|
cannam@0
|
321 /** Calculates the Manhattan distance between two vectors, with an
|
cannam@0
|
322 * optional normalisation by the combined values in the
|
cannam@0
|
323 * vectors. Since the vectors contain energy, this could be
|
cannam@0
|
324 * considered as a squared Euclidean distance metric. Note that
|
cannam@0
|
325 * normalisation assumes the values are all non-negative.
|
cannam@0
|
326 *
|
cannam@0
|
327 * @param f1 one of the vectors involved in the distance calculation
|
cannam@0
|
328 * @param f2 one of the vectors involved in the distance calculation
|
cannam@0
|
329 * @return the distance, scaled and truncated to an integer
|
cannam@0
|
330 */
|
cannam@0
|
331 int calcDistance(const vector<double> &f1, const vector<double> &f2);
|
cannam@0
|
332
|
cannam@0
|
333 /** Retrieves values from the minimum cost matrix.
|
cannam@0
|
334 *
|
cannam@0
|
335 * @param i the frame number of this Matcher
|
cannam@0
|
336 * @param j the frame number of the other Matcher
|
cannam@0
|
337 * @return the cost of the minimum cost path to this location
|
cannam@0
|
338 */
|
cannam@0
|
339 int getValue(int i, int j, bool firstAttempt);
|
cannam@0
|
340
|
cannam@0
|
341 /** Stores entries in the distance matrix and the optimal path matrix.
|
cannam@0
|
342 *
|
cannam@0
|
343 * @param i the frame number of this Matcher
|
cannam@0
|
344 * @param j the frame number of the other Matcher
|
cannam@0
|
345 * @param dir the direction from which this position is reached with
|
cannam@0
|
346 * minimum cost
|
cannam@0
|
347 * @param value the cost of the minimum path except the current step
|
cannam@0
|
348 * @param dMN the distance cost between the two frames
|
cannam@0
|
349 */
|
cannam@0
|
350 void setValue(int i, int j, int dir, int value, int dMN);
|
cannam@0
|
351
|
Chris@21
|
352 vector<double> processFrameFromFreqData(double *, double *);
|
Chris@21
|
353 void calcAdvance();
|
Chris@21
|
354
|
cannam@0
|
355 friend class MatchFeeder;
|
Chris@15
|
356 friend class Finder;
|
cannam@0
|
357
|
cannam@0
|
358 }; // class Matcher
|
cannam@0
|
359
|
cannam@0
|
360 #endif
|