cannam@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
cannam@0
|
2
|
cannam@0
|
3 /*
|
cannam@0
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
cannam@0
|
5 algorithm.
|
cannam@0
|
6
|
cannam@0
|
7 Centre for Digital Music, Queen Mary, University of London.
|
cannam@0
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
cannam@0
|
9
|
cannam@0
|
10 This program is free software; you can redistribute it and/or
|
cannam@0
|
11 modify it under the terms of the GNU General Public License as
|
cannam@0
|
12 published by the Free Software Foundation; either version 2 of the
|
cannam@0
|
13 License, or (at your option) any later version. See the file
|
cannam@0
|
14 COPYING included with this distribution for more information.
|
cannam@0
|
15 */
|
cannam@0
|
16
|
cannam@0
|
17 #ifndef _MATCHER_H_
|
cannam@0
|
18 #define _MATCHER_H_
|
cannam@0
|
19
|
cannam@0
|
20 #include <vector>
|
cannam@0
|
21 #include <iostream>
|
cannam@0
|
22 #include <sstream>
|
cannam@0
|
23 #include <cmath>
|
cannam@0
|
24
|
cannam@0
|
25 #define ADVANCE_THIS 1
|
cannam@0
|
26 #define ADVANCE_OTHER 2
|
cannam@0
|
27 #define ADVANCE_BOTH 3
|
cannam@0
|
28 #define MASK 0xfc
|
cannam@0
|
29
|
Chris@26
|
30 #include "DistanceMetric.h"
|
cannam@0
|
31
|
cannam@0
|
32 using std::vector;
|
cannam@0
|
33 using std::string;
|
cannam@0
|
34 using std::cerr;
|
cannam@0
|
35 using std::endl;
|
cannam@0
|
36
|
cannam@0
|
37 /** Represents an audio stream that can be matched to another audio
|
cannam@0
|
38 * stream of the same piece of music. The matching algorithm uses
|
cannam@0
|
39 * dynamic time warping. The distance metric is a Euclidean metric
|
cannam@0
|
40 * on the FFT data with the higher frequencies mapped onto a linear
|
cannam@0
|
41 * scale.
|
cannam@0
|
42 */
|
cannam@0
|
43 class Matcher
|
cannam@0
|
44 {
|
Chris@15
|
45 public:
|
Chris@15
|
46 enum FrameNormalisation {
|
Chris@15
|
47
|
Chris@15
|
48 /** Do not normalise audio frames */
|
Chris@15
|
49 NoFrameNormalisation,
|
Chris@15
|
50
|
Chris@15
|
51 /** Normalise each frame of audio to have a sum of 1 */
|
Chris@15
|
52 NormaliseFrameToSum1,
|
Chris@15
|
53
|
Chris@15
|
54 /** Normalise each frame of audio by the long-term average
|
Chris@15
|
55 * of the summed energy */
|
Chris@15
|
56 NormaliseFrameToLTAverage,
|
Chris@15
|
57 };
|
Chris@15
|
58
|
Chris@15
|
59 struct Parameters {
|
Chris@15
|
60
|
Chris@15
|
61 Parameters(float rate_, double hopTime_, int fftSize_) :
|
Chris@15
|
62 sampleRate(rate_),
|
Chris@15
|
63 frameNorm(NormaliseFrameToSum1),
|
Chris@26
|
64 distanceNorm(DistanceMetric::NormaliseDistanceToLogSum),
|
Chris@29
|
65 distanceScale(90.0),
|
Chris@15
|
66 useSpectralDifference(true),
|
Chris@15
|
67 useChromaFrequencyMap(false),
|
Chris@15
|
68 hopTime(hopTime_),
|
Chris@15
|
69 fftSize(fftSize_),
|
Chris@15
|
70 blockTime(10.0),
|
Chris@15
|
71 silenceThreshold(0.01),
|
Chris@15
|
72 decay(0.99),
|
Chris@15
|
73 maxRunCount(3)
|
Chris@15
|
74 {}
|
Chris@15
|
75
|
Chris@15
|
76 /** Sample rate of audio */
|
Chris@15
|
77 float sampleRate;
|
Chris@15
|
78
|
Chris@15
|
79 /** Type of audio frame normalisation */
|
Chris@15
|
80 FrameNormalisation frameNorm;
|
Chris@15
|
81
|
Chris@15
|
82 /** Type of distance metric normalisation */
|
Chris@26
|
83 DistanceMetric::DistanceNormalisation distanceNorm;
|
Chris@15
|
84
|
Chris@29
|
85 /** Scaling factor for distance metric; must guarantee that the
|
Chris@29
|
86 * final value fits in the data type used, that is, unsigned
|
Chris@29
|
87 * char.
|
Chris@29
|
88 */
|
Chris@29
|
89 double distanceScale;
|
Chris@29
|
90
|
Chris@15
|
91 /** Flag indicating whether or not the half-wave rectified
|
Chris@15
|
92 * spectral difference should be used in calculating the
|
Chris@15
|
93 * distance metric for pairs of audio frames, instead of the
|
Chris@15
|
94 * straight spectrum values. */
|
Chris@15
|
95 bool useSpectralDifference;
|
Chris@15
|
96
|
Chris@15
|
97 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@15
|
98 * bins) instead of the default warped spectrogram */
|
Chris@15
|
99 bool useChromaFrequencyMap;
|
Chris@15
|
100
|
Chris@15
|
101 /** Spacing of audio frames (determines the amount of overlap or
|
Chris@15
|
102 * skip between frames). This value is expressed in
|
Chris@15
|
103 * seconds. */
|
Chris@15
|
104 double hopTime;
|
Chris@15
|
105
|
Chris@15
|
106 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@15
|
107 * in to Matcher is already in the frequency domain, so this
|
Chris@15
|
108 * expresses the size of the frame that the caller will be
|
Chris@15
|
109 * providing.
|
Chris@15
|
110 */
|
Chris@15
|
111 int fftSize;
|
Chris@15
|
112
|
Chris@15
|
113 /** The width of the search band (error margin) around the current
|
Chris@15
|
114 * match position, measured in seconds. Strictly speaking the
|
Chris@15
|
115 * width is measured backwards from the current point, since the
|
Chris@15
|
116 * algorithm has to work causally.
|
Chris@15
|
117 */
|
Chris@15
|
118 double blockTime;
|
Chris@15
|
119
|
Chris@15
|
120 /** RMS level below which frame is considered silent */
|
Chris@15
|
121 double silenceThreshold;
|
Chris@15
|
122
|
Chris@15
|
123 /** Frame-to-frame decay factor in calculating long-term average */
|
Chris@15
|
124 double decay;
|
Chris@15
|
125
|
Chris@15
|
126 /** Maximum number of frames sequentially processed by this
|
Chris@15
|
127 * matcher, without a frame of the other matcher being
|
Chris@15
|
128 * processed.
|
Chris@15
|
129 */
|
Chris@15
|
130 int maxRunCount;
|
Chris@15
|
131 };
|
Chris@15
|
132
|
cannam@0
|
133 protected:
|
cannam@0
|
134 /** Points to the other performance with which this one is being
|
cannam@0
|
135 * compared. The data for the distance metric and the dynamic
|
cannam@0
|
136 * time warping is shared between the two matchers. In the
|
cannam@0
|
137 * original version, only one of the two performance matchers
|
cannam@0
|
138 * contained the distance metric. (See <code>first</code>)
|
cannam@0
|
139 */
|
cannam@0
|
140 Matcher *otherMatcher;
|
cannam@0
|
141
|
cannam@0
|
142 /** Indicates which performance is considered primary (the
|
cannam@0
|
143 * score). This is the performance shown on the vertical axis,
|
cannam@0
|
144 * and referred to as "this" in the codes for the direction of
|
cannam@0
|
145 * DTW steps. */
|
cannam@0
|
146 bool firstPM;
|
cannam@0
|
147
|
Chris@15
|
148 /** Configuration parameters */
|
Chris@15
|
149 Parameters params;
|
cannam@0
|
150
|
cannam@0
|
151 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
|
cannam@0
|
152 int blockSize;
|
cannam@0
|
153
|
cannam@0
|
154 /** The number of frames of audio data which have been read. */
|
cannam@0
|
155 int frameCount;
|
cannam@0
|
156
|
cannam@0
|
157 /** Long term average frame energy (in frequency domain
|
cannam@0
|
158 * representation). */
|
cannam@0
|
159 double ltAverage;
|
cannam@0
|
160
|
cannam@0
|
161 /** The number of frames sequentially processed by this matcher,
|
cannam@0
|
162 * without a frame of the other matcher being processed.
|
cannam@0
|
163 */
|
cannam@0
|
164 int runCount;
|
cannam@0
|
165
|
cannam@0
|
166 /** A mapping function for mapping FFT bins to final frequency
|
cannam@0
|
167 * bins. The mapping is linear (1-1) until the resolution
|
cannam@0
|
168 * reaches 2 points per semitone, then logarithmic with a
|
cannam@0
|
169 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
cannam@0
|
170 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
cannam@0
|
171 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
cannam@0
|
172 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
cannam@0
|
173 * where all energy above note 127 is mapped into the final
|
cannam@0
|
174 * bin. */
|
cannam@0
|
175 vector<int> freqMap;
|
cannam@0
|
176
|
Chris@23
|
177 /** The number of entries in <code>freqMap</code>. */
|
cannam@0
|
178 int freqMapSize;
|
cannam@0
|
179
|
Chris@23
|
180 /** The number of values in an externally-supplied feature vector,
|
Chris@23
|
181 * used in preference to freqMap/freqMapSize if constructed with
|
Chris@23
|
182 * the external feature version of the Matcher constructor. If
|
Chris@23
|
183 * this is zero, the internal feature extractor will be used as
|
Chris@23
|
184 * normal.
|
Chris@23
|
185 */
|
Chris@23
|
186 int externalFeatureSize;
|
Chris@23
|
187
|
Chris@23
|
188 /** The number of values in the feature vectors actually in
|
Chris@23
|
189 * use. This will be externalFeatureSize if greater than zero, or
|
Chris@23
|
190 * freqMapSize otherwise.
|
Chris@23
|
191 */
|
Chris@23
|
192 int featureSize;
|
Chris@23
|
193
|
cannam@0
|
194 /** The most recent frame; used for calculating the frame to frame
|
Chris@13
|
195 * spectral difference. These are therefore frequency warped but
|
Chris@13
|
196 * not yet normalised. */
|
cannam@0
|
197 vector<double> prevFrame;
|
cannam@0
|
198 vector<double> newFrame;
|
cannam@0
|
199
|
cannam@0
|
200 /** A block of previously seen frames are stored in this structure
|
cannam@0
|
201 * for calculation of the distance matrix as the new frames are
|
cannam@0
|
202 * read in. One can think of the structure of the array as a
|
Chris@13
|
203 * circular buffer of vectors. These are the frames with all
|
Chris@13
|
204 * applicable processing applied (e.g. spectral difference,
|
Chris@13
|
205 * normalisation), unlike prevFrame and newFrame. The total
|
Chris@13
|
206 * energy of frames[i] is stored in totalEnergies[i]. */
|
cannam@0
|
207 vector<vector<double> > frames;
|
cannam@0
|
208
|
Chris@13
|
209 /** The total energy of each frame in the frames block. */
|
Chris@13
|
210 vector<double> totalEnergies;
|
Chris@13
|
211
|
cannam@0
|
212 /** The best path cost matrix. */
|
cannam@0
|
213 int **bestPathCost;
|
cannam@0
|
214
|
cannam@0
|
215 /** The distance matrix. */
|
cannam@0
|
216 unsigned char **distance;
|
cannam@0
|
217
|
cannam@0
|
218 /** The bounds of each row of data in the distance and path cost matrices.*/
|
cannam@0
|
219 int *first;
|
cannam@0
|
220 int *last;
|
cannam@0
|
221
|
cannam@0
|
222 /** Height of each column in distance and bestPathCost matrices */
|
cannam@0
|
223 int *distYSizes;
|
cannam@0
|
224
|
cannam@0
|
225 /** Width of distance and bestPathCost matrices and first and last vectors */
|
cannam@0
|
226 int distXSize;
|
cannam@0
|
227
|
cannam@0
|
228 bool initialised;
|
cannam@0
|
229
|
cannam@0
|
230 /** Disable or enable debugging output */
|
cannam@0
|
231 static bool silent;
|
cannam@0
|
232
|
cannam@0
|
233 public:
|
cannam@0
|
234 /** Constructor for Matcher.
|
cannam@0
|
235 *
|
cannam@0
|
236 * @param p The Matcher representing the performance with which
|
cannam@0
|
237 * this one is going to be matched. Some information is shared
|
cannam@0
|
238 * between the two matchers (currently one possesses the distance
|
cannam@0
|
239 * matrix and optimal path matrix).
|
cannam@0
|
240 */
|
Chris@15
|
241 Matcher(Parameters parameters, Matcher *p);
|
cannam@0
|
242
|
Chris@23
|
243 /** Constructor for Matcher using externally supplied features.
|
Chris@23
|
244 * A Matcher made using this constructor will not carry out its
|
Chris@23
|
245 * own feature extraction from frequency-domain audio data, but
|
Chris@23
|
246 * instead will accept arbitrary feature frames calculated by
|
Chris@23
|
247 * some external code.
|
Chris@23
|
248 *
|
Chris@23
|
249 * @param p The Matcher representing the performance with which
|
Chris@23
|
250 * this one is going to be matched. Some information is shared
|
Chris@23
|
251 * between the two matchers (currently one possesses the distance
|
Chris@23
|
252 * matrix and optimal path matrix).
|
Chris@23
|
253 *
|
Chris@23
|
254 * @param featureSize Number of values in each feature vector.
|
Chris@23
|
255 */
|
Chris@23
|
256 Matcher(Parameters parameters, Matcher *p, int featureSize);
|
Chris@23
|
257
|
cannam@0
|
258 ~Matcher();
|
cannam@0
|
259
|
cannam@0
|
260 /** For debugging, outputs information about the Matcher to
|
cannam@0
|
261 * standard error.
|
cannam@0
|
262 */
|
cannam@0
|
263 void print();
|
cannam@0
|
264
|
cannam@0
|
265 /** Adds a link to the Matcher object representing the performance
|
cannam@0
|
266 * which is going to be matched to this one.
|
cannam@0
|
267 *
|
cannam@0
|
268 * @param p the Matcher representing the other performance
|
cannam@0
|
269 */
|
cannam@0
|
270 void setOtherMatcher(Matcher *p) {
|
cannam@0
|
271 otherMatcher = p;
|
cannam@0
|
272 } // setOtherMatcher()
|
cannam@0
|
273
|
cannam@0
|
274 int getFrameCount() {
|
cannam@0
|
275 return frameCount;
|
cannam@0
|
276 }
|
cannam@0
|
277
|
Chris@16
|
278 /**
|
Chris@16
|
279 * Return the feature vector size that will be used for the given
|
Chris@16
|
280 * parameters.
|
Chris@16
|
281 */
|
Chris@23
|
282 static int getFeatureSizeFor(Parameters params);
|
Chris@16
|
283
|
cannam@0
|
284 protected:
|
cannam@0
|
285 template <typename T>
|
cannam@0
|
286 void initVector(vector<T> &vec, int sz, T dflt = 0) {
|
cannam@0
|
287 vec.clear();
|
cannam@0
|
288 while ((int)vec.size() < sz) vec.push_back(dflt);
|
cannam@0
|
289 }
|
cannam@0
|
290
|
cannam@0
|
291 template <typename T>
|
cannam@0
|
292 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
|
cannam@0
|
293 T dflt = 0, int fillTo = -1) {
|
cannam@0
|
294 mat.clear();
|
cannam@0
|
295 if (fillTo < 0) fillTo = hsz;
|
cannam@0
|
296 for (int i = 0; i < hsz; ++i) {
|
cannam@0
|
297 mat.push_back(vector<T>());
|
cannam@0
|
298 if (i < fillTo) {
|
cannam@0
|
299 while ((int)mat[i].size() < vsz) {
|
cannam@0
|
300 mat[i].push_back(dflt);
|
cannam@0
|
301 }
|
cannam@0
|
302 }
|
cannam@0
|
303 }
|
cannam@0
|
304 }
|
cannam@0
|
305
|
cannam@0
|
306 void init();
|
cannam@0
|
307
|
Chris@15
|
308 void makeFreqMap();
|
cannam@0
|
309
|
cannam@0
|
310 /** Creates a map of FFT frequency bins to comparison bins. Where
|
cannam@0
|
311 * the spacing of FFT bins is less than 0.5 semitones, the
|
cannam@0
|
312 * mapping is one to one. Where the spacing is greater than 0.5
|
cannam@0
|
313 * semitones, the FFT energy is mapped into semitone-wide
|
cannam@0
|
314 * bins. No scaling is performed; that is the energy is summed
|
Chris@21
|
315 * into the comparison bins. See also consumeFrame()
|
cannam@0
|
316 */
|
Chris@15
|
317 void makeStandardFrequencyMap();
|
cannam@0
|
318
|
Chris@15
|
319 void makeChromaFrequencyMap();
|
cannam@0
|
320
|
cannam@0
|
321 /** Processes a frame of audio data by first computing the STFT
|
cannam@0
|
322 * with a Hamming window, then mapping the frequency bins into a
|
cannam@0
|
323 * part-linear part-logarithmic array, then (optionally)
|
cannam@0
|
324 * computing the half-wave rectified spectral difference from the
|
cannam@0
|
325 * previous frame, then (optionally) normalising to a sum of 1,
|
cannam@0
|
326 * then calculating the distance to all frames stored in the
|
cannam@0
|
327 * otherMatcher and storing them in the distance matrix, and
|
cannam@0
|
328 * finally updating the optimal path matrix using the dynamic
|
cannam@0
|
329 * time warping algorithm.
|
Chris@14
|
330 *
|
Chris@14
|
331 * Return value is the frame (post-processed, with warping,
|
Chris@14
|
332 * rectification, and normalisation as appropriate).
|
Chris@23
|
333 *
|
Chris@23
|
334 * The Matcher must have been constructed using the constructor
|
Chris@23
|
335 * without an external featureSize parameter in order to use this
|
Chris@23
|
336 * function. (Otherwise it will be expecting you to call
|
Chris@23
|
337 * consumeFeatureVector.)
|
cannam@0
|
338 */
|
Chris@21
|
339 std::vector<double> consumeFrame(double *reBuffer, double *imBuffer);
|
cannam@0
|
340
|
Chris@23
|
341 /** Processes a feature vector frame (presumably calculated from
|
Chris@23
|
342 * audio data by some external code). As consumeFrame, except
|
Chris@23
|
343 * that it does not calculate a feature from audio data but
|
Chris@23
|
344 * instead uses the supplied feature directly.
|
Chris@23
|
345 *
|
Chris@23
|
346 * The Matcher must have been constructed using the constructor
|
Chris@23
|
347 * that accepts an external featureSize parameter in order to
|
Chris@23
|
348 * use this function. The supplied feature must be of the size
|
Chris@23
|
349 * that was passed to the constructor.
|
Chris@23
|
350 */
|
Chris@23
|
351 void consumeFeatureVector(std::vector<double> feature);
|
Chris@23
|
352
|
cannam@0
|
353 /** Retrieves values from the minimum cost matrix.
|
cannam@0
|
354 *
|
cannam@0
|
355 * @param i the frame number of this Matcher
|
cannam@0
|
356 * @param j the frame number of the other Matcher
|
cannam@0
|
357 * @return the cost of the minimum cost path to this location
|
cannam@0
|
358 */
|
cannam@0
|
359 int getValue(int i, int j, bool firstAttempt);
|
cannam@0
|
360
|
cannam@0
|
361 /** Stores entries in the distance matrix and the optimal path matrix.
|
cannam@0
|
362 *
|
cannam@0
|
363 * @param i the frame number of this Matcher
|
cannam@0
|
364 * @param j the frame number of the other Matcher
|
cannam@0
|
365 * @param dir the direction from which this position is reached with
|
cannam@0
|
366 * minimum cost
|
cannam@0
|
367 * @param value the cost of the minimum path except the current step
|
cannam@0
|
368 * @param dMN the distance cost between the two frames
|
cannam@0
|
369 */
|
cannam@0
|
370 void setValue(int i, int j, int dir, int value, int dMN);
|
cannam@0
|
371
|
Chris@21
|
372 vector<double> processFrameFromFreqData(double *, double *);
|
Chris@21
|
373 void calcAdvance();
|
Chris@21
|
374
|
Chris@26
|
375 DistanceMetric metric;
|
Chris@26
|
376
|
cannam@0
|
377 friend class MatchFeeder;
|
Chris@24
|
378 friend class MatchFeatureFeeder;
|
Chris@15
|
379 friend class Finder;
|
cannam@0
|
380
|
cannam@0
|
381 }; // class Matcher
|
cannam@0
|
382
|
cannam@0
|
383 #endif
|