cannam@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
cannam@0
|
2
|
cannam@0
|
3 /*
|
cannam@0
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
cannam@0
|
5 algorithm.
|
cannam@0
|
6
|
cannam@0
|
7 Centre for Digital Music, Queen Mary, University of London.
|
cannam@0
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
cannam@0
|
9
|
cannam@0
|
10 This program is free software; you can redistribute it and/or
|
cannam@0
|
11 modify it under the terms of the GNU General Public License as
|
cannam@0
|
12 published by the Free Software Foundation; either version 2 of the
|
cannam@0
|
13 License, or (at your option) any later version. See the file
|
cannam@0
|
14 COPYING included with this distribution for more information.
|
cannam@0
|
15 */
|
cannam@0
|
16
|
cannam@0
|
17 #ifndef _MATCHER_H_
|
cannam@0
|
18 #define _MATCHER_H_
|
cannam@0
|
19
|
cannam@0
|
20 #include <vector>
|
cannam@0
|
21 #include <iostream>
|
cannam@0
|
22 #include <sstream>
|
cannam@0
|
23 #include <cmath>
|
cannam@0
|
24
|
cannam@0
|
25 #define ADVANCE_THIS 1
|
cannam@0
|
26 #define ADVANCE_OTHER 2
|
cannam@0
|
27 #define ADVANCE_BOTH 3
|
cannam@0
|
28 #define MASK 0xfc
|
cannam@0
|
29
|
Chris@26
|
30 #include "DistanceMetric.h"
|
cannam@0
|
31
|
cannam@0
|
32 using std::vector;
|
cannam@0
|
33 using std::string;
|
cannam@0
|
34 using std::cerr;
|
cannam@0
|
35 using std::endl;
|
cannam@0
|
36
|
cannam@0
|
37 /** Represents an audio stream that can be matched to another audio
|
cannam@0
|
38 * stream of the same piece of music. The matching algorithm uses
|
cannam@0
|
39 * dynamic time warping. The distance metric is a Euclidean metric
|
cannam@0
|
40 * on the FFT data with the higher frequencies mapped onto a linear
|
cannam@0
|
41 * scale.
|
cannam@0
|
42 */
|
cannam@0
|
43 class Matcher
|
cannam@0
|
44 {
|
Chris@15
|
45 public:
|
Chris@15
|
46 enum FrameNormalisation {
|
Chris@15
|
47
|
Chris@15
|
48 /** Do not normalise audio frames */
|
Chris@15
|
49 NoFrameNormalisation,
|
Chris@15
|
50
|
Chris@15
|
51 /** Normalise each frame of audio to have a sum of 1 */
|
Chris@15
|
52 NormaliseFrameToSum1,
|
Chris@15
|
53
|
Chris@15
|
54 /** Normalise each frame of audio by the long-term average
|
Chris@15
|
55 * of the summed energy */
|
Chris@15
|
56 NormaliseFrameToLTAverage,
|
Chris@15
|
57 };
|
Chris@15
|
58
|
Chris@15
|
59 struct Parameters {
|
Chris@15
|
60
|
Chris@15
|
61 Parameters(float rate_, double hopTime_, int fftSize_) :
|
Chris@15
|
62 sampleRate(rate_),
|
Chris@15
|
63 frameNorm(NormaliseFrameToSum1),
|
Chris@26
|
64 distanceNorm(DistanceMetric::NormaliseDistanceToLogSum),
|
Chris@15
|
65 useSpectralDifference(true),
|
Chris@15
|
66 useChromaFrequencyMap(false),
|
Chris@15
|
67 hopTime(hopTime_),
|
Chris@15
|
68 fftSize(fftSize_),
|
Chris@15
|
69 blockTime(10.0),
|
Chris@15
|
70 silenceThreshold(0.01),
|
Chris@15
|
71 decay(0.99),
|
Chris@15
|
72 maxRunCount(3)
|
Chris@15
|
73 {}
|
Chris@15
|
74
|
Chris@15
|
75 /** Sample rate of audio */
|
Chris@15
|
76 float sampleRate;
|
Chris@15
|
77
|
Chris@15
|
78 /** Type of audio frame normalisation */
|
Chris@15
|
79 FrameNormalisation frameNorm;
|
Chris@15
|
80
|
Chris@15
|
81 /** Type of distance metric normalisation */
|
Chris@26
|
82 DistanceMetric::DistanceNormalisation distanceNorm;
|
Chris@15
|
83
|
Chris@15
|
84 /** Flag indicating whether or not the half-wave rectified
|
Chris@15
|
85 * spectral difference should be used in calculating the
|
Chris@15
|
86 * distance metric for pairs of audio frames, instead of the
|
Chris@15
|
87 * straight spectrum values. */
|
Chris@15
|
88 bool useSpectralDifference;
|
Chris@15
|
89
|
Chris@15
|
90 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@15
|
91 * bins) instead of the default warped spectrogram */
|
Chris@15
|
92 bool useChromaFrequencyMap;
|
Chris@15
|
93
|
Chris@15
|
94 /** Spacing of audio frames (determines the amount of overlap or
|
Chris@15
|
95 * skip between frames). This value is expressed in
|
Chris@15
|
96 * seconds. */
|
Chris@15
|
97 double hopTime;
|
Chris@15
|
98
|
Chris@15
|
99 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@15
|
100 * in to Matcher is already in the frequency domain, so this
|
Chris@15
|
101 * expresses the size of the frame that the caller will be
|
Chris@15
|
102 * providing.
|
Chris@15
|
103 */
|
Chris@15
|
104 int fftSize;
|
Chris@15
|
105
|
Chris@15
|
106 /** The width of the search band (error margin) around the current
|
Chris@15
|
107 * match position, measured in seconds. Strictly speaking the
|
Chris@15
|
108 * width is measured backwards from the current point, since the
|
Chris@15
|
109 * algorithm has to work causally.
|
Chris@15
|
110 */
|
Chris@15
|
111 double blockTime;
|
Chris@15
|
112
|
Chris@15
|
113 /** RMS level below which frame is considered silent */
|
Chris@15
|
114 double silenceThreshold;
|
Chris@15
|
115
|
Chris@15
|
116 /** Frame-to-frame decay factor in calculating long-term average */
|
Chris@15
|
117 double decay;
|
Chris@15
|
118
|
Chris@15
|
119 /** Maximum number of frames sequentially processed by this
|
Chris@15
|
120 * matcher, without a frame of the other matcher being
|
Chris@15
|
121 * processed.
|
Chris@15
|
122 */
|
Chris@15
|
123 int maxRunCount;
|
Chris@15
|
124 };
|
Chris@15
|
125
|
cannam@0
|
126 protected:
|
cannam@0
|
127 /** Points to the other performance with which this one is being
|
cannam@0
|
128 * compared. The data for the distance metric and the dynamic
|
cannam@0
|
129 * time warping is shared between the two matchers. In the
|
cannam@0
|
130 * original version, only one of the two performance matchers
|
cannam@0
|
131 * contained the distance metric. (See <code>first</code>)
|
cannam@0
|
132 */
|
cannam@0
|
133 Matcher *otherMatcher;
|
cannam@0
|
134
|
cannam@0
|
135 /** Indicates which performance is considered primary (the
|
cannam@0
|
136 * score). This is the performance shown on the vertical axis,
|
cannam@0
|
137 * and referred to as "this" in the codes for the direction of
|
cannam@0
|
138 * DTW steps. */
|
cannam@0
|
139 bool firstPM;
|
cannam@0
|
140
|
Chris@15
|
141 /** Configuration parameters */
|
Chris@15
|
142 Parameters params;
|
cannam@0
|
143
|
cannam@0
|
144 /** Scaling factor for distance metric; must guarantee that the
|
cannam@0
|
145 * final value fits in the data type used, that is, unsigned
|
Chris@15
|
146 * char.
|
cannam@0
|
147 */
|
cannam@0
|
148 double scale;
|
cannam@0
|
149
|
cannam@0
|
150 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
|
cannam@0
|
151 int blockSize;
|
cannam@0
|
152
|
cannam@0
|
153 /** The number of frames of audio data which have been read. */
|
cannam@0
|
154 int frameCount;
|
cannam@0
|
155
|
cannam@0
|
156 /** Long term average frame energy (in frequency domain
|
cannam@0
|
157 * representation). */
|
cannam@0
|
158 double ltAverage;
|
cannam@0
|
159
|
cannam@0
|
160 /** The number of frames sequentially processed by this matcher,
|
cannam@0
|
161 * without a frame of the other matcher being processed.
|
cannam@0
|
162 */
|
cannam@0
|
163 int runCount;
|
cannam@0
|
164
|
cannam@0
|
165 /** A mapping function for mapping FFT bins to final frequency
|
cannam@0
|
166 * bins. The mapping is linear (1-1) until the resolution
|
cannam@0
|
167 * reaches 2 points per semitone, then logarithmic with a
|
cannam@0
|
168 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
cannam@0
|
169 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
cannam@0
|
170 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
cannam@0
|
171 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
cannam@0
|
172 * where all energy above note 127 is mapped into the final
|
cannam@0
|
173 * bin. */
|
cannam@0
|
174 vector<int> freqMap;
|
cannam@0
|
175
|
Chris@23
|
176 /** The number of entries in <code>freqMap</code>. */
|
cannam@0
|
177 int freqMapSize;
|
cannam@0
|
178
|
Chris@23
|
179 /** The number of values in an externally-supplied feature vector,
|
Chris@23
|
180 * used in preference to freqMap/freqMapSize if constructed with
|
Chris@23
|
181 * the external feature version of the Matcher constructor. If
|
Chris@23
|
182 * this is zero, the internal feature extractor will be used as
|
Chris@23
|
183 * normal.
|
Chris@23
|
184 */
|
Chris@23
|
185 int externalFeatureSize;
|
Chris@23
|
186
|
Chris@23
|
187 /** The number of values in the feature vectors actually in
|
Chris@23
|
188 * use. This will be externalFeatureSize if greater than zero, or
|
Chris@23
|
189 * freqMapSize otherwise.
|
Chris@23
|
190 */
|
Chris@23
|
191 int featureSize;
|
Chris@23
|
192
|
cannam@0
|
193 /** The most recent frame; used for calculating the frame to frame
|
Chris@13
|
194 * spectral difference. These are therefore frequency warped but
|
Chris@13
|
195 * not yet normalised. */
|
cannam@0
|
196 vector<double> prevFrame;
|
cannam@0
|
197 vector<double> newFrame;
|
cannam@0
|
198
|
cannam@0
|
199 /** A block of previously seen frames are stored in this structure
|
cannam@0
|
200 * for calculation of the distance matrix as the new frames are
|
cannam@0
|
201 * read in. One can think of the structure of the array as a
|
Chris@13
|
202 * circular buffer of vectors. These are the frames with all
|
Chris@13
|
203 * applicable processing applied (e.g. spectral difference,
|
Chris@13
|
204 * normalisation), unlike prevFrame and newFrame. The total
|
Chris@13
|
205 * energy of frames[i] is stored in totalEnergies[i]. */
|
cannam@0
|
206 vector<vector<double> > frames;
|
cannam@0
|
207
|
Chris@13
|
208 /** The total energy of each frame in the frames block. */
|
Chris@13
|
209 vector<double> totalEnergies;
|
Chris@13
|
210
|
cannam@0
|
211 /** The best path cost matrix. */
|
cannam@0
|
212 int **bestPathCost;
|
cannam@0
|
213
|
cannam@0
|
214 /** The distance matrix. */
|
cannam@0
|
215 unsigned char **distance;
|
cannam@0
|
216
|
cannam@0
|
217 /** The bounds of each row of data in the distance and path cost matrices.*/
|
cannam@0
|
218 int *first;
|
cannam@0
|
219 int *last;
|
cannam@0
|
220
|
cannam@0
|
221 /** Height of each column in distance and bestPathCost matrices */
|
cannam@0
|
222 int *distYSizes;
|
cannam@0
|
223
|
cannam@0
|
224 /** Width of distance and bestPathCost matrices and first and last vectors */
|
cannam@0
|
225 int distXSize;
|
cannam@0
|
226
|
cannam@0
|
227 bool initialised;
|
cannam@0
|
228
|
cannam@0
|
229 /** Disable or enable debugging output */
|
cannam@0
|
230 static bool silent;
|
cannam@0
|
231
|
cannam@0
|
232 public:
|
cannam@0
|
233 /** Constructor for Matcher.
|
cannam@0
|
234 *
|
cannam@0
|
235 * @param p The Matcher representing the performance with which
|
cannam@0
|
236 * this one is going to be matched. Some information is shared
|
cannam@0
|
237 * between the two matchers (currently one possesses the distance
|
cannam@0
|
238 * matrix and optimal path matrix).
|
cannam@0
|
239 */
|
Chris@15
|
240 Matcher(Parameters parameters, Matcher *p);
|
cannam@0
|
241
|
Chris@23
|
242 /** Constructor for Matcher using externally supplied features.
|
Chris@23
|
243 * A Matcher made using this constructor will not carry out its
|
Chris@23
|
244 * own feature extraction from frequency-domain audio data, but
|
Chris@23
|
245 * instead will accept arbitrary feature frames calculated by
|
Chris@23
|
246 * some external code.
|
Chris@23
|
247 *
|
Chris@23
|
248 * @param p The Matcher representing the performance with which
|
Chris@23
|
249 * this one is going to be matched. Some information is shared
|
Chris@23
|
250 * between the two matchers (currently one possesses the distance
|
Chris@23
|
251 * matrix and optimal path matrix).
|
Chris@23
|
252 *
|
Chris@23
|
253 * @param featureSize Number of values in each feature vector.
|
Chris@23
|
254 */
|
Chris@23
|
255 Matcher(Parameters parameters, Matcher *p, int featureSize);
|
Chris@23
|
256
|
cannam@0
|
257 ~Matcher();
|
cannam@0
|
258
|
cannam@0
|
259 /** For debugging, outputs information about the Matcher to
|
cannam@0
|
260 * standard error.
|
cannam@0
|
261 */
|
cannam@0
|
262 void print();
|
cannam@0
|
263
|
cannam@0
|
264 /** Adds a link to the Matcher object representing the performance
|
cannam@0
|
265 * which is going to be matched to this one.
|
cannam@0
|
266 *
|
cannam@0
|
267 * @param p the Matcher representing the other performance
|
cannam@0
|
268 */
|
cannam@0
|
269 void setOtherMatcher(Matcher *p) {
|
cannam@0
|
270 otherMatcher = p;
|
cannam@0
|
271 } // setOtherMatcher()
|
cannam@0
|
272
|
cannam@0
|
273 int getFrameCount() {
|
cannam@0
|
274 return frameCount;
|
cannam@0
|
275 }
|
cannam@0
|
276
|
Chris@16
|
277 /**
|
Chris@16
|
278 * Return the feature vector size that will be used for the given
|
Chris@16
|
279 * parameters.
|
Chris@16
|
280 */
|
Chris@23
|
281 static int getFeatureSizeFor(Parameters params);
|
Chris@16
|
282
|
cannam@0
|
283 protected:
|
cannam@0
|
284 template <typename T>
|
cannam@0
|
285 void initVector(vector<T> &vec, int sz, T dflt = 0) {
|
cannam@0
|
286 vec.clear();
|
cannam@0
|
287 while ((int)vec.size() < sz) vec.push_back(dflt);
|
cannam@0
|
288 }
|
cannam@0
|
289
|
cannam@0
|
290 template <typename T>
|
cannam@0
|
291 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
|
cannam@0
|
292 T dflt = 0, int fillTo = -1) {
|
cannam@0
|
293 mat.clear();
|
cannam@0
|
294 if (fillTo < 0) fillTo = hsz;
|
cannam@0
|
295 for (int i = 0; i < hsz; ++i) {
|
cannam@0
|
296 mat.push_back(vector<T>());
|
cannam@0
|
297 if (i < fillTo) {
|
cannam@0
|
298 while ((int)mat[i].size() < vsz) {
|
cannam@0
|
299 mat[i].push_back(dflt);
|
cannam@0
|
300 }
|
cannam@0
|
301 }
|
cannam@0
|
302 }
|
cannam@0
|
303 }
|
cannam@0
|
304
|
cannam@0
|
305 void init();
|
cannam@0
|
306
|
Chris@15
|
307 void makeFreqMap();
|
cannam@0
|
308
|
cannam@0
|
309 /** Creates a map of FFT frequency bins to comparison bins. Where
|
cannam@0
|
310 * the spacing of FFT bins is less than 0.5 semitones, the
|
cannam@0
|
311 * mapping is one to one. Where the spacing is greater than 0.5
|
cannam@0
|
312 * semitones, the FFT energy is mapped into semitone-wide
|
cannam@0
|
313 * bins. No scaling is performed; that is the energy is summed
|
Chris@21
|
314 * into the comparison bins. See also consumeFrame()
|
cannam@0
|
315 */
|
Chris@15
|
316 void makeStandardFrequencyMap();
|
cannam@0
|
317
|
Chris@15
|
318 void makeChromaFrequencyMap();
|
cannam@0
|
319
|
cannam@0
|
320 /** Processes a frame of audio data by first computing the STFT
|
cannam@0
|
321 * with a Hamming window, then mapping the frequency bins into a
|
cannam@0
|
322 * part-linear part-logarithmic array, then (optionally)
|
cannam@0
|
323 * computing the half-wave rectified spectral difference from the
|
cannam@0
|
324 * previous frame, then (optionally) normalising to a sum of 1,
|
cannam@0
|
325 * then calculating the distance to all frames stored in the
|
cannam@0
|
326 * otherMatcher and storing them in the distance matrix, and
|
cannam@0
|
327 * finally updating the optimal path matrix using the dynamic
|
cannam@0
|
328 * time warping algorithm.
|
Chris@14
|
329 *
|
Chris@14
|
330 * Return value is the frame (post-processed, with warping,
|
Chris@14
|
331 * rectification, and normalisation as appropriate).
|
Chris@23
|
332 *
|
Chris@23
|
333 * The Matcher must have been constructed using the constructor
|
Chris@23
|
334 * without an external featureSize parameter in order to use this
|
Chris@23
|
335 * function. (Otherwise it will be expecting you to call
|
Chris@23
|
336 * consumeFeatureVector.)
|
cannam@0
|
337 */
|
Chris@21
|
338 std::vector<double> consumeFrame(double *reBuffer, double *imBuffer);
|
cannam@0
|
339
|
Chris@23
|
340 /** Processes a feature vector frame (presumably calculated from
|
Chris@23
|
341 * audio data by some external code). As consumeFrame, except
|
Chris@23
|
342 * that it does not calculate a feature from audio data but
|
Chris@23
|
343 * instead uses the supplied feature directly.
|
Chris@23
|
344 *
|
Chris@23
|
345 * The Matcher must have been constructed using the constructor
|
Chris@23
|
346 * that accepts an external featureSize parameter in order to
|
Chris@23
|
347 * use this function. The supplied feature must be of the size
|
Chris@23
|
348 * that was passed to the constructor.
|
Chris@23
|
349 */
|
Chris@23
|
350 void consumeFeatureVector(std::vector<double> feature);
|
Chris@23
|
351
|
cannam@0
|
352 /** Retrieves values from the minimum cost matrix.
|
cannam@0
|
353 *
|
cannam@0
|
354 * @param i the frame number of this Matcher
|
cannam@0
|
355 * @param j the frame number of the other Matcher
|
cannam@0
|
356 * @return the cost of the minimum cost path to this location
|
cannam@0
|
357 */
|
cannam@0
|
358 int getValue(int i, int j, bool firstAttempt);
|
cannam@0
|
359
|
cannam@0
|
360 /** Stores entries in the distance matrix and the optimal path matrix.
|
cannam@0
|
361 *
|
cannam@0
|
362 * @param i the frame number of this Matcher
|
cannam@0
|
363 * @param j the frame number of the other Matcher
|
cannam@0
|
364 * @param dir the direction from which this position is reached with
|
cannam@0
|
365 * minimum cost
|
cannam@0
|
366 * @param value the cost of the minimum path except the current step
|
cannam@0
|
367 * @param dMN the distance cost between the two frames
|
cannam@0
|
368 */
|
cannam@0
|
369 void setValue(int i, int j, int dir, int value, int dMN);
|
cannam@0
|
370
|
Chris@21
|
371 vector<double> processFrameFromFreqData(double *, double *);
|
Chris@21
|
372 void calcAdvance();
|
Chris@21
|
373
|
Chris@26
|
374 DistanceMetric metric;
|
Chris@26
|
375
|
cannam@0
|
376 friend class MatchFeeder;
|
Chris@24
|
377 friend class MatchFeatureFeeder;
|
Chris@15
|
378 friend class Finder;
|
cannam@0
|
379
|
cannam@0
|
380 }; // class Matcher
|
cannam@0
|
381
|
cannam@0
|
382 #endif
|