cannam@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
cannam@0
|
2
|
cannam@0
|
3 /*
|
cannam@0
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
cannam@0
|
5 algorithm.
|
cannam@0
|
6
|
cannam@0
|
7 Centre for Digital Music, Queen Mary, University of London.
|
cannam@0
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
cannam@0
|
9
|
cannam@0
|
10 This program is free software; you can redistribute it and/or
|
cannam@0
|
11 modify it under the terms of the GNU General Public License as
|
cannam@0
|
12 published by the Free Software Foundation; either version 2 of the
|
cannam@0
|
13 License, or (at your option) any later version. See the file
|
cannam@0
|
14 COPYING included with this distribution for more information.
|
cannam@0
|
15 */
|
cannam@0
|
16
|
cannam@0
|
17 #ifndef _MATCHER_H_
|
cannam@0
|
18 #define _MATCHER_H_
|
cannam@0
|
19
|
cannam@0
|
20 #include <vector>
|
cannam@0
|
21 #include <iostream>
|
cannam@0
|
22 #include <sstream>
|
cannam@0
|
23 #include <cmath>
|
cannam@0
|
24
|
cannam@0
|
25 #define ADVANCE_THIS 1
|
cannam@0
|
26 #define ADVANCE_OTHER 2
|
cannam@0
|
27 #define ADVANCE_BOTH 3
|
cannam@0
|
28 #define MASK 0xfc
|
cannam@0
|
29
|
cannam@0
|
30
|
cannam@0
|
31 using std::vector;
|
cannam@0
|
32 using std::string;
|
cannam@0
|
33 using std::cerr;
|
cannam@0
|
34 using std::endl;
|
cannam@0
|
35
|
cannam@0
|
36 /** Represents an audio stream that can be matched to another audio
|
cannam@0
|
37 * stream of the same piece of music. The matching algorithm uses
|
cannam@0
|
38 * dynamic time warping. The distance metric is a Euclidean metric
|
cannam@0
|
39 * on the FFT data with the higher frequencies mapped onto a linear
|
cannam@0
|
40 * scale.
|
cannam@0
|
41 */
|
cannam@0
|
42 class Matcher
|
cannam@0
|
43 {
|
Chris@15
|
44 public:
|
Chris@15
|
45 enum FrameNormalisation {
|
Chris@15
|
46
|
Chris@15
|
47 /** Do not normalise audio frames */
|
Chris@15
|
48 NoFrameNormalisation,
|
Chris@15
|
49
|
Chris@15
|
50 /** Normalise each frame of audio to have a sum of 1 */
|
Chris@15
|
51 NormaliseFrameToSum1,
|
Chris@15
|
52
|
Chris@15
|
53 /** Normalise each frame of audio by the long-term average
|
Chris@15
|
54 * of the summed energy */
|
Chris@15
|
55 NormaliseFrameToLTAverage,
|
Chris@15
|
56 };
|
Chris@15
|
57
|
Chris@15
|
58 enum DistanceNormalisation {
|
Chris@15
|
59
|
Chris@15
|
60 /** Do not normalise distance metrics */
|
Chris@15
|
61 NoDistanceNormalisation,
|
Chris@15
|
62
|
Chris@15
|
63 /** Normalise distance metric for pairs of audio frames by
|
Chris@15
|
64 * the sum of the two frames. */
|
Chris@15
|
65 NormaliseDistanceToSum,
|
Chris@15
|
66
|
Chris@15
|
67 /** Normalise distance metric for pairs of audio frames by
|
Chris@15
|
68 * the log of the sum of the frames. */
|
Chris@15
|
69 NormaliseDistanceToLogSum,
|
Chris@15
|
70 };
|
Chris@15
|
71
|
Chris@15
|
72 struct Parameters {
|
Chris@15
|
73
|
Chris@15
|
74 Parameters(float rate_, double hopTime_, int fftSize_) :
|
Chris@15
|
75 sampleRate(rate_),
|
Chris@15
|
76 frameNorm(NormaliseFrameToSum1),
|
Chris@15
|
77 distanceNorm(NormaliseDistanceToLogSum),
|
Chris@15
|
78 useSpectralDifference(true),
|
Chris@15
|
79 useChromaFrequencyMap(false),
|
Chris@15
|
80 hopTime(hopTime_),
|
Chris@15
|
81 fftSize(fftSize_),
|
Chris@15
|
82 blockTime(10.0),
|
Chris@15
|
83 silenceThreshold(0.01),
|
Chris@15
|
84 decay(0.99),
|
Chris@15
|
85 maxRunCount(3)
|
Chris@15
|
86 {}
|
Chris@15
|
87
|
Chris@15
|
88 /** Sample rate of audio */
|
Chris@15
|
89 float sampleRate;
|
Chris@15
|
90
|
Chris@15
|
91 /** Type of audio frame normalisation */
|
Chris@15
|
92 FrameNormalisation frameNorm;
|
Chris@15
|
93
|
Chris@15
|
94 /** Type of distance metric normalisation */
|
Chris@15
|
95 DistanceNormalisation distanceNorm;
|
Chris@15
|
96
|
Chris@15
|
97 /** Flag indicating whether or not the half-wave rectified
|
Chris@15
|
98 * spectral difference should be used in calculating the
|
Chris@15
|
99 * distance metric for pairs of audio frames, instead of the
|
Chris@15
|
100 * straight spectrum values. */
|
Chris@15
|
101 bool useSpectralDifference;
|
Chris@15
|
102
|
Chris@15
|
103 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@15
|
104 * bins) instead of the default warped spectrogram */
|
Chris@15
|
105 bool useChromaFrequencyMap;
|
Chris@15
|
106
|
Chris@15
|
107 /** Spacing of audio frames (determines the amount of overlap or
|
Chris@15
|
108 * skip between frames). This value is expressed in
|
Chris@15
|
109 * seconds. */
|
Chris@15
|
110 double hopTime;
|
Chris@15
|
111
|
Chris@15
|
112 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@15
|
113 * in to Matcher is already in the frequency domain, so this
|
Chris@15
|
114 * expresses the size of the frame that the caller will be
|
Chris@15
|
115 * providing.
|
Chris@15
|
116 */
|
Chris@15
|
117 int fftSize;
|
Chris@15
|
118
|
Chris@15
|
119 /** The width of the search band (error margin) around the current
|
Chris@15
|
120 * match position, measured in seconds. Strictly speaking the
|
Chris@15
|
121 * width is measured backwards from the current point, since the
|
Chris@15
|
122 * algorithm has to work causally.
|
Chris@15
|
123 */
|
Chris@15
|
124 double blockTime;
|
Chris@15
|
125
|
Chris@15
|
126 /** RMS level below which frame is considered silent */
|
Chris@15
|
127 double silenceThreshold;
|
Chris@15
|
128
|
Chris@15
|
129 /** Frame-to-frame decay factor in calculating long-term average */
|
Chris@15
|
130 double decay;
|
Chris@15
|
131
|
Chris@15
|
132 /** Maximum number of frames sequentially processed by this
|
Chris@15
|
133 * matcher, without a frame of the other matcher being
|
Chris@15
|
134 * processed.
|
Chris@15
|
135 */
|
Chris@15
|
136 int maxRunCount;
|
Chris@15
|
137 };
|
Chris@15
|
138
|
cannam@0
|
139 protected:
|
cannam@0
|
140 /** Points to the other performance with which this one is being
|
cannam@0
|
141 * compared. The data for the distance metric and the dynamic
|
cannam@0
|
142 * time warping is shared between the two matchers. In the
|
cannam@0
|
143 * original version, only one of the two performance matchers
|
cannam@0
|
144 * contained the distance metric. (See <code>first</code>)
|
cannam@0
|
145 */
|
cannam@0
|
146 Matcher *otherMatcher;
|
cannam@0
|
147
|
cannam@0
|
148 /** Indicates which performance is considered primary (the
|
cannam@0
|
149 * score). This is the performance shown on the vertical axis,
|
cannam@0
|
150 * and referred to as "this" in the codes for the direction of
|
cannam@0
|
151 * DTW steps. */
|
cannam@0
|
152 bool firstPM;
|
cannam@0
|
153
|
Chris@15
|
154 /** Configuration parameters */
|
Chris@15
|
155 Parameters params;
|
cannam@0
|
156
|
cannam@0
|
157 /** Scaling factor for distance metric; must guarantee that the
|
cannam@0
|
158 * final value fits in the data type used, that is, unsigned
|
Chris@15
|
159 * char.
|
cannam@0
|
160 */
|
cannam@0
|
161 double scale;
|
cannam@0
|
162
|
cannam@0
|
163 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
|
cannam@0
|
164 int blockSize;
|
cannam@0
|
165
|
cannam@0
|
166 /** The number of frames of audio data which have been read. */
|
cannam@0
|
167 int frameCount;
|
cannam@0
|
168
|
cannam@0
|
169 /** Long term average frame energy (in frequency domain
|
cannam@0
|
170 * representation). */
|
cannam@0
|
171 double ltAverage;
|
cannam@0
|
172
|
cannam@0
|
173 /** The number of frames sequentially processed by this matcher,
|
cannam@0
|
174 * without a frame of the other matcher being processed.
|
cannam@0
|
175 */
|
cannam@0
|
176 int runCount;
|
cannam@0
|
177
|
cannam@0
|
178 /** A mapping function for mapping FFT bins to final frequency
|
cannam@0
|
179 * bins. The mapping is linear (1-1) until the resolution
|
cannam@0
|
180 * reaches 2 points per semitone, then logarithmic with a
|
cannam@0
|
181 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
cannam@0
|
182 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
cannam@0
|
183 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
cannam@0
|
184 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
cannam@0
|
185 * where all energy above note 127 is mapped into the final
|
cannam@0
|
186 * bin. */
|
cannam@0
|
187 vector<int> freqMap;
|
cannam@0
|
188
|
cannam@0
|
189 /** The number of entries in <code>freqMap</code>. Note that the
|
cannam@0
|
190 * length of the array is greater, because its size is not known
|
cannam@0
|
191 * at creation time. */
|
cannam@0
|
192 int freqMapSize;
|
cannam@0
|
193
|
cannam@0
|
194 /** The most recent frame; used for calculating the frame to frame
|
Chris@13
|
195 * spectral difference. These are therefore frequency warped but
|
Chris@13
|
196 * not yet normalised. */
|
cannam@0
|
197 vector<double> prevFrame;
|
cannam@0
|
198 vector<double> newFrame;
|
cannam@0
|
199
|
cannam@0
|
200 /** A block of previously seen frames are stored in this structure
|
cannam@0
|
201 * for calculation of the distance matrix as the new frames are
|
cannam@0
|
202 * read in. One can think of the structure of the array as a
|
Chris@13
|
203 * circular buffer of vectors. These are the frames with all
|
Chris@13
|
204 * applicable processing applied (e.g. spectral difference,
|
Chris@13
|
205 * normalisation), unlike prevFrame and newFrame. The total
|
Chris@13
|
206 * energy of frames[i] is stored in totalEnergies[i]. */
|
cannam@0
|
207 vector<vector<double> > frames;
|
cannam@0
|
208
|
Chris@13
|
209 /** The total energy of each frame in the frames block. */
|
Chris@13
|
210 vector<double> totalEnergies;
|
Chris@13
|
211
|
cannam@0
|
212 /** The best path cost matrix. */
|
cannam@0
|
213 int **bestPathCost;
|
cannam@0
|
214
|
cannam@0
|
215 /** The distance matrix. */
|
cannam@0
|
216 unsigned char **distance;
|
cannam@0
|
217
|
cannam@0
|
218 /** The bounds of each row of data in the distance and path cost matrices.*/
|
cannam@0
|
219 int *first;
|
cannam@0
|
220 int *last;
|
cannam@0
|
221
|
cannam@0
|
222 /** Height of each column in distance and bestPathCost matrices */
|
cannam@0
|
223 int *distYSizes;
|
cannam@0
|
224
|
cannam@0
|
225 /** Width of distance and bestPathCost matrices and first and last vectors */
|
cannam@0
|
226 int distXSize;
|
cannam@0
|
227
|
cannam@0
|
228 bool initialised;
|
cannam@0
|
229
|
cannam@0
|
230 /** Disable or enable debugging output */
|
cannam@0
|
231 static bool silent;
|
cannam@0
|
232
|
cannam@0
|
233 public:
|
cannam@0
|
234 /** Constructor for Matcher.
|
cannam@0
|
235 *
|
cannam@0
|
236 * @param p The Matcher representing the performance with which
|
cannam@0
|
237 * this one is going to be matched. Some information is shared
|
cannam@0
|
238 * between the two matchers (currently one possesses the distance
|
cannam@0
|
239 * matrix and optimal path matrix).
|
cannam@0
|
240 */
|
Chris@15
|
241 Matcher(Parameters parameters, Matcher *p);
|
cannam@0
|
242
|
cannam@0
|
243 ~Matcher();
|
cannam@0
|
244
|
cannam@0
|
245 /** For debugging, outputs information about the Matcher to
|
cannam@0
|
246 * standard error.
|
cannam@0
|
247 */
|
cannam@0
|
248 void print();
|
cannam@0
|
249
|
cannam@0
|
250 /** Gives some basic `header' information about the Matcher. */
|
cannam@0
|
251 string toString();
|
cannam@0
|
252
|
cannam@0
|
253 /** Adds a link to the Matcher object representing the performance
|
cannam@0
|
254 * which is going to be matched to this one.
|
cannam@0
|
255 *
|
cannam@0
|
256 * @param p the Matcher representing the other performance
|
cannam@0
|
257 */
|
cannam@0
|
258 void setOtherMatcher(Matcher *p) {
|
cannam@0
|
259 otherMatcher = p;
|
cannam@0
|
260 } // setOtherMatcher()
|
cannam@0
|
261
|
cannam@0
|
262 int getFrameCount() {
|
cannam@0
|
263 return frameCount;
|
cannam@0
|
264 }
|
cannam@0
|
265
|
cannam@0
|
266 protected:
|
cannam@0
|
267 template <typename T>
|
cannam@0
|
268 void initVector(vector<T> &vec, int sz, T dflt = 0) {
|
cannam@0
|
269 vec.clear();
|
cannam@0
|
270 while ((int)vec.size() < sz) vec.push_back(dflt);
|
cannam@0
|
271 }
|
cannam@0
|
272
|
cannam@0
|
273 template <typename T>
|
cannam@0
|
274 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
|
cannam@0
|
275 T dflt = 0, int fillTo = -1) {
|
cannam@0
|
276 mat.clear();
|
cannam@0
|
277 if (fillTo < 0) fillTo = hsz;
|
cannam@0
|
278 for (int i = 0; i < hsz; ++i) {
|
cannam@0
|
279 mat.push_back(vector<T>());
|
cannam@0
|
280 if (i < fillTo) {
|
cannam@0
|
281 while ((int)mat[i].size() < vsz) {
|
cannam@0
|
282 mat[i].push_back(dflt);
|
cannam@0
|
283 }
|
cannam@0
|
284 }
|
cannam@0
|
285 }
|
cannam@0
|
286 }
|
cannam@0
|
287
|
cannam@0
|
288 void init();
|
cannam@0
|
289
|
Chris@15
|
290 void makeFreqMap();
|
cannam@0
|
291
|
cannam@0
|
292 /** Creates a map of FFT frequency bins to comparison bins. Where
|
cannam@0
|
293 * the spacing of FFT bins is less than 0.5 semitones, the
|
cannam@0
|
294 * mapping is one to one. Where the spacing is greater than 0.5
|
cannam@0
|
295 * semitones, the FFT energy is mapped into semitone-wide
|
cannam@0
|
296 * bins. No scaling is performed; that is the energy is summed
|
cannam@0
|
297 * into the comparison bins. See also processFrame()
|
cannam@0
|
298 */
|
Chris@15
|
299 void makeStandardFrequencyMap();
|
cannam@0
|
300
|
Chris@15
|
301 void makeChromaFrequencyMap();
|
cannam@0
|
302
|
cannam@0
|
303 /** Processes a frame of audio data by first computing the STFT
|
cannam@0
|
304 * with a Hamming window, then mapping the frequency bins into a
|
cannam@0
|
305 * part-linear part-logarithmic array, then (optionally)
|
cannam@0
|
306 * computing the half-wave rectified spectral difference from the
|
cannam@0
|
307 * previous frame, then (optionally) normalising to a sum of 1,
|
cannam@0
|
308 * then calculating the distance to all frames stored in the
|
cannam@0
|
309 * otherMatcher and storing them in the distance matrix, and
|
cannam@0
|
310 * finally updating the optimal path matrix using the dynamic
|
cannam@0
|
311 * time warping algorithm.
|
Chris@14
|
312 *
|
Chris@14
|
313 * Return value is the frame (post-processed, with warping,
|
Chris@14
|
314 * rectification, and normalisation as appropriate).
|
cannam@0
|
315 */
|
Chris@14
|
316 std::vector<double> processFrame(double *reBuffer, double *imBuffer);
|
cannam@0
|
317
|
cannam@0
|
318 /** Calculates the Manhattan distance between two vectors, with an
|
cannam@0
|
319 * optional normalisation by the combined values in the
|
cannam@0
|
320 * vectors. Since the vectors contain energy, this could be
|
cannam@0
|
321 * considered as a squared Euclidean distance metric. Note that
|
cannam@0
|
322 * normalisation assumes the values are all non-negative.
|
cannam@0
|
323 *
|
cannam@0
|
324 * @param f1 one of the vectors involved in the distance calculation
|
cannam@0
|
325 * @param f2 one of the vectors involved in the distance calculation
|
cannam@0
|
326 * @return the distance, scaled and truncated to an integer
|
cannam@0
|
327 */
|
cannam@0
|
328 int calcDistance(const vector<double> &f1, const vector<double> &f2);
|
cannam@0
|
329
|
cannam@0
|
330 /** Retrieves values from the minimum cost matrix.
|
cannam@0
|
331 *
|
cannam@0
|
332 * @param i the frame number of this Matcher
|
cannam@0
|
333 * @param j the frame number of the other Matcher
|
cannam@0
|
334 * @return the cost of the minimum cost path to this location
|
cannam@0
|
335 */
|
cannam@0
|
336 int getValue(int i, int j, bool firstAttempt);
|
cannam@0
|
337
|
cannam@0
|
338 /** Stores entries in the distance matrix and the optimal path matrix.
|
cannam@0
|
339 *
|
cannam@0
|
340 * @param i the frame number of this Matcher
|
cannam@0
|
341 * @param j the frame number of the other Matcher
|
cannam@0
|
342 * @param dir the direction from which this position is reached with
|
cannam@0
|
343 * minimum cost
|
cannam@0
|
344 * @param value the cost of the minimum path except the current step
|
cannam@0
|
345 * @param dMN the distance cost between the two frames
|
cannam@0
|
346 */
|
cannam@0
|
347 void setValue(int i, int j, int dir, int value, int dMN);
|
cannam@0
|
348
|
cannam@0
|
349 friend class MatchFeeder;
|
Chris@15
|
350 friend class Finder;
|
cannam@0
|
351
|
cannam@0
|
352 }; // class Matcher
|
cannam@0
|
353
|
cannam@0
|
354 #endif
|