cannam@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
cannam@0
|
2
|
cannam@0
|
3 /*
|
cannam@0
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
cannam@0
|
5 algorithm.
|
cannam@0
|
6
|
cannam@0
|
7 Centre for Digital Music, Queen Mary, University of London.
|
cannam@0
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
cannam@0
|
9
|
cannam@0
|
10 This program is free software; you can redistribute it and/or
|
cannam@0
|
11 modify it under the terms of the GNU General Public License as
|
cannam@0
|
12 published by the Free Software Foundation; either version 2 of the
|
cannam@0
|
13 License, or (at your option) any later version. See the file
|
cannam@0
|
14 COPYING included with this distribution for more information.
|
cannam@0
|
15 */
|
cannam@0
|
16
|
cannam@0
|
17 #ifndef _MATCHER_H_
|
cannam@0
|
18 #define _MATCHER_H_
|
cannam@0
|
19
|
cannam@0
|
20 #include <vector>
|
cannam@0
|
21 #include <iostream>
|
cannam@0
|
22 #include <sstream>
|
cannam@0
|
23 #include <cmath>
|
cannam@0
|
24
|
cannam@0
|
25 #define ADVANCE_THIS 1
|
cannam@0
|
26 #define ADVANCE_OTHER 2
|
cannam@0
|
27 #define ADVANCE_BOTH 3
|
cannam@0
|
28 #define MASK 0xfc
|
cannam@0
|
29
|
cannam@0
|
30
|
cannam@0
|
31 using std::vector;
|
cannam@0
|
32 using std::string;
|
cannam@0
|
33 using std::cerr;
|
cannam@0
|
34 using std::endl;
|
cannam@0
|
35
|
cannam@0
|
36 /** Represents an audio stream that can be matched to another audio
|
cannam@0
|
37 * stream of the same piece of music. The matching algorithm uses
|
cannam@0
|
38 * dynamic time warping. The distance metric is a Euclidean metric
|
cannam@0
|
39 * on the FFT data with the higher frequencies mapped onto a linear
|
cannam@0
|
40 * scale.
|
cannam@0
|
41 */
|
cannam@0
|
42
|
cannam@0
|
43 class Matcher
|
cannam@0
|
44 {
|
cannam@0
|
45 protected:
|
cannam@0
|
46 /** Points to the other performance with which this one is being
|
cannam@0
|
47 * compared. The data for the distance metric and the dynamic
|
cannam@0
|
48 * time warping is shared between the two matchers. In the
|
cannam@0
|
49 * original version, only one of the two performance matchers
|
cannam@0
|
50 * contained the distance metric. (See <code>first</code>)
|
cannam@0
|
51 */
|
cannam@0
|
52 Matcher *otherMatcher;
|
cannam@0
|
53
|
cannam@0
|
54 /** Indicates which performance is considered primary (the
|
cannam@0
|
55 * score). This is the performance shown on the vertical axis,
|
cannam@0
|
56 * and referred to as "this" in the codes for the direction of
|
cannam@0
|
57 * DTW steps. */
|
cannam@0
|
58 bool firstPM;
|
cannam@0
|
59
|
cannam@0
|
60 /** Sample rate of audio */
|
cannam@0
|
61 float sampleRate;
|
cannam@0
|
62
|
cannam@0
|
63 /** Onset time of the first note in the audio file, in order to
|
cannam@0
|
64 * establish synchronisation between the match file and the audio
|
cannam@0
|
65 * data. */
|
cannam@0
|
66 double matchFileOffset;
|
cannam@0
|
67
|
cannam@0
|
68 /** Flag indicating whether or not each frame of audio should be
|
cannam@0
|
69 * normalised to have a sum of 1. (Default = false). */
|
cannam@0
|
70 bool normalise1;
|
cannam@0
|
71
|
cannam@0
|
72 /** Flag indicating whether or not the distance metric for pairs
|
cannam@0
|
73 * of audio frames should be normalised by the sum of the two
|
cannam@0
|
74 * frames. (Default = false). */
|
cannam@0
|
75 bool normalise2;
|
cannam@0
|
76
|
cannam@0
|
77 /** Flag indicating whether or not each frame of audio should be
|
cannam@0
|
78 * normalised by the long term average of the summed energy.
|
cannam@0
|
79 * (Default = false; assumes normalise1 == false). */
|
cannam@0
|
80 bool normalise3;
|
cannam@0
|
81
|
cannam@0
|
82 /** Flag indicating whether or not the distance metric for pairs
|
cannam@0
|
83 * of audio frames should be normalised by the log of the sum of
|
cannam@0
|
84 * the frames. (Default = false; assumes normalise2 ==
|
cannam@0
|
85 * false). */
|
cannam@0
|
86 bool normalise4;
|
cannam@0
|
87
|
cannam@0
|
88 /** Flag indicating whether or not the half-wave rectified
|
cannam@0
|
89 * spectral difference should be used in calculating the distance
|
cannam@0
|
90 * metric for pairs of audio frames, instead of the straight
|
cannam@0
|
91 * spectrum values. (Default = true). */
|
cannam@0
|
92 bool useSpectralDifference;
|
cannam@0
|
93
|
cannam@0
|
94 bool useChromaFrequencyMap;
|
cannam@0
|
95
|
cannam@0
|
96 /** Scaling factor for distance metric; must guarantee that the
|
cannam@0
|
97 * final value fits in the data type used, that is, unsigned
|
cannam@0
|
98 * char. (Default = 16).
|
cannam@0
|
99 */
|
cannam@0
|
100 double scale;
|
cannam@0
|
101
|
cannam@0
|
102 /** Spacing of audio frames (determines the amount of overlap or
|
cannam@0
|
103 * skip between frames). This value is expressed in
|
cannam@0
|
104 * seconds. (Default = 0.020s) */
|
cannam@0
|
105 double hopTime;
|
cannam@0
|
106
|
cannam@0
|
107 /** The size of an FFT frame in seconds. (Default = 0.04644s).
|
cannam@0
|
108 * Note that the value is not taken to be precise; it is adjusted
|
cannam@0
|
109 * so that <code>fftSize</code> is always power of 2. */
|
cannam@0
|
110 double fftTime;
|
cannam@0
|
111
|
cannam@0
|
112 /** The width of the search band (error margin) around the current
|
cannam@0
|
113 * match position, measured in seconds. Strictly speaking the
|
cannam@0
|
114 * width is measured backwards from the current point, since the
|
cannam@0
|
115 * algorithm has to work causally.
|
cannam@0
|
116 */
|
cannam@0
|
117 double blockTime;
|
cannam@0
|
118
|
cannam@0
|
119 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
|
cannam@0
|
120 int hopSize;
|
cannam@0
|
121
|
cannam@0
|
122 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
|
cannam@0
|
123 int fftSize;
|
cannam@0
|
124
|
cannam@0
|
125 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
|
cannam@0
|
126 int blockSize;
|
cannam@0
|
127
|
cannam@0
|
128 /** The number of frames of audio data which have been read. */
|
cannam@0
|
129 int frameCount;
|
cannam@0
|
130
|
cannam@0
|
131 /** RMS amplitude of the current frame. */
|
cannam@0
|
132 // double frameRMS;
|
cannam@0
|
133
|
cannam@0
|
134 /** Long term average frame energy (in frequency domain
|
cannam@0
|
135 * representation). */
|
cannam@0
|
136 double ltAverage;
|
cannam@0
|
137
|
cannam@0
|
138 /** The number of frames sequentially processed by this matcher,
|
cannam@0
|
139 * without a frame of the other matcher being processed.
|
cannam@0
|
140 */
|
cannam@0
|
141 int runCount;
|
cannam@0
|
142
|
cannam@0
|
143 /** Interactive control of the matching process allows pausing
|
cannam@0
|
144 * computation of the cost matrices in one direction.
|
cannam@0
|
145 */
|
cannam@0
|
146 bool paused;
|
cannam@0
|
147
|
cannam@0
|
148 /** The total number of frames of audio data to be read. */
|
cannam@0
|
149 int maxFrames;
|
cannam@0
|
150
|
cannam@0
|
151 /** A mapping function for mapping FFT bins to final frequency
|
cannam@0
|
152 * bins. The mapping is linear (1-1) until the resolution
|
cannam@0
|
153 * reaches 2 points per semitone, then logarithmic with a
|
cannam@0
|
154 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
cannam@0
|
155 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
cannam@0
|
156 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
cannam@0
|
157 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
cannam@0
|
158 * where all energy above note 127 is mapped into the final
|
cannam@0
|
159 * bin. */
|
cannam@0
|
160 vector<int> freqMap;
|
cannam@0
|
161
|
cannam@0
|
162 /** The number of entries in <code>freqMap</code>. Note that the
|
cannam@0
|
163 * length of the array is greater, because its size is not known
|
cannam@0
|
164 * at creation time. */
|
cannam@0
|
165 int freqMapSize;
|
cannam@0
|
166
|
cannam@0
|
167 /** The most recent frame; used for calculating the frame to frame
|
cannam@0
|
168 * spectral difference. */
|
cannam@0
|
169 vector<double> prevFrame;
|
cannam@0
|
170 vector<double> newFrame;
|
cannam@0
|
171
|
cannam@0
|
172 /** A block of previously seen frames are stored in this structure
|
cannam@0
|
173 * for calculation of the distance matrix as the new frames are
|
cannam@0
|
174 * read in. One can think of the structure of the array as a
|
cannam@0
|
175 * circular buffer of vectors. The last element of each vector
|
cannam@0
|
176 * stores the total energy. */
|
cannam@0
|
177 vector<vector<double> > frames;
|
cannam@0
|
178
|
cannam@0
|
179 /** The best path cost matrix. */
|
cannam@0
|
180 int **bestPathCost;
|
cannam@0
|
181
|
cannam@0
|
182 /** The distance matrix. */
|
cannam@0
|
183 unsigned char **distance;
|
cannam@0
|
184
|
cannam@0
|
185 /** The bounds of each row of data in the distance and path cost matrices.*/
|
cannam@0
|
186 int *first;
|
cannam@0
|
187 int *last;
|
cannam@0
|
188
|
cannam@0
|
189 /** Height of each column in distance and bestPathCost matrices */
|
cannam@0
|
190 int *distYSizes;
|
cannam@0
|
191
|
cannam@0
|
192 /** Width of distance and bestPathCost matrices and first and last vectors */
|
cannam@0
|
193 int distXSize;
|
cannam@0
|
194
|
cannam@0
|
195 /** Total number of audio frames, or -1 for live or compressed input. */
|
cannam@0
|
196 long fileLength;
|
cannam@0
|
197
|
cannam@0
|
198 bool initialised;
|
cannam@0
|
199
|
cannam@0
|
200 //!!! bool atEnd; //!!!
|
cannam@0
|
201
|
cannam@0
|
202 /** Disable or enable debugging output */
|
cannam@0
|
203 static bool silent;
|
cannam@0
|
204
|
cannam@0
|
205 static const double decay = 0.99;
|
cannam@0
|
206 static const double silenceThreshold = 0.0004;
|
cannam@0
|
207 static const int MAX_RUN_COUNT = 3;
|
cannam@0
|
208
|
cannam@0
|
209 friend class Finder; //!!!
|
cannam@0
|
210
|
cannam@0
|
211 public:
|
cannam@0
|
212 /** Constructor for Matcher.
|
cannam@0
|
213 *
|
cannam@0
|
214 * @param p The Matcher representing the performance with which
|
cannam@0
|
215 * this one is going to be matched. Some information is shared
|
cannam@0
|
216 * between the two matchers (currently one possesses the distance
|
cannam@0
|
217 * matrix and optimal path matrix).
|
cannam@0
|
218 */
|
cannam@0
|
219 Matcher(float rate, Matcher *p);
|
cannam@0
|
220
|
cannam@0
|
221 ~Matcher();
|
cannam@0
|
222
|
cannam@0
|
223 /** For debugging, outputs information about the Matcher to
|
cannam@0
|
224 * standard error.
|
cannam@0
|
225 */
|
cannam@0
|
226 void print();
|
cannam@0
|
227
|
cannam@0
|
228 /** Gives some basic `header' information about the Matcher. */
|
cannam@0
|
229 string toString();
|
cannam@0
|
230
|
cannam@0
|
231 /** Adds a link to the Matcher object representing the performance
|
cannam@0
|
232 * which is going to be matched to this one.
|
cannam@0
|
233 *
|
cannam@0
|
234 * @param p the Matcher representing the other performance
|
cannam@0
|
235 */
|
cannam@0
|
236 void setOtherMatcher(Matcher *p) {
|
cannam@0
|
237 otherMatcher = p;
|
cannam@0
|
238 } // setOtherMatcher()
|
cannam@0
|
239
|
cannam@0
|
240 int getFFTSize() {
|
cannam@0
|
241 return fftSize;
|
cannam@0
|
242 }
|
cannam@0
|
243
|
cannam@0
|
244 int getHopSize() {
|
cannam@0
|
245 return hopSize;
|
cannam@0
|
246 }
|
cannam@0
|
247
|
cannam@0
|
248 int getFrameCount() {
|
cannam@0
|
249 return frameCount;
|
cannam@0
|
250 }
|
cannam@0
|
251
|
cannam@1
|
252 void setHopSize(int);
|
cannam@1
|
253
|
cannam@0
|
254 protected:
|
cannam@0
|
255 template <typename T>
|
cannam@0
|
256 void initVector(vector<T> &vec, int sz, T dflt = 0) {
|
cannam@0
|
257 std::cerr << "initVector: " << sz << " * " << sizeof(T) << " = "
|
cannam@0
|
258 << sz * sizeof(T) << std::endl;
|
cannam@0
|
259 vec.clear();
|
cannam@0
|
260 while ((int)vec.size() < sz) vec.push_back(dflt);
|
cannam@0
|
261 }
|
cannam@0
|
262
|
cannam@0
|
263 template <typename T>
|
cannam@0
|
264 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
|
cannam@0
|
265 T dflt = 0, int fillTo = -1) {
|
cannam@0
|
266 std::cerr << "initMatrix: " << hsz << " * " << vsz << " * "
|
cannam@0
|
267 << sizeof(T) << " = "
|
cannam@0
|
268 << hsz * vsz * sizeof(T) << std::endl;
|
cannam@0
|
269 mat.clear();
|
cannam@0
|
270 if (fillTo < 0) fillTo = hsz;
|
cannam@0
|
271 for (int i = 0; i < hsz; ++i) {
|
cannam@0
|
272 mat.push_back(vector<T>());
|
cannam@0
|
273 if (i < fillTo) {
|
cannam@0
|
274 while ((int)mat[i].size() < vsz) {
|
cannam@0
|
275 mat[i].push_back(dflt);
|
cannam@0
|
276 }
|
cannam@0
|
277 }
|
cannam@0
|
278 }
|
cannam@0
|
279 }
|
cannam@0
|
280
|
cannam@0
|
281 void init();
|
cannam@0
|
282
|
cannam@0
|
283 void makeFreqMap(int fftSize, float sampleRate);
|
cannam@0
|
284
|
cannam@0
|
285 /** Creates a map of FFT frequency bins to comparison bins. Where
|
cannam@0
|
286 * the spacing of FFT bins is less than 0.5 semitones, the
|
cannam@0
|
287 * mapping is one to one. Where the spacing is greater than 0.5
|
cannam@0
|
288 * semitones, the FFT energy is mapped into semitone-wide
|
cannam@0
|
289 * bins. No scaling is performed; that is the energy is summed
|
cannam@0
|
290 * into the comparison bins. See also processFrame()
|
cannam@0
|
291 */
|
cannam@0
|
292 void makeStandardFrequencyMap(int fftSize, float sampleRate);
|
cannam@0
|
293
|
cannam@0
|
294 void makeChromaFrequencyMap(int fftSize, float sampleRate);
|
cannam@0
|
295
|
cannam@0
|
296 /** Processes a frame of audio data by first computing the STFT
|
cannam@0
|
297 * with a Hamming window, then mapping the frequency bins into a
|
cannam@0
|
298 * part-linear part-logarithmic array, then (optionally)
|
cannam@0
|
299 * computing the half-wave rectified spectral difference from the
|
cannam@0
|
300 * previous frame, then (optionally) normalising to a sum of 1,
|
cannam@0
|
301 * then calculating the distance to all frames stored in the
|
cannam@0
|
302 * otherMatcher and storing them in the distance matrix, and
|
cannam@0
|
303 * finally updating the optimal path matrix using the dynamic
|
cannam@0
|
304 * time warping algorithm.
|
cannam@0
|
305 */
|
cannam@0
|
306 void processFrame(double *reBuffer, double *imBuffer);
|
cannam@0
|
307
|
cannam@0
|
308 /** Calculates the Manhattan distance between two vectors, with an
|
cannam@0
|
309 * optional normalisation by the combined values in the
|
cannam@0
|
310 * vectors. Since the vectors contain energy, this could be
|
cannam@0
|
311 * considered as a squared Euclidean distance metric. Note that
|
cannam@0
|
312 * normalisation assumes the values are all non-negative.
|
cannam@0
|
313 *
|
cannam@0
|
314 * @param f1 one of the vectors involved in the distance calculation
|
cannam@0
|
315 * @param f2 one of the vectors involved in the distance calculation
|
cannam@0
|
316 * @return the distance, scaled and truncated to an integer
|
cannam@0
|
317 */
|
cannam@0
|
318 int calcDistance(const vector<double> &f1, const vector<double> &f2);
|
cannam@0
|
319
|
cannam@0
|
320 /** Retrieves values from the minimum cost matrix.
|
cannam@0
|
321 *
|
cannam@0
|
322 * @param i the frame number of this Matcher
|
cannam@0
|
323 * @param j the frame number of the other Matcher
|
cannam@0
|
324 * @return the cost of the minimum cost path to this location
|
cannam@0
|
325 */
|
cannam@0
|
326 int getValue(int i, int j, bool firstAttempt);
|
cannam@0
|
327
|
cannam@0
|
328 /** Stores entries in the distance matrix and the optimal path matrix.
|
cannam@0
|
329 *
|
cannam@0
|
330 * @param i the frame number of this Matcher
|
cannam@0
|
331 * @param j the frame number of the other Matcher
|
cannam@0
|
332 * @param dir the direction from which this position is reached with
|
cannam@0
|
333 * minimum cost
|
cannam@0
|
334 * @param value the cost of the minimum path except the current step
|
cannam@0
|
335 * @param dMN the distance cost between the two frames
|
cannam@0
|
336 */
|
cannam@0
|
337 void setValue(int i, int j, int dir, int value, int dMN);
|
cannam@0
|
338
|
cannam@0
|
339 friend class MatchFeeder;
|
cannam@0
|
340
|
cannam@0
|
341 }; // class Matcher
|
cannam@0
|
342
|
cannam@0
|
343 #endif
|