annotate Matcher.h @ 13:66082cc488c3

Pull out total energies into a separate vector (so that the frame vectors only contain the real features)
author Chris Cannam
date Fri, 10 Oct 2014 10:52:07 +0100
parents b3ec48d9cd2e
children cdead4a52755
rev   line source
cannam@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
cannam@0 2
cannam@0 3 /*
cannam@0 4 Vamp feature extraction plugin using the MATCH audio alignment
cannam@0 5 algorithm.
cannam@0 6
cannam@0 7 Centre for Digital Music, Queen Mary, University of London.
cannam@0 8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
cannam@0 9
cannam@0 10 This program is free software; you can redistribute it and/or
cannam@0 11 modify it under the terms of the GNU General Public License as
cannam@0 12 published by the Free Software Foundation; either version 2 of the
cannam@0 13 License, or (at your option) any later version. See the file
cannam@0 14 COPYING included with this distribution for more information.
cannam@0 15 */
cannam@0 16
cannam@0 17 #ifndef _MATCHER_H_
cannam@0 18 #define _MATCHER_H_
cannam@0 19
cannam@0 20 #include <vector>
cannam@0 21 #include <iostream>
cannam@0 22 #include <sstream>
cannam@0 23 #include <cmath>
cannam@0 24
cannam@0 25 #define ADVANCE_THIS 1
cannam@0 26 #define ADVANCE_OTHER 2
cannam@0 27 #define ADVANCE_BOTH 3
cannam@0 28 #define MASK 0xfc
cannam@0 29
cannam@0 30
cannam@0 31 using std::vector;
cannam@0 32 using std::string;
cannam@0 33 using std::cerr;
cannam@0 34 using std::endl;
cannam@0 35
cannam@0 36 /** Represents an audio stream that can be matched to another audio
cannam@0 37 * stream of the same piece of music. The matching algorithm uses
cannam@0 38 * dynamic time warping. The distance metric is a Euclidean metric
cannam@0 39 * on the FFT data with the higher frequencies mapped onto a linear
cannam@0 40 * scale.
cannam@0 41 */
cannam@0 42
cannam@0 43 class Matcher
cannam@0 44 {
cannam@0 45 protected:
cannam@0 46 /** Points to the other performance with which this one is being
cannam@0 47 * compared. The data for the distance metric and the dynamic
cannam@0 48 * time warping is shared between the two matchers. In the
cannam@0 49 * original version, only one of the two performance matchers
cannam@0 50 * contained the distance metric. (See <code>first</code>)
cannam@0 51 */
cannam@0 52 Matcher *otherMatcher;
cannam@0 53
cannam@0 54 /** Indicates which performance is considered primary (the
cannam@0 55 * score). This is the performance shown on the vertical axis,
cannam@0 56 * and referred to as "this" in the codes for the direction of
cannam@0 57 * DTW steps. */
cannam@0 58 bool firstPM;
cannam@0 59
cannam@0 60 /** Sample rate of audio */
cannam@0 61 float sampleRate;
cannam@0 62
cannam@0 63 /** Onset time of the first note in the audio file, in order to
cannam@0 64 * establish synchronisation between the match file and the audio
cannam@0 65 * data. */
cannam@0 66 double matchFileOffset;
cannam@0 67
cannam@0 68 /** Flag indicating whether or not each frame of audio should be
cannam@0 69 * normalised to have a sum of 1. (Default = false). */
cannam@0 70 bool normalise1;
cannam@0 71
cannam@0 72 /** Flag indicating whether or not the distance metric for pairs
cannam@0 73 * of audio frames should be normalised by the sum of the two
cannam@0 74 * frames. (Default = false). */
cannam@0 75 bool normalise2;
cannam@0 76
cannam@0 77 /** Flag indicating whether or not each frame of audio should be
cannam@0 78 * normalised by the long term average of the summed energy.
cannam@0 79 * (Default = false; assumes normalise1 == false). */
cannam@0 80 bool normalise3;
cannam@0 81
cannam@0 82 /** Flag indicating whether or not the distance metric for pairs
cannam@0 83 * of audio frames should be normalised by the log of the sum of
cannam@0 84 * the frames. (Default = false; assumes normalise2 ==
cannam@0 85 * false). */
cannam@0 86 bool normalise4;
cannam@0 87
cannam@0 88 /** Flag indicating whether or not the half-wave rectified
cannam@0 89 * spectral difference should be used in calculating the distance
cannam@0 90 * metric for pairs of audio frames, instead of the straight
cannam@0 91 * spectrum values. (Default = true). */
cannam@0 92 bool useSpectralDifference;
cannam@0 93
cannam@0 94 bool useChromaFrequencyMap;
cannam@0 95
cannam@0 96 /** Scaling factor for distance metric; must guarantee that the
cannam@0 97 * final value fits in the data type used, that is, unsigned
cannam@0 98 * char. (Default = 16).
cannam@0 99 */
cannam@0 100 double scale;
cannam@0 101
cannam@0 102 /** Spacing of audio frames (determines the amount of overlap or
cannam@0 103 * skip between frames). This value is expressed in
cannam@0 104 * seconds. (Default = 0.020s) */
cannam@0 105 double hopTime;
cannam@0 106
cannam@0 107 /** The size of an FFT frame in seconds. (Default = 0.04644s).
cannam@0 108 * Note that the value is not taken to be precise; it is adjusted
cannam@0 109 * so that <code>fftSize</code> is always power of 2. */
cannam@0 110 double fftTime;
cannam@0 111
cannam@0 112 /** The width of the search band (error margin) around the current
cannam@0 113 * match position, measured in seconds. Strictly speaking the
cannam@0 114 * width is measured backwards from the current point, since the
cannam@0 115 * algorithm has to work causally.
cannam@0 116 */
cannam@0 117 double blockTime;
cannam@0 118
cannam@0 119 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
cannam@0 120 int hopSize;
cannam@0 121
cannam@0 122 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
cannam@0 123 int fftSize;
cannam@0 124
cannam@0 125 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
cannam@0 126 int blockSize;
cannam@0 127
cannam@0 128 /** The number of frames of audio data which have been read. */
cannam@0 129 int frameCount;
cannam@0 130
cannam@0 131 /** RMS amplitude of the current frame. */
cannam@0 132 // double frameRMS;
cannam@0 133
cannam@0 134 /** Long term average frame energy (in frequency domain
cannam@0 135 * representation). */
cannam@0 136 double ltAverage;
cannam@0 137
cannam@0 138 /** The number of frames sequentially processed by this matcher,
cannam@0 139 * without a frame of the other matcher being processed.
cannam@0 140 */
cannam@0 141 int runCount;
cannam@0 142
cannam@0 143 /** Interactive control of the matching process allows pausing
cannam@0 144 * computation of the cost matrices in one direction.
cannam@0 145 */
cannam@0 146 bool paused;
cannam@0 147
cannam@0 148 /** The total number of frames of audio data to be read. */
cannam@0 149 int maxFrames;
cannam@0 150
cannam@0 151 /** A mapping function for mapping FFT bins to final frequency
cannam@0 152 * bins. The mapping is linear (1-1) until the resolution
cannam@0 153 * reaches 2 points per semitone, then logarithmic with a
cannam@0 154 * semitone resolution. e.g. for 44.1kHz sampling rate and
cannam@0 155 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
cannam@0 156 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
cannam@0 157 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
cannam@0 158 * where all energy above note 127 is mapped into the final
cannam@0 159 * bin. */
cannam@0 160 vector<int> freqMap;
cannam@0 161
cannam@0 162 /** The number of entries in <code>freqMap</code>. Note that the
cannam@0 163 * length of the array is greater, because its size is not known
cannam@0 164 * at creation time. */
cannam@0 165 int freqMapSize;
cannam@0 166
cannam@0 167 /** The most recent frame; used for calculating the frame to frame
Chris@13 168 * spectral difference. These are therefore frequency warped but
Chris@13 169 * not yet normalised. */
cannam@0 170 vector<double> prevFrame;
cannam@0 171 vector<double> newFrame;
cannam@0 172
cannam@0 173 /** A block of previously seen frames are stored in this structure
cannam@0 174 * for calculation of the distance matrix as the new frames are
cannam@0 175 * read in. One can think of the structure of the array as a
Chris@13 176 * circular buffer of vectors. These are the frames with all
Chris@13 177 * applicable processing applied (e.g. spectral difference,
Chris@13 178 * normalisation), unlike prevFrame and newFrame. The total
Chris@13 179 * energy of frames[i] is stored in totalEnergies[i]. */
cannam@0 180 vector<vector<double> > frames;
cannam@0 181
Chris@13 182 /** The total energy of each frame in the frames block. */
Chris@13 183 vector<double> totalEnergies;
Chris@13 184
cannam@0 185 /** The best path cost matrix. */
cannam@0 186 int **bestPathCost;
cannam@0 187
cannam@0 188 /** The distance matrix. */
cannam@0 189 unsigned char **distance;
cannam@0 190
cannam@0 191 /** The bounds of each row of data in the distance and path cost matrices.*/
cannam@0 192 int *first;
cannam@0 193 int *last;
cannam@0 194
cannam@0 195 /** Height of each column in distance and bestPathCost matrices */
cannam@0 196 int *distYSizes;
cannam@0 197
cannam@0 198 /** Width of distance and bestPathCost matrices and first and last vectors */
cannam@0 199 int distXSize;
cannam@0 200
cannam@0 201 /** Total number of audio frames, or -1 for live or compressed input. */
cannam@0 202 long fileLength;
cannam@0 203
cannam@0 204 bool initialised;
cannam@0 205
cannam@0 206 //!!! bool atEnd; //!!!
cannam@0 207
cannam@0 208 /** Disable or enable debugging output */
cannam@0 209 static bool silent;
cannam@0 210
cannam@7 211 static const double decay;
cannam@7 212 static const double silenceThreshold;
cannam@7 213 static const int MAX_RUN_COUNT;
cannam@0 214
cannam@0 215 friend class Finder; //!!!
cannam@0 216
cannam@0 217 public:
cannam@0 218 /** Constructor for Matcher.
cannam@0 219 *
cannam@0 220 * @param p The Matcher representing the performance with which
cannam@0 221 * this one is going to be matched. Some information is shared
cannam@0 222 * between the two matchers (currently one possesses the distance
cannam@0 223 * matrix and optimal path matrix).
cannam@0 224 */
cannam@0 225 Matcher(float rate, Matcher *p);
cannam@0 226
cannam@0 227 ~Matcher();
cannam@0 228
cannam@0 229 /** For debugging, outputs information about the Matcher to
cannam@0 230 * standard error.
cannam@0 231 */
cannam@0 232 void print();
cannam@0 233
cannam@0 234 /** Gives some basic `header' information about the Matcher. */
cannam@0 235 string toString();
cannam@0 236
cannam@0 237 /** Adds a link to the Matcher object representing the performance
cannam@0 238 * which is going to be matched to this one.
cannam@0 239 *
cannam@0 240 * @param p the Matcher representing the other performance
cannam@0 241 */
cannam@0 242 void setOtherMatcher(Matcher *p) {
cannam@0 243 otherMatcher = p;
cannam@0 244 } // setOtherMatcher()
cannam@0 245
cannam@0 246 int getFFTSize() {
cannam@0 247 return fftSize;
cannam@0 248 }
cannam@0 249
cannam@0 250 int getHopSize() {
cannam@0 251 return hopSize;
cannam@0 252 }
cannam@0 253
cannam@0 254 int getFrameCount() {
cannam@0 255 return frameCount;
cannam@0 256 }
cannam@0 257
cannam@1 258 void setHopSize(int);
cannam@1 259
cannam@0 260 protected:
cannam@0 261 template <typename T>
cannam@0 262 void initVector(vector<T> &vec, int sz, T dflt = 0) {
cannam@0 263 vec.clear();
cannam@0 264 while ((int)vec.size() < sz) vec.push_back(dflt);
cannam@0 265 }
cannam@0 266
cannam@0 267 template <typename T>
cannam@0 268 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
cannam@0 269 T dflt = 0, int fillTo = -1) {
cannam@0 270 mat.clear();
cannam@0 271 if (fillTo < 0) fillTo = hsz;
cannam@0 272 for (int i = 0; i < hsz; ++i) {
cannam@0 273 mat.push_back(vector<T>());
cannam@0 274 if (i < fillTo) {
cannam@0 275 while ((int)mat[i].size() < vsz) {
cannam@0 276 mat[i].push_back(dflt);
cannam@0 277 }
cannam@0 278 }
cannam@0 279 }
cannam@0 280 }
cannam@0 281
cannam@0 282 void init();
cannam@0 283
cannam@0 284 void makeFreqMap(int fftSize, float sampleRate);
cannam@0 285
cannam@0 286 /** Creates a map of FFT frequency bins to comparison bins. Where
cannam@0 287 * the spacing of FFT bins is less than 0.5 semitones, the
cannam@0 288 * mapping is one to one. Where the spacing is greater than 0.5
cannam@0 289 * semitones, the FFT energy is mapped into semitone-wide
cannam@0 290 * bins. No scaling is performed; that is the energy is summed
cannam@0 291 * into the comparison bins. See also processFrame()
cannam@0 292 */
cannam@0 293 void makeStandardFrequencyMap(int fftSize, float sampleRate);
cannam@0 294
cannam@0 295 void makeChromaFrequencyMap(int fftSize, float sampleRate);
cannam@0 296
cannam@0 297 /** Processes a frame of audio data by first computing the STFT
cannam@0 298 * with a Hamming window, then mapping the frequency bins into a
cannam@0 299 * part-linear part-logarithmic array, then (optionally)
cannam@0 300 * computing the half-wave rectified spectral difference from the
cannam@0 301 * previous frame, then (optionally) normalising to a sum of 1,
cannam@0 302 * then calculating the distance to all frames stored in the
cannam@0 303 * otherMatcher and storing them in the distance matrix, and
cannam@0 304 * finally updating the optimal path matrix using the dynamic
cannam@0 305 * time warping algorithm.
cannam@0 306 */
cannam@0 307 void processFrame(double *reBuffer, double *imBuffer);
cannam@0 308
cannam@0 309 /** Calculates the Manhattan distance between two vectors, with an
cannam@0 310 * optional normalisation by the combined values in the
cannam@0 311 * vectors. Since the vectors contain energy, this could be
cannam@0 312 * considered as a squared Euclidean distance metric. Note that
cannam@0 313 * normalisation assumes the values are all non-negative.
cannam@0 314 *
cannam@0 315 * @param f1 one of the vectors involved in the distance calculation
cannam@0 316 * @param f2 one of the vectors involved in the distance calculation
cannam@0 317 * @return the distance, scaled and truncated to an integer
cannam@0 318 */
cannam@0 319 int calcDistance(const vector<double> &f1, const vector<double> &f2);
cannam@0 320
cannam@0 321 /** Retrieves values from the minimum cost matrix.
cannam@0 322 *
cannam@0 323 * @param i the frame number of this Matcher
cannam@0 324 * @param j the frame number of the other Matcher
cannam@0 325 * @return the cost of the minimum cost path to this location
cannam@0 326 */
cannam@0 327 int getValue(int i, int j, bool firstAttempt);
cannam@0 328
cannam@0 329 /** Stores entries in the distance matrix and the optimal path matrix.
cannam@0 330 *
cannam@0 331 * @param i the frame number of this Matcher
cannam@0 332 * @param j the frame number of the other Matcher
cannam@0 333 * @param dir the direction from which this position is reached with
cannam@0 334 * minimum cost
cannam@0 335 * @param value the cost of the minimum path except the current step
cannam@0 336 * @param dMN the distance cost between the two frames
cannam@0 337 */
cannam@0 338 void setValue(int i, int j, int dir, int value, int dMN);
cannam@0 339
cannam@0 340 friend class MatchFeeder;
cannam@0 341
cannam@0 342 }; // class Matcher
cannam@0 343
cannam@0 344 #endif