annotate Matcher.h @ 23:64c4c0cf80c9

Add alternate Matcher API calls to provide external feature data
author Chris Cannam
date Fri, 10 Oct 2014 16:27:45 +0100
parents b15106b0abcd
children 89929e9e54fb
rev   line source
cannam@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
cannam@0 2
cannam@0 3 /*
cannam@0 4 Vamp feature extraction plugin using the MATCH audio alignment
cannam@0 5 algorithm.
cannam@0 6
cannam@0 7 Centre for Digital Music, Queen Mary, University of London.
cannam@0 8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
cannam@0 9
cannam@0 10 This program is free software; you can redistribute it and/or
cannam@0 11 modify it under the terms of the GNU General Public License as
cannam@0 12 published by the Free Software Foundation; either version 2 of the
cannam@0 13 License, or (at your option) any later version. See the file
cannam@0 14 COPYING included with this distribution for more information.
cannam@0 15 */
cannam@0 16
cannam@0 17 #ifndef _MATCHER_H_
cannam@0 18 #define _MATCHER_H_
cannam@0 19
cannam@0 20 #include <vector>
cannam@0 21 #include <iostream>
cannam@0 22 #include <sstream>
cannam@0 23 #include <cmath>
cannam@0 24
cannam@0 25 #define ADVANCE_THIS 1
cannam@0 26 #define ADVANCE_OTHER 2
cannam@0 27 #define ADVANCE_BOTH 3
cannam@0 28 #define MASK 0xfc
cannam@0 29
cannam@0 30
cannam@0 31 using std::vector;
cannam@0 32 using std::string;
cannam@0 33 using std::cerr;
cannam@0 34 using std::endl;
cannam@0 35
cannam@0 36 /** Represents an audio stream that can be matched to another audio
cannam@0 37 * stream of the same piece of music. The matching algorithm uses
cannam@0 38 * dynamic time warping. The distance metric is a Euclidean metric
cannam@0 39 * on the FFT data with the higher frequencies mapped onto a linear
cannam@0 40 * scale.
cannam@0 41 */
cannam@0 42 class Matcher
cannam@0 43 {
Chris@15 44 public:
Chris@15 45 enum FrameNormalisation {
Chris@15 46
Chris@15 47 /** Do not normalise audio frames */
Chris@15 48 NoFrameNormalisation,
Chris@15 49
Chris@15 50 /** Normalise each frame of audio to have a sum of 1 */
Chris@15 51 NormaliseFrameToSum1,
Chris@15 52
Chris@15 53 /** Normalise each frame of audio by the long-term average
Chris@15 54 * of the summed energy */
Chris@15 55 NormaliseFrameToLTAverage,
Chris@15 56 };
Chris@15 57
Chris@15 58 enum DistanceNormalisation {
Chris@15 59
Chris@15 60 /** Do not normalise distance metrics */
Chris@15 61 NoDistanceNormalisation,
Chris@15 62
Chris@15 63 /** Normalise distance metric for pairs of audio frames by
Chris@15 64 * the sum of the two frames. */
Chris@15 65 NormaliseDistanceToSum,
Chris@15 66
Chris@15 67 /** Normalise distance metric for pairs of audio frames by
Chris@15 68 * the log of the sum of the frames. */
Chris@15 69 NormaliseDistanceToLogSum,
Chris@15 70 };
Chris@15 71
Chris@15 72 struct Parameters {
Chris@15 73
Chris@15 74 Parameters(float rate_, double hopTime_, int fftSize_) :
Chris@15 75 sampleRate(rate_),
Chris@15 76 frameNorm(NormaliseFrameToSum1),
Chris@15 77 distanceNorm(NormaliseDistanceToLogSum),
Chris@15 78 useSpectralDifference(true),
Chris@15 79 useChromaFrequencyMap(false),
Chris@15 80 hopTime(hopTime_),
Chris@15 81 fftSize(fftSize_),
Chris@15 82 blockTime(10.0),
Chris@15 83 silenceThreshold(0.01),
Chris@15 84 decay(0.99),
Chris@15 85 maxRunCount(3)
Chris@15 86 {}
Chris@15 87
Chris@15 88 /** Sample rate of audio */
Chris@15 89 float sampleRate;
Chris@15 90
Chris@15 91 /** Type of audio frame normalisation */
Chris@15 92 FrameNormalisation frameNorm;
Chris@15 93
Chris@15 94 /** Type of distance metric normalisation */
Chris@15 95 DistanceNormalisation distanceNorm;
Chris@15 96
Chris@15 97 /** Flag indicating whether or not the half-wave rectified
Chris@15 98 * spectral difference should be used in calculating the
Chris@15 99 * distance metric for pairs of audio frames, instead of the
Chris@15 100 * straight spectrum values. */
Chris@15 101 bool useSpectralDifference;
Chris@15 102
Chris@15 103 /** Flag indicating whether to use a chroma frequency map (12
Chris@15 104 * bins) instead of the default warped spectrogram */
Chris@15 105 bool useChromaFrequencyMap;
Chris@15 106
Chris@15 107 /** Spacing of audio frames (determines the amount of overlap or
Chris@15 108 * skip between frames). This value is expressed in
Chris@15 109 * seconds. */
Chris@15 110 double hopTime;
Chris@15 111
Chris@15 112 /** Size of an FFT frame in samples. Note that the data passed
Chris@15 113 * in to Matcher is already in the frequency domain, so this
Chris@15 114 * expresses the size of the frame that the caller will be
Chris@15 115 * providing.
Chris@15 116 */
Chris@15 117 int fftSize;
Chris@15 118
Chris@15 119 /** The width of the search band (error margin) around the current
Chris@15 120 * match position, measured in seconds. Strictly speaking the
Chris@15 121 * width is measured backwards from the current point, since the
Chris@15 122 * algorithm has to work causally.
Chris@15 123 */
Chris@15 124 double blockTime;
Chris@15 125
Chris@15 126 /** RMS level below which frame is considered silent */
Chris@15 127 double silenceThreshold;
Chris@15 128
Chris@15 129 /** Frame-to-frame decay factor in calculating long-term average */
Chris@15 130 double decay;
Chris@15 131
Chris@15 132 /** Maximum number of frames sequentially processed by this
Chris@15 133 * matcher, without a frame of the other matcher being
Chris@15 134 * processed.
Chris@15 135 */
Chris@15 136 int maxRunCount;
Chris@15 137 };
Chris@15 138
cannam@0 139 protected:
cannam@0 140 /** Points to the other performance with which this one is being
cannam@0 141 * compared. The data for the distance metric and the dynamic
cannam@0 142 * time warping is shared between the two matchers. In the
cannam@0 143 * original version, only one of the two performance matchers
cannam@0 144 * contained the distance metric. (See <code>first</code>)
cannam@0 145 */
cannam@0 146 Matcher *otherMatcher;
cannam@0 147
cannam@0 148 /** Indicates which performance is considered primary (the
cannam@0 149 * score). This is the performance shown on the vertical axis,
cannam@0 150 * and referred to as "this" in the codes for the direction of
cannam@0 151 * DTW steps. */
cannam@0 152 bool firstPM;
cannam@0 153
Chris@15 154 /** Configuration parameters */
Chris@15 155 Parameters params;
cannam@0 156
cannam@0 157 /** Scaling factor for distance metric; must guarantee that the
cannam@0 158 * final value fits in the data type used, that is, unsigned
Chris@15 159 * char.
cannam@0 160 */
cannam@0 161 double scale;
cannam@0 162
cannam@0 163 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
cannam@0 164 int blockSize;
cannam@0 165
cannam@0 166 /** The number of frames of audio data which have been read. */
cannam@0 167 int frameCount;
cannam@0 168
cannam@0 169 /** Long term average frame energy (in frequency domain
cannam@0 170 * representation). */
cannam@0 171 double ltAverage;
cannam@0 172
cannam@0 173 /** The number of frames sequentially processed by this matcher,
cannam@0 174 * without a frame of the other matcher being processed.
cannam@0 175 */
cannam@0 176 int runCount;
cannam@0 177
cannam@0 178 /** A mapping function for mapping FFT bins to final frequency
cannam@0 179 * bins. The mapping is linear (1-1) until the resolution
cannam@0 180 * reaches 2 points per semitone, then logarithmic with a
cannam@0 181 * semitone resolution. e.g. for 44.1kHz sampling rate and
cannam@0 182 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
cannam@0 183 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
cannam@0 184 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
cannam@0 185 * where all energy above note 127 is mapped into the final
cannam@0 186 * bin. */
cannam@0 187 vector<int> freqMap;
cannam@0 188
Chris@23 189 /** The number of entries in <code>freqMap</code>. */
cannam@0 190 int freqMapSize;
cannam@0 191
Chris@23 192 /** The number of values in an externally-supplied feature vector,
Chris@23 193 * used in preference to freqMap/freqMapSize if constructed with
Chris@23 194 * the external feature version of the Matcher constructor. If
Chris@23 195 * this is zero, the internal feature extractor will be used as
Chris@23 196 * normal.
Chris@23 197 */
Chris@23 198 int externalFeatureSize;
Chris@23 199
Chris@23 200 /** The number of values in the feature vectors actually in
Chris@23 201 * use. This will be externalFeatureSize if greater than zero, or
Chris@23 202 * freqMapSize otherwise.
Chris@23 203 */
Chris@23 204 int featureSize;
Chris@23 205
cannam@0 206 /** The most recent frame; used for calculating the frame to frame
Chris@13 207 * spectral difference. These are therefore frequency warped but
Chris@13 208 * not yet normalised. */
cannam@0 209 vector<double> prevFrame;
cannam@0 210 vector<double> newFrame;
cannam@0 211
cannam@0 212 /** A block of previously seen frames are stored in this structure
cannam@0 213 * for calculation of the distance matrix as the new frames are
cannam@0 214 * read in. One can think of the structure of the array as a
Chris@13 215 * circular buffer of vectors. These are the frames with all
Chris@13 216 * applicable processing applied (e.g. spectral difference,
Chris@13 217 * normalisation), unlike prevFrame and newFrame. The total
Chris@13 218 * energy of frames[i] is stored in totalEnergies[i]. */
cannam@0 219 vector<vector<double> > frames;
cannam@0 220
Chris@13 221 /** The total energy of each frame in the frames block. */
Chris@13 222 vector<double> totalEnergies;
Chris@13 223
cannam@0 224 /** The best path cost matrix. */
cannam@0 225 int **bestPathCost;
cannam@0 226
cannam@0 227 /** The distance matrix. */
cannam@0 228 unsigned char **distance;
cannam@0 229
cannam@0 230 /** The bounds of each row of data in the distance and path cost matrices.*/
cannam@0 231 int *first;
cannam@0 232 int *last;
cannam@0 233
cannam@0 234 /** Height of each column in distance and bestPathCost matrices */
cannam@0 235 int *distYSizes;
cannam@0 236
cannam@0 237 /** Width of distance and bestPathCost matrices and first and last vectors */
cannam@0 238 int distXSize;
cannam@0 239
cannam@0 240 bool initialised;
cannam@0 241
cannam@0 242 /** Disable or enable debugging output */
cannam@0 243 static bool silent;
cannam@0 244
cannam@0 245 public:
cannam@0 246 /** Constructor for Matcher.
cannam@0 247 *
cannam@0 248 * @param p The Matcher representing the performance with which
cannam@0 249 * this one is going to be matched. Some information is shared
cannam@0 250 * between the two matchers (currently one possesses the distance
cannam@0 251 * matrix and optimal path matrix).
cannam@0 252 */
Chris@15 253 Matcher(Parameters parameters, Matcher *p);
cannam@0 254
Chris@23 255 /** Constructor for Matcher using externally supplied features.
Chris@23 256 * A Matcher made using this constructor will not carry out its
Chris@23 257 * own feature extraction from frequency-domain audio data, but
Chris@23 258 * instead will accept arbitrary feature frames calculated by
Chris@23 259 * some external code.
Chris@23 260 *
Chris@23 261 * @param p The Matcher representing the performance with which
Chris@23 262 * this one is going to be matched. Some information is shared
Chris@23 263 * between the two matchers (currently one possesses the distance
Chris@23 264 * matrix and optimal path matrix).
Chris@23 265 *
Chris@23 266 * @param featureSize Number of values in each feature vector.
Chris@23 267 */
Chris@23 268 Matcher(Parameters parameters, Matcher *p, int featureSize);
Chris@23 269
cannam@0 270 ~Matcher();
cannam@0 271
cannam@0 272 /** For debugging, outputs information about the Matcher to
cannam@0 273 * standard error.
cannam@0 274 */
cannam@0 275 void print();
cannam@0 276
cannam@0 277 /** Adds a link to the Matcher object representing the performance
cannam@0 278 * which is going to be matched to this one.
cannam@0 279 *
cannam@0 280 * @param p the Matcher representing the other performance
cannam@0 281 */
cannam@0 282 void setOtherMatcher(Matcher *p) {
cannam@0 283 otherMatcher = p;
cannam@0 284 } // setOtherMatcher()
cannam@0 285
cannam@0 286 int getFrameCount() {
cannam@0 287 return frameCount;
cannam@0 288 }
cannam@0 289
Chris@16 290 /**
Chris@16 291 * Return the feature vector size that will be used for the given
Chris@16 292 * parameters.
Chris@16 293 */
Chris@23 294 static int getFeatureSizeFor(Parameters params);
Chris@16 295
cannam@0 296 protected:
cannam@0 297 template <typename T>
cannam@0 298 void initVector(vector<T> &vec, int sz, T dflt = 0) {
cannam@0 299 vec.clear();
cannam@0 300 while ((int)vec.size() < sz) vec.push_back(dflt);
cannam@0 301 }
cannam@0 302
cannam@0 303 template <typename T>
cannam@0 304 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
cannam@0 305 T dflt = 0, int fillTo = -1) {
cannam@0 306 mat.clear();
cannam@0 307 if (fillTo < 0) fillTo = hsz;
cannam@0 308 for (int i = 0; i < hsz; ++i) {
cannam@0 309 mat.push_back(vector<T>());
cannam@0 310 if (i < fillTo) {
cannam@0 311 while ((int)mat[i].size() < vsz) {
cannam@0 312 mat[i].push_back(dflt);
cannam@0 313 }
cannam@0 314 }
cannam@0 315 }
cannam@0 316 }
cannam@0 317
cannam@0 318 void init();
cannam@0 319
Chris@15 320 void makeFreqMap();
cannam@0 321
cannam@0 322 /** Creates a map of FFT frequency bins to comparison bins. Where
cannam@0 323 * the spacing of FFT bins is less than 0.5 semitones, the
cannam@0 324 * mapping is one to one. Where the spacing is greater than 0.5
cannam@0 325 * semitones, the FFT energy is mapped into semitone-wide
cannam@0 326 * bins. No scaling is performed; that is the energy is summed
Chris@21 327 * into the comparison bins. See also consumeFrame()
cannam@0 328 */
Chris@15 329 void makeStandardFrequencyMap();
cannam@0 330
Chris@15 331 void makeChromaFrequencyMap();
cannam@0 332
cannam@0 333 /** Processes a frame of audio data by first computing the STFT
cannam@0 334 * with a Hamming window, then mapping the frequency bins into a
cannam@0 335 * part-linear part-logarithmic array, then (optionally)
cannam@0 336 * computing the half-wave rectified spectral difference from the
cannam@0 337 * previous frame, then (optionally) normalising to a sum of 1,
cannam@0 338 * then calculating the distance to all frames stored in the
cannam@0 339 * otherMatcher and storing them in the distance matrix, and
cannam@0 340 * finally updating the optimal path matrix using the dynamic
cannam@0 341 * time warping algorithm.
Chris@14 342 *
Chris@14 343 * Return value is the frame (post-processed, with warping,
Chris@14 344 * rectification, and normalisation as appropriate).
Chris@23 345 *
Chris@23 346 * The Matcher must have been constructed using the constructor
Chris@23 347 * without an external featureSize parameter in order to use this
Chris@23 348 * function. (Otherwise it will be expecting you to call
Chris@23 349 * consumeFeatureVector.)
cannam@0 350 */
Chris@21 351 std::vector<double> consumeFrame(double *reBuffer, double *imBuffer);
cannam@0 352
Chris@23 353 /** Processes a feature vector frame (presumably calculated from
Chris@23 354 * audio data by some external code). As consumeFrame, except
Chris@23 355 * that it does not calculate a feature from audio data but
Chris@23 356 * instead uses the supplied feature directly.
Chris@23 357 *
Chris@23 358 * The Matcher must have been constructed using the constructor
Chris@23 359 * that accepts an external featureSize parameter in order to
Chris@23 360 * use this function. The supplied feature must be of the size
Chris@23 361 * that was passed to the constructor.
Chris@23 362 */
Chris@23 363 void consumeFeatureVector(std::vector<double> feature);
Chris@23 364
cannam@0 365 /** Calculates the Manhattan distance between two vectors, with an
cannam@0 366 * optional normalisation by the combined values in the
cannam@0 367 * vectors. Since the vectors contain energy, this could be
cannam@0 368 * considered as a squared Euclidean distance metric. Note that
cannam@0 369 * normalisation assumes the values are all non-negative.
cannam@0 370 *
cannam@0 371 * @param f1 one of the vectors involved in the distance calculation
cannam@0 372 * @param f2 one of the vectors involved in the distance calculation
cannam@0 373 * @return the distance, scaled and truncated to an integer
cannam@0 374 */
cannam@0 375 int calcDistance(const vector<double> &f1, const vector<double> &f2);
cannam@0 376
cannam@0 377 /** Retrieves values from the minimum cost matrix.
cannam@0 378 *
cannam@0 379 * @param i the frame number of this Matcher
cannam@0 380 * @param j the frame number of the other Matcher
cannam@0 381 * @return the cost of the minimum cost path to this location
cannam@0 382 */
cannam@0 383 int getValue(int i, int j, bool firstAttempt);
cannam@0 384
cannam@0 385 /** Stores entries in the distance matrix and the optimal path matrix.
cannam@0 386 *
cannam@0 387 * @param i the frame number of this Matcher
cannam@0 388 * @param j the frame number of the other Matcher
cannam@0 389 * @param dir the direction from which this position is reached with
cannam@0 390 * minimum cost
cannam@0 391 * @param value the cost of the minimum path except the current step
cannam@0 392 * @param dMN the distance cost between the two frames
cannam@0 393 */
cannam@0 394 void setValue(int i, int j, int dir, int value, int dMN);
cannam@0 395
Chris@21 396 vector<double> processFrameFromFreqData(double *, double *);
Chris@21 397 void calcAdvance();
Chris@21 398
cannam@0 399 friend class MatchFeeder;
Chris@15 400 friend class Finder;
cannam@0 401
cannam@0 402 }; // class Matcher
cannam@0 403
cannam@0 404 #endif