match-vamp: Matcher.h annotate

annotate Matcher.h @ 0:640f92242cc1

* initial import

author	cannam
date	Wed, 24 Oct 2007 12:13:43 +0000
parents
children	de792b8c2801

rev	line source
cannam@0	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
cannam@0	2
cannam@0	3 /*
cannam@0	4 Vamp feature extraction plugin using the MATCH audio alignment
cannam@0	5 algorithm.
cannam@0	6
cannam@0	7 Centre for Digital Music, Queen Mary, University of London.
cannam@0	8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
cannam@0	9
cannam@0	10 This program is free software; you can redistribute it and/or
cannam@0	11 modify it under the terms of the GNU General Public License as
cannam@0	12 published by the Free Software Foundation; either version 2 of the
cannam@0	13 License, or (at your option) any later version. See the file
cannam@0	14 COPYING included with this distribution for more information.
cannam@0	15 */
cannam@0	16
cannam@0	17 #ifndef _MATCHER_H_
cannam@0	18 #define _MATCHER_H_
cannam@0	19
cannam@0	20 #include <vector>
cannam@0	21 #include <iostream>
cannam@0	22 #include <sstream>
cannam@0	23 #include <cmath>
cannam@0	24
cannam@0	25 #define ADVANCE_THIS 1
cannam@0	26 #define ADVANCE_OTHER 2
cannam@0	27 #define ADVANCE_BOTH 3
cannam@0	28 #define MASK 0xfc
cannam@0	29
cannam@0	30
cannam@0	31 using std::vector;
cannam@0	32 using std::string;
cannam@0	33 using std::cerr;
cannam@0	34 using std::endl;
cannam@0	35
cannam@0	36 /** Represents an audio stream that can be matched to another audio
cannam@0	37 * stream of the same piece of music. The matching algorithm uses
cannam@0	38 * dynamic time warping. The distance metric is a Euclidean metric
cannam@0	39 * on the FFT data with the higher frequencies mapped onto a linear
cannam@0	40 * scale.
cannam@0	41 */
cannam@0	42
cannam@0	43 class Matcher
cannam@0	44 {
cannam@0	45 protected:
cannam@0	46 /** Points to the other performance with which this one is being
cannam@0	47 * compared. The data for the distance metric and the dynamic
cannam@0	48 * time warping is shared between the two matchers. In the
cannam@0	49 * original version, only one of the two performance matchers
cannam@0	50 * contained the distance metric. (See <code>first</code>)
cannam@0	51 */
cannam@0	52 Matcher *otherMatcher;
cannam@0	53
cannam@0	54 /** Indicates which performance is considered primary (the
cannam@0	55 * score). This is the performance shown on the vertical axis,
cannam@0	56 * and referred to as "this" in the codes for the direction of
cannam@0	57 * DTW steps. */
cannam@0	58 bool firstPM;
cannam@0	59
cannam@0	60 /** Sample rate of audio */
cannam@0	61 float sampleRate;
cannam@0	62
cannam@0	63 /** Onset time of the first note in the audio file, in order to
cannam@0	64 * establish synchronisation between the match file and the audio
cannam@0	65 * data. */
cannam@0	66 double matchFileOffset;
cannam@0	67
cannam@0	68 /** Flag indicating whether or not each frame of audio should be
cannam@0	69 * normalised to have a sum of 1. (Default = false). */
cannam@0	70 bool normalise1;
cannam@0	71
cannam@0	72 /** Flag indicating whether or not the distance metric for pairs
cannam@0	73 * of audio frames should be normalised by the sum of the two
cannam@0	74 * frames. (Default = false). */
cannam@0	75 bool normalise2;
cannam@0	76
cannam@0	77 /** Flag indicating whether or not each frame of audio should be
cannam@0	78 * normalised by the long term average of the summed energy.
cannam@0	79 * (Default = false; assumes normalise1 == false). */
cannam@0	80 bool normalise3;
cannam@0	81
cannam@0	82 /** Flag indicating whether or not the distance metric for pairs
cannam@0	83 * of audio frames should be normalised by the log of the sum of
cannam@0	84 * the frames. (Default = false; assumes normalise2 ==
cannam@0	85 * false). */
cannam@0	86 bool normalise4;
cannam@0	87
cannam@0	88 /** Flag indicating whether or not the half-wave rectified
cannam@0	89 * spectral difference should be used in calculating the distance
cannam@0	90 * metric for pairs of audio frames, instead of the straight
cannam@0	91 * spectrum values. (Default = true). */
cannam@0	92 bool useSpectralDifference;
cannam@0	93
cannam@0	94 bool useChromaFrequencyMap;
cannam@0	95
cannam@0	96 /** Scaling factor for distance metric; must guarantee that the
cannam@0	97 * final value fits in the data type used, that is, unsigned
cannam@0	98 * char. (Default = 16).
cannam@0	99 */
cannam@0	100 double scale;
cannam@0	101
cannam@0	102 /** Spacing of audio frames (determines the amount of overlap or
cannam@0	103 * skip between frames). This value is expressed in
cannam@0	104 * seconds. (Default = 0.020s) */
cannam@0	105 double hopTime;
cannam@0	106
cannam@0	107 /** The size of an FFT frame in seconds. (Default = 0.04644s).
cannam@0	108 * Note that the value is not taken to be precise; it is adjusted
cannam@0	109 * so that <code>fftSize</code> is always power of 2. */
cannam@0	110 double fftTime;
cannam@0	111
cannam@0	112 /** The width of the search band (error margin) around the current
cannam@0	113 * match position, measured in seconds. Strictly speaking the
cannam@0	114 * width is measured backwards from the current point, since the
cannam@0	115 * algorithm has to work causally.
cannam@0	116 */
cannam@0	117 double blockTime;
cannam@0	118
cannam@0	119 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
cannam@0	120 int hopSize;
cannam@0	121
cannam@0	122 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
cannam@0	123 int fftSize;
cannam@0	124
cannam@0	125 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
cannam@0	126 int blockSize;
cannam@0	127
cannam@0	128 /** The number of frames of audio data which have been read. */
cannam@0	129 int frameCount;
cannam@0	130
cannam@0	131 /** RMS amplitude of the current frame. */
cannam@0	132 // double frameRMS;
cannam@0	133
cannam@0	134 /** Long term average frame energy (in frequency domain
cannam@0	135 * representation). */
cannam@0	136 double ltAverage;
cannam@0	137
cannam@0	138 /** The number of frames sequentially processed by this matcher,
cannam@0	139 * without a frame of the other matcher being processed.
cannam@0	140 */
cannam@0	141 int runCount;
cannam@0	142
cannam@0	143 /** Interactive control of the matching process allows pausing
cannam@0	144 * computation of the cost matrices in one direction.
cannam@0	145 */
cannam@0	146 bool paused;
cannam@0	147
cannam@0	148 /** The total number of frames of audio data to be read. */
cannam@0	149 int maxFrames;
cannam@0	150
cannam@0	151 /** A mapping function for mapping FFT bins to final frequency
cannam@0	152 * bins. The mapping is linear (1-1) until the resolution
cannam@0	153 * reaches 2 points per semitone, then logarithmic with a
cannam@0	154 * semitone resolution. e.g. for 44.1kHz sampling rate and
cannam@0	155 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
cannam@0	156 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
cannam@0	157 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
cannam@0	158 * where all energy above note 127 is mapped into the final
cannam@0	159 * bin. */
cannam@0	160 vector<int> freqMap;
cannam@0	161
cannam@0	162 /** The number of entries in <code>freqMap</code>. Note that the
cannam@0	163 * length of the array is greater, because its size is not known
cannam@0	164 * at creation time. */
cannam@0	165 int freqMapSize;
cannam@0	166
cannam@0	167 /** The most recent frame; used for calculating the frame to frame
cannam@0	168 * spectral difference. */
cannam@0	169 vector<double> prevFrame;
cannam@0	170 vector<double> newFrame;
cannam@0	171
cannam@0	172 /** A block of previously seen frames are stored in this structure
cannam@0	173 * for calculation of the distance matrix as the new frames are
cannam@0	174 * read in. One can think of the structure of the array as a
cannam@0	175 * circular buffer of vectors. The last element of each vector
cannam@0	176 * stores the total energy. */
cannam@0	177 vector<vector<double> > frames;
cannam@0	178
cannam@0	179 /** The best path cost matrix. */
cannam@0	180 int **bestPathCost;
cannam@0	181
cannam@0	182 /** The distance matrix. */
cannam@0	183 unsigned char **distance;
cannam@0	184
cannam@0	185 /** The bounds of each row of data in the distance and path cost matrices.*/
cannam@0	186 int *first;
cannam@0	187 int *last;
cannam@0	188
cannam@0	189 /** Height of each column in distance and bestPathCost matrices */
cannam@0	190 int *distYSizes;
cannam@0	191
cannam@0	192 /** Width of distance and bestPathCost matrices and first and last vectors */
cannam@0	193 int distXSize;
cannam@0	194
cannam@0	195 /** Total number of audio frames, or -1 for live or compressed input. */
cannam@0	196 long fileLength;
cannam@0	197
cannam@0	198 bool initialised;
cannam@0	199
cannam@0	200 //!!! bool atEnd; //!!!
cannam@0	201
cannam@0	202 /** Disable or enable debugging output */
cannam@0	203 static bool silent;
cannam@0	204
cannam@0	205 static const double decay = 0.99;
cannam@0	206 static const double silenceThreshold = 0.0004;
cannam@0	207 static const int MAX_RUN_COUNT = 3;
cannam@0	208
cannam@0	209 friend class Finder; //!!!
cannam@0	210
cannam@0	211 public:
cannam@0	212 /** Constructor for Matcher.
cannam@0	213 *
cannam@0	214 * @param p The Matcher representing the performance with which
cannam@0	215 * this one is going to be matched. Some information is shared
cannam@0	216 * between the two matchers (currently one possesses the distance
cannam@0	217 * matrix and optimal path matrix).
cannam@0	218 */
cannam@0	219 Matcher(float rate, Matcher *p);
cannam@0	220
cannam@0	221 ~Matcher();
cannam@0	222
cannam@0	223 /** For debugging, outputs information about the Matcher to
cannam@0	224 * standard error.
cannam@0	225 */
cannam@0	226 void print();
cannam@0	227
cannam@0	228 /** Gives some basic `header' information about the Matcher. */
cannam@0	229 string toString();
cannam@0	230
cannam@0	231 /** Adds a link to the Matcher object representing the performance
cannam@0	232 * which is going to be matched to this one.
cannam@0	233 *
cannam@0	234 * @param p the Matcher representing the other performance
cannam@0	235 */
cannam@0	236 void setOtherMatcher(Matcher *p) {
cannam@0	237 otherMatcher = p;
cannam@0	238 } // setOtherMatcher()
cannam@0	239
cannam@0	240 int getFFTSize() {
cannam@0	241 return fftSize;
cannam@0	242 }
cannam@0	243
cannam@0	244 int getHopSize() {
cannam@0	245 return hopSize;
cannam@0	246 }
cannam@0	247
cannam@0	248 int getFrameCount() {
cannam@0	249 return frameCount;
cannam@0	250 }
cannam@0	251
cannam@0	252 protected:
cannam@0	253 template <typename T>
cannam@0	254 void initVector(vector<T> &vec, int sz, T dflt = 0) {
cannam@0	255 std::cerr << "initVector: " << sz << " * " << sizeof(T) << " = "
cannam@0	256 << sz * sizeof(T) << std::endl;
cannam@0	257 vec.clear();
cannam@0	258 while ((int)vec.size() < sz) vec.push_back(dflt);
cannam@0	259 }
cannam@0	260
cannam@0	261 template <typename T>
cannam@0	262 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
cannam@0	263 T dflt = 0, int fillTo = -1) {
cannam@0	264 std::cerr << "initMatrix: " << hsz << " * " << vsz << " * "
cannam@0	265 << sizeof(T) << " = "
cannam@0	266 << hsz * vsz * sizeof(T) << std::endl;
cannam@0	267 mat.clear();
cannam@0	268 if (fillTo < 0) fillTo = hsz;
cannam@0	269 for (int i = 0; i < hsz; ++i) {
cannam@0	270 mat.push_back(vector<T>());
cannam@0	271 if (i < fillTo) {
cannam@0	272 while ((int)mat[i].size() < vsz) {
cannam@0	273 mat[i].push_back(dflt);
cannam@0	274 }
cannam@0	275 }
cannam@0	276 }
cannam@0	277 }
cannam@0	278
cannam@0	279 void init();
cannam@0	280
cannam@0	281 void makeFreqMap(int fftSize, float sampleRate);
cannam@0	282
cannam@0	283 /** Creates a map of FFT frequency bins to comparison bins. Where
cannam@0	284 * the spacing of FFT bins is less than 0.5 semitones, the
cannam@0	285 * mapping is one to one. Where the spacing is greater than 0.5
cannam@0	286 * semitones, the FFT energy is mapped into semitone-wide
cannam@0	287 * bins. No scaling is performed; that is the energy is summed
cannam@0	288 * into the comparison bins. See also processFrame()
cannam@0	289 */
cannam@0	290 void makeStandardFrequencyMap(int fftSize, float sampleRate);
cannam@0	291
cannam@0	292 void makeChromaFrequencyMap(int fftSize, float sampleRate);
cannam@0	293
cannam@0	294 /** Processes a frame of audio data by first computing the STFT
cannam@0	295 * with a Hamming window, then mapping the frequency bins into a
cannam@0	296 * part-linear part-logarithmic array, then (optionally)
cannam@0	297 * computing the half-wave rectified spectral difference from the
cannam@0	298 * previous frame, then (optionally) normalising to a sum of 1,
cannam@0	299 * then calculating the distance to all frames stored in the
cannam@0	300 * otherMatcher and storing them in the distance matrix, and
cannam@0	301 * finally updating the optimal path matrix using the dynamic
cannam@0	302 * time warping algorithm.
cannam@0	303 */
cannam@0	304 void processFrame(double reBuffer, double imBuffer);
cannam@0	305
cannam@0	306 /** Calculates the Manhattan distance between two vectors, with an
cannam@0	307 * optional normalisation by the combined values in the
cannam@0	308 * vectors. Since the vectors contain energy, this could be
cannam@0	309 * considered as a squared Euclidean distance metric. Note that
cannam@0	310 * normalisation assumes the values are all non-negative.
cannam@0	311 *
cannam@0	312 * @param f1 one of the vectors involved in the distance calculation
cannam@0	313 * @param f2 one of the vectors involved in the distance calculation
cannam@0	314 * @return the distance, scaled and truncated to an integer
cannam@0	315 */
cannam@0	316 int calcDistance(const vector<double> &f1, const vector<double> &f2);
cannam@0	317
cannam@0	318 /** Retrieves values from the minimum cost matrix.
cannam@0	319 *
cannam@0	320 * @param i the frame number of this Matcher
cannam@0	321 * @param j the frame number of the other Matcher
cannam@0	322 * @return the cost of the minimum cost path to this location
cannam@0	323 */
cannam@0	324 int getValue(int i, int j, bool firstAttempt);
cannam@0	325
cannam@0	326 /** Stores entries in the distance matrix and the optimal path matrix.
cannam@0	327 *
cannam@0	328 * @param i the frame number of this Matcher
cannam@0	329 * @param j the frame number of the other Matcher
cannam@0	330 * @param dir the direction from which this position is reached with
cannam@0	331 * minimum cost
cannam@0	332 * @param value the cost of the minimum path except the current step
cannam@0	333 * @param dMN the distance cost between the two frames
cannam@0	334 */
cannam@0	335 void setValue(int i, int j, int dir, int value, int dMN);
cannam@0	336
cannam@0	337 friend class MatchFeeder;
cannam@0	338
cannam@0	339 }; // class Matcher
cannam@0	340
cannam@0	341 #endif

Mercurial > hg > match-vamp

annotate Matcher.h @ 0:640f92242cc1