match-vamp: Matcher.h annotate

annotate Matcher.h @ 26:9f60d097f0b2

Pull out DistanceMetric into its own class

author	Chris Cannam
date	Fri, 31 Oct 2014 11:31:08 +0000
parents	89929e9e54fb
children	cd0ffe6b1e68

rev	line source
cannam@0	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
cannam@0	2
cannam@0	3 /*
cannam@0	4 Vamp feature extraction plugin using the MATCH audio alignment
cannam@0	5 algorithm.
cannam@0	6
cannam@0	7 Centre for Digital Music, Queen Mary, University of London.
cannam@0	8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
cannam@0	9
cannam@0	10 This program is free software; you can redistribute it and/or
cannam@0	11 modify it under the terms of the GNU General Public License as
cannam@0	12 published by the Free Software Foundation; either version 2 of the
cannam@0	13 License, or (at your option) any later version. See the file
cannam@0	14 COPYING included with this distribution for more information.
cannam@0	15 */
cannam@0	16
cannam@0	17 #ifndef _MATCHER_H_
cannam@0	18 #define _MATCHER_H_
cannam@0	19
cannam@0	20 #include <vector>
cannam@0	21 #include <iostream>
cannam@0	22 #include <sstream>
cannam@0	23 #include <cmath>
cannam@0	24
cannam@0	25 #define ADVANCE_THIS 1
cannam@0	26 #define ADVANCE_OTHER 2
cannam@0	27 #define ADVANCE_BOTH 3
cannam@0	28 #define MASK 0xfc
cannam@0	29
Chris@26	30 #include "DistanceMetric.h"
cannam@0	31
cannam@0	32 using std::vector;
cannam@0	33 using std::string;
cannam@0	34 using std::cerr;
cannam@0	35 using std::endl;
cannam@0	36
cannam@0	37 /** Represents an audio stream that can be matched to another audio
cannam@0	38 * stream of the same piece of music. The matching algorithm uses
cannam@0	39 * dynamic time warping. The distance metric is a Euclidean metric
cannam@0	40 * on the FFT data with the higher frequencies mapped onto a linear
cannam@0	41 * scale.
cannam@0	42 */
cannam@0	43 class Matcher
cannam@0	44 {
Chris@15	45 public:
Chris@15	46 enum FrameNormalisation {
Chris@15	47
Chris@15	48 /** Do not normalise audio frames */
Chris@15	49 NoFrameNormalisation,
Chris@15	50
Chris@15	51 /** Normalise each frame of audio to have a sum of 1 */
Chris@15	52 NormaliseFrameToSum1,
Chris@15	53
Chris@15	54 /** Normalise each frame of audio by the long-term average
Chris@15	55 * of the summed energy */
Chris@15	56 NormaliseFrameToLTAverage,
Chris@15	57 };
Chris@15	58
Chris@15	59 struct Parameters {
Chris@15	60
Chris@15	61 Parameters(float rate_, double hopTime_, int fftSize_) :
Chris@15	62 sampleRate(rate_),
Chris@15	63 frameNorm(NormaliseFrameToSum1),
Chris@26	64 distanceNorm(DistanceMetric::NormaliseDistanceToLogSum),
Chris@15	65 useSpectralDifference(true),
Chris@15	66 useChromaFrequencyMap(false),
Chris@15	67 hopTime(hopTime_),
Chris@15	68 fftSize(fftSize_),
Chris@15	69 blockTime(10.0),
Chris@15	70 silenceThreshold(0.01),
Chris@15	71 decay(0.99),
Chris@15	72 maxRunCount(3)
Chris@15	73 {}
Chris@15	74
Chris@15	75 /** Sample rate of audio */
Chris@15	76 float sampleRate;
Chris@15	77
Chris@15	78 /** Type of audio frame normalisation */
Chris@15	79 FrameNormalisation frameNorm;
Chris@15	80
Chris@15	81 /** Type of distance metric normalisation */
Chris@26	82 DistanceMetric::DistanceNormalisation distanceNorm;
Chris@15	83
Chris@15	84 /** Flag indicating whether or not the half-wave rectified
Chris@15	85 * spectral difference should be used in calculating the
Chris@15	86 * distance metric for pairs of audio frames, instead of the
Chris@15	87 * straight spectrum values. */
Chris@15	88 bool useSpectralDifference;
Chris@15	89
Chris@15	90 /** Flag indicating whether to use a chroma frequency map (12
Chris@15	91 * bins) instead of the default warped spectrogram */
Chris@15	92 bool useChromaFrequencyMap;
Chris@15	93
Chris@15	94 /** Spacing of audio frames (determines the amount of overlap or
Chris@15	95 * skip between frames). This value is expressed in
Chris@15	96 * seconds. */
Chris@15	97 double hopTime;
Chris@15	98
Chris@15	99 /** Size of an FFT frame in samples. Note that the data passed
Chris@15	100 * in to Matcher is already in the frequency domain, so this
Chris@15	101 * expresses the size of the frame that the caller will be
Chris@15	102 * providing.
Chris@15	103 */
Chris@15	104 int fftSize;
Chris@15	105
Chris@15	106 /** The width of the search band (error margin) around the current
Chris@15	107 * match position, measured in seconds. Strictly speaking the
Chris@15	108 * width is measured backwards from the current point, since the
Chris@15	109 * algorithm has to work causally.
Chris@15	110 */
Chris@15	111 double blockTime;
Chris@15	112
Chris@15	113 /** RMS level below which frame is considered silent */
Chris@15	114 double silenceThreshold;
Chris@15	115
Chris@15	116 /** Frame-to-frame decay factor in calculating long-term average */
Chris@15	117 double decay;
Chris@15	118
Chris@15	119 /** Maximum number of frames sequentially processed by this
Chris@15	120 * matcher, without a frame of the other matcher being
Chris@15	121 * processed.
Chris@15	122 */
Chris@15	123 int maxRunCount;
Chris@15	124 };
Chris@15	125
cannam@0	126 protected:
cannam@0	127 /** Points to the other performance with which this one is being
cannam@0	128 * compared. The data for the distance metric and the dynamic
cannam@0	129 * time warping is shared between the two matchers. In the
cannam@0	130 * original version, only one of the two performance matchers
cannam@0	131 * contained the distance metric. (See <code>first</code>)
cannam@0	132 */
cannam@0	133 Matcher *otherMatcher;
cannam@0	134
cannam@0	135 /** Indicates which performance is considered primary (the
cannam@0	136 * score). This is the performance shown on the vertical axis,
cannam@0	137 * and referred to as "this" in the codes for the direction of
cannam@0	138 * DTW steps. */
cannam@0	139 bool firstPM;
cannam@0	140
Chris@15	141 /** Configuration parameters */
Chris@15	142 Parameters params;
cannam@0	143
cannam@0	144 /** Scaling factor for distance metric; must guarantee that the
cannam@0	145 * final value fits in the data type used, that is, unsigned
Chris@15	146 * char.
cannam@0	147 */
cannam@0	148 double scale;
cannam@0	149
cannam@0	150 /** Width of the search band in FFT frames (see <code>blockTime</code>) */
cannam@0	151 int blockSize;
cannam@0	152
cannam@0	153 /** The number of frames of audio data which have been read. */
cannam@0	154 int frameCount;
cannam@0	155
cannam@0	156 /** Long term average frame energy (in frequency domain
cannam@0	157 * representation). */
cannam@0	158 double ltAverage;
cannam@0	159
cannam@0	160 /** The number of frames sequentially processed by this matcher,
cannam@0	161 * without a frame of the other matcher being processed.
cannam@0	162 */
cannam@0	163 int runCount;
cannam@0	164
cannam@0	165 /** A mapping function for mapping FFT bins to final frequency
cannam@0	166 * bins. The mapping is linear (1-1) until the resolution
cannam@0	167 * reaches 2 points per semitone, then logarithmic with a
cannam@0	168 * semitone resolution. e.g. for 44.1kHz sampling rate and
cannam@0	169 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
cannam@0	170 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
cannam@0	171 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
cannam@0	172 * where all energy above note 127 is mapped into the final
cannam@0	173 * bin. */
cannam@0	174 vector<int> freqMap;
cannam@0	175
Chris@23	176 /** The number of entries in <code>freqMap</code>. */
cannam@0	177 int freqMapSize;
cannam@0	178
Chris@23	179 /** The number of values in an externally-supplied feature vector,
Chris@23	180 * used in preference to freqMap/freqMapSize if constructed with
Chris@23	181 * the external feature version of the Matcher constructor. If
Chris@23	182 * this is zero, the internal feature extractor will be used as
Chris@23	183 * normal.
Chris@23	184 */
Chris@23	185 int externalFeatureSize;
Chris@23	186
Chris@23	187 /** The number of values in the feature vectors actually in
Chris@23	188 * use. This will be externalFeatureSize if greater than zero, or
Chris@23	189 * freqMapSize otherwise.
Chris@23	190 */
Chris@23	191 int featureSize;
Chris@23	192
cannam@0	193 /** The most recent frame; used for calculating the frame to frame
Chris@13	194 * spectral difference. These are therefore frequency warped but
Chris@13	195 * not yet normalised. */
cannam@0	196 vector<double> prevFrame;
cannam@0	197 vector<double> newFrame;
cannam@0	198
cannam@0	199 /** A block of previously seen frames are stored in this structure
cannam@0	200 * for calculation of the distance matrix as the new frames are
cannam@0	201 * read in. One can think of the structure of the array as a
Chris@13	202 * circular buffer of vectors. These are the frames with all
Chris@13	203 * applicable processing applied (e.g. spectral difference,
Chris@13	204 * normalisation), unlike prevFrame and newFrame. The total
Chris@13	205 * energy of frames[i] is stored in totalEnergies[i]. */
cannam@0	206 vector<vector<double> > frames;
cannam@0	207
Chris@13	208 /** The total energy of each frame in the frames block. */
Chris@13	209 vector<double> totalEnergies;
Chris@13	210
cannam@0	211 /** The best path cost matrix. */
cannam@0	212 int **bestPathCost;
cannam@0	213
cannam@0	214 /** The distance matrix. */
cannam@0	215 unsigned char **distance;
cannam@0	216
cannam@0	217 /** The bounds of each row of data in the distance and path cost matrices.*/
cannam@0	218 int *first;
cannam@0	219 int *last;
cannam@0	220
cannam@0	221 /** Height of each column in distance and bestPathCost matrices */
cannam@0	222 int *distYSizes;
cannam@0	223
cannam@0	224 /** Width of distance and bestPathCost matrices and first and last vectors */
cannam@0	225 int distXSize;
cannam@0	226
cannam@0	227 bool initialised;
cannam@0	228
cannam@0	229 /** Disable or enable debugging output */
cannam@0	230 static bool silent;
cannam@0	231
cannam@0	232 public:
cannam@0	233 /** Constructor for Matcher.
cannam@0	234 *
cannam@0	235 * @param p The Matcher representing the performance with which
cannam@0	236 * this one is going to be matched. Some information is shared
cannam@0	237 * between the two matchers (currently one possesses the distance
cannam@0	238 * matrix and optimal path matrix).
cannam@0	239 */
Chris@15	240 Matcher(Parameters parameters, Matcher *p);
cannam@0	241
Chris@23	242 /** Constructor for Matcher using externally supplied features.
Chris@23	243 * A Matcher made using this constructor will not carry out its
Chris@23	244 * own feature extraction from frequency-domain audio data, but
Chris@23	245 * instead will accept arbitrary feature frames calculated by
Chris@23	246 * some external code.
Chris@23	247 *
Chris@23	248 * @param p The Matcher representing the performance with which
Chris@23	249 * this one is going to be matched. Some information is shared
Chris@23	250 * between the two matchers (currently one possesses the distance
Chris@23	251 * matrix and optimal path matrix).
Chris@23	252 *
Chris@23	253 * @param featureSize Number of values in each feature vector.
Chris@23	254 */
Chris@23	255 Matcher(Parameters parameters, Matcher *p, int featureSize);
Chris@23	256
cannam@0	257 ~Matcher();
cannam@0	258
cannam@0	259 /** For debugging, outputs information about the Matcher to
cannam@0	260 * standard error.
cannam@0	261 */
cannam@0	262 void print();
cannam@0	263
cannam@0	264 /** Adds a link to the Matcher object representing the performance
cannam@0	265 * which is going to be matched to this one.
cannam@0	266 *
cannam@0	267 * @param p the Matcher representing the other performance
cannam@0	268 */
cannam@0	269 void setOtherMatcher(Matcher *p) {
cannam@0	270 otherMatcher = p;
cannam@0	271 } // setOtherMatcher()
cannam@0	272
cannam@0	273 int getFrameCount() {
cannam@0	274 return frameCount;
cannam@0	275 }
cannam@0	276
Chris@16	277 /**
Chris@16	278 * Return the feature vector size that will be used for the given
Chris@16	279 * parameters.
Chris@16	280 */
Chris@23	281 static int getFeatureSizeFor(Parameters params);
Chris@16	282
cannam@0	283 protected:
cannam@0	284 template <typename T>
cannam@0	285 void initVector(vector<T> &vec, int sz, T dflt = 0) {
cannam@0	286 vec.clear();
cannam@0	287 while ((int)vec.size() < sz) vec.push_back(dflt);
cannam@0	288 }
cannam@0	289
cannam@0	290 template <typename T>
cannam@0	291 void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
cannam@0	292 T dflt = 0, int fillTo = -1) {
cannam@0	293 mat.clear();
cannam@0	294 if (fillTo < 0) fillTo = hsz;
cannam@0	295 for (int i = 0; i < hsz; ++i) {
cannam@0	296 mat.push_back(vector<T>());
cannam@0	297 if (i < fillTo) {
cannam@0	298 while ((int)mat[i].size() < vsz) {
cannam@0	299 mat[i].push_back(dflt);
cannam@0	300 }
cannam@0	301 }
cannam@0	302 }
cannam@0	303 }
cannam@0	304
cannam@0	305 void init();
cannam@0	306
Chris@15	307 void makeFreqMap();
cannam@0	308
cannam@0	309 /** Creates a map of FFT frequency bins to comparison bins. Where
cannam@0	310 * the spacing of FFT bins is less than 0.5 semitones, the
cannam@0	311 * mapping is one to one. Where the spacing is greater than 0.5
cannam@0	312 * semitones, the FFT energy is mapped into semitone-wide
cannam@0	313 * bins. No scaling is performed; that is the energy is summed
Chris@21	314 * into the comparison bins. See also consumeFrame()
cannam@0	315 */
Chris@15	316 void makeStandardFrequencyMap();
cannam@0	317
Chris@15	318 void makeChromaFrequencyMap();
cannam@0	319
cannam@0	320 /** Processes a frame of audio data by first computing the STFT
cannam@0	321 * with a Hamming window, then mapping the frequency bins into a
cannam@0	322 * part-linear part-logarithmic array, then (optionally)
cannam@0	323 * computing the half-wave rectified spectral difference from the
cannam@0	324 * previous frame, then (optionally) normalising to a sum of 1,
cannam@0	325 * then calculating the distance to all frames stored in the
cannam@0	326 * otherMatcher and storing them in the distance matrix, and
cannam@0	327 * finally updating the optimal path matrix using the dynamic
cannam@0	328 * time warping algorithm.
Chris@14	329 *
Chris@14	330 * Return value is the frame (post-processed, with warping,
Chris@14	331 * rectification, and normalisation as appropriate).
Chris@23	332 *
Chris@23	333 * The Matcher must have been constructed using the constructor
Chris@23	334 * without an external featureSize parameter in order to use this
Chris@23	335 * function. (Otherwise it will be expecting you to call
Chris@23	336 * consumeFeatureVector.)
cannam@0	337 */
Chris@21	338 std::vector<double> consumeFrame(double reBuffer, double imBuffer);
cannam@0	339
Chris@23	340 /** Processes a feature vector frame (presumably calculated from
Chris@23	341 * audio data by some external code). As consumeFrame, except
Chris@23	342 * that it does not calculate a feature from audio data but
Chris@23	343 * instead uses the supplied feature directly.
Chris@23	344 *
Chris@23	345 * The Matcher must have been constructed using the constructor
Chris@23	346 * that accepts an external featureSize parameter in order to
Chris@23	347 * use this function. The supplied feature must be of the size
Chris@23	348 * that was passed to the constructor.
Chris@23	349 */
Chris@23	350 void consumeFeatureVector(std::vector<double> feature);
Chris@23	351
cannam@0	352 /** Retrieves values from the minimum cost matrix.
cannam@0	353 *
cannam@0	354 * @param i the frame number of this Matcher
cannam@0	355 * @param j the frame number of the other Matcher
cannam@0	356 * @return the cost of the minimum cost path to this location
cannam@0	357 */
cannam@0	358 int getValue(int i, int j, bool firstAttempt);
cannam@0	359
cannam@0	360 /** Stores entries in the distance matrix and the optimal path matrix.
cannam@0	361 *
cannam@0	362 * @param i the frame number of this Matcher
cannam@0	363 * @param j the frame number of the other Matcher
cannam@0	364 * @param dir the direction from which this position is reached with
cannam@0	365 * minimum cost
cannam@0	366 * @param value the cost of the minimum path except the current step
cannam@0	367 * @param dMN the distance cost between the two frames
cannam@0	368 */
cannam@0	369 void setValue(int i, int j, int dir, int value, int dMN);
cannam@0	370
Chris@21	371 vector<double> processFrameFromFreqData(double , double );
Chris@21	372 void calcAdvance();
Chris@21	373
Chris@26	374 DistanceMetric metric;
Chris@26	375
cannam@0	376 friend class MatchFeeder;
Chris@24	377 friend class MatchFeatureFeeder;
Chris@15	378 friend class Finder;
cannam@0	379
cannam@0	380 }; // class Matcher
cannam@0	381
cannam@0	382 #endif

Mercurial > hg > match-vamp

annotate Matcher.h @ 26:9f60d097f0b2