cannam@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ cannam@0: cannam@0: /* cannam@0: Vamp feature extraction plugin using the MATCH audio alignment cannam@0: algorithm. cannam@0: cannam@0: Centre for Digital Music, Queen Mary, University of London. cannam@0: This file copyright 2007 Simon Dixon, Chris Cannam and QMUL. cannam@0: cannam@0: This program is free software; you can redistribute it and/or cannam@0: modify it under the terms of the GNU General Public License as cannam@0: published by the Free Software Foundation; either version 2 of the cannam@0: License, or (at your option) any later version. See the file cannam@0: COPYING included with this distribution for more information. cannam@0: */ cannam@0: cannam@0: #ifndef _MATCHER_H_ cannam@0: #define _MATCHER_H_ cannam@0: cannam@0: #include cannam@0: #include cannam@0: #include cannam@0: #include cannam@0: cannam@0: #define ADVANCE_THIS 1 cannam@0: #define ADVANCE_OTHER 2 cannam@0: #define ADVANCE_BOTH 3 cannam@0: #define MASK 0xfc cannam@0: Chris@26: #include "DistanceMetric.h" cannam@0: cannam@0: using std::vector; cannam@0: using std::string; cannam@0: using std::cerr; cannam@0: using std::endl; cannam@0: cannam@0: /** Represents an audio stream that can be matched to another audio cannam@0: * stream of the same piece of music. The matching algorithm uses cannam@0: * dynamic time warping. The distance metric is a Euclidean metric cannam@0: * on the FFT data with the higher frequencies mapped onto a linear cannam@0: * scale. cannam@0: */ cannam@0: class Matcher cannam@0: { Chris@15: public: Chris@15: enum FrameNormalisation { Chris@15: Chris@15: /** Do not normalise audio frames */ Chris@15: NoFrameNormalisation, Chris@15: Chris@15: /** Normalise each frame of audio to have a sum of 1 */ Chris@15: NormaliseFrameToSum1, Chris@15: Chris@15: /** Normalise each frame of audio by the long-term average Chris@15: * of the summed energy */ Chris@15: NormaliseFrameToLTAverage, Chris@15: }; Chris@15: Chris@15: struct Parameters { Chris@15: Chris@15: Parameters(float rate_, double hopTime_, int fftSize_) : Chris@15: sampleRate(rate_), Chris@15: frameNorm(NormaliseFrameToSum1), Chris@26: distanceNorm(DistanceMetric::NormaliseDistanceToLogSum), Chris@29: distanceScale(90.0), Chris@15: useSpectralDifference(true), Chris@15: useChromaFrequencyMap(false), Chris@15: hopTime(hopTime_), Chris@15: fftSize(fftSize_), Chris@15: blockTime(10.0), Chris@15: silenceThreshold(0.01), Chris@15: decay(0.99), Chris@15: maxRunCount(3) Chris@15: {} Chris@15: Chris@15: /** Sample rate of audio */ Chris@15: float sampleRate; Chris@15: Chris@15: /** Type of audio frame normalisation */ Chris@15: FrameNormalisation frameNorm; Chris@15: Chris@15: /** Type of distance metric normalisation */ Chris@26: DistanceMetric::DistanceNormalisation distanceNorm; Chris@15: Chris@29: /** Scaling factor for distance metric; must guarantee that the Chris@29: * final value fits in the data type used, that is, unsigned Chris@29: * char. Chris@29: */ Chris@29: double distanceScale; Chris@29: Chris@15: /** Flag indicating whether or not the half-wave rectified Chris@15: * spectral difference should be used in calculating the Chris@15: * distance metric for pairs of audio frames, instead of the Chris@15: * straight spectrum values. */ Chris@15: bool useSpectralDifference; Chris@15: Chris@15: /** Flag indicating whether to use a chroma frequency map (12 Chris@15: * bins) instead of the default warped spectrogram */ Chris@15: bool useChromaFrequencyMap; Chris@15: Chris@15: /** Spacing of audio frames (determines the amount of overlap or Chris@15: * skip between frames). This value is expressed in Chris@15: * seconds. */ Chris@15: double hopTime; Chris@15: Chris@15: /** Size of an FFT frame in samples. Note that the data passed Chris@15: * in to Matcher is already in the frequency domain, so this Chris@15: * expresses the size of the frame that the caller will be Chris@15: * providing. Chris@15: */ Chris@15: int fftSize; Chris@15: Chris@15: /** The width of the search band (error margin) around the current Chris@15: * match position, measured in seconds. Strictly speaking the Chris@15: * width is measured backwards from the current point, since the Chris@15: * algorithm has to work causally. Chris@15: */ Chris@15: double blockTime; Chris@15: Chris@15: /** RMS level below which frame is considered silent */ Chris@15: double silenceThreshold; Chris@15: Chris@15: /** Frame-to-frame decay factor in calculating long-term average */ Chris@15: double decay; Chris@15: Chris@15: /** Maximum number of frames sequentially processed by this Chris@15: * matcher, without a frame of the other matcher being Chris@15: * processed. Chris@15: */ Chris@15: int maxRunCount; Chris@15: }; Chris@15: cannam@0: protected: cannam@0: /** Points to the other performance with which this one is being cannam@0: * compared. The data for the distance metric and the dynamic cannam@0: * time warping is shared between the two matchers. In the cannam@0: * original version, only one of the two performance matchers cannam@0: * contained the distance metric. (See first) cannam@0: */ cannam@0: Matcher *otherMatcher; cannam@0: cannam@0: /** Indicates which performance is considered primary (the cannam@0: * score). This is the performance shown on the vertical axis, cannam@0: * and referred to as "this" in the codes for the direction of cannam@0: * DTW steps. */ cannam@0: bool firstPM; cannam@0: Chris@15: /** Configuration parameters */ Chris@15: Parameters params; cannam@0: cannam@0: /** Width of the search band in FFT frames (see blockTime) */ cannam@0: int blockSize; cannam@0: cannam@0: /** The number of frames of audio data which have been read. */ cannam@0: int frameCount; cannam@0: cannam@0: /** Long term average frame energy (in frequency domain cannam@0: * representation). */ cannam@0: double ltAverage; cannam@0: cannam@0: /** The number of frames sequentially processed by this matcher, cannam@0: * without a frame of the other matcher being processed. cannam@0: */ cannam@0: int runCount; cannam@0: cannam@0: /** A mapping function for mapping FFT bins to final frequency cannam@0: * bins. The mapping is linear (1-1) until the resolution cannam@0: * reaches 2 points per semitone, then logarithmic with a cannam@0: * semitone resolution. e.g. for 44.1kHz sampling rate and cannam@0: * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped cannam@0: * linearly for bins 0-34 (0 to 732Hz), and logarithmically for cannam@0: * the remaining bins (midi notes 79 to 127, bins 35 to 83), cannam@0: * where all energy above note 127 is mapped into the final cannam@0: * bin. */ cannam@0: vector freqMap; cannam@0: Chris@23: /** The number of entries in freqMap. */ cannam@0: int freqMapSize; cannam@0: Chris@23: /** The number of values in an externally-supplied feature vector, Chris@23: * used in preference to freqMap/freqMapSize if constructed with Chris@23: * the external feature version of the Matcher constructor. If Chris@23: * this is zero, the internal feature extractor will be used as Chris@23: * normal. Chris@23: */ Chris@23: int externalFeatureSize; Chris@23: Chris@23: /** The number of values in the feature vectors actually in Chris@23: * use. This will be externalFeatureSize if greater than zero, or Chris@23: * freqMapSize otherwise. Chris@23: */ Chris@23: int featureSize; Chris@23: cannam@0: /** The most recent frame; used for calculating the frame to frame Chris@13: * spectral difference. These are therefore frequency warped but Chris@13: * not yet normalised. */ cannam@0: vector prevFrame; cannam@0: vector newFrame; cannam@0: cannam@0: /** A block of previously seen frames are stored in this structure cannam@0: * for calculation of the distance matrix as the new frames are cannam@0: * read in. One can think of the structure of the array as a Chris@13: * circular buffer of vectors. These are the frames with all Chris@13: * applicable processing applied (e.g. spectral difference, Chris@13: * normalisation), unlike prevFrame and newFrame. The total Chris@13: * energy of frames[i] is stored in totalEnergies[i]. */ cannam@0: vector > frames; cannam@0: Chris@13: /** The total energy of each frame in the frames block. */ Chris@13: vector totalEnergies; Chris@13: cannam@0: /** The best path cost matrix. */ cannam@0: int **bestPathCost; cannam@0: cannam@0: /** The distance matrix. */ cannam@0: unsigned char **distance; cannam@0: cannam@0: /** The bounds of each row of data in the distance and path cost matrices.*/ cannam@0: int *first; cannam@0: int *last; cannam@0: cannam@0: /** Height of each column in distance and bestPathCost matrices */ cannam@0: int *distYSizes; cannam@0: cannam@0: /** Width of distance and bestPathCost matrices and first and last vectors */ cannam@0: int distXSize; cannam@0: cannam@0: bool initialised; cannam@0: cannam@0: /** Disable or enable debugging output */ cannam@0: static bool silent; cannam@0: cannam@0: public: cannam@0: /** Constructor for Matcher. cannam@0: * cannam@0: * @param p The Matcher representing the performance with which cannam@0: * this one is going to be matched. Some information is shared cannam@0: * between the two matchers (currently one possesses the distance cannam@0: * matrix and optimal path matrix). cannam@0: */ Chris@15: Matcher(Parameters parameters, Matcher *p); cannam@0: Chris@23: /** Constructor for Matcher using externally supplied features. Chris@23: * A Matcher made using this constructor will not carry out its Chris@23: * own feature extraction from frequency-domain audio data, but Chris@23: * instead will accept arbitrary feature frames calculated by Chris@23: * some external code. Chris@23: * Chris@23: * @param p The Matcher representing the performance with which Chris@23: * this one is going to be matched. Some information is shared Chris@23: * between the two matchers (currently one possesses the distance Chris@23: * matrix and optimal path matrix). Chris@23: * Chris@23: * @param featureSize Number of values in each feature vector. Chris@23: */ Chris@23: Matcher(Parameters parameters, Matcher *p, int featureSize); Chris@23: cannam@0: ~Matcher(); cannam@0: cannam@0: /** For debugging, outputs information about the Matcher to cannam@0: * standard error. cannam@0: */ cannam@0: void print(); cannam@0: cannam@0: /** Adds a link to the Matcher object representing the performance cannam@0: * which is going to be matched to this one. cannam@0: * cannam@0: * @param p the Matcher representing the other performance cannam@0: */ cannam@0: void setOtherMatcher(Matcher *p) { cannam@0: otherMatcher = p; cannam@0: } // setOtherMatcher() cannam@0: cannam@0: int getFrameCount() { cannam@0: return frameCount; cannam@0: } cannam@0: Chris@16: /** Chris@16: * Return the feature vector size that will be used for the given Chris@16: * parameters. Chris@16: */ Chris@23: static int getFeatureSizeFor(Parameters params); Chris@16: cannam@0: protected: cannam@0: template cannam@0: void initVector(vector &vec, int sz, T dflt = 0) { cannam@0: vec.clear(); cannam@0: while ((int)vec.size() < sz) vec.push_back(dflt); cannam@0: } cannam@0: cannam@0: template cannam@0: void initMatrix(vector > &mat, int hsz, int vsz, cannam@0: T dflt = 0, int fillTo = -1) { cannam@0: mat.clear(); cannam@0: if (fillTo < 0) fillTo = hsz; cannam@0: for (int i = 0; i < hsz; ++i) { cannam@0: mat.push_back(vector()); cannam@0: if (i < fillTo) { cannam@0: while ((int)mat[i].size() < vsz) { cannam@0: mat[i].push_back(dflt); cannam@0: } cannam@0: } cannam@0: } cannam@0: } cannam@0: cannam@0: void init(); cannam@0: Chris@15: void makeFreqMap(); cannam@0: cannam@0: /** Creates a map of FFT frequency bins to comparison bins. Where cannam@0: * the spacing of FFT bins is less than 0.5 semitones, the cannam@0: * mapping is one to one. Where the spacing is greater than 0.5 cannam@0: * semitones, the FFT energy is mapped into semitone-wide cannam@0: * bins. No scaling is performed; that is the energy is summed Chris@21: * into the comparison bins. See also consumeFrame() cannam@0: */ Chris@15: void makeStandardFrequencyMap(); cannam@0: Chris@15: void makeChromaFrequencyMap(); cannam@0: cannam@0: /** Processes a frame of audio data by first computing the STFT cannam@0: * with a Hamming window, then mapping the frequency bins into a cannam@0: * part-linear part-logarithmic array, then (optionally) cannam@0: * computing the half-wave rectified spectral difference from the cannam@0: * previous frame, then (optionally) normalising to a sum of 1, cannam@0: * then calculating the distance to all frames stored in the cannam@0: * otherMatcher and storing them in the distance matrix, and cannam@0: * finally updating the optimal path matrix using the dynamic cannam@0: * time warping algorithm. Chris@14: * Chris@14: * Return value is the frame (post-processed, with warping, Chris@14: * rectification, and normalisation as appropriate). Chris@23: * Chris@23: * The Matcher must have been constructed using the constructor Chris@23: * without an external featureSize parameter in order to use this Chris@23: * function. (Otherwise it will be expecting you to call Chris@23: * consumeFeatureVector.) cannam@0: */ Chris@21: std::vector consumeFrame(double *reBuffer, double *imBuffer); cannam@0: Chris@23: /** Processes a feature vector frame (presumably calculated from Chris@23: * audio data by some external code). As consumeFrame, except Chris@23: * that it does not calculate a feature from audio data but Chris@23: * instead uses the supplied feature directly. Chris@23: * Chris@23: * The Matcher must have been constructed using the constructor Chris@23: * that accepts an external featureSize parameter in order to Chris@23: * use this function. The supplied feature must be of the size Chris@23: * that was passed to the constructor. Chris@23: */ Chris@23: void consumeFeatureVector(std::vector feature); Chris@23: cannam@0: /** Retrieves values from the minimum cost matrix. cannam@0: * cannam@0: * @param i the frame number of this Matcher cannam@0: * @param j the frame number of the other Matcher cannam@0: * @return the cost of the minimum cost path to this location cannam@0: */ cannam@0: int getValue(int i, int j, bool firstAttempt); cannam@0: cannam@0: /** Stores entries in the distance matrix and the optimal path matrix. cannam@0: * cannam@0: * @param i the frame number of this Matcher cannam@0: * @param j the frame number of the other Matcher cannam@0: * @param dir the direction from which this position is reached with cannam@0: * minimum cost cannam@0: * @param value the cost of the minimum path except the current step cannam@0: * @param dMN the distance cost between the two frames cannam@0: */ cannam@0: void setValue(int i, int j, int dir, int value, int dMN); cannam@0: Chris@21: vector processFrameFromFreqData(double *, double *); Chris@21: void calcAdvance(); Chris@21: Chris@26: DistanceMetric metric; Chris@26: cannam@0: friend class MatchFeeder; Chris@24: friend class MatchFeatureFeeder; Chris@15: friend class Finder; cannam@0: cannam@0: }; // class Matcher cannam@0: cannam@0: #endif