cannam@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ cannam@0: cannam@0: /* cannam@0: Vamp feature extraction plugin using the MATCH audio alignment cannam@0: algorithm. cannam@0: cannam@0: Centre for Digital Music, Queen Mary, University of London. cannam@0: This file copyright 2007 Simon Dixon, Chris Cannam and QMUL. cannam@0: cannam@0: This program is free software; you can redistribute it and/or cannam@0: modify it under the terms of the GNU General Public License as cannam@0: published by the Free Software Foundation; either version 2 of the cannam@0: License, or (at your option) any later version. See the file cannam@0: COPYING included with this distribution for more information. cannam@0: */ cannam@0: cannam@0: #ifndef _MATCHER_H_ cannam@0: #define _MATCHER_H_ cannam@0: cannam@0: #include cannam@0: #include cannam@0: #include cannam@0: #include cannam@0: cannam@0: #define ADVANCE_THIS 1 cannam@0: #define ADVANCE_OTHER 2 cannam@0: #define ADVANCE_BOTH 3 cannam@0: #define MASK 0xfc cannam@0: cannam@0: cannam@0: using std::vector; cannam@0: using std::string; cannam@0: using std::cerr; cannam@0: using std::endl; cannam@0: cannam@0: /** Represents an audio stream that can be matched to another audio cannam@0: * stream of the same piece of music. The matching algorithm uses cannam@0: * dynamic time warping. The distance metric is a Euclidean metric cannam@0: * on the FFT data with the higher frequencies mapped onto a linear cannam@0: * scale. cannam@0: */ cannam@0: cannam@0: class Matcher cannam@0: { cannam@0: protected: cannam@0: /** Points to the other performance with which this one is being cannam@0: * compared. The data for the distance metric and the dynamic cannam@0: * time warping is shared between the two matchers. In the cannam@0: * original version, only one of the two performance matchers cannam@0: * contained the distance metric. (See first) cannam@0: */ cannam@0: Matcher *otherMatcher; cannam@0: cannam@0: /** Indicates which performance is considered primary (the cannam@0: * score). This is the performance shown on the vertical axis, cannam@0: * and referred to as "this" in the codes for the direction of cannam@0: * DTW steps. */ cannam@0: bool firstPM; cannam@0: cannam@0: /** Sample rate of audio */ cannam@0: float sampleRate; cannam@0: cannam@0: /** Onset time of the first note in the audio file, in order to cannam@0: * establish synchronisation between the match file and the audio cannam@0: * data. */ cannam@0: double matchFileOffset; cannam@0: cannam@0: /** Flag indicating whether or not each frame of audio should be cannam@0: * normalised to have a sum of 1. (Default = false). */ cannam@0: bool normalise1; cannam@0: cannam@0: /** Flag indicating whether or not the distance metric for pairs cannam@0: * of audio frames should be normalised by the sum of the two cannam@0: * frames. (Default = false). */ cannam@0: bool normalise2; cannam@0: cannam@0: /** Flag indicating whether or not each frame of audio should be cannam@0: * normalised by the long term average of the summed energy. cannam@0: * (Default = false; assumes normalise1 == false). */ cannam@0: bool normalise3; cannam@0: cannam@0: /** Flag indicating whether or not the distance metric for pairs cannam@0: * of audio frames should be normalised by the log of the sum of cannam@0: * the frames. (Default = false; assumes normalise2 == cannam@0: * false). */ cannam@0: bool normalise4; cannam@0: cannam@0: /** Flag indicating whether or not the half-wave rectified cannam@0: * spectral difference should be used in calculating the distance cannam@0: * metric for pairs of audio frames, instead of the straight cannam@0: * spectrum values. (Default = true). */ cannam@0: bool useSpectralDifference; cannam@0: cannam@0: bool useChromaFrequencyMap; cannam@0: cannam@0: /** Scaling factor for distance metric; must guarantee that the cannam@0: * final value fits in the data type used, that is, unsigned cannam@0: * char. (Default = 16). cannam@0: */ cannam@0: double scale; cannam@0: cannam@0: /** Spacing of audio frames (determines the amount of overlap or cannam@0: * skip between frames). This value is expressed in cannam@0: * seconds. (Default = 0.020s) */ cannam@0: double hopTime; cannam@0: cannam@0: /** The size of an FFT frame in seconds. (Default = 0.04644s). cannam@0: * Note that the value is not taken to be precise; it is adjusted cannam@0: * so that fftSize is always power of 2. */ cannam@0: double fftTime; cannam@0: cannam@0: /** The width of the search band (error margin) around the current cannam@0: * match position, measured in seconds. Strictly speaking the cannam@0: * width is measured backwards from the current point, since the cannam@0: * algorithm has to work causally. cannam@0: */ cannam@0: double blockTime; cannam@0: cannam@0: /** Spacing of audio frames in samples (see hopTime) */ cannam@0: int hopSize; cannam@0: cannam@0: /** The size of an FFT frame in samples (see fftTime) */ cannam@0: int fftSize; cannam@0: cannam@0: /** Width of the search band in FFT frames (see blockTime) */ cannam@0: int blockSize; cannam@0: cannam@0: /** The number of frames of audio data which have been read. */ cannam@0: int frameCount; cannam@0: cannam@0: /** RMS amplitude of the current frame. */ cannam@0: // double frameRMS; cannam@0: cannam@0: /** Long term average frame energy (in frequency domain cannam@0: * representation). */ cannam@0: double ltAverage; cannam@0: cannam@0: /** The number of frames sequentially processed by this matcher, cannam@0: * without a frame of the other matcher being processed. cannam@0: */ cannam@0: int runCount; cannam@0: cannam@0: /** Interactive control of the matching process allows pausing cannam@0: * computation of the cost matrices in one direction. cannam@0: */ cannam@0: bool paused; cannam@0: cannam@0: /** The total number of frames of audio data to be read. */ cannam@0: int maxFrames; cannam@0: cannam@0: /** A mapping function for mapping FFT bins to final frequency cannam@0: * bins. The mapping is linear (1-1) until the resolution cannam@0: * reaches 2 points per semitone, then logarithmic with a cannam@0: * semitone resolution. e.g. for 44.1kHz sampling rate and cannam@0: * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped cannam@0: * linearly for bins 0-34 (0 to 732Hz), and logarithmically for cannam@0: * the remaining bins (midi notes 79 to 127, bins 35 to 83), cannam@0: * where all energy above note 127 is mapped into the final cannam@0: * bin. */ cannam@0: vector freqMap; cannam@0: cannam@0: /** The number of entries in freqMap. Note that the cannam@0: * length of the array is greater, because its size is not known cannam@0: * at creation time. */ cannam@0: int freqMapSize; cannam@0: cannam@0: /** The most recent frame; used for calculating the frame to frame cannam@0: * spectral difference. */ cannam@0: vector prevFrame; cannam@0: vector newFrame; cannam@0: cannam@0: /** A block of previously seen frames are stored in this structure cannam@0: * for calculation of the distance matrix as the new frames are cannam@0: * read in. One can think of the structure of the array as a cannam@0: * circular buffer of vectors. The last element of each vector cannam@0: * stores the total energy. */ cannam@0: vector > frames; cannam@0: cannam@0: /** The best path cost matrix. */ cannam@0: int **bestPathCost; cannam@0: cannam@0: /** The distance matrix. */ cannam@0: unsigned char **distance; cannam@0: cannam@0: /** The bounds of each row of data in the distance and path cost matrices.*/ cannam@0: int *first; cannam@0: int *last; cannam@0: cannam@0: /** Height of each column in distance and bestPathCost matrices */ cannam@0: int *distYSizes; cannam@0: cannam@0: /** Width of distance and bestPathCost matrices and first and last vectors */ cannam@0: int distXSize; cannam@0: cannam@0: /** Total number of audio frames, or -1 for live or compressed input. */ cannam@0: long fileLength; cannam@0: cannam@0: bool initialised; cannam@0: cannam@0: //!!! bool atEnd; //!!! cannam@0: cannam@0: /** Disable or enable debugging output */ cannam@0: static bool silent; cannam@0: cannam@0: static const double decay = 0.99; cannam@0: static const double silenceThreshold = 0.0004; cannam@0: static const int MAX_RUN_COUNT = 3; cannam@0: cannam@0: friend class Finder; //!!! cannam@0: cannam@0: public: cannam@0: /** Constructor for Matcher. cannam@0: * cannam@0: * @param p The Matcher representing the performance with which cannam@0: * this one is going to be matched. Some information is shared cannam@0: * between the two matchers (currently one possesses the distance cannam@0: * matrix and optimal path matrix). cannam@0: */ cannam@0: Matcher(float rate, Matcher *p); cannam@0: cannam@0: ~Matcher(); cannam@0: cannam@0: /** For debugging, outputs information about the Matcher to cannam@0: * standard error. cannam@0: */ cannam@0: void print(); cannam@0: cannam@0: /** Gives some basic `header' information about the Matcher. */ cannam@0: string toString(); cannam@0: cannam@0: /** Adds a link to the Matcher object representing the performance cannam@0: * which is going to be matched to this one. cannam@0: * cannam@0: * @param p the Matcher representing the other performance cannam@0: */ cannam@0: void setOtherMatcher(Matcher *p) { cannam@0: otherMatcher = p; cannam@0: } // setOtherMatcher() cannam@0: cannam@0: int getFFTSize() { cannam@0: return fftSize; cannam@0: } cannam@0: cannam@0: int getHopSize() { cannam@0: return hopSize; cannam@0: } cannam@0: cannam@0: int getFrameCount() { cannam@0: return frameCount; cannam@0: } cannam@0: cannam@1: void setHopSize(int); cannam@1: cannam@0: protected: cannam@0: template cannam@0: void initVector(vector &vec, int sz, T dflt = 0) { cannam@0: std::cerr << "initVector: " << sz << " * " << sizeof(T) << " = " cannam@0: << sz * sizeof(T) << std::endl; cannam@0: vec.clear(); cannam@0: while ((int)vec.size() < sz) vec.push_back(dflt); cannam@0: } cannam@0: cannam@0: template cannam@0: void initMatrix(vector > &mat, int hsz, int vsz, cannam@0: T dflt = 0, int fillTo = -1) { cannam@0: std::cerr << "initMatrix: " << hsz << " * " << vsz << " * " cannam@0: << sizeof(T) << " = " cannam@0: << hsz * vsz * sizeof(T) << std::endl; cannam@0: mat.clear(); cannam@0: if (fillTo < 0) fillTo = hsz; cannam@0: for (int i = 0; i < hsz; ++i) { cannam@0: mat.push_back(vector()); cannam@0: if (i < fillTo) { cannam@0: while ((int)mat[i].size() < vsz) { cannam@0: mat[i].push_back(dflt); cannam@0: } cannam@0: } cannam@0: } cannam@0: } cannam@0: cannam@0: void init(); cannam@0: cannam@0: void makeFreqMap(int fftSize, float sampleRate); cannam@0: cannam@0: /** Creates a map of FFT frequency bins to comparison bins. Where cannam@0: * the spacing of FFT bins is less than 0.5 semitones, the cannam@0: * mapping is one to one. Where the spacing is greater than 0.5 cannam@0: * semitones, the FFT energy is mapped into semitone-wide cannam@0: * bins. No scaling is performed; that is the energy is summed cannam@0: * into the comparison bins. See also processFrame() cannam@0: */ cannam@0: void makeStandardFrequencyMap(int fftSize, float sampleRate); cannam@0: cannam@0: void makeChromaFrequencyMap(int fftSize, float sampleRate); cannam@0: cannam@0: /** Processes a frame of audio data by first computing the STFT cannam@0: * with a Hamming window, then mapping the frequency bins into a cannam@0: * part-linear part-logarithmic array, then (optionally) cannam@0: * computing the half-wave rectified spectral difference from the cannam@0: * previous frame, then (optionally) normalising to a sum of 1, cannam@0: * then calculating the distance to all frames stored in the cannam@0: * otherMatcher and storing them in the distance matrix, and cannam@0: * finally updating the optimal path matrix using the dynamic cannam@0: * time warping algorithm. cannam@0: */ cannam@0: void processFrame(double *reBuffer, double *imBuffer); cannam@0: cannam@0: /** Calculates the Manhattan distance between two vectors, with an cannam@0: * optional normalisation by the combined values in the cannam@0: * vectors. Since the vectors contain energy, this could be cannam@0: * considered as a squared Euclidean distance metric. Note that cannam@0: * normalisation assumes the values are all non-negative. cannam@0: * cannam@0: * @param f1 one of the vectors involved in the distance calculation cannam@0: * @param f2 one of the vectors involved in the distance calculation cannam@0: * @return the distance, scaled and truncated to an integer cannam@0: */ cannam@0: int calcDistance(const vector &f1, const vector &f2); cannam@0: cannam@0: /** Retrieves values from the minimum cost matrix. cannam@0: * cannam@0: * @param i the frame number of this Matcher cannam@0: * @param j the frame number of the other Matcher cannam@0: * @return the cost of the minimum cost path to this location cannam@0: */ cannam@0: int getValue(int i, int j, bool firstAttempt); cannam@0: cannam@0: /** Stores entries in the distance matrix and the optimal path matrix. cannam@0: * cannam@0: * @param i the frame number of this Matcher cannam@0: * @param j the frame number of the other Matcher cannam@0: * @param dir the direction from which this position is reached with cannam@0: * minimum cost cannam@0: * @param value the cost of the minimum path except the current step cannam@0: * @param dMN the distance cost between the two frames cannam@0: */ cannam@0: void setValue(int i, int j, int dir, int value, int dMN); cannam@0: cannam@0: friend class MatchFeeder; cannam@0: cannam@0: }; // class Matcher cannam@0: cannam@0: #endif