Mercurial > hg > match-vamp
diff src/Matcher.h @ 35:16870e8770ae refactors
Move source to src
author | Chris Cannam |
---|---|
date | Thu, 13 Nov 2014 09:50:14 +0000 |
parents | Matcher.h@cd0ffe6b1e68 |
children | 8cce4e13ede3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Matcher.h Thu Nov 13 09:50:14 2014 +0000 @@ -0,0 +1,383 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Vamp feature extraction plugin using the MATCH audio alignment + algorithm. + + Centre for Digital Music, Queen Mary, University of London. + This file copyright 2007 Simon Dixon, Chris Cannam and QMUL. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#ifndef _MATCHER_H_ +#define _MATCHER_H_ + +#include <vector> +#include <iostream> +#include <sstream> +#include <cmath> + +#define ADVANCE_THIS 1 +#define ADVANCE_OTHER 2 +#define ADVANCE_BOTH 3 +#define MASK 0xfc + +#include "DistanceMetric.h" + +using std::vector; +using std::string; +using std::cerr; +using std::endl; + +/** Represents an audio stream that can be matched to another audio + * stream of the same piece of music. The matching algorithm uses + * dynamic time warping. The distance metric is a Euclidean metric + * on the FFT data with the higher frequencies mapped onto a linear + * scale. + */ +class Matcher +{ +public: + enum FrameNormalisation { + + /** Do not normalise audio frames */ + NoFrameNormalisation, + + /** Normalise each frame of audio to have a sum of 1 */ + NormaliseFrameToSum1, + + /** Normalise each frame of audio by the long-term average + * of the summed energy */ + NormaliseFrameToLTAverage, + }; + + struct Parameters { + + Parameters(float rate_, double hopTime_, int fftSize_) : + sampleRate(rate_), + frameNorm(NormaliseFrameToSum1), + distanceNorm(DistanceMetric::NormaliseDistanceToLogSum), + distanceScale(90.0), + useSpectralDifference(true), + useChromaFrequencyMap(false), + hopTime(hopTime_), + fftSize(fftSize_), + blockTime(10.0), + silenceThreshold(0.01), + decay(0.99), + maxRunCount(3) + {} + + /** Sample rate of audio */ + float sampleRate; + + /** Type of audio frame normalisation */ + FrameNormalisation frameNorm; + + /** Type of distance metric normalisation */ + DistanceMetric::DistanceNormalisation distanceNorm; + + /** Scaling factor for distance metric; must guarantee that the + * final value fits in the data type used, that is, unsigned + * char. + */ + double distanceScale; + + /** Flag indicating whether or not the half-wave rectified + * spectral difference should be used in calculating the + * distance metric for pairs of audio frames, instead of the + * straight spectrum values. */ + bool useSpectralDifference; + + /** Flag indicating whether to use a chroma frequency map (12 + * bins) instead of the default warped spectrogram */ + bool useChromaFrequencyMap; + + /** Spacing of audio frames (determines the amount of overlap or + * skip between frames). This value is expressed in + * seconds. */ + double hopTime; + + /** Size of an FFT frame in samples. Note that the data passed + * in to Matcher is already in the frequency domain, so this + * expresses the size of the frame that the caller will be + * providing. + */ + int fftSize; + + /** The width of the search band (error margin) around the current + * match position, measured in seconds. Strictly speaking the + * width is measured backwards from the current point, since the + * algorithm has to work causally. + */ + double blockTime; + + /** RMS level below which frame is considered silent */ + double silenceThreshold; + + /** Frame-to-frame decay factor in calculating long-term average */ + double decay; + + /** Maximum number of frames sequentially processed by this + * matcher, without a frame of the other matcher being + * processed. + */ + int maxRunCount; + }; + +protected: + /** Points to the other performance with which this one is being + * compared. The data for the distance metric and the dynamic + * time warping is shared between the two matchers. In the + * original version, only one of the two performance matchers + * contained the distance metric. (See <code>first</code>) + */ + Matcher *otherMatcher; + + /** Indicates which performance is considered primary (the + * score). This is the performance shown on the vertical axis, + * and referred to as "this" in the codes for the direction of + * DTW steps. */ + bool firstPM; + + /** Configuration parameters */ + Parameters params; + + /** Width of the search band in FFT frames (see <code>blockTime</code>) */ + int blockSize; + + /** The number of frames of audio data which have been read. */ + int frameCount; + + /** Long term average frame energy (in frequency domain + * representation). */ + double ltAverage; + + /** The number of frames sequentially processed by this matcher, + * without a frame of the other matcher being processed. + */ + int runCount; + + /** A mapping function for mapping FFT bins to final frequency + * bins. The mapping is linear (1-1) until the resolution + * reaches 2 points per semitone, then logarithmic with a + * semitone resolution. e.g. for 44.1kHz sampling rate and + * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped + * linearly for bins 0-34 (0 to 732Hz), and logarithmically for + * the remaining bins (midi notes 79 to 127, bins 35 to 83), + * where all energy above note 127 is mapped into the final + * bin. */ + vector<int> freqMap; + + /** The number of entries in <code>freqMap</code>. */ + int freqMapSize; + + /** The number of values in an externally-supplied feature vector, + * used in preference to freqMap/freqMapSize if constructed with + * the external feature version of the Matcher constructor. If + * this is zero, the internal feature extractor will be used as + * normal. + */ + int externalFeatureSize; + + /** The number of values in the feature vectors actually in + * use. This will be externalFeatureSize if greater than zero, or + * freqMapSize otherwise. + */ + int featureSize; + + /** The most recent frame; used for calculating the frame to frame + * spectral difference. These are therefore frequency warped but + * not yet normalised. */ + vector<double> prevFrame; + vector<double> newFrame; + + /** A block of previously seen frames are stored in this structure + * for calculation of the distance matrix as the new frames are + * read in. One can think of the structure of the array as a + * circular buffer of vectors. These are the frames with all + * applicable processing applied (e.g. spectral difference, + * normalisation), unlike prevFrame and newFrame. The total + * energy of frames[i] is stored in totalEnergies[i]. */ + vector<vector<double> > frames; + + /** The total energy of each frame in the frames block. */ + vector<double> totalEnergies; + + /** The best path cost matrix. */ + int **bestPathCost; + + /** The distance matrix. */ + unsigned char **distance; + + /** The bounds of each row of data in the distance and path cost matrices.*/ + int *first; + int *last; + + /** Height of each column in distance and bestPathCost matrices */ + int *distYSizes; + + /** Width of distance and bestPathCost matrices and first and last vectors */ + int distXSize; + + bool initialised; + + /** Disable or enable debugging output */ + static bool silent; + +public: + /** Constructor for Matcher. + * + * @param p The Matcher representing the performance with which + * this one is going to be matched. Some information is shared + * between the two matchers (currently one possesses the distance + * matrix and optimal path matrix). + */ + Matcher(Parameters parameters, Matcher *p); + + /** Constructor for Matcher using externally supplied features. + * A Matcher made using this constructor will not carry out its + * own feature extraction from frequency-domain audio data, but + * instead will accept arbitrary feature frames calculated by + * some external code. + * + * @param p The Matcher representing the performance with which + * this one is going to be matched. Some information is shared + * between the two matchers (currently one possesses the distance + * matrix and optimal path matrix). + * + * @param featureSize Number of values in each feature vector. + */ + Matcher(Parameters parameters, Matcher *p, int featureSize); + + ~Matcher(); + + /** For debugging, outputs information about the Matcher to + * standard error. + */ + void print(); + + /** Adds a link to the Matcher object representing the performance + * which is going to be matched to this one. + * + * @param p the Matcher representing the other performance + */ + void setOtherMatcher(Matcher *p) { + otherMatcher = p; + } // setOtherMatcher() + + int getFrameCount() { + return frameCount; + } + + /** + * Return the feature vector size that will be used for the given + * parameters. + */ + static int getFeatureSizeFor(Parameters params); + +protected: + template <typename T> + void initVector(vector<T> &vec, int sz, T dflt = 0) { + vec.clear(); + while ((int)vec.size() < sz) vec.push_back(dflt); + } + + template <typename T> + void initMatrix(vector<vector<T> > &mat, int hsz, int vsz, + T dflt = 0, int fillTo = -1) { + mat.clear(); + if (fillTo < 0) fillTo = hsz; + for (int i = 0; i < hsz; ++i) { + mat.push_back(vector<T>()); + if (i < fillTo) { + while ((int)mat[i].size() < vsz) { + mat[i].push_back(dflt); + } + } + } + } + + void init(); + + void makeFreqMap(); + + /** Creates a map of FFT frequency bins to comparison bins. Where + * the spacing of FFT bins is less than 0.5 semitones, the + * mapping is one to one. Where the spacing is greater than 0.5 + * semitones, the FFT energy is mapped into semitone-wide + * bins. No scaling is performed; that is the energy is summed + * into the comparison bins. See also consumeFrame() + */ + void makeStandardFrequencyMap(); + + void makeChromaFrequencyMap(); + + /** Processes a frame of audio data by first computing the STFT + * with a Hamming window, then mapping the frequency bins into a + * part-linear part-logarithmic array, then (optionally) + * computing the half-wave rectified spectral difference from the + * previous frame, then (optionally) normalising to a sum of 1, + * then calculating the distance to all frames stored in the + * otherMatcher and storing them in the distance matrix, and + * finally updating the optimal path matrix using the dynamic + * time warping algorithm. + * + * Return value is the frame (post-processed, with warping, + * rectification, and normalisation as appropriate). + * + * The Matcher must have been constructed using the constructor + * without an external featureSize parameter in order to use this + * function. (Otherwise it will be expecting you to call + * consumeFeatureVector.) + */ + std::vector<double> consumeFrame(double *reBuffer, double *imBuffer); + + /** Processes a feature vector frame (presumably calculated from + * audio data by some external code). As consumeFrame, except + * that it does not calculate a feature from audio data but + * instead uses the supplied feature directly. + * + * The Matcher must have been constructed using the constructor + * that accepts an external featureSize parameter in order to + * use this function. The supplied feature must be of the size + * that was passed to the constructor. + */ + void consumeFeatureVector(std::vector<double> feature); + + /** Retrieves values from the minimum cost matrix. + * + * @param i the frame number of this Matcher + * @param j the frame number of the other Matcher + * @return the cost of the minimum cost path to this location + */ + int getValue(int i, int j, bool firstAttempt); + + /** Stores entries in the distance matrix and the optimal path matrix. + * + * @param i the frame number of this Matcher + * @param j the frame number of the other Matcher + * @param dir the direction from which this position is reached with + * minimum cost + * @param value the cost of the minimum path except the current step + * @param dMN the distance cost between the two frames + */ + void setValue(int i, int j, int dir, int value, int dMN); + + vector<double> processFrameFromFreqData(double *, double *); + void calcAdvance(); + + DistanceMetric metric; + + friend class MatchFeeder; + friend class MatchFeatureFeeder; + friend class Finder; + +}; // class Matcher + +#endif