diff Matcher.h @ 0:640f92242cc1

* initial import
author cannam
date Wed, 24 Oct 2007 12:13:43 +0000
parents
children de792b8c2801
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Matcher.h	Wed Oct 24 12:13:43 2007 +0000
@@ -0,0 +1,341 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Vamp feature extraction plugin using the MATCH audio alignment
+    algorithm.
+
+    Centre for Digital Music, Queen Mary, University of London.
+    This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#ifndef _MATCHER_H_
+#define _MATCHER_H_
+
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <cmath>
+
+#define ADVANCE_THIS 1
+#define ADVANCE_OTHER 2
+#define ADVANCE_BOTH 3
+#define MASK 0xfc
+
+
+using std::vector;
+using std::string;
+using std::cerr;
+using std::endl;
+
+/** Represents an audio stream that can be matched to another audio
+ *  stream of the same piece of music.  The matching algorithm uses
+ *  dynamic time warping.  The distance metric is a Euclidean metric
+ *  on the FFT data with the higher frequencies mapped onto a linear
+ *  scale.
+ */
+
+class Matcher
+{
+protected:
+    /** Points to the other performance with which this one is being
+     *  compared.  The data for the distance metric and the dynamic
+     *  time warping is shared between the two matchers. In the
+     *  original version, only one of the two performance matchers
+     *  contained the distance metric. (See <code>first</code>)
+     */
+    Matcher *otherMatcher;
+
+    /** Indicates which performance is considered primary (the
+     *  score). This is the performance shown on the vertical axis,
+     *  and referred to as "this" in the codes for the direction of
+     *  DTW steps. */
+    bool firstPM;
+
+    /** Sample rate of audio */
+    float sampleRate;
+
+    /** Onset time of the first note in the audio file, in order to
+     *  establish synchronisation between the match file and the audio
+     *  data. */
+    double matchFileOffset;
+
+    /** Flag indicating whether or not each frame of audio should be
+     *  normalised to have a sum of 1.  (Default = false). */
+    bool normalise1;
+	
+    /** Flag indicating whether or not the distance metric for pairs
+     *  of audio frames should be normalised by the sum of the two
+     *  frames.  (Default = false). */
+    bool normalise2;
+
+    /** Flag indicating whether or not each frame of audio should be
+     *  normalised by the long term average of the summed energy.
+     *  (Default = false; assumes normalise1 == false). */
+    bool normalise3;
+	
+    /** Flag indicating whether or not the distance metric for pairs
+     *  of audio frames should be normalised by the log of the sum of
+     *  the frames.  (Default = false; assumes normalise2 ==
+     *  false). */
+    bool normalise4;
+
+    /** Flag indicating whether or not the half-wave rectified
+     *  spectral difference should be used in calculating the distance
+     *  metric for pairs of audio frames, instead of the straight
+     *  spectrum values. (Default = true). */
+    bool useSpectralDifference;
+
+    bool useChromaFrequencyMap;
+
+    /** Scaling factor for distance metric; must guarantee that the
+     *  final value fits in the data type used, that is, unsigned
+     *  char. (Default = 16).
+     */
+    double scale;
+
+    /** Spacing of audio frames (determines the amount of overlap or
+     *  skip between frames). This value is expressed in
+     *  seconds. (Default = 0.020s) */
+    double hopTime;
+
+    /** The size of an FFT frame in seconds. (Default = 0.04644s).
+     *  Note that the value is not taken to be precise; it is adjusted
+     *  so that <code>fftSize</code> is always power of 2. */
+    double fftTime;
+
+    /** The width of the search band (error margin) around the current
+     *  match position, measured in seconds. Strictly speaking the
+     *  width is measured backwards from the current point, since the
+     *  algorithm has to work causally.
+     */
+    double blockTime;
+
+    /** Spacing of audio frames in samples (see <code>hopTime</code>) */
+    int hopSize;
+
+    /** The size of an FFT frame in samples (see <code>fftTime</code>) */
+    int fftSize;
+
+    /** Width of the search band in FFT frames (see <code>blockTime</code>) */
+    int blockSize;
+
+    /** The number of frames of audio data which have been read. */
+    int frameCount;
+
+    /** RMS amplitude of the current frame. */
+//    double frameRMS;
+
+    /** Long term average frame energy (in frequency domain
+     *  representation). */
+    double ltAverage;
+
+    /** The number of frames sequentially processed by this matcher,
+     *  without a frame of the other matcher being processed.
+     */
+    int runCount;
+
+    /** Interactive control of the matching process allows pausing
+     *  computation of the cost matrices in one direction.
+     */
+    bool paused;
+
+    /** The total number of frames of audio data to be read. */
+    int maxFrames;
+
+    /** A mapping function for mapping FFT bins to final frequency
+     *  bins.  The mapping is linear (1-1) until the resolution
+     *  reaches 2 points per semitone, then logarithmic with a
+     *  semitone resolution.  e.g. for 44.1kHz sampling rate and
+     *  fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
+     *  linearly for bins 0-34 (0 to 732Hz), and logarithmically for
+     *  the remaining bins (midi notes 79 to 127, bins 35 to 83),
+     *  where all energy above note 127 is mapped into the final
+     *  bin. */
+    vector<int> freqMap;
+
+    /** The number of entries in <code>freqMap</code>. Note that the
+     *  length of the array is greater, because its size is not known
+     *  at creation time. */
+    int freqMapSize;
+
+    /** The most recent frame; used for calculating the frame to frame
+     *  spectral difference. */
+    vector<double> prevFrame;
+    vector<double> newFrame;
+
+    /** A block of previously seen frames are stored in this structure
+     *  for calculation of the distance matrix as the new frames are
+     *  read in.  One can think of the structure of the array as a
+     *  circular buffer of vectors. The last element of each vector
+     *  stores the total energy. */
+    vector<vector<double> > frames;
+
+    /** The best path cost matrix. */
+    int **bestPathCost;
+
+    /** The distance matrix. */
+    unsigned char **distance;
+
+    /** The bounds of each row of data in the distance and path cost matrices.*/
+    int *first;
+    int *last;
+
+    /** Height of each column in distance and bestPathCost matrices */
+    int *distYSizes;
+
+    /** Width of distance and bestPathCost matrices and first and last vectors */
+    int  distXSize;
+
+    /** Total number of audio frames, or -1 for live or compressed input. */
+    long fileLength;
+
+    bool initialised;
+
+//!!!    bool atEnd; //!!!
+
+    /** Disable or enable debugging output */
+    static bool silent;
+
+    static const double decay = 0.99;
+    static const double silenceThreshold = 0.0004;
+    static const int MAX_RUN_COUNT = 3;
+
+    friend class Finder; //!!!
+
+public:
+    /** Constructor for Matcher.
+     *
+     *  @param p The Matcher representing the performance with which
+     *  this one is going to be matched.  Some information is shared
+     *  between the two matchers (currently one possesses the distance
+     *  matrix and optimal path matrix).
+     */
+    Matcher(float rate, Matcher *p);
+
+    ~Matcher();
+
+    /** For debugging, outputs information about the Matcher to
+     *  standard error.
+     */
+    void print();
+
+    /** Gives some basic `header' information about the Matcher. */
+    string toString();
+
+    /** Adds a link to the Matcher object representing the performance
+     *  which is going to be matched to this one.
+     *
+     *  @param p the Matcher representing the other performance
+     */
+    void setOtherMatcher(Matcher *p) {
+        otherMatcher = p;
+    } // setOtherMatcher()
+
+    int getFFTSize() {
+        return fftSize;
+    }
+    
+    int getHopSize() {
+        return hopSize;
+    }
+
+    int getFrameCount() { 
+        return frameCount;
+    }
+
+protected:
+    template <typename T>
+    void initVector(vector<T> &vec, int sz, T dflt = 0) {
+        std::cerr << "initVector: " << sz << " * " << sizeof(T) << " = "
+                  << sz * sizeof(T) << std::endl;
+        vec.clear();
+        while ((int)vec.size() < sz) vec.push_back(dflt);
+    }
+
+    template <typename T>
+    void initMatrix(vector<vector<T> > &mat, int hsz, int vsz,
+                    T dflt = 0, int fillTo = -1) {
+        std::cerr << "initMatrix: " << hsz << " * " << vsz << " * "
+                  << sizeof(T) << " = "
+                  << hsz * vsz * sizeof(T) << std::endl;
+        mat.clear();
+        if (fillTo < 0) fillTo = hsz;
+        for (int i = 0; i < hsz; ++i) {
+            mat.push_back(vector<T>());
+            if (i < fillTo) {
+                while ((int)mat[i].size() < vsz) {
+                    mat[i].push_back(dflt);
+                }
+            }
+        }
+    }
+
+    void init();
+
+    void makeFreqMap(int fftSize, float sampleRate);
+
+    /** Creates a map of FFT frequency bins to comparison bins.  Where
+     *  the spacing of FFT bins is less than 0.5 semitones, the
+     *  mapping is one to one. Where the spacing is greater than 0.5
+     *  semitones, the FFT energy is mapped into semitone-wide
+     *  bins. No scaling is performed; that is the energy is summed
+     *  into the comparison bins. See also processFrame()
+     */
+    void makeStandardFrequencyMap(int fftSize, float sampleRate);
+
+    void makeChromaFrequencyMap(int fftSize, float sampleRate);
+
+    /** Processes a frame of audio data by first computing the STFT
+     *  with a Hamming window, then mapping the frequency bins into a
+     *  part-linear part-logarithmic array, then (optionally)
+     *  computing the half-wave rectified spectral difference from the
+     *  previous frame, then (optionally) normalising to a sum of 1,
+     *  then calculating the distance to all frames stored in the
+     *  otherMatcher and storing them in the distance matrix, and
+     *  finally updating the optimal path matrix using the dynamic
+     *  time warping algorithm.
+     */
+    void processFrame(double *reBuffer, double *imBuffer);
+
+    /** Calculates the Manhattan distance between two vectors, with an
+     *  optional normalisation by the combined values in the
+     *  vectors. Since the vectors contain energy, this could be
+     *  considered as a squared Euclidean distance metric. Note that
+     *  normalisation assumes the values are all non-negative.
+     *
+     *  @param f1 one of the vectors involved in the distance calculation
+     *  @param f2 one of the vectors involved in the distance calculation
+     *  @return the distance, scaled and truncated to an integer
+     */
+    int calcDistance(const vector<double> &f1, const vector<double> &f2);
+
+    /** Retrieves values from the minimum cost matrix.
+     *
+     *  @param i the frame number of this Matcher
+     *  @param j the frame number of the other Matcher
+     *  @return the cost of the minimum cost path to this location
+     */
+    int getValue(int i, int j, bool firstAttempt);
+
+    /** Stores entries in the distance matrix and the optimal path matrix.
+     *
+     *  @param i the frame number of this Matcher
+     *  @param j the frame number of the other Matcher
+     *  @param dir the direction from which this position is reached with
+     *  minimum cost
+     *  @param value the cost of the minimum path except the current step
+     *  @param dMN the distance cost between the two frames
+     */
+    void setValue(int i, int j, int dir, int value, int dMN);
+
+    friend class MatchFeeder;
+
+}; // class Matcher
+
+#endif