annotate src/FeatureExtractor.h @ 246:aac9ad4064ea subsequence tip

Fix incorrect handling of silent tail in the non-subsequence MATCH phase; some debug output changes
author Chris Cannam
date Fri, 24 Jul 2020 14:29:55 +0100
parents 39fe8728e1ca
children
rev   line source
Chris@37 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@37 2
Chris@37 3 /*
Chris@37 4 Vamp feature extraction plugin using the MATCH audio alignment
Chris@37 5 algorithm.
Chris@37 6
Chris@37 7 Centre for Digital Music, Queen Mary, University of London.
Chris@236 8 Copyright (c) 2007-2020 Simon Dixon, Chris Cannam, and Queen Mary
Chris@230 9 University of London, Copyright (c) 2014-2015 Tido GmbH.
Chris@37 10
Chris@37 11 This program is free software; you can redistribute it and/or
Chris@37 12 modify it under the terms of the GNU General Public License as
Chris@37 13 published by the Free Software Foundation; either version 2 of the
Chris@37 14 License, or (at your option) any later version. See the file
Chris@37 15 COPYING included with this distribution for more information.
Chris@37 16 */
Chris@37 17
Chris@37 18 #ifndef FEATURE_EXTRACTOR_H
Chris@37 19 #define FEATURE_EXTRACTOR_H
Chris@37 20
Chris@187 21 #include "MatchTypes.h"
Chris@37 22
Chris@37 23 /**
Chris@37 24 * Convert frequency-domain audio frames into features suitable for
Chris@125 25 * MATCH alignment calculation.
Chris@37 26 *
Chris@125 27 * The default feature is a warping of the frequency data to map FFT
Chris@125 28 * frequency bins into feature bins. The mapping is linear (1-1) until
Chris@125 29 * the resolution reaches 2 points per semitone, then logarithmic with
Chris@125 30 * a semitone resolution. e.g. for 44.1kHz sampling rate and fftSize
Chris@125 31 * of 2048 (46ms), bin spacing is 21.5Hz, which is mapped linearly for
Chris@125 32 * bins 0-34 (0 to 732Hz), and logarithmically for the remaining bins
Chris@125 33 * (midi notes 79 to 127, bins 35 to 83), where all energy above note
Chris@125 34 * 127 is mapped into the final bin.
Chris@125 35 *
Chris@125 36 * Alternatively a chroma mapping is also available. This produces a
Chris@125 37 * 13-bin feature by mapping all FFT bins into bin 0 until the
Chris@125 38 * resolution reaches 1 point per semitone, then mapping each
Chris@125 39 * subsequent bin into its corresponding semitone in the remaining 12
Chris@125 40 * bins (where bin 1 is C). e.g. e.g. for 44.1kHz sampling rate and
Chris@125 41 * fftSize of 2048 (46ms), frequencies up to 361 Hz go to bin 0,
Chris@125 42 * subsequent frequencies to the chroma bins.
Chris@37 43 */
Chris@37 44 class FeatureExtractor
Chris@37 45 {
Chris@37 46 public:
Chris@37 47 struct Parameters {
Chris@37 48
Chris@216 49 Parameters(float rate_) :
Chris@37 50 sampleRate(rate_),
Chris@37 51 useChromaFrequencyMap(false),
Chris@216 52 fftSize(2048),
Chris@176 53 referenceFrequency(440.0),
Chris@219 54 minFrequency(150.),
Chris@176 55 maxFrequency(rate_/2.)
Chris@37 56 {}
Chris@37 57
Chris@37 58 /** Sample rate of audio */
Chris@37 59 float sampleRate;
Chris@37 60
Chris@37 61 /** Flag indicating whether to use a chroma frequency map (12
Chris@37 62 * bins) instead of the default warped spectrogram */
Chris@37 63 bool useChromaFrequencyMap;
Chris@37 64
Chris@37 65 /** Size of an FFT frame in samples. Note that the data passed
Chris@37 66 * in is already in the frequency domain, so this expresses
Chris@37 67 * the size of the frame that the caller will be providing. */
Chris@37 68 int fftSize;
Chris@159 69
Chris@159 70 /** Frequency of concert A */
Chris@159 71 double referenceFrequency;
Chris@176 72
Chris@176 73 /** Minimum frequency cutoff to include in feature */
Chris@176 74 double minFrequency;
Chris@176 75
Chris@176 76 /** Maximum frequency cutoff to include in feature */
Chris@176 77 double maxFrequency;
Chris@37 78 };
Chris@37 79
Chris@37 80 /**
Chris@37 81 * Construct a FeatureExtractor with the given parameters.
Chris@37 82 *
Chris@37 83 * Note that FeatureExtractor maintains internal frame-to-frame
Chris@37 84 * state: use one FeatureExtractor per audio source, and construct
Chris@37 85 * a new one for each new source.
Chris@37 86 */
Chris@37 87 FeatureExtractor(Parameters params);
Chris@37 88
Chris@37 89 /**
Chris@37 90 * Return the feature vector size that will be returned from process().
Chris@37 91 */
Chris@37 92 int getFeatureSize() const { return m_featureSize; }
Chris@74 93
Chris@74 94 /**
Chris@74 95 * Return the feature vector size that would be returned from
Chris@74 96 * process() with these parameters.
Chris@74 97 */
Chris@74 98 static int getFeatureSizeFor(Parameters params);
Chris@37 99
Chris@37 100 /**
Chris@201 101 * Process one frequency-domain audio frame, provided as real &
Chris@201 102 * imaginary components from the FFT output. Return a feature
Chris@38 103 * vector of size given by getFeatureSize(). Input vectors must
Chris@38 104 * have at least params.fftSize/2+1 elements each.
Chris@37 105 *
Chris@37 106 * Operates by mapping the frequency bins into a part-linear
Chris@103 107 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103 108 * which case they are mapped into chroma bins.
Chris@37 109 */
Chris@183 110 feature_t process(const std::vector<double> &real,
Chris@183 111 const std::vector<double> &imag);
Chris@37 112
Chris@74 113 /**
Chris@201 114 * Process one frequency-domain audio frame, provided as real &
Chris@201 115 * imaginary components from the FFT output. Return a feature
Chris@184 116 * vector of size given by getFeatureSize(). Input vectors must
Chris@184 117 * have at least params.fftSize/2+1 elements each.
Chris@184 118 *
Chris@184 119 * Operates by mapping the frequency bins into a part-linear
Chris@184 120 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@184 121 * which case they are mapped into chroma bins.
Chris@184 122 */
Chris@184 123 feature_t process(const std::vector<float> &real,
Chris@184 124 const std::vector<float> &imag);
Chris@184 125
Chris@184 126 /**
Chris@201 127 * Process one frequency-domain audio frame, provided as real &
Chris@201 128 * imaginary components from the FFT output. Return a feature
Chris@201 129 * vector of size given by getFeatureSize(). Input arrays must
Chris@201 130 * have at least params.fftSize/2+1 elements each.
Chris@201 131 *
Chris@201 132 * Operates by mapping the frequency bins into a part-linear
Chris@201 133 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@201 134 * which case they are mapped into chroma bins.
Chris@201 135 */
Chris@201 136 feature_t process(const float *real, const float *imag);
Chris@201 137
Chris@201 138 /**
Chris@74 139 * Process one frequency-domain audio frame, provided as a single
Chris@74 140 * array of alternating real and imaginary components. Input array
Chris@74 141 * must have at least 2 * (params.fftSize/2 + 1) elements.
Chris@74 142 *
Chris@74 143 * Operates by mapping the frequency bins into a part-linear
Chris@103 144 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103 145 * which case they are mapped into chroma bins.
Chris@74 146 */
Chris@184 147 feature_t process(const float *carray);
Chris@74 148
Chris@37 149 protected:
Chris@37 150 /** Make either standard or chroma map, depending on m_params */
Chris@37 151 void makeFreqMap();
Chris@37 152
Chris@37 153 /** Creates a map of FFT frequency bins to comparison bins. Where
Chris@37 154 * the spacing of FFT bins is less than 0.5 semitones, the
Chris@37 155 * mapping is one to one. Where the spacing is greater than 0.5
Chris@37 156 * semitones, the FFT energy is mapped into semitone-wide
Chris@37 157 * bins. No scaling is performed; that is the energy is summed
Chris@37 158 * into the comparison bins. */
Chris@37 159 void makeStandardFrequencyMap();
Chris@37 160
Chris@37 161 /** Creates a map of FFT frequency bins to semitone chroma bins. */
Chris@37 162 void makeChromaFrequencyMap();
Chris@37 163
Chris@37 164 /** Configuration parameters */
Chris@37 165 Parameters m_params;
Chris@37 166
Chris@37 167 /** A mapping function for mapping FFT bins to final frequency
Chris@37 168 * bins. The mapping is linear (1-1) until the resolution
Chris@37 169 * reaches 2 points per semitone, then logarithmic with a
Chris@37 170 * semitone resolution. e.g. for 44.1kHz sampling rate and
Chris@37 171 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
Chris@37 172 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
Chris@37 173 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
Chris@37 174 * where all energy above note 127 is mapped into the final
Chris@176 175 * bin.
Chris@176 176 *
Chris@176 177 * If a bin's frequency is outside the minFrequency->maxFrequency
Chris@176 178 * range, it will be mapped to a target bin of -1 and should be
Chris@176 179 * discarded.
Chris@176 180 */
Chris@37 181 std::vector<int> m_freqMap;
Chris@37 182
Chris@184 183 feature_t processMags(const std::vector<float> &mags);
Chris@184 184 std::vector<float> scaleMags(const std::vector<float> &mags);
Chris@169 185
Chris@37 186 /** The size of a returned feature. */
Chris@37 187 int m_featureSize;
Chris@37 188 };
Chris@37 189
Chris@37 190 #endif
Chris@37 191