annotate src/FeatureExtractor.h @ 172:30d59e1e4232 structure

Minor tidy
author Chris Cannam
date Fri, 06 Feb 2015 18:09:18 +0000
parents 4ca5e4219684
children cdbee79699b0
rev   line source
Chris@37 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@37 2
Chris@37 3 /*
Chris@37 4 Vamp feature extraction plugin using the MATCH audio alignment
Chris@37 5 algorithm.
Chris@37 6
Chris@37 7 Centre for Digital Music, Queen Mary, University of London.
Chris@37 8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
Chris@37 9
Chris@37 10 This program is free software; you can redistribute it and/or
Chris@37 11 modify it under the terms of the GNU General Public License as
Chris@37 12 published by the Free Software Foundation; either version 2 of the
Chris@37 13 License, or (at your option) any later version. See the file
Chris@37 14 COPYING included with this distribution for more information.
Chris@37 15 */
Chris@37 16
Chris@37 17 #ifndef FEATURE_EXTRACTOR_H
Chris@37 18 #define FEATURE_EXTRACTOR_H
Chris@37 19
Chris@37 20 #include <vector>
Chris@37 21
Chris@37 22 /**
Chris@37 23 * Convert frequency-domain audio frames into features suitable for
Chris@125 24 * MATCH alignment calculation.
Chris@37 25 *
Chris@125 26 * The default feature is a warping of the frequency data to map FFT
Chris@125 27 * frequency bins into feature bins. The mapping is linear (1-1) until
Chris@125 28 * the resolution reaches 2 points per semitone, then logarithmic with
Chris@125 29 * a semitone resolution. e.g. for 44.1kHz sampling rate and fftSize
Chris@125 30 * of 2048 (46ms), bin spacing is 21.5Hz, which is mapped linearly for
Chris@125 31 * bins 0-34 (0 to 732Hz), and logarithmically for the remaining bins
Chris@125 32 * (midi notes 79 to 127, bins 35 to 83), where all energy above note
Chris@125 33 * 127 is mapped into the final bin.
Chris@125 34 *
Chris@125 35 * Alternatively a chroma mapping is also available. This produces a
Chris@125 36 * 13-bin feature by mapping all FFT bins into bin 0 until the
Chris@125 37 * resolution reaches 1 point per semitone, then mapping each
Chris@125 38 * subsequent bin into its corresponding semitone in the remaining 12
Chris@125 39 * bins (where bin 1 is C). e.g. e.g. for 44.1kHz sampling rate and
Chris@125 40 * fftSize of 2048 (46ms), frequencies up to 361 Hz go to bin 0,
Chris@125 41 * subsequent frequencies to the chroma bins.
Chris@37 42 */
Chris@37 43 class FeatureExtractor
Chris@37 44 {
Chris@37 45 public:
Chris@37 46 struct Parameters {
Chris@37 47
Chris@37 48 Parameters(float rate_, int fftSize_) :
Chris@37 49 sampleRate(rate_),
Chris@37 50 useChromaFrequencyMap(false),
Chris@103 51 fftSize(fftSize_)
Chris@37 52 {}
Chris@37 53
Chris@37 54 /** Sample rate of audio */
Chris@37 55 float sampleRate;
Chris@37 56
Chris@37 57 /** Flag indicating whether to use a chroma frequency map (12
Chris@37 58 * bins) instead of the default warped spectrogram */
Chris@37 59 bool useChromaFrequencyMap;
Chris@37 60
Chris@37 61 /** Size of an FFT frame in samples. Note that the data passed
Chris@37 62 * in is already in the frequency domain, so this expresses
Chris@37 63 * the size of the frame that the caller will be providing. */
Chris@37 64 int fftSize;
Chris@37 65 };
Chris@37 66
Chris@37 67 /**
Chris@37 68 * Construct a FeatureExtractor with the given parameters.
Chris@37 69 *
Chris@37 70 * Note that FeatureExtractor maintains internal frame-to-frame
Chris@37 71 * state: use one FeatureExtractor per audio source, and construct
Chris@37 72 * a new one for each new source.
Chris@37 73 */
Chris@37 74 FeatureExtractor(Parameters params);
Chris@37 75
Chris@37 76 /**
Chris@37 77 * Return the feature vector size that will be returned from process().
Chris@37 78 */
Chris@37 79 int getFeatureSize() const { return m_featureSize; }
Chris@74 80
Chris@74 81 /**
Chris@74 82 * Return the feature vector size that would be returned from
Chris@74 83 * process() with these parameters.
Chris@74 84 */
Chris@74 85 static int getFeatureSizeFor(Parameters params);
Chris@37 86
Chris@37 87 /**
Chris@37 88 * Process one frequency-domain audio frame (provided as real &
Chris@37 89 * imaginary components from the FFT output). Return a feature
Chris@38 90 * vector of size given by getFeatureSize(). Input vectors must
Chris@38 91 * have at least params.fftSize/2+1 elements each.
Chris@37 92 *
Chris@37 93 * Operates by mapping the frequency bins into a part-linear
Chris@103 94 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103 95 * which case they are mapped into chroma bins.
Chris@37 96 */
Chris@37 97 std::vector<double> process(const std::vector<double> &real,
Chris@37 98 const std::vector<double> &imag);
Chris@37 99
Chris@74 100 /**
Chris@74 101 * Process one frequency-domain audio frame, provided as a single
Chris@74 102 * array of alternating real and imaginary components. Input array
Chris@74 103 * must have at least 2 * (params.fftSize/2 + 1) elements.
Chris@74 104 *
Chris@74 105 * Operates by mapping the frequency bins into a part-linear
Chris@103 106 * part-logarithmic array, unless useChromaFrequencyMap is true in
Chris@103 107 * which case they are mapped into chroma bins.
Chris@74 108 */
Chris@74 109 std::vector<double> process(const float *carray);
Chris@74 110
Chris@37 111 protected:
Chris@37 112 /** Make either standard or chroma map, depending on m_params */
Chris@37 113 void makeFreqMap();
Chris@37 114
Chris@37 115 /** Creates a map of FFT frequency bins to comparison bins. Where
Chris@37 116 * the spacing of FFT bins is less than 0.5 semitones, the
Chris@37 117 * mapping is one to one. Where the spacing is greater than 0.5
Chris@37 118 * semitones, the FFT energy is mapped into semitone-wide
Chris@37 119 * bins. No scaling is performed; that is the energy is summed
Chris@37 120 * into the comparison bins. */
Chris@37 121 void makeStandardFrequencyMap();
Chris@37 122
Chris@37 123 /** Creates a map of FFT frequency bins to semitone chroma bins. */
Chris@37 124 void makeChromaFrequencyMap();
Chris@37 125
Chris@37 126 /** Configuration parameters */
Chris@37 127 Parameters m_params;
Chris@37 128
Chris@37 129 /** A mapping function for mapping FFT bins to final frequency
Chris@37 130 * bins. The mapping is linear (1-1) until the resolution
Chris@37 131 * reaches 2 points per semitone, then logarithmic with a
Chris@37 132 * semitone resolution. e.g. for 44.1kHz sampling rate and
Chris@37 133 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
Chris@37 134 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
Chris@37 135 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
Chris@37 136 * where all energy above note 127 is mapped into the final
Chris@37 137 * bin. */
Chris@37 138 std::vector<int> m_freqMap;
Chris@37 139
Chris@37 140 /** The size of a returned feature. */
Chris@37 141 int m_featureSize;
Chris@37 142 };
Chris@37 143
Chris@37 144 #endif
Chris@37 145