Chris@37
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@37
|
2
|
Chris@37
|
3 /*
|
Chris@37
|
4 Vamp feature extraction plugin using the MATCH audio alignment
|
Chris@37
|
5 algorithm.
|
Chris@37
|
6
|
Chris@37
|
7 Centre for Digital Music, Queen Mary, University of London.
|
Chris@37
|
8 This file copyright 2007 Simon Dixon, Chris Cannam and QMUL.
|
Chris@37
|
9
|
Chris@37
|
10 This program is free software; you can redistribute it and/or
|
Chris@37
|
11 modify it under the terms of the GNU General Public License as
|
Chris@37
|
12 published by the Free Software Foundation; either version 2 of the
|
Chris@37
|
13 License, or (at your option) any later version. See the file
|
Chris@37
|
14 COPYING included with this distribution for more information.
|
Chris@37
|
15 */
|
Chris@37
|
16
|
Chris@37
|
17 #ifndef FEATURE_EXTRACTOR_H
|
Chris@37
|
18 #define FEATURE_EXTRACTOR_H
|
Chris@37
|
19
|
Chris@37
|
20 #include <vector>
|
Chris@37
|
21
|
Chris@37
|
22 /**
|
Chris@37
|
23 * Convert frequency-domain audio frames into features suitable for
|
Chris@37
|
24 * MATCH alignment calculation. The default feature is a warping of
|
Chris@37
|
25 * the frequency data to map higher frequencies into a linear scale. A
|
Chris@37
|
26 * chroma mapping is also available.
|
Chris@37
|
27 *
|
Chris@37
|
28 * Note that FeatureExtractor maintains internal frame-to-frame state:
|
Chris@37
|
29 * use one FeatureExtractor per audio source, and construct a new one
|
Chris@37
|
30 * for each new source.
|
Chris@37
|
31 */
|
Chris@37
|
32 class FeatureExtractor
|
Chris@37
|
33 {
|
Chris@37
|
34 public:
|
Chris@37
|
35 enum FrameNormalisation {
|
Chris@37
|
36
|
Chris@37
|
37 /** Do not normalise frames */
|
Chris@37
|
38 NoFrameNormalisation,
|
Chris@37
|
39
|
Chris@37
|
40 /** Normalise each frame to have a sum of 1 */
|
Chris@37
|
41 NormaliseFrameToSum1,
|
Chris@37
|
42
|
Chris@37
|
43 /** Normalise each frame by the long-term average of the
|
Chris@37
|
44 * summed energy */
|
Chris@37
|
45 NormaliseFrameToLTAverage,
|
Chris@37
|
46 };
|
Chris@37
|
47
|
Chris@37
|
48 struct Parameters {
|
Chris@37
|
49
|
Chris@37
|
50 Parameters(float rate_, int fftSize_) :
|
Chris@37
|
51 sampleRate(rate_),
|
Chris@37
|
52 frameNorm(NormaliseFrameToSum1),
|
Chris@37
|
53 useSpectralDifference(true),
|
Chris@37
|
54 useChromaFrequencyMap(false),
|
Chris@37
|
55 fftSize(fftSize_),
|
Chris@37
|
56 silenceThreshold(0.01),
|
Chris@37
|
57 decay(0.99)
|
Chris@37
|
58 {}
|
Chris@37
|
59
|
Chris@37
|
60 /** Sample rate of audio */
|
Chris@37
|
61 float sampleRate;
|
Chris@37
|
62
|
Chris@37
|
63 /** Type of audio frame normalisation */
|
Chris@37
|
64 FrameNormalisation frameNorm;
|
Chris@37
|
65
|
Chris@37
|
66 /** Flag indicating whether or not the half-wave rectified
|
Chris@37
|
67 * spectral difference should be used in calculating the
|
Chris@37
|
68 * distance metric for pairs of audio frames, instead of the
|
Chris@37
|
69 * straight spectrum values. */
|
Chris@37
|
70 bool useSpectralDifference;
|
Chris@37
|
71
|
Chris@37
|
72 /** Flag indicating whether to use a chroma frequency map (12
|
Chris@37
|
73 * bins) instead of the default warped spectrogram */
|
Chris@37
|
74 bool useChromaFrequencyMap;
|
Chris@37
|
75
|
Chris@37
|
76 /** Spacing of audio frames (determines the amount of overlap or
|
Chris@37
|
77 * skip between frames). This value is expressed in
|
Chris@37
|
78 * seconds. */
|
Chris@37
|
79 double hopTime;
|
Chris@37
|
80
|
Chris@37
|
81 /** Size of an FFT frame in samples. Note that the data passed
|
Chris@37
|
82 * in is already in the frequency domain, so this expresses
|
Chris@37
|
83 * the size of the frame that the caller will be providing. */
|
Chris@37
|
84 int fftSize;
|
Chris@37
|
85
|
Chris@37
|
86 /** RMS level below which frame is considered silent */
|
Chris@37
|
87 double silenceThreshold;
|
Chris@37
|
88
|
Chris@37
|
89 /** Frame-to-frame decay factor in calculating long-term average */
|
Chris@37
|
90 double decay;
|
Chris@37
|
91 };
|
Chris@37
|
92
|
Chris@37
|
93 /**
|
Chris@37
|
94 * Construct a FeatureExtractor with the given parameters.
|
Chris@37
|
95 *
|
Chris@37
|
96 * Note that FeatureExtractor maintains internal frame-to-frame
|
Chris@37
|
97 * state: use one FeatureExtractor per audio source, and construct
|
Chris@37
|
98 * a new one for each new source.
|
Chris@37
|
99 */
|
Chris@37
|
100 FeatureExtractor(Parameters params);
|
Chris@37
|
101
|
Chris@37
|
102 /**
|
Chris@37
|
103 * Return the feature vector size that will be returned from process().
|
Chris@37
|
104 */
|
Chris@37
|
105 int getFeatureSize() const { return m_featureSize; }
|
Chris@37
|
106
|
Chris@37
|
107 /**
|
Chris@37
|
108 * Process one frequency-domain audio frame (provided as real &
|
Chris@37
|
109 * imaginary components from the FFT output). Return a feature
|
Chris@38
|
110 * vector of size given by getFeatureSize(). Input vectors must
|
Chris@38
|
111 * have at least params.fftSize/2+1 elements each.
|
Chris@37
|
112 *
|
Chris@37
|
113 * Operates by mapping the frequency bins into a part-linear
|
Chris@37
|
114 * part-logarithmic array, then (optionally) computing the
|
Chris@37
|
115 * half-wave rectified spectral difference from the previous
|
Chris@37
|
116 * frame, then (optionally) normalising to a sum of 1.
|
Chris@37
|
117 *
|
Chris@37
|
118 * Return value is the frame (post-processed, with warping,
|
Chris@37
|
119 * rectification, and normalisation as appropriate).
|
Chris@37
|
120 */
|
Chris@37
|
121 std::vector<double> process(const std::vector<double> &real,
|
Chris@37
|
122 const std::vector<double> &imag);
|
Chris@37
|
123
|
Chris@37
|
124 protected:
|
Chris@37
|
125 /** Make either standard or chroma map, depending on m_params */
|
Chris@37
|
126 void makeFreqMap();
|
Chris@37
|
127
|
Chris@37
|
128 /** Creates a map of FFT frequency bins to comparison bins. Where
|
Chris@37
|
129 * the spacing of FFT bins is less than 0.5 semitones, the
|
Chris@37
|
130 * mapping is one to one. Where the spacing is greater than 0.5
|
Chris@37
|
131 * semitones, the FFT energy is mapped into semitone-wide
|
Chris@37
|
132 * bins. No scaling is performed; that is the energy is summed
|
Chris@37
|
133 * into the comparison bins. */
|
Chris@37
|
134 void makeStandardFrequencyMap();
|
Chris@37
|
135
|
Chris@37
|
136 /** Creates a map of FFT frequency bins to semitone chroma bins. */
|
Chris@37
|
137 void makeChromaFrequencyMap();
|
Chris@37
|
138
|
Chris@37
|
139 /** Configuration parameters */
|
Chris@37
|
140 Parameters m_params;
|
Chris@37
|
141
|
Chris@37
|
142 /** Long term average frame energy (in frequency domain
|
Chris@37
|
143 * representation). */
|
Chris@37
|
144 double m_ltAverage;
|
Chris@37
|
145
|
Chris@37
|
146 /** A mapping function for mapping FFT bins to final frequency
|
Chris@37
|
147 * bins. The mapping is linear (1-1) until the resolution
|
Chris@37
|
148 * reaches 2 points per semitone, then logarithmic with a
|
Chris@37
|
149 * semitone resolution. e.g. for 44.1kHz sampling rate and
|
Chris@37
|
150 * fftSize of 2048 (46ms), bin spacing is 21.5Hz, which is mapped
|
Chris@37
|
151 * linearly for bins 0-34 (0 to 732Hz), and logarithmically for
|
Chris@37
|
152 * the remaining bins (midi notes 79 to 127, bins 35 to 83),
|
Chris@37
|
153 * where all energy above note 127 is mapped into the final
|
Chris@37
|
154 * bin. */
|
Chris@37
|
155 std::vector<int> m_freqMap;
|
Chris@37
|
156
|
Chris@37
|
157 /** The size of a returned feature. */
|
Chris@37
|
158 int m_featureSize;
|
Chris@37
|
159
|
Chris@37
|
160 /** The most recent frame; used for calculating the frame to frame
|
Chris@37
|
161 * spectral difference. This is therefore frequency warped but
|
Chris@37
|
162 * not yet normalised. */
|
Chris@37
|
163 std::vector<double> m_prevFrame;
|
Chris@37
|
164 };
|
Chris@37
|
165
|
Chris@37
|
166 #endif
|
Chris@37
|
167
|