annotate BeatRootProcessor.h @ 3:a821f49c42f0

More pruning, etc
author Chris Cannam
date Mon, 20 Jun 2011 16:32:11 +0100
parents 7d4e6b1ff3d1
children c06cf6f7cb04
rev   line source
Chris@1 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@1 2
Chris@1 3 /*
Chris@3 4 Vamp feature extraction plugin for the BeatRoot beat tracker.
Chris@1 5
Chris@3 6 Centre for Digital Music, Queen Mary, University of London.
Chris@3 7 This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
Chris@1 8
Chris@3 9 This program is free software; you can redistribute it and/or
Chris@3 10 modify it under the terms of the GNU General Public License as
Chris@3 11 published by the Free Software Foundation; either version 2 of the
Chris@3 12 License, or (at your option) any later version. See the file
Chris@3 13 COPYING included with this distribution for more information.
Chris@1 14 */
Chris@1 15
Chris@1 16 #ifndef _BEATROOT_PROCESSOR_H_
Chris@1 17 #define _BEATROOT_PROCESSOR_H_
Chris@1 18
Chris@2 19 #include <vector>
Chris@3 20 #include <cmath>
Chris@2 21
Chris@2 22 using std::vector;
Chris@2 23
Chris@1 24 class BeatRootProcessor
Chris@1 25 {
Chris@1 26 protected:
Chris@1 27 /** Sample rate of audio */
Chris@1 28 float sampleRate;
Chris@1 29
Chris@1 30 /** Spacing of audio frames (determines the amount of overlap or
Chris@1 31 * skip between frames). This value is expressed in
Chris@1 32 * seconds. (Default = 0.020s) */
Chris@1 33 double hopTime;
Chris@1 34
Chris@1 35 /** The approximate size of an FFT frame in seconds. (Default =
Chris@1 36 * 0.04644s). The value is adjusted so that <code>fftSize</code>
Chris@1 37 * is always power of 2. */
Chris@1 38 double fftTime;
Chris@1 39
Chris@1 40 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
Chris@1 41 int hopSize;
Chris@1 42
Chris@1 43 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
Chris@1 44 int fftSize;
Chris@1 45
Chris@1 46 /** The number of overlapping frames of audio data which have been read. */
Chris@1 47 int frameCount;
Chris@1 48
Chris@1 49 /** RMS amplitude of the current frame. */
Chris@1 50 double frameRMS;
Chris@1 51
Chris@1 52 /** Long term average frame energy (in frequency domain representation). */
Chris@1 53 double ltAverage;
Chris@1 54
Chris@1 55 /** Spectral flux onset detection function, indexed by frame. */
Chris@1 56 vector<int> spectralFlux;
Chris@1 57
Chris@1 58 /** A mapping function for mapping FFT bins to final frequency bins.
Chris@1 59 * The mapping is linear (1-1) until the resolution reaches 2 points per
Chris@1 60 * semitone, then logarithmic with a semitone resolution. e.g. for
Chris@1 61 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
Chris@1 62 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
Chris@1 63 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
Chris@1 64 * 83), where all energy above note 127 is mapped into the final bin. */
Chris@1 65 vector<int> freqMap;
Chris@1 66
Chris@1 67 /** The number of entries in <code>freqMap</code>. Note that the length of
Chris@1 68 * the array is greater, because its size is not known at creation time. */
Chris@1 69 int freqMapSize;
Chris@1 70
Chris@1 71 /** The magnitude spectrum of the most recent frame. Used for
Chris@1 72 * calculating the spectral flux. */
Chris@1 73 vector<double> prevFrame;
Chris@1 74
Chris@1 75 /** The magnitude spectrum of the current frame. */
Chris@1 76 vector<double> newFrame;
Chris@1 77
Chris@1 78 /** The magnitude spectra of all frames, used for plotting the spectrogram. */
Chris@1 79 vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
Chris@1 80
Chris@1 81 /** The RMS energy of all frames. */
Chris@3 82 // vector<double> energy; //!!! unused in beat tracking?
Chris@1 83
Chris@1 84 /** The estimated onset times from peak-picking the onset
Chris@1 85 * detection function(s). */
Chris@1 86 vector<double> onsets;
Chris@1 87
Chris@1 88 /** The estimated onset times and their saliences. */
Chris@1 89 //!!!EventList onsetList;
Chris@1 90 vector<double> onsetList; //!!! corresponding to keyDown member of events in list
Chris@1 91
Chris@1 92 /** Total number of audio frames if known, or -1 for live or compressed input. */
Chris@1 93 int totalFrames;
Chris@1 94
Chris@1 95 /** Flag for enabling or disabling debugging output */
Chris@2 96 static bool debug;
Chris@1 97
Chris@1 98 /** Flag for suppressing all standard output messages except results. */
Chris@2 99 static bool silent;
Chris@1 100
Chris@1 101 /** RMS frame energy below this value results in the frame being
Chris@1 102 * set to zero, so that normalisation does not have undesired
Chris@1 103 * side-effects. */
Chris@2 104 static double silenceThreshold; //!!!??? energy of what? should not be static?
Chris@1 105
Chris@1 106 /** For dynamic range compression, this value is added to the log
Chris@1 107 * magnitude in each frequency bin and any remaining negative
Chris@1 108 * values are then set to zero.
Chris@1 109 */
Chris@2 110 static double rangeThreshold; //!!! sim
Chris@1 111
Chris@1 112 /** Determines method of normalisation. Values can be:<ul>
Chris@1 113 * <li>0: no normalisation</li>
Chris@1 114 * <li>1: normalisation by current frame energy</li>
Chris@1 115 * <li>2: normalisation by exponential average of frame energy</li>
Chris@1 116 * </ul>
Chris@1 117 */
Chris@2 118 static int normaliseMode;
Chris@1 119
Chris@1 120 /** Ratio between rate of sampling the signal energy (for the
Chris@1 121 * amplitude envelope) and the hop size */
Chris@3 122 // static int energyOversampleFactor; //!!! not used?
Chris@1 123
Chris@1 124 public:
Chris@1 125
Chris@1 126 /** Constructor: note that streams are not opened until the input
Chris@1 127 * file is set (see <code>setInputFile()</code>). */
Chris@2 128 BeatRootProcessor() {
Chris@1 129 frameRMS = 0;
Chris@1 130 ltAverage = 0;
Chris@1 131 frameCount = 0;
Chris@1 132 hopSize = 0;
Chris@1 133 fftSize = 0;
Chris@1 134 hopTime = 0.010; // DEFAULT, overridden with -h
Chris@1 135 fftTime = 0.04644; // DEFAULT, overridden with -f
Chris@3 136 totalFrames = -1; //!!! not needed?
Chris@1 137 } // constructor
Chris@1 138
Chris@2 139 protected:
Chris@3 140 /** Allocates memory for arrays, based on parameter settings */
Chris@3 141 void init() {
Chris@3 142 hopSize = lrint(sampleRate * hopTime);
Chris@3 143 fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2))));
Chris@3 144 makeFreqMap(fftSize, sampleRate);
Chris@3 145 prevFrame.clear();
Chris@3 146 for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0);
Chris@3 147 frameCount = 0;
Chris@3 148 frameRMS = 0;
Chris@3 149 ltAverage = 0;
Chris@3 150 spectralFlux.clear();
Chris@3 151 } // init()
Chris@1 152
Chris@3 153 /** Creates a map of FFT frequency bins to comparison bins.
Chris@3 154 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
Chris@3 155 * one to one. Where the spacing is greater than 0.5 semitones, the FFT
Chris@3 156 * energy is mapped into semitone-wide bins. No scaling is performed; that
Chris@3 157 * is the energy is summed into the comparison bins. See also
Chris@3 158 * processFrame()
Chris@3 159 */
Chris@3 160 void makeFreqMap(int fftSize, float sampleRate) {
Chris@3 161 freqMap.resize(fftSize/2+1);
Chris@3 162 double binWidth = sampleRate / fftSize;
Chris@3 163 int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
Chris@3 164 int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/
Chris@3 165 log(2) * 12 + 69);
Chris@3 166 int i = 0;
Chris@3 167 while (i <= crossoverBin)
Chris@3 168 freqMap[i++] = i;
Chris@3 169 while (i <= fftSize/2) {
Chris@3 170 double midi = log(i*binWidth/440) / log(2) * 12 + 69;
Chris@3 171 if (midi > 127)
Chris@3 172 midi = 127;
Chris@3 173 freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi;
Chris@3 174 }
Chris@3 175 freqMapSize = freqMap[i-1] + 1;
Chris@3 176 } // makeFreqMap()
Chris@1 177
Chris@3 178 /** Processes a frame of audio data by first computing the STFT with a
Chris@3 179 * Hamming window, then mapping the frequency bins into a part-linear
Chris@3 180 * part-logarithmic array, then computing the spectral flux
Chris@3 181 * then (optionally) normalising and calculating onsets.
Chris@3 182 */
Chris@3 183 void processFrame(const float *const *inputBuffers) {
Chris@3 184 newFrame.clear();
Chris@3 185 for (int i = 0; i < freqMapSize; i++) {
Chris@3 186 newFrame.push_back(0);
Chris@3 187 }
Chris@3 188 double flux = 0;
Chris@3 189 for (int i = 0; i <= fftSize/2; i++) {
Chris@3 190 double mag = sqrt(inputBuffers[0][i*2] * inputBuffers[0][i*2] +
Chris@3 191 inputBuffers[0][i*2+1] * inputBuffers[0][i*2+1]);
Chris@3 192 if (mag > prevFrame[i]) flux += mag - prevFrame[i];
Chris@3 193 prevFrame[i] = mag;
Chris@3 194 newFrame[freqMap[i]] += mag;
Chris@3 195 }
Chris@3 196 spectralFlux.push_back(flux);
Chris@3 197 frames.push_back(newFrame);
Chris@3 198 // for (int i = 0; i < freqMapSize; i++)
Chris@3 199 // [frameCount][i] = newFrame[i];
Chris@3 200 /*
Chris@3 201 int index = cbIndex - (fftSize - hopSize);
Chris@3 202 if (index < 0)
Chris@3 203 index += fftSize;
Chris@3 204 int sz = (fftSize - hopSize) / energyOversampleFactor;
Chris@3 205 for (int j = 0; j < energyOversampleFactor; j++) {
Chris@3 206 double newEnergy = 0;
Chris@3 207 for (int i = 0; i < sz; i++) {
Chris@3 208 newEnergy += circBuffer[index] * circBuffer[index];
Chris@3 209 if (++index == fftSize)
Chris@3 210 index = 0;
Chris@3 211 }
Chris@3 212 energy[frameCount * energyOversampleFactor + j] =
Chris@3 213 newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816;
Chris@3 214 }*/
Chris@1 215
Chris@3 216 double decay = frameCount >= 200? 0.99:
Chris@3 217 (frameCount < 100? 0: (frameCount - 100) / 100.0);
Chris@1 218
Chris@3 219 //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent
Chris@1 220
Chris@3 221 if (ltAverage == 0)
Chris@3 222 ltAverage = frameRMS;
Chris@3 223 else
Chris@3 224 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
Chris@3 225 if (frameRMS <= silenceThreshold)
Chris@3 226 for (int i = 0; i < freqMapSize; i++)
Chris@3 227 frames[frameCount][i] = 0;
Chris@3 228 else {
Chris@3 229 if (normaliseMode == 1)
Chris@3 230 for (int i = 0; i < freqMapSize; i++)
Chris@3 231 frames[frameCount][i] /= frameRMS;
Chris@3 232 else if (normaliseMode == 2)
Chris@3 233 for (int i = 0; i < freqMapSize; i++)
Chris@3 234 frames[frameCount][i] /= ltAverage;
Chris@3 235 for (int i = 0; i < freqMapSize; i++) {
Chris@3 236 frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold;
Chris@3 237 if (frames[frameCount][i] < 0)
Chris@3 238 frames[frameCount][i] = 0;
Chris@3 239 }
Chris@3 240 }
Chris@1 241 // weightedPhaseDeviation();
Chris@1 242 // if (debug)
Chris@1 243 // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n",
Chris@1 244 // frameCount * hopTime,
Chris@1 245 // phaseDeviation[frameCount],
Chris@1 246 // frameRMS);
Chris@3 247 frameCount++;
Chris@3 248 } // processFrame()
Chris@1 249
Chris@3 250 /** Processes a complete file of audio data. */
Chris@3 251 void processFile() {
Chris@3 252 /*
Chris@3 253 while (pcmInputStream != null) {
Chris@3 254 // Profile.start(0);
Chris@3 255 processFrame();
Chris@3 256 // Profile.log(0);
Chris@3 257 if (Thread.currentThread().isInterrupted()) {
Chris@3 258 System.err.println("info: INTERRUPTED in processFile()");
Chris@3 259 return;
Chris@3 260 }
Chris@3 261 }
Chris@3 262 */
Chris@1 263 // double[] x1 = new double[phaseDeviation.length];
Chris@1 264 // for (int i = 0; i < x1.length; i++) {
Chris@1 265 // x1[i] = i * hopTime;
Chris@1 266 // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
Chris@1 267 // }
Chris@1 268 // double[] x2 = new double[energy.length];
Chris@1 269 // for (int i = 0; i < x2.length; i++)
Chris@1 270 // x2[i] = i * hopTime / energyOversampleFactor;
Chris@1 271 // // plot.clear();
Chris@1 272 // plot.addPlot(x1, phaseDeviation, Color.green, 7);
Chris@1 273 // plot.addPlot(x2, energy, Color.red, 7);
Chris@1 274 // plot.setTitle("Test phase deviation");
Chris@1 275 // plot.fitAxes();
Chris@1 276
Chris@1 277 // double[] slope = new double[energy.length];
Chris@1 278 // double hop = hopTime / energyOversampleFactor;
Chris@1 279 // Peaks.getSlope(energy, hop, 15, slope);
Chris@3 280 // LinkedList<Integer> peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10);
Chris@1 281
Chris@3 282 double hop = hopTime;
Chris@3 283 Peaks.normalise(spectralFlux);
Chris@3 284 LinkedList<Integer> peaks = Peaks.findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true);
Chris@3 285 onsets = new double[peaks.size()];
Chris@3 286 double[] y2 = new double[onsets.length];
Chris@3 287 Iterator<Integer> it = peaks.iterator();
Chris@3 288 onsetList = new EventList();
Chris@3 289 double minSalience = Peaks.min(spectralFlux);
Chris@3 290 for (int i = 0; i < onsets.length; i++) {
Chris@3 291 int index = it.next();
Chris@3 292 onsets[i] = index * hop;
Chris@3 293 y2[i] = spectralFlux[index];
Chris@3 294 Event e = BeatTrackDisplay.newBeat(onsets[i], 0);
Chris@1 295 // if (debug)
Chris@1 296 // System.err.printf("Onset: %8.3f %8.3f %8.3f\n",
Chris@1 297 // onsets[i], energy[index], slope[index]);
Chris@1 298 // e.salience = slope[index]; // or combination of energy + slope??
Chris@3 299 // Note that salience must be non-negative or the beat tracking system fails!
Chris@3 300 e.salience = spectralFlux[index] - minSalience;
Chris@3 301 onsetList.add(e);
Chris@3 302 }
Chris@1 303
Chris@3 304 //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack
Chris@1 305
Chris@3 306 } // processFile()
Chris@3 307
Chris@3 308 }; // class AudioProcessor
Chris@1 309
Chris@1 310
Chris@1 311 #endif