annotate BeatRootProcessor.h @ 4:c06cf6f7cb04

Bring in Peaks code from BeatRoot
author Chris Cannam
date Mon, 19 Sep 2011 15:48:26 +0100
parents a821f49c42f0
children 2150607d4726
rev   line source
Chris@1 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@1 2
Chris@1 3 /*
Chris@3 4 Vamp feature extraction plugin for the BeatRoot beat tracker.
Chris@1 5
Chris@3 6 Centre for Digital Music, Queen Mary, University of London.
Chris@3 7 This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
Chris@1 8
Chris@3 9 This program is free software; you can redistribute it and/or
Chris@3 10 modify it under the terms of the GNU General Public License as
Chris@3 11 published by the Free Software Foundation; either version 2 of the
Chris@3 12 License, or (at your option) any later version. See the file
Chris@3 13 COPYING included with this distribution for more information.
Chris@1 14 */
Chris@1 15
Chris@1 16 #ifndef _BEATROOT_PROCESSOR_H_
Chris@1 17 #define _BEATROOT_PROCESSOR_H_
Chris@1 18
Chris@4 19 #include "Peaks.h"
Chris@4 20
Chris@2 21 #include <vector>
Chris@3 22 #include <cmath>
Chris@2 23
Chris@2 24 using std::vector;
Chris@2 25
Chris@1 26 class BeatRootProcessor
Chris@1 27 {
Chris@1 28 protected:
Chris@1 29 /** Sample rate of audio */
Chris@1 30 float sampleRate;
Chris@1 31
Chris@1 32 /** Spacing of audio frames (determines the amount of overlap or
Chris@1 33 * skip between frames). This value is expressed in
Chris@1 34 * seconds. (Default = 0.020s) */
Chris@1 35 double hopTime;
Chris@1 36
Chris@1 37 /** The approximate size of an FFT frame in seconds. (Default =
Chris@1 38 * 0.04644s). The value is adjusted so that <code>fftSize</code>
Chris@1 39 * is always power of 2. */
Chris@1 40 double fftTime;
Chris@1 41
Chris@1 42 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
Chris@1 43 int hopSize;
Chris@1 44
Chris@1 45 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
Chris@1 46 int fftSize;
Chris@1 47
Chris@1 48 /** The number of overlapping frames of audio data which have been read. */
Chris@1 49 int frameCount;
Chris@1 50
Chris@1 51 /** RMS amplitude of the current frame. */
Chris@1 52 double frameRMS;
Chris@1 53
Chris@1 54 /** Long term average frame energy (in frequency domain representation). */
Chris@1 55 double ltAverage;
Chris@1 56
Chris@1 57 /** Spectral flux onset detection function, indexed by frame. */
Chris@4 58 vector<double> spectralFlux;
Chris@1 59
Chris@1 60 /** A mapping function for mapping FFT bins to final frequency bins.
Chris@1 61 * The mapping is linear (1-1) until the resolution reaches 2 points per
Chris@1 62 * semitone, then logarithmic with a semitone resolution. e.g. for
Chris@1 63 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
Chris@1 64 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
Chris@1 65 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
Chris@1 66 * 83), where all energy above note 127 is mapped into the final bin. */
Chris@1 67 vector<int> freqMap;
Chris@1 68
Chris@1 69 /** The number of entries in <code>freqMap</code>. Note that the length of
Chris@1 70 * the array is greater, because its size is not known at creation time. */
Chris@1 71 int freqMapSize;
Chris@1 72
Chris@1 73 /** The magnitude spectrum of the most recent frame. Used for
Chris@1 74 * calculating the spectral flux. */
Chris@1 75 vector<double> prevFrame;
Chris@1 76
Chris@1 77 /** The magnitude spectrum of the current frame. */
Chris@1 78 vector<double> newFrame;
Chris@1 79
Chris@1 80 /** The magnitude spectra of all frames, used for plotting the spectrogram. */
Chris@1 81 vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
Chris@1 82
Chris@1 83 /** The RMS energy of all frames. */
Chris@3 84 // vector<double> energy; //!!! unused in beat tracking?
Chris@1 85
Chris@1 86 /** The estimated onset times from peak-picking the onset
Chris@1 87 * detection function(s). */
Chris@1 88 vector<double> onsets;
Chris@1 89
Chris@1 90 /** The estimated onset times and their saliences. */
Chris@1 91 //!!!EventList onsetList;
Chris@1 92 vector<double> onsetList; //!!! corresponding to keyDown member of events in list
Chris@1 93
Chris@1 94 /** Total number of audio frames if known, or -1 for live or compressed input. */
Chris@1 95 int totalFrames;
Chris@1 96
Chris@1 97 /** Flag for enabling or disabling debugging output */
Chris@2 98 static bool debug;
Chris@1 99
Chris@1 100 /** Flag for suppressing all standard output messages except results. */
Chris@2 101 static bool silent;
Chris@1 102
Chris@1 103 /** RMS frame energy below this value results in the frame being
Chris@1 104 * set to zero, so that normalisation does not have undesired
Chris@1 105 * side-effects. */
Chris@2 106 static double silenceThreshold; //!!!??? energy of what? should not be static?
Chris@1 107
Chris@1 108 /** For dynamic range compression, this value is added to the log
Chris@1 109 * magnitude in each frequency bin and any remaining negative
Chris@1 110 * values are then set to zero.
Chris@1 111 */
Chris@2 112 static double rangeThreshold; //!!! sim
Chris@1 113
Chris@1 114 /** Determines method of normalisation. Values can be:<ul>
Chris@1 115 * <li>0: no normalisation</li>
Chris@1 116 * <li>1: normalisation by current frame energy</li>
Chris@1 117 * <li>2: normalisation by exponential average of frame energy</li>
Chris@1 118 * </ul>
Chris@1 119 */
Chris@2 120 static int normaliseMode;
Chris@1 121
Chris@1 122 /** Ratio between rate of sampling the signal energy (for the
Chris@1 123 * amplitude envelope) and the hop size */
Chris@3 124 // static int energyOversampleFactor; //!!! not used?
Chris@1 125
Chris@1 126 public:
Chris@1 127
Chris@1 128 /** Constructor: note that streams are not opened until the input
Chris@1 129 * file is set (see <code>setInputFile()</code>). */
Chris@2 130 BeatRootProcessor() {
Chris@1 131 frameRMS = 0;
Chris@1 132 ltAverage = 0;
Chris@1 133 frameCount = 0;
Chris@1 134 hopSize = 0;
Chris@1 135 fftSize = 0;
Chris@1 136 hopTime = 0.010; // DEFAULT, overridden with -h
Chris@1 137 fftTime = 0.04644; // DEFAULT, overridden with -f
Chris@3 138 totalFrames = -1; //!!! not needed?
Chris@1 139 } // constructor
Chris@1 140
Chris@2 141 protected:
Chris@3 142 /** Allocates memory for arrays, based on parameter settings */
Chris@3 143 void init() {
Chris@3 144 hopSize = lrint(sampleRate * hopTime);
Chris@3 145 fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2))));
Chris@3 146 makeFreqMap(fftSize, sampleRate);
Chris@3 147 prevFrame.clear();
Chris@3 148 for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0);
Chris@3 149 frameCount = 0;
Chris@3 150 frameRMS = 0;
Chris@3 151 ltAverage = 0;
Chris@3 152 spectralFlux.clear();
Chris@3 153 } // init()
Chris@1 154
Chris@3 155 /** Creates a map of FFT frequency bins to comparison bins.
Chris@3 156 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
Chris@3 157 * one to one. Where the spacing is greater than 0.5 semitones, the FFT
Chris@3 158 * energy is mapped into semitone-wide bins. No scaling is performed; that
Chris@3 159 * is the energy is summed into the comparison bins. See also
Chris@3 160 * processFrame()
Chris@3 161 */
Chris@3 162 void makeFreqMap(int fftSize, float sampleRate) {
Chris@3 163 freqMap.resize(fftSize/2+1);
Chris@3 164 double binWidth = sampleRate / fftSize;
Chris@3 165 int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
Chris@3 166 int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/
Chris@3 167 log(2) * 12 + 69);
Chris@3 168 int i = 0;
Chris@3 169 while (i <= crossoverBin)
Chris@3 170 freqMap[i++] = i;
Chris@3 171 while (i <= fftSize/2) {
Chris@3 172 double midi = log(i*binWidth/440) / log(2) * 12 + 69;
Chris@3 173 if (midi > 127)
Chris@3 174 midi = 127;
Chris@3 175 freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi;
Chris@3 176 }
Chris@3 177 freqMapSize = freqMap[i-1] + 1;
Chris@3 178 } // makeFreqMap()
Chris@1 179
Chris@3 180 /** Processes a frame of audio data by first computing the STFT with a
Chris@3 181 * Hamming window, then mapping the frequency bins into a part-linear
Chris@3 182 * part-logarithmic array, then computing the spectral flux
Chris@3 183 * then (optionally) normalising and calculating onsets.
Chris@3 184 */
Chris@3 185 void processFrame(const float *const *inputBuffers) {
Chris@3 186 newFrame.clear();
Chris@3 187 for (int i = 0; i < freqMapSize; i++) {
Chris@3 188 newFrame.push_back(0);
Chris@3 189 }
Chris@3 190 double flux = 0;
Chris@3 191 for (int i = 0; i <= fftSize/2; i++) {
Chris@3 192 double mag = sqrt(inputBuffers[0][i*2] * inputBuffers[0][i*2] +
Chris@3 193 inputBuffers[0][i*2+1] * inputBuffers[0][i*2+1]);
Chris@3 194 if (mag > prevFrame[i]) flux += mag - prevFrame[i];
Chris@3 195 prevFrame[i] = mag;
Chris@3 196 newFrame[freqMap[i]] += mag;
Chris@3 197 }
Chris@3 198 spectralFlux.push_back(flux);
Chris@3 199 frames.push_back(newFrame);
Chris@3 200 // for (int i = 0; i < freqMapSize; i++)
Chris@3 201 // [frameCount][i] = newFrame[i];
Chris@3 202 /*
Chris@3 203 int index = cbIndex - (fftSize - hopSize);
Chris@3 204 if (index < 0)
Chris@3 205 index += fftSize;
Chris@3 206 int sz = (fftSize - hopSize) / energyOversampleFactor;
Chris@3 207 for (int j = 0; j < energyOversampleFactor; j++) {
Chris@3 208 double newEnergy = 0;
Chris@3 209 for (int i = 0; i < sz; i++) {
Chris@3 210 newEnergy += circBuffer[index] * circBuffer[index];
Chris@3 211 if (++index == fftSize)
Chris@3 212 index = 0;
Chris@3 213 }
Chris@3 214 energy[frameCount * energyOversampleFactor + j] =
Chris@3 215 newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816;
Chris@3 216 }*/
Chris@1 217
Chris@3 218 double decay = frameCount >= 200? 0.99:
Chris@3 219 (frameCount < 100? 0: (frameCount - 100) / 100.0);
Chris@1 220
Chris@3 221 //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent
Chris@1 222
Chris@3 223 if (ltAverage == 0)
Chris@3 224 ltAverage = frameRMS;
Chris@3 225 else
Chris@3 226 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
Chris@3 227 if (frameRMS <= silenceThreshold)
Chris@3 228 for (int i = 0; i < freqMapSize; i++)
Chris@3 229 frames[frameCount][i] = 0;
Chris@3 230 else {
Chris@3 231 if (normaliseMode == 1)
Chris@3 232 for (int i = 0; i < freqMapSize; i++)
Chris@3 233 frames[frameCount][i] /= frameRMS;
Chris@3 234 else if (normaliseMode == 2)
Chris@3 235 for (int i = 0; i < freqMapSize; i++)
Chris@3 236 frames[frameCount][i] /= ltAverage;
Chris@3 237 for (int i = 0; i < freqMapSize; i++) {
Chris@3 238 frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold;
Chris@3 239 if (frames[frameCount][i] < 0)
Chris@3 240 frames[frameCount][i] = 0;
Chris@3 241 }
Chris@3 242 }
Chris@1 243 // weightedPhaseDeviation();
Chris@1 244 // if (debug)
Chris@1 245 // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n",
Chris@1 246 // frameCount * hopTime,
Chris@1 247 // phaseDeviation[frameCount],
Chris@1 248 // frameRMS);
Chris@3 249 frameCount++;
Chris@3 250 } // processFrame()
Chris@1 251
Chris@3 252 /** Processes a complete file of audio data. */
Chris@3 253 void processFile() {
Chris@3 254 /*
Chris@3 255 while (pcmInputStream != null) {
Chris@3 256 // Profile.start(0);
Chris@3 257 processFrame();
Chris@3 258 // Profile.log(0);
Chris@3 259 if (Thread.currentThread().isInterrupted()) {
Chris@3 260 System.err.println("info: INTERRUPTED in processFile()");
Chris@3 261 return;
Chris@3 262 }
Chris@3 263 }
Chris@3 264 */
Chris@1 265 // double[] x1 = new double[phaseDeviation.length];
Chris@1 266 // for (int i = 0; i < x1.length; i++) {
Chris@1 267 // x1[i] = i * hopTime;
Chris@1 268 // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
Chris@1 269 // }
Chris@1 270 // double[] x2 = new double[energy.length];
Chris@1 271 // for (int i = 0; i < x2.length; i++)
Chris@1 272 // x2[i] = i * hopTime / energyOversampleFactor;
Chris@1 273 // // plot.clear();
Chris@1 274 // plot.addPlot(x1, phaseDeviation, Color.green, 7);
Chris@1 275 // plot.addPlot(x2, energy, Color.red, 7);
Chris@1 276 // plot.setTitle("Test phase deviation");
Chris@1 277 // plot.fitAxes();
Chris@1 278
Chris@1 279 // double[] slope = new double[energy.length];
Chris@1 280 // double hop = hopTime / energyOversampleFactor;
Chris@1 281 // Peaks.getSlope(energy, hop, 15, slope);
Chris@4 282 // vector<Integer> peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10);
Chris@1 283
Chris@3 284 double hop = hopTime;
Chris@4 285 Peaks::normalise(spectralFlux);
Chris@4 286 vector<int> peaks = Peaks::findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true);
Chris@3 287 onsets = new double[peaks.size()];
Chris@3 288 double[] y2 = new double[onsets.length];
Chris@4 289 vector<int>::iterator it = peaks.begin();
Chris@3 290 onsetList = new EventList();
Chris@3 291 double minSalience = Peaks.min(spectralFlux);
Chris@3 292 for (int i = 0; i < onsets.length; i++) {
Chris@4 293 int index = *it;
Chris@4 294 ++it;
Chris@3 295 onsets[i] = index * hop;
Chris@3 296 y2[i] = spectralFlux[index];
Chris@3 297 Event e = BeatTrackDisplay.newBeat(onsets[i], 0);
Chris@1 298 // if (debug)
Chris@1 299 // System.err.printf("Onset: %8.3f %8.3f %8.3f\n",
Chris@1 300 // onsets[i], energy[index], slope[index]);
Chris@1 301 // e.salience = slope[index]; // or combination of energy + slope??
Chris@3 302 // Note that salience must be non-negative or the beat tracking system fails!
Chris@3 303 e.salience = spectralFlux[index] - minSalience;
Chris@3 304 onsetList.add(e);
Chris@3 305 }
Chris@1 306
Chris@3 307 //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack
Chris@1 308
Chris@3 309 } // processFile()
Chris@3 310
Chris@3 311 }; // class AudioProcessor
Chris@1 312
Chris@1 313
Chris@1 314 #endif