annotate BeatRootProcessor.h @ 8:f04f87b5e643

Add agent list class, and continue plodding through
author Chris Cannam
date Fri, 30 Sep 2011 11:37:25 +0100
parents 02d388f98c23
children 4f6626f9ffac
rev   line source
Chris@1 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@1 2
Chris@1 3 /*
Chris@3 4 Vamp feature extraction plugin for the BeatRoot beat tracker.
Chris@1 5
Chris@3 6 Centre for Digital Music, Queen Mary, University of London.
Chris@3 7 This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
Chris@1 8
Chris@3 9 This program is free software; you can redistribute it and/or
Chris@3 10 modify it under the terms of the GNU General Public License as
Chris@3 11 published by the Free Software Foundation; either version 2 of the
Chris@3 12 License, or (at your option) any later version. See the file
Chris@3 13 COPYING included with this distribution for more information.
Chris@1 14 */
Chris@1 15
Chris@1 16 #ifndef _BEATROOT_PROCESSOR_H_
Chris@1 17 #define _BEATROOT_PROCESSOR_H_
Chris@1 18
Chris@4 19 #include "Peaks.h"
Chris@6 20 #include "Event.h"
Chris@6 21 #include "BeatTracker.h"
Chris@4 22
Chris@2 23 #include <vector>
Chris@3 24 #include <cmath>
Chris@2 25
Chris@2 26 using std::vector;
Chris@2 27
Chris@1 28 class BeatRootProcessor
Chris@1 29 {
Chris@1 30 protected:
Chris@1 31 /** Sample rate of audio */
Chris@1 32 float sampleRate;
Chris@1 33
Chris@1 34 /** Spacing of audio frames (determines the amount of overlap or
Chris@1 35 * skip between frames). This value is expressed in
Chris@1 36 * seconds. (Default = 0.020s) */
Chris@1 37 double hopTime;
Chris@1 38
Chris@1 39 /** The approximate size of an FFT frame in seconds. (Default =
Chris@1 40 * 0.04644s). The value is adjusted so that <code>fftSize</code>
Chris@1 41 * is always power of 2. */
Chris@1 42 double fftTime;
Chris@1 43
Chris@1 44 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
Chris@1 45 int hopSize;
Chris@1 46
Chris@1 47 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
Chris@1 48 int fftSize;
Chris@1 49
Chris@1 50 /** The number of overlapping frames of audio data which have been read. */
Chris@1 51 int frameCount;
Chris@1 52
Chris@1 53 /** RMS amplitude of the current frame. */
Chris@1 54 double frameRMS;
Chris@1 55
Chris@1 56 /** Long term average frame energy (in frequency domain representation). */
Chris@1 57 double ltAverage;
Chris@1 58
Chris@1 59 /** Spectral flux onset detection function, indexed by frame. */
Chris@4 60 vector<double> spectralFlux;
Chris@1 61
Chris@1 62 /** A mapping function for mapping FFT bins to final frequency bins.
Chris@1 63 * The mapping is linear (1-1) until the resolution reaches 2 points per
Chris@1 64 * semitone, then logarithmic with a semitone resolution. e.g. for
Chris@1 65 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
Chris@1 66 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
Chris@1 67 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
Chris@1 68 * 83), where all energy above note 127 is mapped into the final bin. */
Chris@1 69 vector<int> freqMap;
Chris@1 70
Chris@1 71 /** The number of entries in <code>freqMap</code>. Note that the length of
Chris@1 72 * the array is greater, because its size is not known at creation time. */
Chris@1 73 int freqMapSize;
Chris@1 74
Chris@1 75 /** The magnitude spectrum of the most recent frame. Used for
Chris@1 76 * calculating the spectral flux. */
Chris@1 77 vector<double> prevFrame;
Chris@1 78
Chris@1 79 /** The magnitude spectrum of the current frame. */
Chris@1 80 vector<double> newFrame;
Chris@1 81
Chris@1 82 /** The magnitude spectra of all frames, used for plotting the spectrogram. */
Chris@1 83 vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
Chris@1 84
Chris@1 85 /** The RMS energy of all frames. */
Chris@3 86 // vector<double> energy; //!!! unused in beat tracking?
Chris@1 87
Chris@1 88 /** The estimated onset times from peak-picking the onset
Chris@1 89 * detection function(s). */
Chris@1 90 vector<double> onsets;
Chris@1 91
Chris@1 92 /** The estimated onset times and their saliences. */
Chris@6 93 EventList onsetList;
Chris@1 94
Chris@1 95 /** Total number of audio frames if known, or -1 for live or compressed input. */
Chris@1 96 int totalFrames;
Chris@1 97
Chris@1 98 /** Flag for enabling or disabling debugging output */
Chris@2 99 static bool debug;
Chris@1 100
Chris@1 101 /** Flag for suppressing all standard output messages except results. */
Chris@2 102 static bool silent;
Chris@1 103
Chris@1 104 /** RMS frame energy below this value results in the frame being
Chris@1 105 * set to zero, so that normalisation does not have undesired
Chris@1 106 * side-effects. */
Chris@2 107 static double silenceThreshold; //!!!??? energy of what? should not be static?
Chris@1 108
Chris@1 109 /** For dynamic range compression, this value is added to the log
Chris@1 110 * magnitude in each frequency bin and any remaining negative
Chris@1 111 * values are then set to zero.
Chris@1 112 */
Chris@2 113 static double rangeThreshold; //!!! sim
Chris@1 114
Chris@1 115 /** Determines method of normalisation. Values can be:<ul>
Chris@1 116 * <li>0: no normalisation</li>
Chris@1 117 * <li>1: normalisation by current frame energy</li>
Chris@1 118 * <li>2: normalisation by exponential average of frame energy</li>
Chris@1 119 * </ul>
Chris@1 120 */
Chris@2 121 static int normaliseMode;
Chris@1 122
Chris@1 123 /** Ratio between rate of sampling the signal energy (for the
Chris@1 124 * amplitude envelope) and the hop size */
Chris@3 125 // static int energyOversampleFactor; //!!! not used?
Chris@1 126
Chris@1 127 public:
Chris@1 128
Chris@1 129 /** Constructor: note that streams are not opened until the input
Chris@1 130 * file is set (see <code>setInputFile()</code>). */
Chris@8 131 BeatRootProcessor(float sr) :
Chris@8 132 sampleRate(sr) {
Chris@1 133 frameRMS = 0;
Chris@1 134 ltAverage = 0;
Chris@1 135 frameCount = 0;
Chris@1 136 hopSize = 0;
Chris@1 137 fftSize = 0;
Chris@1 138 hopTime = 0.010; // DEFAULT, overridden with -h
Chris@1 139 fftTime = 0.04644; // DEFAULT, overridden with -f
Chris@3 140 totalFrames = -1; //!!! not needed?
Chris@1 141 } // constructor
Chris@1 142
Chris@2 143 protected:
Chris@3 144 /** Allocates memory for arrays, based on parameter settings */
Chris@3 145 void init() {
Chris@3 146 hopSize = lrint(sampleRate * hopTime);
Chris@3 147 fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2))));
Chris@3 148 makeFreqMap(fftSize, sampleRate);
Chris@3 149 prevFrame.clear();
Chris@3 150 for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0);
Chris@3 151 frameCount = 0;
Chris@3 152 frameRMS = 0;
Chris@3 153 ltAverage = 0;
Chris@3 154 spectralFlux.clear();
Chris@3 155 } // init()
Chris@1 156
Chris@3 157 /** Creates a map of FFT frequency bins to comparison bins.
Chris@3 158 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
Chris@3 159 * one to one. Where the spacing is greater than 0.5 semitones, the FFT
Chris@3 160 * energy is mapped into semitone-wide bins. No scaling is performed; that
Chris@3 161 * is the energy is summed into the comparison bins. See also
Chris@3 162 * processFrame()
Chris@3 163 */
Chris@3 164 void makeFreqMap(int fftSize, float sampleRate) {
Chris@3 165 freqMap.resize(fftSize/2+1);
Chris@3 166 double binWidth = sampleRate / fftSize;
Chris@3 167 int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
Chris@3 168 int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/
Chris@3 169 log(2) * 12 + 69);
Chris@3 170 int i = 0;
Chris@3 171 while (i <= crossoverBin)
Chris@3 172 freqMap[i++] = i;
Chris@3 173 while (i <= fftSize/2) {
Chris@3 174 double midi = log(i*binWidth/440) / log(2) * 12 + 69;
Chris@3 175 if (midi > 127)
Chris@3 176 midi = 127;
Chris@3 177 freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi;
Chris@3 178 }
Chris@3 179 freqMapSize = freqMap[i-1] + 1;
Chris@3 180 } // makeFreqMap()
Chris@1 181
Chris@3 182 /** Processes a frame of audio data by first computing the STFT with a
Chris@3 183 * Hamming window, then mapping the frequency bins into a part-linear
Chris@3 184 * part-logarithmic array, then computing the spectral flux
Chris@3 185 * then (optionally) normalising and calculating onsets.
Chris@3 186 */
Chris@3 187 void processFrame(const float *const *inputBuffers) {
Chris@3 188 newFrame.clear();
Chris@3 189 for (int i = 0; i < freqMapSize; i++) {
Chris@3 190 newFrame.push_back(0);
Chris@3 191 }
Chris@3 192 double flux = 0;
Chris@3 193 for (int i = 0; i <= fftSize/2; i++) {
Chris@3 194 double mag = sqrt(inputBuffers[0][i*2] * inputBuffers[0][i*2] +
Chris@3 195 inputBuffers[0][i*2+1] * inputBuffers[0][i*2+1]);
Chris@3 196 if (mag > prevFrame[i]) flux += mag - prevFrame[i];
Chris@3 197 prevFrame[i] = mag;
Chris@3 198 newFrame[freqMap[i]] += mag;
Chris@3 199 }
Chris@3 200 spectralFlux.push_back(flux);
Chris@3 201 frames.push_back(newFrame);
Chris@3 202 // for (int i = 0; i < freqMapSize; i++)
Chris@3 203 // [frameCount][i] = newFrame[i];
Chris@3 204 /*
Chris@3 205 int index = cbIndex - (fftSize - hopSize);
Chris@3 206 if (index < 0)
Chris@3 207 index += fftSize;
Chris@3 208 int sz = (fftSize - hopSize) / energyOversampleFactor;
Chris@3 209 for (int j = 0; j < energyOversampleFactor; j++) {
Chris@3 210 double newEnergy = 0;
Chris@3 211 for (int i = 0; i < sz; i++) {
Chris@3 212 newEnergy += circBuffer[index] * circBuffer[index];
Chris@3 213 if (++index == fftSize)
Chris@3 214 index = 0;
Chris@3 215 }
Chris@3 216 energy[frameCount * energyOversampleFactor + j] =
Chris@3 217 newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816;
Chris@3 218 }*/
Chris@1 219
Chris@3 220 double decay = frameCount >= 200? 0.99:
Chris@3 221 (frameCount < 100? 0: (frameCount - 100) / 100.0);
Chris@1 222
Chris@3 223 //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent
Chris@1 224
Chris@3 225 if (ltAverage == 0)
Chris@3 226 ltAverage = frameRMS;
Chris@3 227 else
Chris@3 228 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
Chris@3 229 if (frameRMS <= silenceThreshold)
Chris@3 230 for (int i = 0; i < freqMapSize; i++)
Chris@3 231 frames[frameCount][i] = 0;
Chris@3 232 else {
Chris@3 233 if (normaliseMode == 1)
Chris@3 234 for (int i = 0; i < freqMapSize; i++)
Chris@3 235 frames[frameCount][i] /= frameRMS;
Chris@3 236 else if (normaliseMode == 2)
Chris@3 237 for (int i = 0; i < freqMapSize; i++)
Chris@3 238 frames[frameCount][i] /= ltAverage;
Chris@3 239 for (int i = 0; i < freqMapSize; i++) {
Chris@3 240 frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold;
Chris@3 241 if (frames[frameCount][i] < 0)
Chris@3 242 frames[frameCount][i] = 0;
Chris@3 243 }
Chris@3 244 }
Chris@1 245 // weightedPhaseDeviation();
Chris@1 246 // if (debug)
Chris@1 247 // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n",
Chris@1 248 // frameCount * hopTime,
Chris@1 249 // phaseDeviation[frameCount],
Chris@1 250 // frameRMS);
Chris@3 251 frameCount++;
Chris@3 252 } // processFrame()
Chris@1 253
Chris@3 254 /** Processes a complete file of audio data. */
Chris@3 255 void processFile() {
Chris@3 256 /*
Chris@3 257 while (pcmInputStream != null) {
Chris@3 258 // Profile.start(0);
Chris@3 259 processFrame();
Chris@3 260 // Profile.log(0);
Chris@3 261 if (Thread.currentThread().isInterrupted()) {
Chris@3 262 System.err.println("info: INTERRUPTED in processFile()");
Chris@3 263 return;
Chris@3 264 }
Chris@3 265 }
Chris@3 266 */
Chris@1 267 // double[] x1 = new double[phaseDeviation.length];
Chris@1 268 // for (int i = 0; i < x1.length; i++) {
Chris@1 269 // x1[i] = i * hopTime;
Chris@1 270 // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
Chris@1 271 // }
Chris@1 272 // double[] x2 = new double[energy.length];
Chris@1 273 // for (int i = 0; i < x2.length; i++)
Chris@1 274 // x2[i] = i * hopTime / energyOversampleFactor;
Chris@1 275 // // plot.clear();
Chris@1 276 // plot.addPlot(x1, phaseDeviation, Color.green, 7);
Chris@1 277 // plot.addPlot(x2, energy, Color.red, 7);
Chris@1 278 // plot.setTitle("Test phase deviation");
Chris@1 279 // plot.fitAxes();
Chris@1 280
Chris@1 281 // double[] slope = new double[energy.length];
Chris@1 282 // double hop = hopTime / energyOversampleFactor;
Chris@1 283 // Peaks.getSlope(energy, hop, 15, slope);
Chris@4 284 // vector<Integer> peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10);
Chris@1 285
Chris@3 286 double hop = hopTime;
Chris@4 287 Peaks::normalise(spectralFlux);
Chris@4 288 vector<int> peaks = Peaks::findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true);
Chris@5 289 onsets.clear();
Chris@5 290 onsets.resize(peaks.size(), 0);
Chris@4 291 vector<int>::iterator it = peaks.begin();
Chris@6 292 onsetList.clear();
Chris@6 293 double minSalience = Peaks::min(spectralFlux);
Chris@6 294 for (int i = 0; i < onsets.size(); i++) {
Chris@4 295 int index = *it;
Chris@4 296 ++it;
Chris@3 297 onsets[i] = index * hop;
Chris@6 298 Event e = BeatTracker::newBeat(onsets[i], 0);
Chris@1 299 // if (debug)
Chris@1 300 // System.err.printf("Onset: %8.3f %8.3f %8.3f\n",
Chris@1 301 // onsets[i], energy[index], slope[index]);
Chris@1 302 // e.salience = slope[index]; // or combination of energy + slope??
Chris@3 303 // Note that salience must be non-negative or the beat tracking system fails!
Chris@3 304 e.salience = spectralFlux[index] - minSalience;
Chris@6 305 onsetList.push_back(e);
Chris@3 306 }
Chris@1 307
Chris@3 308 //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack
Chris@1 309
Chris@3 310 } // processFile()
Chris@3 311
Chris@3 312 }; // class AudioProcessor
Chris@1 313
Chris@1 314
Chris@1 315 #endif