beatroot-vamp: BeatRootProcessor.h annotate

annotate BeatRootProcessor.h @ 4:c06cf6f7cb04

Bring in Peaks code from BeatRoot

author	Chris Cannam
date	Mon, 19 Sep 2011 15:48:26 +0100
parents	a821f49c42f0
children	2150607d4726

rev	line source
Chris@1	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
Chris@1	2
Chris@1	3 /*
Chris@3	4 Vamp feature extraction plugin for the BeatRoot beat tracker.
Chris@1	5
Chris@3	6 Centre for Digital Music, Queen Mary, University of London.
Chris@3	7 This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
Chris@1	8
Chris@3	9 This program is free software; you can redistribute it and/or
Chris@3	10 modify it under the terms of the GNU General Public License as
Chris@3	11 published by the Free Software Foundation; either version 2 of the
Chris@3	12 License, or (at your option) any later version. See the file
Chris@3	13 COPYING included with this distribution for more information.
Chris@1	14 */
Chris@1	15
Chris@1	16 #ifndef _BEATROOT_PROCESSOR_H_
Chris@1	17 #define _BEATROOT_PROCESSOR_H_
Chris@1	18
Chris@4	19 #include "Peaks.h"
Chris@4	20
Chris@2	21 #include <vector>
Chris@3	22 #include <cmath>
Chris@2	23
Chris@2	24 using std::vector;
Chris@2	25
Chris@1	26 class BeatRootProcessor
Chris@1	27 {
Chris@1	28 protected:
Chris@1	29 /** Sample rate of audio */
Chris@1	30 float sampleRate;
Chris@1	31
Chris@1	32 /** Spacing of audio frames (determines the amount of overlap or
Chris@1	33 * skip between frames). This value is expressed in
Chris@1	34 * seconds. (Default = 0.020s) */
Chris@1	35 double hopTime;
Chris@1	36
Chris@1	37 /** The approximate size of an FFT frame in seconds. (Default =
Chris@1	38 * 0.04644s). The value is adjusted so that <code>fftSize</code>
Chris@1	39 * is always power of 2. */
Chris@1	40 double fftTime;
Chris@1	41
Chris@1	42 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
Chris@1	43 int hopSize;
Chris@1	44
Chris@1	45 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
Chris@1	46 int fftSize;
Chris@1	47
Chris@1	48 /** The number of overlapping frames of audio data which have been read. */
Chris@1	49 int frameCount;
Chris@1	50
Chris@1	51 /** RMS amplitude of the current frame. */
Chris@1	52 double frameRMS;
Chris@1	53
Chris@1	54 /** Long term average frame energy (in frequency domain representation). */
Chris@1	55 double ltAverage;
Chris@1	56
Chris@1	57 /** Spectral flux onset detection function, indexed by frame. */
Chris@4	58 vector<double> spectralFlux;
Chris@1	59
Chris@1	60 /** A mapping function for mapping FFT bins to final frequency bins.
Chris@1	61 * The mapping is linear (1-1) until the resolution reaches 2 points per
Chris@1	62 * semitone, then logarithmic with a semitone resolution. e.g. for
Chris@1	63 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
Chris@1	64 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
Chris@1	65 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
Chris@1	66 * 83), where all energy above note 127 is mapped into the final bin. */
Chris@1	67 vector<int> freqMap;
Chris@1	68
Chris@1	69 /** The number of entries in <code>freqMap</code>. Note that the length of
Chris@1	70 * the array is greater, because its size is not known at creation time. */
Chris@1	71 int freqMapSize;
Chris@1	72
Chris@1	73 /** The magnitude spectrum of the most recent frame. Used for
Chris@1	74 * calculating the spectral flux. */
Chris@1	75 vector<double> prevFrame;
Chris@1	76
Chris@1	77 /** The magnitude spectrum of the current frame. */
Chris@1	78 vector<double> newFrame;
Chris@1	79
Chris@1	80 /** The magnitude spectra of all frames, used for plotting the spectrogram. */
Chris@1	81 vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
Chris@1	82
Chris@1	83 /** The RMS energy of all frames. */
Chris@3	84 // vector<double> energy; //!!! unused in beat tracking?
Chris@1	85
Chris@1	86 /** The estimated onset times from peak-picking the onset
Chris@1	87 * detection function(s). */
Chris@1	88 vector<double> onsets;
Chris@1	89
Chris@1	90 /** The estimated onset times and their saliences. */
Chris@1	91 //!!!EventList onsetList;
Chris@1	92 vector<double> onsetList; //!!! corresponding to keyDown member of events in list
Chris@1	93
Chris@1	94 /** Total number of audio frames if known, or -1 for live or compressed input. */
Chris@1	95 int totalFrames;
Chris@1	96
Chris@1	97 /** Flag for enabling or disabling debugging output */
Chris@2	98 static bool debug;
Chris@1	99
Chris@1	100 /** Flag for suppressing all standard output messages except results. */
Chris@2	101 static bool silent;
Chris@1	102
Chris@1	103 /** RMS frame energy below this value results in the frame being
Chris@1	104 * set to zero, so that normalisation does not have undesired
Chris@1	105 * side-effects. */
Chris@2	106 static double silenceThreshold; //!!!??? energy of what? should not be static?
Chris@1	107
Chris@1	108 /** For dynamic range compression, this value is added to the log
Chris@1	109 * magnitude in each frequency bin and any remaining negative
Chris@1	110 * values are then set to zero.
Chris@1	111 */
Chris@2	112 static double rangeThreshold; //!!! sim
Chris@1	113
Chris@1	114 /** Determines method of normalisation. Values can be:<ul>
Chris@1	115 * <li>0: no normalisation</li>
Chris@1	116 * <li>1: normalisation by current frame energy</li>
Chris@1	117 * <li>2: normalisation by exponential average of frame energy</li>
Chris@1	118 * </ul>
Chris@1	119 */
Chris@2	120 static int normaliseMode;
Chris@1	121
Chris@1	122 /** Ratio between rate of sampling the signal energy (for the
Chris@1	123 * amplitude envelope) and the hop size */
Chris@3	124 // static int energyOversampleFactor; //!!! not used?
Chris@1	125
Chris@1	126 public:
Chris@1	127
Chris@1	128 /** Constructor: note that streams are not opened until the input
Chris@1	129 * file is set (see <code>setInputFile()</code>). */
Chris@2	130 BeatRootProcessor() {
Chris@1	131 frameRMS = 0;
Chris@1	132 ltAverage = 0;
Chris@1	133 frameCount = 0;
Chris@1	134 hopSize = 0;
Chris@1	135 fftSize = 0;
Chris@1	136 hopTime = 0.010; // DEFAULT, overridden with -h
Chris@1	137 fftTime = 0.04644; // DEFAULT, overridden with -f
Chris@3	138 totalFrames = -1; //!!! not needed?
Chris@1	139 } // constructor
Chris@1	140
Chris@2	141 protected:
Chris@3	142 /** Allocates memory for arrays, based on parameter settings */
Chris@3	143 void init() {
Chris@3	144 hopSize = lrint(sampleRate * hopTime);
Chris@3	145 fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2))));
Chris@3	146 makeFreqMap(fftSize, sampleRate);
Chris@3	147 prevFrame.clear();
Chris@3	148 for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0);
Chris@3	149 frameCount = 0;
Chris@3	150 frameRMS = 0;
Chris@3	151 ltAverage = 0;
Chris@3	152 spectralFlux.clear();
Chris@3	153 } // init()
Chris@1	154
Chris@3	155 /** Creates a map of FFT frequency bins to comparison bins.
Chris@3	156 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
Chris@3	157 * one to one. Where the spacing is greater than 0.5 semitones, the FFT
Chris@3	158 * energy is mapped into semitone-wide bins. No scaling is performed; that
Chris@3	159 * is the energy is summed into the comparison bins. See also
Chris@3	160 * processFrame()
Chris@3	161 */
Chris@3	162 void makeFreqMap(int fftSize, float sampleRate) {
Chris@3	163 freqMap.resize(fftSize/2+1);
Chris@3	164 double binWidth = sampleRate / fftSize;
Chris@3	165 int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
Chris@3	166 int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/
Chris@3	167 log(2) * 12 + 69);
Chris@3	168 int i = 0;
Chris@3	169 while (i <= crossoverBin)
Chris@3	170 freqMap[i++] = i;
Chris@3	171 while (i <= fftSize/2) {
Chris@3	172 double midi = log(ibinWidth/440) / log(2) 12 + 69;
Chris@3	173 if (midi > 127)
Chris@3	174 midi = 127;
Chris@3	175 freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi;
Chris@3	176 }
Chris@3	177 freqMapSize = freqMap[i-1] + 1;
Chris@3	178 } // makeFreqMap()
Chris@1	179
Chris@3	180 /** Processes a frame of audio data by first computing the STFT with a
Chris@3	181 * Hamming window, then mapping the frequency bins into a part-linear
Chris@3	182 * part-logarithmic array, then computing the spectral flux
Chris@3	183 * then (optionally) normalising and calculating onsets.
Chris@3	184 */
Chris@3	185 void processFrame(const float const inputBuffers) {
Chris@3	186 newFrame.clear();
Chris@3	187 for (int i = 0; i < freqMapSize; i++) {
Chris@3	188 newFrame.push_back(0);
Chris@3	189 }
Chris@3	190 double flux = 0;
Chris@3	191 for (int i = 0; i <= fftSize/2; i++) {
Chris@3	192 double mag = sqrt(inputBuffers[0][i2] inputBuffers[0][i*2] +
Chris@3	193 inputBuffers[0][i2+1] inputBuffers[0][i*2+1]);
Chris@3	194 if (mag > prevFrame[i]) flux += mag - prevFrame[i];
Chris@3	195 prevFrame[i] = mag;
Chris@3	196 newFrame[freqMap[i]] += mag;
Chris@3	197 }
Chris@3	198 spectralFlux.push_back(flux);
Chris@3	199 frames.push_back(newFrame);
Chris@3	200 // for (int i = 0; i < freqMapSize; i++)
Chris@3	201 // [frameCount][i] = newFrame[i];
Chris@3	202 /*
Chris@3	203 int index = cbIndex - (fftSize - hopSize);
Chris@3	204 if (index < 0)
Chris@3	205 index += fftSize;
Chris@3	206 int sz = (fftSize - hopSize) / energyOversampleFactor;
Chris@3	207 for (int j = 0; j < energyOversampleFactor; j++) {
Chris@3	208 double newEnergy = 0;
Chris@3	209 for (int i = 0; i < sz; i++) {
Chris@3	210 newEnergy += circBuffer[index] * circBuffer[index];
Chris@3	211 if (++index == fftSize)
Chris@3	212 index = 0;
Chris@3	213 }
Chris@3	214 energy[frameCount * energyOversampleFactor + j] =
Chris@3	215 newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816;
Chris@3	216 }*/
Chris@1	217
Chris@3	218 double decay = frameCount >= 200? 0.99:
Chris@3	219 (frameCount < 100? 0: (frameCount - 100) / 100.0);
Chris@1	220
Chris@3	221 //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent
Chris@1	222
Chris@3	223 if (ltAverage == 0)
Chris@3	224 ltAverage = frameRMS;
Chris@3	225 else
Chris@3	226 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
Chris@3	227 if (frameRMS <= silenceThreshold)
Chris@3	228 for (int i = 0; i < freqMapSize; i++)
Chris@3	229 frames[frameCount][i] = 0;
Chris@3	230 else {
Chris@3	231 if (normaliseMode == 1)
Chris@3	232 for (int i = 0; i < freqMapSize; i++)
Chris@3	233 frames[frameCount][i] /= frameRMS;
Chris@3	234 else if (normaliseMode == 2)
Chris@3	235 for (int i = 0; i < freqMapSize; i++)
Chris@3	236 frames[frameCount][i] /= ltAverage;
Chris@3	237 for (int i = 0; i < freqMapSize; i++) {
Chris@3	238 frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold;
Chris@3	239 if (frames[frameCount][i] < 0)
Chris@3	240 frames[frameCount][i] = 0;
Chris@3	241 }
Chris@3	242 }
Chris@1	243 // weightedPhaseDeviation();
Chris@1	244 // if (debug)
Chris@1	245 // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n",
Chris@1	246 // frameCount * hopTime,
Chris@1	247 // phaseDeviation[frameCount],
Chris@1	248 // frameRMS);
Chris@3	249 frameCount++;
Chris@3	250 } // processFrame()
Chris@1	251
Chris@3	252 /** Processes a complete file of audio data. */
Chris@3	253 void processFile() {
Chris@3	254 /*
Chris@3	255 while (pcmInputStream != null) {
Chris@3	256 // Profile.start(0);
Chris@3	257 processFrame();
Chris@3	258 // Profile.log(0);
Chris@3	259 if (Thread.currentThread().isInterrupted()) {
Chris@3	260 System.err.println("info: INTERRUPTED in processFile()");
Chris@3	261 return;
Chris@3	262 }
Chris@3	263 }
Chris@3	264 */
Chris@1	265 // double[] x1 = new double[phaseDeviation.length];
Chris@1	266 // for (int i = 0; i < x1.length; i++) {
Chris@1	267 // x1[i] = i * hopTime;
Chris@1	268 // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
Chris@1	269 // }
Chris@1	270 // double[] x2 = new double[energy.length];
Chris@1	271 // for (int i = 0; i < x2.length; i++)
Chris@1	272 // x2[i] = i * hopTime / energyOversampleFactor;
Chris@1	273 // // plot.clear();
Chris@1	274 // plot.addPlot(x1, phaseDeviation, Color.green, 7);
Chris@1	275 // plot.addPlot(x2, energy, Color.red, 7);
Chris@1	276 // plot.setTitle("Test phase deviation");
Chris@1	277 // plot.fitAxes();
Chris@1	278
Chris@1	279 // double[] slope = new double[energy.length];
Chris@1	280 // double hop = hopTime / energyOversampleFactor;
Chris@1	281 // Peaks.getSlope(energy, hop, 15, slope);
Chris@4	282 // vector<Integer> peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10);
Chris@1	283
Chris@3	284 double hop = hopTime;
Chris@4	285 Peaks::normalise(spectralFlux);
Chris@4	286 vector<int> peaks = Peaks::findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true);
Chris@3	287 onsets = new double[peaks.size()];
Chris@3	288 double[] y2 = new double[onsets.length];
Chris@4	289 vector<int>::iterator it = peaks.begin();
Chris@3	290 onsetList = new EventList();
Chris@3	291 double minSalience = Peaks.min(spectralFlux);
Chris@3	292 for (int i = 0; i < onsets.length; i++) {
Chris@4	293 int index = *it;
Chris@4	294 ++it;
Chris@3	295 onsets[i] = index * hop;
Chris@3	296 y2[i] = spectralFlux[index];
Chris@3	297 Event e = BeatTrackDisplay.newBeat(onsets[i], 0);
Chris@1	298 // if (debug)
Chris@1	299 // System.err.printf("Onset: %8.3f %8.3f %8.3f\n",
Chris@1	300 // onsets[i], energy[index], slope[index]);
Chris@1	301 // e.salience = slope[index]; // or combination of energy + slope??
Chris@3	302 // Note that salience must be non-negative or the beat tracking system fails!
Chris@3	303 e.salience = spectralFlux[index] - minSalience;
Chris@3	304 onsetList.add(e);
Chris@3	305 }
Chris@1	306
Chris@3	307 //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack
Chris@1	308
Chris@3	309 } // processFile()
Chris@3	310
Chris@3	311 }; // class AudioProcessor
Chris@1	312
Chris@1	313
Chris@1	314 #endif

Mercurial > hg > beatroot-vamp

annotate BeatRootProcessor.h @ 4:c06cf6f7cb04