beatroot-vamp: BeatRootProcessor.h annotate

annotate BeatRootProcessor.h @ 8:f04f87b5e643

Add agent list class, and continue plodding through

author	Chris Cannam
date	Fri, 30 Sep 2011 11:37:25 +0100
parents	02d388f98c23
children	4f6626f9ffac

rev	line source
Chris@1	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
Chris@1	2
Chris@1	3 /*
Chris@3	4 Vamp feature extraction plugin for the BeatRoot beat tracker.
Chris@1	5
Chris@3	6 Centre for Digital Music, Queen Mary, University of London.
Chris@3	7 This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
Chris@1	8
Chris@3	9 This program is free software; you can redistribute it and/or
Chris@3	10 modify it under the terms of the GNU General Public License as
Chris@3	11 published by the Free Software Foundation; either version 2 of the
Chris@3	12 License, or (at your option) any later version. See the file
Chris@3	13 COPYING included with this distribution for more information.
Chris@1	14 */
Chris@1	15
Chris@1	16 #ifndef _BEATROOT_PROCESSOR_H_
Chris@1	17 #define _BEATROOT_PROCESSOR_H_
Chris@1	18
Chris@4	19 #include "Peaks.h"
Chris@6	20 #include "Event.h"
Chris@6	21 #include "BeatTracker.h"
Chris@4	22
Chris@2	23 #include <vector>
Chris@3	24 #include <cmath>
Chris@2	25
Chris@2	26 using std::vector;
Chris@2	27
Chris@1	28 class BeatRootProcessor
Chris@1	29 {
Chris@1	30 protected:
Chris@1	31 /** Sample rate of audio */
Chris@1	32 float sampleRate;
Chris@1	33
Chris@1	34 /** Spacing of audio frames (determines the amount of overlap or
Chris@1	35 * skip between frames). This value is expressed in
Chris@1	36 * seconds. (Default = 0.020s) */
Chris@1	37 double hopTime;
Chris@1	38
Chris@1	39 /** The approximate size of an FFT frame in seconds. (Default =
Chris@1	40 * 0.04644s). The value is adjusted so that <code>fftSize</code>
Chris@1	41 * is always power of 2. */
Chris@1	42 double fftTime;
Chris@1	43
Chris@1	44 /** Spacing of audio frames in samples (see <code>hopTime</code>) */
Chris@1	45 int hopSize;
Chris@1	46
Chris@1	47 /** The size of an FFT frame in samples (see <code>fftTime</code>) */
Chris@1	48 int fftSize;
Chris@1	49
Chris@1	50 /** The number of overlapping frames of audio data which have been read. */
Chris@1	51 int frameCount;
Chris@1	52
Chris@1	53 /** RMS amplitude of the current frame. */
Chris@1	54 double frameRMS;
Chris@1	55
Chris@1	56 /** Long term average frame energy (in frequency domain representation). */
Chris@1	57 double ltAverage;
Chris@1	58
Chris@1	59 /** Spectral flux onset detection function, indexed by frame. */
Chris@4	60 vector<double> spectralFlux;
Chris@1	61
Chris@1	62 /** A mapping function for mapping FFT bins to final frequency bins.
Chris@1	63 * The mapping is linear (1-1) until the resolution reaches 2 points per
Chris@1	64 * semitone, then logarithmic with a semitone resolution. e.g. for
Chris@1	65 * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
Chris@1	66 * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
Chris@1	67 * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
Chris@1	68 * 83), where all energy above note 127 is mapped into the final bin. */
Chris@1	69 vector<int> freqMap;
Chris@1	70
Chris@1	71 /** The number of entries in <code>freqMap</code>. Note that the length of
Chris@1	72 * the array is greater, because its size is not known at creation time. */
Chris@1	73 int freqMapSize;
Chris@1	74
Chris@1	75 /** The magnitude spectrum of the most recent frame. Used for
Chris@1	76 * calculating the spectral flux. */
Chris@1	77 vector<double> prevFrame;
Chris@1	78
Chris@1	79 /** The magnitude spectrum of the current frame. */
Chris@1	80 vector<double> newFrame;
Chris@1	81
Chris@1	82 /** The magnitude spectra of all frames, used for plotting the spectrogram. */
Chris@1	83 vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
Chris@1	84
Chris@1	85 /** The RMS energy of all frames. */
Chris@3	86 // vector<double> energy; //!!! unused in beat tracking?
Chris@1	87
Chris@1	88 /** The estimated onset times from peak-picking the onset
Chris@1	89 * detection function(s). */
Chris@1	90 vector<double> onsets;
Chris@1	91
Chris@1	92 /** The estimated onset times and their saliences. */
Chris@6	93 EventList onsetList;
Chris@1	94
Chris@1	95 /** Total number of audio frames if known, or -1 for live or compressed input. */
Chris@1	96 int totalFrames;
Chris@1	97
Chris@1	98 /** Flag for enabling or disabling debugging output */
Chris@2	99 static bool debug;
Chris@1	100
Chris@1	101 /** Flag for suppressing all standard output messages except results. */
Chris@2	102 static bool silent;
Chris@1	103
Chris@1	104 /** RMS frame energy below this value results in the frame being
Chris@1	105 * set to zero, so that normalisation does not have undesired
Chris@1	106 * side-effects. */
Chris@2	107 static double silenceThreshold; //!!!??? energy of what? should not be static?
Chris@1	108
Chris@1	109 /** For dynamic range compression, this value is added to the log
Chris@1	110 * magnitude in each frequency bin and any remaining negative
Chris@1	111 * values are then set to zero.
Chris@1	112 */
Chris@2	113 static double rangeThreshold; //!!! sim
Chris@1	114
Chris@1	115 /** Determines method of normalisation. Values can be:<ul>
Chris@1	116 * <li>0: no normalisation</li>
Chris@1	117 * <li>1: normalisation by current frame energy</li>
Chris@1	118 * <li>2: normalisation by exponential average of frame energy</li>
Chris@1	119 * </ul>
Chris@1	120 */
Chris@2	121 static int normaliseMode;
Chris@1	122
Chris@1	123 /** Ratio between rate of sampling the signal energy (for the
Chris@1	124 * amplitude envelope) and the hop size */
Chris@3	125 // static int energyOversampleFactor; //!!! not used?
Chris@1	126
Chris@1	127 public:
Chris@1	128
Chris@1	129 /** Constructor: note that streams are not opened until the input
Chris@1	130 * file is set (see <code>setInputFile()</code>). */
Chris@8	131 BeatRootProcessor(float sr) :
Chris@8	132 sampleRate(sr) {
Chris@1	133 frameRMS = 0;
Chris@1	134 ltAverage = 0;
Chris@1	135 frameCount = 0;
Chris@1	136 hopSize = 0;
Chris@1	137 fftSize = 0;
Chris@1	138 hopTime = 0.010; // DEFAULT, overridden with -h
Chris@1	139 fftTime = 0.04644; // DEFAULT, overridden with -f
Chris@3	140 totalFrames = -1; //!!! not needed?
Chris@1	141 } // constructor
Chris@1	142
Chris@2	143 protected:
Chris@3	144 /** Allocates memory for arrays, based on parameter settings */
Chris@3	145 void init() {
Chris@3	146 hopSize = lrint(sampleRate * hopTime);
Chris@3	147 fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2))));
Chris@3	148 makeFreqMap(fftSize, sampleRate);
Chris@3	149 prevFrame.clear();
Chris@3	150 for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0);
Chris@3	151 frameCount = 0;
Chris@3	152 frameRMS = 0;
Chris@3	153 ltAverage = 0;
Chris@3	154 spectralFlux.clear();
Chris@3	155 } // init()
Chris@1	156
Chris@3	157 /** Creates a map of FFT frequency bins to comparison bins.
Chris@3	158 * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
Chris@3	159 * one to one. Where the spacing is greater than 0.5 semitones, the FFT
Chris@3	160 * energy is mapped into semitone-wide bins. No scaling is performed; that
Chris@3	161 * is the energy is summed into the comparison bins. See also
Chris@3	162 * processFrame()
Chris@3	163 */
Chris@3	164 void makeFreqMap(int fftSize, float sampleRate) {
Chris@3	165 freqMap.resize(fftSize/2+1);
Chris@3	166 double binWidth = sampleRate / fftSize;
Chris@3	167 int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1));
Chris@3	168 int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/
Chris@3	169 log(2) * 12 + 69);
Chris@3	170 int i = 0;
Chris@3	171 while (i <= crossoverBin)
Chris@3	172 freqMap[i++] = i;
Chris@3	173 while (i <= fftSize/2) {
Chris@3	174 double midi = log(ibinWidth/440) / log(2) 12 + 69;
Chris@3	175 if (midi > 127)
Chris@3	176 midi = 127;
Chris@3	177 freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi;
Chris@3	178 }
Chris@3	179 freqMapSize = freqMap[i-1] + 1;
Chris@3	180 } // makeFreqMap()
Chris@1	181
Chris@3	182 /** Processes a frame of audio data by first computing the STFT with a
Chris@3	183 * Hamming window, then mapping the frequency bins into a part-linear
Chris@3	184 * part-logarithmic array, then computing the spectral flux
Chris@3	185 * then (optionally) normalising and calculating onsets.
Chris@3	186 */
Chris@3	187 void processFrame(const float const inputBuffers) {
Chris@3	188 newFrame.clear();
Chris@3	189 for (int i = 0; i < freqMapSize; i++) {
Chris@3	190 newFrame.push_back(0);
Chris@3	191 }
Chris@3	192 double flux = 0;
Chris@3	193 for (int i = 0; i <= fftSize/2; i++) {
Chris@3	194 double mag = sqrt(inputBuffers[0][i2] inputBuffers[0][i*2] +
Chris@3	195 inputBuffers[0][i2+1] inputBuffers[0][i*2+1]);
Chris@3	196 if (mag > prevFrame[i]) flux += mag - prevFrame[i];
Chris@3	197 prevFrame[i] = mag;
Chris@3	198 newFrame[freqMap[i]] += mag;
Chris@3	199 }
Chris@3	200 spectralFlux.push_back(flux);
Chris@3	201 frames.push_back(newFrame);
Chris@3	202 // for (int i = 0; i < freqMapSize; i++)
Chris@3	203 // [frameCount][i] = newFrame[i];
Chris@3	204 /*
Chris@3	205 int index = cbIndex - (fftSize - hopSize);
Chris@3	206 if (index < 0)
Chris@3	207 index += fftSize;
Chris@3	208 int sz = (fftSize - hopSize) / energyOversampleFactor;
Chris@3	209 for (int j = 0; j < energyOversampleFactor; j++) {
Chris@3	210 double newEnergy = 0;
Chris@3	211 for (int i = 0; i < sz; i++) {
Chris@3	212 newEnergy += circBuffer[index] * circBuffer[index];
Chris@3	213 if (++index == fftSize)
Chris@3	214 index = 0;
Chris@3	215 }
Chris@3	216 energy[frameCount * energyOversampleFactor + j] =
Chris@3	217 newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816;
Chris@3	218 }*/
Chris@1	219
Chris@3	220 double decay = frameCount >= 200? 0.99:
Chris@3	221 (frameCount < 100? 0: (frameCount - 100) / 100.0);
Chris@1	222
Chris@3	223 //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent
Chris@1	224
Chris@3	225 if (ltAverage == 0)
Chris@3	226 ltAverage = frameRMS;
Chris@3	227 else
Chris@3	228 ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
Chris@3	229 if (frameRMS <= silenceThreshold)
Chris@3	230 for (int i = 0; i < freqMapSize; i++)
Chris@3	231 frames[frameCount][i] = 0;
Chris@3	232 else {
Chris@3	233 if (normaliseMode == 1)
Chris@3	234 for (int i = 0; i < freqMapSize; i++)
Chris@3	235 frames[frameCount][i] /= frameRMS;
Chris@3	236 else if (normaliseMode == 2)
Chris@3	237 for (int i = 0; i < freqMapSize; i++)
Chris@3	238 frames[frameCount][i] /= ltAverage;
Chris@3	239 for (int i = 0; i < freqMapSize; i++) {
Chris@3	240 frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold;
Chris@3	241 if (frames[frameCount][i] < 0)
Chris@3	242 frames[frameCount][i] = 0;
Chris@3	243 }
Chris@3	244 }
Chris@1	245 // weightedPhaseDeviation();
Chris@1	246 // if (debug)
Chris@1	247 // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n",
Chris@1	248 // frameCount * hopTime,
Chris@1	249 // phaseDeviation[frameCount],
Chris@1	250 // frameRMS);
Chris@3	251 frameCount++;
Chris@3	252 } // processFrame()
Chris@1	253
Chris@3	254 /** Processes a complete file of audio data. */
Chris@3	255 void processFile() {
Chris@3	256 /*
Chris@3	257 while (pcmInputStream != null) {
Chris@3	258 // Profile.start(0);
Chris@3	259 processFrame();
Chris@3	260 // Profile.log(0);
Chris@3	261 if (Thread.currentThread().isInterrupted()) {
Chris@3	262 System.err.println("info: INTERRUPTED in processFile()");
Chris@3	263 return;
Chris@3	264 }
Chris@3	265 }
Chris@3	266 */
Chris@1	267 // double[] x1 = new double[phaseDeviation.length];
Chris@1	268 // for (int i = 0; i < x1.length; i++) {
Chris@1	269 // x1[i] = i * hopTime;
Chris@1	270 // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
Chris@1	271 // }
Chris@1	272 // double[] x2 = new double[energy.length];
Chris@1	273 // for (int i = 0; i < x2.length; i++)
Chris@1	274 // x2[i] = i * hopTime / energyOversampleFactor;
Chris@1	275 // // plot.clear();
Chris@1	276 // plot.addPlot(x1, phaseDeviation, Color.green, 7);
Chris@1	277 // plot.addPlot(x2, energy, Color.red, 7);
Chris@1	278 // plot.setTitle("Test phase deviation");
Chris@1	279 // plot.fitAxes();
Chris@1	280
Chris@1	281 // double[] slope = new double[energy.length];
Chris@1	282 // double hop = hopTime / energyOversampleFactor;
Chris@1	283 // Peaks.getSlope(energy, hop, 15, slope);
Chris@4	284 // vector<Integer> peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10);
Chris@1	285
Chris@3	286 double hop = hopTime;
Chris@4	287 Peaks::normalise(spectralFlux);
Chris@4	288 vector<int> peaks = Peaks::findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true);
Chris@5	289 onsets.clear();
Chris@5	290 onsets.resize(peaks.size(), 0);
Chris@4	291 vector<int>::iterator it = peaks.begin();
Chris@6	292 onsetList.clear();
Chris@6	293 double minSalience = Peaks::min(spectralFlux);
Chris@6	294 for (int i = 0; i < onsets.size(); i++) {
Chris@4	295 int index = *it;
Chris@4	296 ++it;
Chris@3	297 onsets[i] = index * hop;
Chris@6	298 Event e = BeatTracker::newBeat(onsets[i], 0);
Chris@1	299 // if (debug)
Chris@1	300 // System.err.printf("Onset: %8.3f %8.3f %8.3f\n",
Chris@1	301 // onsets[i], energy[index], slope[index]);
Chris@1	302 // e.salience = slope[index]; // or combination of energy + slope??
Chris@3	303 // Note that salience must be non-negative or the beat tracking system fails!
Chris@3	304 e.salience = spectralFlux[index] - minSalience;
Chris@6	305 onsetList.push_back(e);
Chris@3	306 }
Chris@1	307
Chris@3	308 //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack
Chris@1	309
Chris@3	310 } // processFile()
Chris@3	311
Chris@3	312 }; // class AudioProcessor
Chris@1	313
Chris@1	314
Chris@1	315 #endif

Mercurial > hg > beatroot-vamp

annotate BeatRootProcessor.h @ 8:f04f87b5e643