Chris@1: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@1: Chris@1: /* Chris@3: Vamp feature extraction plugin for the BeatRoot beat tracker. Chris@1: Chris@3: Centre for Digital Music, Queen Mary, University of London. Chris@3: This file copyright 2011 Simon Dixon, Chris Cannam and QMUL. Chris@1: Chris@3: This program is free software; you can redistribute it and/or Chris@3: modify it under the terms of the GNU General Public License as Chris@3: published by the Free Software Foundation; either version 2 of the Chris@3: License, or (at your option) any later version. See the file Chris@3: COPYING included with this distribution for more information. Chris@1: */ Chris@1: Chris@1: #ifndef _BEATROOT_PROCESSOR_H_ Chris@1: #define _BEATROOT_PROCESSOR_H_ Chris@1: Chris@2: #include Chris@3: #include Chris@2: Chris@2: using std::vector; Chris@2: Chris@1: class BeatRootProcessor Chris@1: { Chris@1: protected: Chris@1: /** Sample rate of audio */ Chris@1: float sampleRate; Chris@1: Chris@1: /** Spacing of audio frames (determines the amount of overlap or Chris@1: * skip between frames). This value is expressed in Chris@1: * seconds. (Default = 0.020s) */ Chris@1: double hopTime; Chris@1: Chris@1: /** The approximate size of an FFT frame in seconds. (Default = Chris@1: * 0.04644s). The value is adjusted so that fftSize Chris@1: * is always power of 2. */ Chris@1: double fftTime; Chris@1: Chris@1: /** Spacing of audio frames in samples (see hopTime) */ Chris@1: int hopSize; Chris@1: Chris@1: /** The size of an FFT frame in samples (see fftTime) */ Chris@1: int fftSize; Chris@1: Chris@1: /** The number of overlapping frames of audio data which have been read. */ Chris@1: int frameCount; Chris@1: Chris@1: /** RMS amplitude of the current frame. */ Chris@1: double frameRMS; Chris@1: Chris@1: /** Long term average frame energy (in frequency domain representation). */ Chris@1: double ltAverage; Chris@1: Chris@1: /** Spectral flux onset detection function, indexed by frame. */ Chris@1: vector spectralFlux; Chris@1: Chris@1: /** A mapping function for mapping FFT bins to final frequency bins. Chris@1: * The mapping is linear (1-1) until the resolution reaches 2 points per Chris@1: * semitone, then logarithmic with a semitone resolution. e.g. for Chris@1: * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is Chris@1: * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and Chris@1: * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to Chris@1: * 83), where all energy above note 127 is mapped into the final bin. */ Chris@1: vector freqMap; Chris@1: Chris@1: /** The number of entries in freqMap. Note that the length of Chris@1: * the array is greater, because its size is not known at creation time. */ Chris@1: int freqMapSize; Chris@1: Chris@1: /** The magnitude spectrum of the most recent frame. Used for Chris@1: * calculating the spectral flux. */ Chris@1: vector prevFrame; Chris@1: Chris@1: /** The magnitude spectrum of the current frame. */ Chris@1: vector newFrame; Chris@1: Chris@1: /** The magnitude spectra of all frames, used for plotting the spectrogram. */ Chris@1: vector > frames; //!!! do we need this? much cheaper to lose it if we don't Chris@1: Chris@1: /** The RMS energy of all frames. */ Chris@3: // vector energy; //!!! unused in beat tracking? Chris@1: Chris@1: /** The estimated onset times from peak-picking the onset Chris@1: * detection function(s). */ Chris@1: vector onsets; Chris@1: Chris@1: /** The estimated onset times and their saliences. */ Chris@1: //!!!EventList onsetList; Chris@1: vector onsetList; //!!! corresponding to keyDown member of events in list Chris@1: Chris@1: /** Total number of audio frames if known, or -1 for live or compressed input. */ Chris@1: int totalFrames; Chris@1: Chris@1: /** Flag for enabling or disabling debugging output */ Chris@2: static bool debug; Chris@1: Chris@1: /** Flag for suppressing all standard output messages except results. */ Chris@2: static bool silent; Chris@1: Chris@1: /** RMS frame energy below this value results in the frame being Chris@1: * set to zero, so that normalisation does not have undesired Chris@1: * side-effects. */ Chris@2: static double silenceThreshold; //!!!??? energy of what? should not be static? Chris@1: Chris@1: /** For dynamic range compression, this value is added to the log Chris@1: * magnitude in each frequency bin and any remaining negative Chris@1: * values are then set to zero. Chris@1: */ Chris@2: static double rangeThreshold; //!!! sim Chris@1: Chris@1: /** Determines method of normalisation. Values can be:
    Chris@1: *
  • 0: no normalisation
  • Chris@1: *
  • 1: normalisation by current frame energy
  • Chris@1: *
  • 2: normalisation by exponential average of frame energy
  • Chris@1: *
Chris@1: */ Chris@2: static int normaliseMode; Chris@1: Chris@1: /** Ratio between rate of sampling the signal energy (for the Chris@1: * amplitude envelope) and the hop size */ Chris@3: // static int energyOversampleFactor; //!!! not used? Chris@1: Chris@1: public: Chris@1: Chris@1: /** Constructor: note that streams are not opened until the input Chris@1: * file is set (see setInputFile()). */ Chris@2: BeatRootProcessor() { Chris@1: frameRMS = 0; Chris@1: ltAverage = 0; Chris@1: frameCount = 0; Chris@1: hopSize = 0; Chris@1: fftSize = 0; Chris@1: hopTime = 0.010; // DEFAULT, overridden with -h Chris@1: fftTime = 0.04644; // DEFAULT, overridden with -f Chris@3: totalFrames = -1; //!!! not needed? Chris@1: } // constructor Chris@1: Chris@2: protected: Chris@3: /** Allocates memory for arrays, based on parameter settings */ Chris@3: void init() { Chris@3: hopSize = lrint(sampleRate * hopTime); Chris@3: fftSize = lrint(pow(2, lrint( log(fftTime * sampleRate) / log(2)))); Chris@3: makeFreqMap(fftSize, sampleRate); Chris@3: prevFrame.clear(); Chris@3: for (int i = 0; i < freqMapSize; i++) prevFrame.push_back(0); Chris@3: frameCount = 0; Chris@3: frameRMS = 0; Chris@3: ltAverage = 0; Chris@3: spectralFlux.clear(); Chris@3: } // init() Chris@1: Chris@3: /** Creates a map of FFT frequency bins to comparison bins. Chris@3: * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is Chris@3: * one to one. Where the spacing is greater than 0.5 semitones, the FFT Chris@3: * energy is mapped into semitone-wide bins. No scaling is performed; that Chris@3: * is the energy is summed into the comparison bins. See also Chris@3: * processFrame() Chris@3: */ Chris@3: void makeFreqMap(int fftSize, float sampleRate) { Chris@3: freqMap.resize(fftSize/2+1); Chris@3: double binWidth = sampleRate / fftSize; Chris@3: int crossoverBin = (int)(2 / (pow(2, 1/12.0) - 1)); Chris@3: int crossoverMidi = (int)lrint(log(crossoverBin*binWidth/440)/ Chris@3: log(2) * 12 + 69); Chris@3: int i = 0; Chris@3: while (i <= crossoverBin) Chris@3: freqMap[i++] = i; Chris@3: while (i <= fftSize/2) { Chris@3: double midi = log(i*binWidth/440) / log(2) * 12 + 69; Chris@3: if (midi > 127) Chris@3: midi = 127; Chris@3: freqMap[i++] = crossoverBin + (int)lrint(midi) - crossoverMidi; Chris@3: } Chris@3: freqMapSize = freqMap[i-1] + 1; Chris@3: } // makeFreqMap() Chris@1: Chris@3: /** Processes a frame of audio data by first computing the STFT with a Chris@3: * Hamming window, then mapping the frequency bins into a part-linear Chris@3: * part-logarithmic array, then computing the spectral flux Chris@3: * then (optionally) normalising and calculating onsets. Chris@3: */ Chris@3: void processFrame(const float *const *inputBuffers) { Chris@3: newFrame.clear(); Chris@3: for (int i = 0; i < freqMapSize; i++) { Chris@3: newFrame.push_back(0); Chris@3: } Chris@3: double flux = 0; Chris@3: for (int i = 0; i <= fftSize/2; i++) { Chris@3: double mag = sqrt(inputBuffers[0][i*2] * inputBuffers[0][i*2] + Chris@3: inputBuffers[0][i*2+1] * inputBuffers[0][i*2+1]); Chris@3: if (mag > prevFrame[i]) flux += mag - prevFrame[i]; Chris@3: prevFrame[i] = mag; Chris@3: newFrame[freqMap[i]] += mag; Chris@3: } Chris@3: spectralFlux.push_back(flux); Chris@3: frames.push_back(newFrame); Chris@3: // for (int i = 0; i < freqMapSize; i++) Chris@3: // [frameCount][i] = newFrame[i]; Chris@3: /* Chris@3: int index = cbIndex - (fftSize - hopSize); Chris@3: if (index < 0) Chris@3: index += fftSize; Chris@3: int sz = (fftSize - hopSize) / energyOversampleFactor; Chris@3: for (int j = 0; j < energyOversampleFactor; j++) { Chris@3: double newEnergy = 0; Chris@3: for (int i = 0; i < sz; i++) { Chris@3: newEnergy += circBuffer[index] * circBuffer[index]; Chris@3: if (++index == fftSize) Chris@3: index = 0; Chris@3: } Chris@3: energy[frameCount * energyOversampleFactor + j] = Chris@3: newEnergy / sz <= 1e-6? 0: log(newEnergy / sz) + 13.816; Chris@3: }*/ Chris@1: Chris@3: double decay = frameCount >= 200? 0.99: Chris@3: (frameCount < 100? 0: (frameCount - 100) / 100.0); Chris@1: Chris@3: //!!! uh-oh -- frameRMS has not been calculated (it came from time-domain signal) -- will always appear silent Chris@1: Chris@3: if (ltAverage == 0) Chris@3: ltAverage = frameRMS; Chris@3: else Chris@3: ltAverage = ltAverage * decay + frameRMS * (1.0 - decay); Chris@3: if (frameRMS <= silenceThreshold) Chris@3: for (int i = 0; i < freqMapSize; i++) Chris@3: frames[frameCount][i] = 0; Chris@3: else { Chris@3: if (normaliseMode == 1) Chris@3: for (int i = 0; i < freqMapSize; i++) Chris@3: frames[frameCount][i] /= frameRMS; Chris@3: else if (normaliseMode == 2) Chris@3: for (int i = 0; i < freqMapSize; i++) Chris@3: frames[frameCount][i] /= ltAverage; Chris@3: for (int i = 0; i < freqMapSize; i++) { Chris@3: frames[frameCount][i] = log(frames[frameCount][i]) + rangeThreshold; Chris@3: if (frames[frameCount][i] < 0) Chris@3: frames[frameCount][i] = 0; Chris@3: } Chris@3: } Chris@1: // weightedPhaseDeviation(); Chris@1: // if (debug) Chris@1: // System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n", Chris@1: // frameCount * hopTime, Chris@1: // phaseDeviation[frameCount], Chris@1: // frameRMS); Chris@3: frameCount++; Chris@3: } // processFrame() Chris@1: Chris@3: /** Processes a complete file of audio data. */ Chris@3: void processFile() { Chris@3: /* Chris@3: while (pcmInputStream != null) { Chris@3: // Profile.start(0); Chris@3: processFrame(); Chris@3: // Profile.log(0); Chris@3: if (Thread.currentThread().isInterrupted()) { Chris@3: System.err.println("info: INTERRUPTED in processFile()"); Chris@3: return; Chris@3: } Chris@3: } Chris@3: */ Chris@1: // double[] x1 = new double[phaseDeviation.length]; Chris@1: // for (int i = 0; i < x1.length; i++) { Chris@1: // x1[i] = i * hopTime; Chris@1: // phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100; Chris@1: // } Chris@1: // double[] x2 = new double[energy.length]; Chris@1: // for (int i = 0; i < x2.length; i++) Chris@1: // x2[i] = i * hopTime / energyOversampleFactor; Chris@1: // // plot.clear(); Chris@1: // plot.addPlot(x1, phaseDeviation, Color.green, 7); Chris@1: // plot.addPlot(x2, energy, Color.red, 7); Chris@1: // plot.setTitle("Test phase deviation"); Chris@1: // plot.fitAxes(); Chris@1: Chris@1: // double[] slope = new double[energy.length]; Chris@1: // double hop = hopTime / energyOversampleFactor; Chris@1: // Peaks.getSlope(energy, hop, 15, slope); Chris@3: // LinkedList peaks = Peaks.findPeaks(slope, (int)lrint(0.06 / hop), 10); Chris@1: Chris@3: double hop = hopTime; Chris@3: Peaks.normalise(spectralFlux); Chris@3: LinkedList peaks = Peaks.findPeaks(spectralFlux, (int)lrint(0.06 / hop), 0.35, 0.84, true); Chris@3: onsets = new double[peaks.size()]; Chris@3: double[] y2 = new double[onsets.length]; Chris@3: Iterator it = peaks.iterator(); Chris@3: onsetList = new EventList(); Chris@3: double minSalience = Peaks.min(spectralFlux); Chris@3: for (int i = 0; i < onsets.length; i++) { Chris@3: int index = it.next(); Chris@3: onsets[i] = index * hop; Chris@3: y2[i] = spectralFlux[index]; Chris@3: Event e = BeatTrackDisplay.newBeat(onsets[i], 0); Chris@1: // if (debug) Chris@1: // System.err.printf("Onset: %8.3f %8.3f %8.3f\n", Chris@1: // onsets[i], energy[index], slope[index]); Chris@1: // e.salience = slope[index]; // or combination of energy + slope?? Chris@3: // Note that salience must be non-negative or the beat tracking system fails! Chris@3: e.salience = spectralFlux[index] - minSalience; Chris@3: onsetList.add(e); Chris@3: } Chris@1: Chris@3: //!!! This onsetList is then fed in to BeatTrackDisplay::beatTrack Chris@1: Chris@3: } // processFile() Chris@3: Chris@3: }; // class AudioProcessor Chris@1: Chris@1: Chris@1: #endif