Mercurial > hg > beatroot-vamp
diff BeatRootProcessor.h @ 1:791398eaf639
* Some half-digested Java/C++ mishmash
author | Chris Cannam |
---|---|
date | Mon, 24 Jan 2011 16:44:27 +0000 |
parents | |
children | 7d4e6b1ff3d1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BeatRootProcessor.h Mon Jan 24 16:44:27 2011 +0000 @@ -0,0 +1,647 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Vamp feature extraction plugin for the BeatRoot beat tracker. + + Centre for Digital Music, Queen Mary, University of London. + This file copyright 2011 Simon Dixon, Chris Cannam and QMUL. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#ifndef _BEATROOT_PROCESSOR_H_ +#define _BEATROOT_PROCESSOR_H_ + +class BeatRootProcessor +{ +protected: + /** Sample rate of audio */ + float sampleRate; + + /** Spacing of audio frames (determines the amount of overlap or + * skip between frames). This value is expressed in + * seconds. (Default = 0.020s) */ + double hopTime; + + /** The approximate size of an FFT frame in seconds. (Default = + * 0.04644s). The value is adjusted so that <code>fftSize</code> + * is always power of 2. */ + double fftTime; + + /** Spacing of audio frames in samples (see <code>hopTime</code>) */ + int hopSize; + + /** The size of an FFT frame in samples (see <code>fftTime</code>) */ + int fftSize; + + /** The number of overlapping frames of audio data which have been read. */ + int frameCount; + + /** RMS amplitude of the current frame. */ + double frameRMS; + + /** Long term average frame energy (in frequency domain representation). */ + double ltAverage; + + /** Spectral flux onset detection function, indexed by frame. */ + vector<int> spectralFlux; + + /** A mapping function for mapping FFT bins to final frequency bins. + * The mapping is linear (1-1) until the resolution reaches 2 points per + * semitone, then logarithmic with a semitone resolution. e.g. for + * 44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is + * 21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and + * logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to + * 83), where all energy above note 127 is mapped into the final bin. */ + vector<int> freqMap; + + /** The number of entries in <code>freqMap</code>. Note that the length of + * the array is greater, because its size is not known at creation time. */ + int freqMapSize; + + /** The magnitude spectrum of the most recent frame. Used for + * calculating the spectral flux. */ + vector<double> prevFrame; + + /** The magnitude spectrum of the current frame. */ + vector<double> newFrame; + + /** The magnitude spectra of all frames, used for plotting the spectrogram. */ + vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't + + /** The RMS energy of all frames. */ + vector<double> energy; //!!! unused in beat tracking? + + /** The estimated onset times from peak-picking the onset + * detection function(s). */ + vector<double> onsets; + + /** The estimated onset times and their saliences. */ + //!!!EventList onsetList; + vector<double> onsetList; //!!! corresponding to keyDown member of events in list + + /** Total number of audio frames if known, or -1 for live or compressed input. */ + int totalFrames; + + /** Flag for enabling or disabling debugging output */ + static bool debug = false; + + /** Flag for suppressing all standard output messages except results. */ + static bool silent = true; + + /** RMS frame energy below this value results in the frame being + * set to zero, so that normalisation does not have undesired + * side-effects. */ + static double silenceThreshold = 0.0004; //!!!??? energy of what? should not be static? + + /** For dynamic range compression, this value is added to the log + * magnitude in each frequency bin and any remaining negative + * values are then set to zero. + */ + static double rangeThreshold = 10; //!!! sim + + /** Determines method of normalisation. Values can be:<ul> + * <li>0: no normalisation</li> + * <li>1: normalisation by current frame energy</li> + * <li>2: normalisation by exponential average of frame energy</li> + * </ul> + */ + static int normaliseMode = 2; + + /** Ratio between rate of sampling the signal energy (for the + * amplitude envelope) and the hop size */ + static int energyOversampleFactor = 2; //!!! not used? + +public: + + /** Constructor: note that streams are not opened until the input + * file is set (see <code>setInputFile()</code>). */ + AudioProcessor() { + cbIndex = 0; + frameRMS = 0; + ltAverage = 0; + frameCount = 0; + hopSize = 0; + fftSize = 0; + hopTime = 0.010; // DEFAULT, overridden with -h + fftTime = 0.04644; // DEFAULT, overridden with -f + progressCallback = null; + stdIn = new BufferedReader(new InputStreamReader(System.in)); + if (doOnsetPlot) + plot = new Plot(); + } // constructor + + /** For debugging, outputs information about the AudioProcessor to + * standard error. + */ + public void print() { + System.err.println(this); + } // print() + + /** For interactive pause - wait for user to hit Enter */ + public String readLine() { + try { return stdIn.readLine(); } catch (Exception e) { return null; } + } // readLine() + + /** Gives some basic information about the audio being processed. */ + public String toString() { + return "AudioProcessor\n" + + String.format("\tFile: %s (%3.1f kHz, %1d channels)\n", + audioFileName, sampleRate/1000, channels) + + String.format("\tHop / FFT sizes: %5.3f / %5.3f", + hopTime, hopTime * fftSize / hopSize); + } // toString() + + /** Adds a link to the GUI component which shows the progress of matching. + * @param c the AudioProcessor representing the other performance + */ + public void setProgressCallback(ProgressIndicator c) { + progressCallback = c; + } // setProgressCallback() + + /** Sets up the streams and buffers for live audio input (CD quality). + * If any Exception is thrown within this method, it is caught, and any + * opened streams are closed, and <code>pcmInputStream</code> is set to + * <code>null</code>, indicating that the method did not complete + * successfully. + */ + public void setLiveInput() { + try { + channels = 2; + sampleRate = 44100; + AudioFormat desiredFormat = new AudioFormat( + AudioFormat.Encoding.PCM_SIGNED, sampleRate, 16, + channels, channels * 2, sampleRate, false); + TargetDataLine tdl = AudioSystem.getTargetDataLine(desiredFormat); + tdl.open(desiredFormat, liveInputBufferSize); + pcmInputStream = new AudioInputStream(tdl); + audioFormat = pcmInputStream.getFormat(); + init(); + tdl.start(); + } catch (Exception e) { + e.printStackTrace(); + closeStreams(); // make sure it exits in a consistent state + } + } // setLiveInput() + + /** Sets up the streams and buffers for audio file input. + * If any Exception is thrown within this method, it is caught, and any + * opened streams are closed, and <code>pcmInputStream</code> is set to + * <code>null</code>, indicating that the method did not complete + * successfully. + * @param fileName The path name of the input audio file. + */ + public void setInputFile(String fileName) { + closeStreams(); // release previously allocated resources + audioFileName = fileName; + try { + if (audioFileName == null) + throw new Exception("No input file specified"); + File audioFile = new File(audioFileName); + if (!audioFile.isFile()) + throw new FileNotFoundException( + "Requested file does not exist: " + audioFileName); + rawInputStream = AudioSystem.getAudioInputStream(audioFile); + audioFormat = rawInputStream.getFormat(); + channels = audioFormat.getChannels(); + sampleRate = audioFormat.getSampleRate(); + pcmInputStream = rawInputStream; + if ((audioFormat.getEncoding()!=AudioFormat.Encoding.PCM_SIGNED) || + (audioFormat.getFrameSize() != channels * 2) || + audioFormat.isBigEndian()) { + AudioFormat desiredFormat = new AudioFormat( + AudioFormat.Encoding.PCM_SIGNED, sampleRate, 16, + channels, channels * 2, sampleRate, false); + pcmInputStream = AudioSystem.getAudioInputStream(desiredFormat, + rawInputStream); + audioFormat = desiredFormat; + } + init(); + } catch (Exception e) { + e.printStackTrace(); + closeStreams(); // make sure it exits in a consistent state + } + } // setInputFile() + + /** Allocates memory for arrays, based on parameter settings */ + protected void init() { + hopSize = (int) Math.round(sampleRate * hopTime); + fftSize = (int) Math.round(Math.pow(2, + Math.round( Math.log(fftTime * sampleRate) / Math.log(2)))); + makeFreqMap(fftSize, sampleRate); + int buffSize = hopSize * channels * 2; + if ((inputBuffer == null) || (inputBuffer.length != buffSize)) + inputBuffer = new byte[buffSize]; + if ((circBuffer == null) || (circBuffer.length != fftSize)) { + circBuffer = new double[fftSize]; + reBuffer = new double[fftSize]; + imBuffer = new double[fftSize]; + prevPhase = new double[fftSize]; + prevPrevPhase = new double[fftSize]; + prevFrame = new double[fftSize]; + window = FFT.makeWindow(FFT.HAMMING, fftSize, fftSize); + for (int i=0; i < fftSize; i++) + window[i] *= Math.sqrt(fftSize); + } + if (pcmInputStream == rawInputStream) + totalFrames = (int)(pcmInputStream.getFrameLength() / hopSize); + else + totalFrames = (int) (MAX_LENGTH / hopTime); + if ((newFrame == null) || (newFrame.length != freqMapSize)) { + newFrame = new double[freqMapSize]; + frames = new double[totalFrames][freqMapSize]; + } else if (frames.length != totalFrames) + frames = new double[totalFrames][freqMapSize]; + energy = new double[totalFrames*energyOversampleFactor]; + phaseDeviation = new double[totalFrames]; + spectralFlux = new double[totalFrames]; + frameCount = 0; + cbIndex = 0; + frameRMS = 0; + ltAverage = 0; + progressCallback = null; + } // init() + + /** Closes the input stream(s) associated with this object. */ + public void closeStreams() { + if (pcmInputStream != null) { + try { + pcmInputStream.close(); + if (pcmInputStream != rawInputStream) + rawInputStream.close(); + if (audioOut != null) { + audioOut.drain(); + audioOut.close(); + } + } catch (Exception e) {} + pcmInputStream = null; + audioOut = null; + } + } // closeStreams() + + /** Creates a map of FFT frequency bins to comparison bins. + * Where the spacing of FFT bins is less than 0.5 semitones, the mapping is + * one to one. Where the spacing is greater than 0.5 semitones, the FFT + * energy is mapped into semitone-wide bins. No scaling is performed; that + * is the energy is summed into the comparison bins. See also + * processFrame() + */ + protected void makeFreqMap(int fftSize, float sampleRate) { + freqMap = new int[fftSize/2+1]; + double binWidth = sampleRate / fftSize; + int crossoverBin = (int)(2 / (Math.pow(2, 1/12.0) - 1)); + int crossoverMidi = (int)Math.round(Math.log(crossoverBin*binWidth/440)/ + Math.log(2) * 12 + 69); + // freq = 440 * Math.pow(2, (midi-69)/12.0) / binWidth; + int i = 0; + while (i <= crossoverBin) + freqMap[i++] = i; + while (i <= fftSize/2) { + double midi = Math.log(i*binWidth/440) / Math.log(2) * 12 + 69; + if (midi > 127) + midi = 127; + freqMap[i++] = crossoverBin + (int)Math.round(midi) - crossoverMidi; + } + freqMapSize = freqMap[i-1] + 1; + } // makeFreqMap() + + /** Calculates the weighted phase deviation onset detection function. + * Not used. + * TODO: Test the change to WPD fn */ + protected void weightedPhaseDeviation() { + if (frameCount < 2) + phaseDeviation[frameCount] = 0; + else { + for (int i = 0; i < fftSize; i++) { + double pd = imBuffer[i] - 2 * prevPhase[i] + prevPrevPhase[i]; + double pd1 = Math.abs(Math.IEEEremainder(pd, 2 * Math.PI)); + phaseDeviation[frameCount] += pd1 * reBuffer[i]; + // System.err.printf("%7.3f %7.3f\n", pd/Math.PI, pd1/Math.PI); + } + } + phaseDeviation[frameCount] /= fftSize * Math.PI; + double[] tmp = prevPrevPhase; + prevPrevPhase = prevPhase; + prevPhase = imBuffer; + imBuffer = tmp; + } // weightedPhaseDeviation() + + /** Reads a frame of input data, averages the channels to mono, scales + * to a maximum possible absolute value of 1, and stores the audio data + * in a circular input buffer. + * @return true if a frame (or part of a frame, if it is the final frame) + * is read. If a complete frame cannot be read, the InputStream is set + * to null. + */ + public boolean getFrame() { + if (pcmInputStream == null) + return false; + try { + int bytesRead = (int) pcmInputStream.read(inputBuffer); + if ((audioOut != null) && (bytesRead > 0)) + if (audioOut.write(inputBuffer, 0, bytesRead) != bytesRead) + System.err.println("Error writing to audio device"); + if (bytesRead < inputBuffer.length) { + if (!silent) + System.err.println("End of input: " + audioFileName); + closeStreams(); + return false; + } + } catch (IOException e) { + e.printStackTrace(); + closeStreams(); + return false; + } + frameRMS = 0; + double sample; + switch(channels) { + case 1: + for (int i = 0; i < inputBuffer.length; i += 2) { + sample = ((inputBuffer[i+1]<<8) | + (inputBuffer[i]&0xff)) / 32768.0; + frameRMS += sample * sample; + circBuffer[cbIndex++] = sample; + if (cbIndex == fftSize) + cbIndex = 0; + } + break; + case 2: // saves ~0.1% of RT (total input overhead ~0.4%) :) + for (int i = 0; i < inputBuffer.length; i += 4) { + sample = (((inputBuffer[i+1]<<8) | (inputBuffer[i]&0xff)) + + ((inputBuffer[i+3]<<8) | (inputBuffer[i+2]&0xff))) + / 65536.0; + frameRMS += sample * sample; + circBuffer[cbIndex++] = sample; + if (cbIndex == fftSize) + cbIndex = 0; + } + break; + default: + for (int i = 0; i < inputBuffer.length; ) { + sample = 0; + for (int j = 0; j < channels; j++, i+=2) + sample += (inputBuffer[i+1]<<8) | (inputBuffer[i]&0xff); + sample /= 32768.0 * channels; + frameRMS += sample * sample; + circBuffer[cbIndex++] = sample; + if (cbIndex == fftSize) + cbIndex = 0; + } + } + frameRMS = Math.sqrt(frameRMS / inputBuffer.length * 2 * channels); + return true; + } // getFrame() + + /** Processes a frame of audio data by first computing the STFT with a + * Hamming window, then mapping the frequency bins into a part-linear + * part-logarithmic array, then computing the spectral flux + * then (optionally) normalising and calculating onsets. + */ + protected void processFrame() { + if (getFrame()) { + for (int i = 0; i < fftSize; i++) { + reBuffer[i] = window[i] * circBuffer[cbIndex]; + if (++cbIndex == fftSize) + cbIndex = 0; + } + Arrays.fill(imBuffer, 0); + FFT.magnitudePhaseFFT(reBuffer, imBuffer); + Arrays.fill(newFrame, 0); + double flux = 0; + for (int i = 0; i <= fftSize/2; i++) { + if (reBuffer[i] > prevFrame[i]) + flux += reBuffer[i] - prevFrame[i]; + newFrame[freqMap[i]] += reBuffer[i]; + } + spectralFlux[frameCount] = flux; + for (int i = 0; i < freqMapSize; i++) + frames[frameCount][i] = newFrame[i]; + int index = cbIndex - (fftSize - hopSize); + if (index < 0) + index += fftSize; + int sz = (fftSize - hopSize) / energyOversampleFactor; + for (int j = 0; j < energyOversampleFactor; j++) { + double newEnergy = 0; + for (int i = 0; i < sz; i++) { + newEnergy += circBuffer[index] * circBuffer[index]; + if (++index == fftSize) + index = 0; + } + energy[frameCount * energyOversampleFactor + j] = + newEnergy / sz <= 1e-6? 0: Math.log(newEnergy / sz) + 13.816; + } + double decay = frameCount >= 200? 0.99: + (frameCount < 100? 0: (frameCount - 100) / 100.0); + if (ltAverage == 0) + ltAverage = frameRMS; + else + ltAverage = ltAverage * decay + frameRMS * (1.0 - decay); + if (frameRMS <= silenceThreshold) + for (int i = 0; i < freqMapSize; i++) + frames[frameCount][i] = 0; + else { + if (normaliseMode == 1) + for (int i = 0; i < freqMapSize; i++) + frames[frameCount][i] /= frameRMS; + else if (normaliseMode == 2) + for (int i = 0; i < freqMapSize; i++) + frames[frameCount][i] /= ltAverage; + for (int i = 0; i < freqMapSize; i++) { + frames[frameCount][i] = Math.log(frames[frameCount][i]) + rangeThreshold; + if (frames[frameCount][i] < 0) + frames[frameCount][i] = 0; + } + } +// weightedPhaseDeviation(); +// if (debug) +// System.err.printf("PhaseDev: t=%7.3f phDev=%7.3f RMS=%7.3f\n", +// frameCount * hopTime, +// phaseDeviation[frameCount], +// frameRMS); + double[] tmp = prevFrame; + prevFrame = reBuffer; + reBuffer = tmp; + frameCount++; + if ((frameCount % 100) == 0) { + if (!silent) { + System.err.printf("Progress: %1d %5.3f %5.3f\n", + frameCount, frameRMS, ltAverage); + Profile.report(); + } + if ((progressCallback != null) && (totalFrames > 0)) + progressCallback.setFraction((double)frameCount/totalFrames); + } + } + } // processFrame() + + /** Processes a complete file of audio data. */ + public void processFile() { + while (pcmInputStream != null) { + // Profile.start(0); + processFrame(); + // Profile.log(0); + if (Thread.currentThread().isInterrupted()) { + System.err.println("info: INTERRUPTED in processFile()"); + return; + } + } + +// double[] x1 = new double[phaseDeviation.length]; +// for (int i = 0; i < x1.length; i++) { +// x1[i] = i * hopTime; +// phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100; +// } +// double[] x2 = new double[energy.length]; +// for (int i = 0; i < x2.length; i++) +// x2[i] = i * hopTime / energyOversampleFactor; +// // plot.clear(); +// plot.addPlot(x1, phaseDeviation, Color.green, 7); +// plot.addPlot(x2, energy, Color.red, 7); +// plot.setTitle("Test phase deviation"); +// plot.fitAxes(); + +// double[] slope = new double[energy.length]; +// double hop = hopTime / energyOversampleFactor; +// Peaks.getSlope(energy, hop, 15, slope); +// LinkedList<Integer> peaks = Peaks.findPeaks(slope, (int)Math.round(0.06 / hop), 10); + + double hop = hopTime; + Peaks.normalise(spectralFlux); + LinkedList<Integer> peaks = Peaks.findPeaks(spectralFlux, (int)Math.round(0.06 / hop), 0.35, 0.84, true); + onsets = new double[peaks.size()]; + double[] y2 = new double[onsets.length]; + Iterator<Integer> it = peaks.iterator(); + onsetList = new EventList(); + double minSalience = Peaks.min(spectralFlux); + for (int i = 0; i < onsets.length; i++) { + int index = it.next(); + onsets[i] = index * hop; + y2[i] = spectralFlux[index]; + Event e = BeatTrackDisplay.newBeat(onsets[i], 0); +// if (debug) +// System.err.printf("Onset: %8.3f %8.3f %8.3f\n", +// onsets[i], energy[index], slope[index]); +// e.salience = slope[index]; // or combination of energy + slope?? + // Note that salience must be non-negative or the beat tracking system fails! + e.salience = spectralFlux[index] - minSalience; + onsetList.add(e); + } + if (progressCallback != null) + progressCallback.setFraction(1.0); + if (doOnsetPlot) { + double[] x1 = new double[spectralFlux.length]; + for (int i = 0; i < x1.length; i++) + x1[i] = i * hopTime; + plot.addPlot(x1, spectralFlux, Color.red, 4); + plot.addPlot(onsets, y2, Color.green, 3); + plot.setTitle("Spectral flux and onsets"); + plot.fitAxes(); + } + if (debug) { + System.err.printf("Onsets: %d\nContinue? ", onsets.length); + readLine(); + } + } // processFile() + + /** Reads a text file containing a list of whitespace-separated feature values. + * Created for paper submitted to ICASSP'07. + * @param fileName File containing the data + * @return An array containing the feature values + */ + public static double[] getFeatures(String fileName) { + ArrayList<Double> l = new ArrayList<Double>(); + try { + BufferedReader b = new BufferedReader(new FileReader(fileName)); + while (true) { + String s = b.readLine(); + if (s == null) + break; + int start = 0; + while (start < s.length()) { + int len = s.substring(start).indexOf(' '); + String t = null; + if (len < 0) + t = s.substring(start); + else if (len > 0) { + t = s.substring(start, start + len); + } + if (t != null) + try { + l.add(Double.parseDouble(t)); + } catch (NumberFormatException e) { + System.err.println(e); + if (l.size() == 0) + l.add(new Double(0)); + else + l.add(new Double(l.get(l.size()-1))); + } + start += len + 1; + if (len < 0) + break; + } + } + double[] features = new double[l.size()]; + Iterator<Double> it = l.iterator(); + for (int i = 0; it.hasNext(); i++) + features[i] = it.next().doubleValue(); + return features; + } catch (FileNotFoundException e) { + e.printStackTrace(); + return null; + } catch (IOException e) { + e.printStackTrace(); + return null; + } catch (NumberFormatException e) { + e.printStackTrace(); + return null; + } + } // getFeatures() + + /** Reads a file of feature values, treated as an onset detection function, + * and finds peaks, which are stored in <code>onsetList</code> and <code>onsets</code>. + * @param fileName The file of feature values + * @param hopTime The spacing of feature values in time + */ + public void processFeatures(String fileName, double hopTime) { + double hop = hopTime; + double[] features = getFeatures(fileName); + Peaks.normalise(features); + LinkedList<Integer> peaks = Peaks.findPeaks(features, (int)Math.round(0.06 / hop), 0.35, 0.84, true); + onsets = new double[peaks.size()]; + double[] y2 = new double[onsets.length]; + Iterator<Integer> it = peaks.iterator(); + onsetList = new EventList(); + double minSalience = Peaks.min(features); + for (int i = 0; i < onsets.length; i++) { + int index = it.next(); + onsets[i] = index * hop; + y2[i] = features[index]; + Event e = BeatTrackDisplay.newBeat(onsets[i], 0); + e.salience = features[index] - minSalience; + onsetList.add(e); + } + } // processFeatures() + + /** Copies output of audio processing to the display panel. */ + public void setDisplay(BeatTrackDisplay btd) { + int energy2[] = new int[totalFrames*energyOversampleFactor]; + double time[] = new double[totalFrames*energyOversampleFactor]; + for (int i = 0; i < totalFrames*energyOversampleFactor; i++) { + energy2[i] = (int) (energy[i] * 4 * energyOversampleFactor); + time[i] = i * hopTime / energyOversampleFactor; + } + btd.setMagnitudes(energy2); + btd.setEnvTimes(time); + btd.setSpectro(frames, totalFrames, hopTime, 0);//fftTime/hopTime); + btd.setOnsets(onsets); + btd.setOnsetList(onsetList); + } // setDisplay() + +} // class AudioProcessor + + +#endif