To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / BeatRootProcessor.h @ 2:7d4e6b1ff3d1

History | View | Annotate | Download (18.6 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
/*
4
    Vamp feature extraction plugin for the BeatRoot beat tracker.
5

6
    Centre for Digital Music, Queen Mary, University of London.
7
    This file copyright 2011 Simon Dixon, Chris Cannam and QMUL.
8
    
9
    This program is free software; you can redistribute it and/or
10
    modify it under the terms of the GNU General Public License as
11
    published by the Free Software Foundation; either version 2 of the
12
    License, or (at your option) any later version.  See the file
13
    COPYING included with this distribution for more information.
14
*/
15

    
16
#ifndef _BEATROOT_PROCESSOR_H_
17
#define _BEATROOT_PROCESSOR_H_
18

    
19
#include <vector>
20

    
21
using std::vector;
22

    
23
class BeatRootProcessor
24
{
25
protected:
26
    /** Sample rate of audio */
27
    float sampleRate;
28
        
29
    /** Spacing of audio frames (determines the amount of overlap or
30
     *  skip between frames). This value is expressed in
31
     *  seconds. (Default = 0.020s) */
32
    double hopTime;
33

    
34
    /** The approximate size of an FFT frame in seconds. (Default =
35
     *  0.04644s).  The value is adjusted so that <code>fftSize</code>
36
     *  is always power of 2. */
37
    double fftTime;
38

    
39
    /** Spacing of audio frames in samples (see <code>hopTime</code>) */
40
    int hopSize;
41

    
42
    /** The size of an FFT frame in samples (see <code>fftTime</code>) */
43
    int fftSize;
44

    
45
    /** The number of overlapping frames of audio data which have been read. */
46
    int frameCount;
47

    
48
    /** RMS amplitude of the current frame. */
49
    double frameRMS;
50

    
51
    /** Long term average frame energy (in frequency domain representation). */
52
    double ltAverage;
53

    
54
    /** Spectral flux onset detection function, indexed by frame. */
55
    vector<int> spectralFlux;
56
        
57
    /** A mapping function for mapping FFT bins to final frequency bins.
58
     *  The mapping is linear (1-1) until the resolution reaches 2 points per
59
     *  semitone, then logarithmic with a semitone resolution.  e.g. for
60
     *  44.1kHz sampling rate and fftSize of 2048 (46ms), bin spacing is
61
     *  21.5Hz, which is mapped linearly for bins 0-34 (0 to 732Hz), and
62
     *  logarithmically for the remaining bins (midi notes 79 to 127, bins 35 to
63
     *  83), where all energy above note 127 is mapped into the final bin. */
64
    vector<int> freqMap;
65

    
66
    /** The number of entries in <code>freqMap</code>. Note that the length of
67
     *  the array is greater, because its size is not known at creation time. */
68
    int freqMapSize;
69

    
70
    /** The magnitude spectrum of the most recent frame.  Used for
71
     *  calculating the spectral flux. */
72
    vector<double> prevFrame;
73
        
74
    /** The magnitude spectrum of the current frame. */
75
    vector<double> newFrame;
76

    
77
    /** The magnitude spectra of all frames, used for plotting the spectrogram. */
78
    vector<vector<double> > frames; //!!! do we need this? much cheaper to lose it if we don't
79
        
80
    /** The RMS energy of all frames. */
81
    vector<double> energy; //!!! unused in beat tracking?
82
        
83
    /** The estimated onset times from peak-picking the onset
84
     * detection function(s). */
85
    vector<double> onsets;
86
        
87
    /** The estimated onset times and their saliences. */        
88
    //!!!EventList onsetList;
89
    vector<double> onsetList; //!!! corresponding to keyDown member of events in list
90

    
91
    /** Total number of audio frames if known, or -1 for live or compressed input. */
92
    int totalFrames;
93
        
94
    /** Flag for enabling or disabling debugging output */
95
    static bool debug;
96
        
97
    /** Flag for suppressing all standard output messages except results. */
98
    static bool silent;
99
        
100
    /** RMS frame energy below this value results in the frame being
101
     *  set to zero, so that normalisation does not have undesired
102
     *  side-effects. */
103
    static double silenceThreshold; //!!!??? energy of what? should not be static?
104
        
105
    /** For dynamic range compression, this value is added to the log
106
     *  magnitude in each frequency bin and any remaining negative
107
     *  values are then set to zero.
108
     */
109
    static double rangeThreshold; //!!! sim
110
        
111
    /** Determines method of normalisation. Values can be:<ul>
112
     *  <li>0: no normalisation</li>
113
     *  <li>1: normalisation by current frame energy</li>
114
     *  <li>2: normalisation by exponential average of frame energy</li>
115
     *  </ul>
116
     */
117
    static int normaliseMode;
118
        
119
    /** Ratio between rate of sampling the signal energy (for the
120
     * amplitude envelope) and the hop size */
121
    static int energyOversampleFactor; //!!! not used?
122
        
123
public:
124

    
125
    /** Constructor: note that streams are not opened until the input
126
     *  file is set (see <code>setInputFile()</code>). */
127
    BeatRootProcessor() {
128
        cbIndex = 0;
129
        frameRMS = 0;
130
        ltAverage = 0;
131
        frameCount = 0;
132
        hopSize = 0;
133
        fftSize = 0;
134
        hopTime = 0.010;        // DEFAULT, overridden with -h
135
        fftTime = 0.04644;        // DEFAULT, overridden with -f
136
    } // constructor
137

    
138
protected:
139
        /** Allocates memory for arrays, based on parameter settings */
140
        void init() {
141
                hopSize = (int) Math.round(sampleRate * hopTime);
142
                fftSize = (int) Math.round(Math.pow(2,
143
                                Math.round( Math.log(fftTime * sampleRate) / Math.log(2))));
144
                makeFreqMap(fftSize, sampleRate);
145
                int buffSize = hopSize * channels * 2;
146
                if ((inputBuffer == null) || (inputBuffer.length != buffSize))
147
                        inputBuffer = new byte[buffSize];
148
                if ((circBuffer == null) || (circBuffer.length != fftSize)) {
149
                        circBuffer = new double[fftSize];
150
                        reBuffer = new double[fftSize];
151
                        imBuffer = new double[fftSize];
152
                        prevPhase = new double[fftSize];
153
                        prevPrevPhase = new double[fftSize];
154
                        prevFrame = new double[fftSize];
155
                        window = FFT.makeWindow(FFT.HAMMING, fftSize, fftSize);
156
                        for (int i=0; i < fftSize; i++)
157
                                window[i] *= Math.sqrt(fftSize);
158
                }
159
                if (pcmInputStream == rawInputStream)
160
                        totalFrames = (int)(pcmInputStream.getFrameLength() / hopSize);
161
                else
162
                        totalFrames = (int) (MAX_LENGTH / hopTime);
163
                if ((newFrame == null) || (newFrame.length != freqMapSize)) {
164
                        newFrame = new double[freqMapSize];
165
                        frames = new double[totalFrames][freqMapSize];
166
                } else if (frames.length != totalFrames)
167
                        frames = new double[totalFrames][freqMapSize];
168
                energy = new double[totalFrames*energyOversampleFactor];
169
                phaseDeviation = new double[totalFrames];
170
                spectralFlux = new double[totalFrames];
171
                frameCount = 0;
172
                cbIndex = 0;
173
                frameRMS = 0;
174
                ltAverage = 0;
175
        } // init()
176

    
177
        /** Closes the input stream(s) associated with this object. */
178
        void closeStreams() {
179
                if (pcmInputStream != null) {
180
                        try {
181
                                pcmInputStream.close();
182
                                if (pcmInputStream != rawInputStream)
183
                                        rawInputStream.close();
184
                                if (audioOut != null) {
185
                                        audioOut.drain();
186
                                        audioOut.close();
187
                                }
188
                        } catch (Exception e) {}
189
                        pcmInputStream = null;
190
                        audioOut = null;
191
                }
192
        } // closeStreams()
193

    
194
        /** Creates a map of FFT frequency bins to comparison bins.
195
         *  Where the spacing of FFT bins is less than 0.5 semitones, the mapping is
196
         *  one to one. Where the spacing is greater than 0.5 semitones, the FFT
197
         *  energy is mapped into semitone-wide bins. No scaling is performed; that
198
         *  is the energy is summed into the comparison bins. See also
199
         *  processFrame()
200
         */
201
        void makeFreqMap(int fftSize, float sampleRate) {
202
                freqMap = new int[fftSize/2+1];
203
                double binWidth = sampleRate / fftSize;
204
                int crossoverBin = (int)(2 / (Math.pow(2, 1/12.0) - 1));
205
                int crossoverMidi = (int)Math.round(Math.log(crossoverBin*binWidth/440)/
206
                                                                                                                Math.log(2) * 12 + 69);
207
                // freq = 440 * Math.pow(2, (midi-69)/12.0) / binWidth;
208
                int i = 0;
209
                while (i <= crossoverBin)
210
                        freqMap[i++] = i;
211
                while (i <= fftSize/2) {
212
                        double midi = Math.log(i*binWidth/440) / Math.log(2) * 12 + 69;
213
                        if (midi > 127)
214
                                midi = 127;
215
                        freqMap[i++] = crossoverBin + (int)Math.round(midi) - crossoverMidi;
216
                }
217
                freqMapSize = freqMap[i-1] + 1;
218
        } // makeFreqMap()
219

    
220
        /** Calculates the weighted phase deviation onset detection function.
221
         *  Not used.
222
         *  TODO: Test the change to WPD fn */
223
        void weightedPhaseDeviation() {
224
                if (frameCount < 2)
225
                        phaseDeviation[frameCount] = 0;
226
                else {
227
                        for (int i = 0; i < fftSize; i++) {
228
                                double pd = imBuffer[i] - 2 * prevPhase[i] + prevPrevPhase[i];
229
                                double pd1 = Math.abs(Math.IEEEremainder(pd, 2 * Math.PI));
230
                                phaseDeviation[frameCount] += pd1 * reBuffer[i];
231
                                // System.err.printf("%7.3f   %7.3f\n", pd/Math.PI, pd1/Math.PI);
232
                        }
233
                }
234
                phaseDeviation[frameCount] /= fftSize * Math.PI;
235
                double[] tmp = prevPrevPhase;
236
                prevPrevPhase = prevPhase;
237
                prevPhase = imBuffer;
238
                imBuffer = tmp;
239
        } // weightedPhaseDeviation()
240

    
241
        /** Reads a frame of input data, averages the channels to mono, scales
242
         *  to a maximum possible absolute value of 1, and stores the audio data
243
         *  in a circular input buffer.
244
         *  @return true if a frame (or part of a frame, if it is the final frame)
245
         *  is read. If a complete frame cannot be read, the InputStream is set
246
         *  to null.
247
         */
248
        bool getFrame() {
249
                if (pcmInputStream == null)
250
                        return false;
251
                try {
252
                        int bytesRead = (int) pcmInputStream.read(inputBuffer);
253
                        if ((audioOut != null) && (bytesRead > 0))
254
                                if (audioOut.write(inputBuffer, 0, bytesRead) != bytesRead)
255
                                        System.err.println("Error writing to audio device");
256
                        if (bytesRead < inputBuffer.length) {
257
                                if (!silent)
258
                                        System.err.println("End of input: " + audioFileName);
259
                                closeStreams();
260
                                return false;
261
                        }
262
                } catch (IOException e) {
263
                        e.printStackTrace();
264
                        closeStreams();
265
                        return false;
266
                }
267
                frameRMS = 0;
268
                double sample;
269
                switch(channels) {
270
                        case 1:
271
                                for (int i = 0; i < inputBuffer.length; i += 2) {
272
                                        sample = ((inputBuffer[i+1]<<8) |
273
                                                          (inputBuffer[i]&0xff)) / 32768.0;
274
                                        frameRMS += sample * sample;
275
                                        circBuffer[cbIndex++] = sample;
276
                                        if (cbIndex == fftSize)
277
                                                cbIndex = 0;
278
                                }
279
                                break;
280
                        case 2: // saves ~0.1% of RT (total input overhead ~0.4%) :)
281
                                for (int i = 0; i < inputBuffer.length; i += 4) {
282
                                        sample = (((inputBuffer[i+1]<<8) | (inputBuffer[i]&0xff)) +
283
                                                          ((inputBuffer[i+3]<<8) | (inputBuffer[i+2]&0xff)))
284
                                                                / 65536.0;
285
                                        frameRMS += sample * sample;
286
                                        circBuffer[cbIndex++] = sample;
287
                                        if (cbIndex == fftSize)
288
                                                cbIndex = 0;
289
                                }
290
                                break;
291
                        default:
292
                                for (int i = 0; i < inputBuffer.length; ) {
293
                                        sample = 0;
294
                                        for (int j = 0; j < channels; j++, i+=2)
295
                                                sample += (inputBuffer[i+1]<<8) | (inputBuffer[i]&0xff);
296
                                        sample /= 32768.0 * channels;
297
                                        frameRMS += sample * sample;
298
                                        circBuffer[cbIndex++] = sample;
299
                                        if (cbIndex == fftSize)
300
                                                cbIndex = 0;
301
                                }
302
                }
303
                frameRMS = Math.sqrt(frameRMS / inputBuffer.length * 2 * channels);
304
                return true;
305
        } // getFrame()
306

    
307
        /** Processes a frame of audio data by first computing the STFT with a
308
         *  Hamming window, then mapping the frequency bins into a part-linear
309
         *  part-logarithmic array, then computing the spectral flux 
310
         *  then (optionally) normalising and calculating onsets.
311
         */
312
        void processFrame() {
313
                if (getFrame()) {
314
                        for (int i = 0; i < fftSize; i++) {
315
                                reBuffer[i] = window[i] * circBuffer[cbIndex];
316
                                if (++cbIndex == fftSize)
317
                                        cbIndex = 0;
318
                        }
319
                        Arrays.fill(imBuffer, 0);
320
                        FFT.magnitudePhaseFFT(reBuffer, imBuffer);
321
                        Arrays.fill(newFrame, 0);
322
                        double flux = 0;
323
                        for (int i = 0; i <= fftSize/2; i++) {
324
                                if (reBuffer[i] > prevFrame[i])
325
                                        flux += reBuffer[i] - prevFrame[i];
326
                                newFrame[freqMap[i]] += reBuffer[i];
327
                        }
328
                        spectralFlux[frameCount] = flux;
329
                        for (int i = 0; i < freqMapSize; i++)
330
                                frames[frameCount][i] = newFrame[i];
331
                        int index = cbIndex - (fftSize - hopSize);
332
                        if (index < 0)
333
                                index += fftSize;
334
                        int sz = (fftSize - hopSize) / energyOversampleFactor;
335
                        for (int j = 0; j < energyOversampleFactor; j++) {
336
                                double newEnergy = 0;
337
                                for (int i = 0; i < sz; i++) {
338
                                        newEnergy += circBuffer[index] * circBuffer[index];
339
                                        if (++index == fftSize)
340
                                                index = 0;
341
                                }
342
                                energy[frameCount * energyOversampleFactor + j] =
343
                                                newEnergy / sz <= 1e-6? 0: Math.log(newEnergy / sz) + 13.816;
344
                        }
345
                        double decay = frameCount >= 200? 0.99:
346
                                                (frameCount < 100? 0: (frameCount - 100) / 100.0);
347
                        if (ltAverage == 0)
348
                                ltAverage = frameRMS;
349
                        else
350
                                ltAverage = ltAverage * decay + frameRMS * (1.0 - decay);
351
                        if (frameRMS <= silenceThreshold)
352
                                for (int i = 0; i < freqMapSize; i++)
353
                                        frames[frameCount][i] = 0;
354
                        else {
355
                                if (normaliseMode == 1)
356
                                        for (int i = 0; i < freqMapSize; i++)
357
                                                frames[frameCount][i] /= frameRMS;
358
                                else if (normaliseMode == 2)
359
                                        for (int i = 0; i < freqMapSize; i++)
360
                                                frames[frameCount][i] /= ltAverage;
361
                                for (int i = 0; i < freqMapSize; i++) {
362
                                        frames[frameCount][i] = Math.log(frames[frameCount][i]) + rangeThreshold;
363
                                        if (frames[frameCount][i] < 0)
364
                                                frames[frameCount][i] = 0;
365
                                }
366
                        }
367
//                        weightedPhaseDeviation();
368
//                        if (debug)
369
//                                System.err.printf("PhaseDev:  t=%7.3f  phDev=%7.3f  RMS=%7.3f\n",
370
//                                                frameCount * hopTime,
371
//                                                phaseDeviation[frameCount],
372
//                                                frameRMS);
373
                        double[] tmp = prevFrame;
374
                        prevFrame = reBuffer;
375
                        reBuffer = tmp;
376
                        frameCount++;
377
                        if ((frameCount % 100) == 0) {
378
                                if (!silent) {
379
                                        System.err.printf("Progress: %1d %5.3f %5.3f\n", 
380
                                                        frameCount, frameRMS, ltAverage);
381
                                        Profile.report();
382
                                }
383
                                if ((progressCallback != null) && (totalFrames > 0))
384
                                        progressCallback.setFraction((double)frameCount/totalFrames);
385
                        }
386
                }
387
        } // processFrame()
388

    
389
        /** Processes a complete file of audio data. */
390
        void processFile() {
391
                while (pcmInputStream != null) {
392
                        // Profile.start(0);
393
                        processFrame();
394
                        // Profile.log(0);
395
                        if (Thread.currentThread().isInterrupted()) {
396
                                System.err.println("info: INTERRUPTED in processFile()");
397
                                return;
398
                        }
399
                }
400

    
401
//                double[] x1 = new double[phaseDeviation.length];
402
//                for (int i = 0; i < x1.length; i++) {
403
//                        x1[i] = i * hopTime;
404
//                        phaseDeviation[i] = (phaseDeviation[i] - 0.4) * 100;
405
//                }
406
//                double[] x2 = new double[energy.length];
407
//                for (int i = 0; i < x2.length; i++)
408
//                        x2[i] = i * hopTime / energyOversampleFactor;
409
//                // plot.clear();
410
//                plot.addPlot(x1, phaseDeviation, Color.green, 7);
411
//                plot.addPlot(x2, energy, Color.red, 7);
412
//                plot.setTitle("Test phase deviation");
413
//                plot.fitAxes();
414

    
415
//                double[] slope = new double[energy.length];
416
//                double hop = hopTime / energyOversampleFactor;
417
//                Peaks.getSlope(energy, hop, 15, slope);
418
//                LinkedList<Integer> peaks = Peaks.findPeaks(slope, (int)Math.round(0.06 / hop), 10);
419
                
420
                double hop = hopTime;
421
                Peaks.normalise(spectralFlux);
422
                LinkedList<Integer> peaks = Peaks.findPeaks(spectralFlux, (int)Math.round(0.06 / hop), 0.35, 0.84, true);
423
                onsets = new double[peaks.size()];
424
                double[] y2 = new double[onsets.length];
425
                Iterator<Integer> it = peaks.iterator();
426
                onsetList = new EventList();
427
                double minSalience = Peaks.min(spectralFlux);
428
                for (int i = 0; i < onsets.length; i++) {
429
                        int index = it.next();
430
                        onsets[i] = index * hop;
431
                        y2[i] = spectralFlux[index];
432
                        Event e = BeatTrackDisplay.newBeat(onsets[i], 0);
433
//                        if (debug)
434
//                                System.err.printf("Onset: %8.3f  %8.3f  %8.3f\n",
435
//                                                onsets[i], energy[index], slope[index]);
436
//                        e.salience = slope[index];        // or combination of energy + slope??
437
                        // Note that salience must be non-negative or the beat tracking system fails!
438
                        e.salience = spectralFlux[index] - minSalience;
439
                        onsetList.add(e);
440
                }
441
                if (progressCallback != null)
442
                        progressCallback.setFraction(1.0);
443
                if (doOnsetPlot) {
444
                        double[] x1 = new double[spectralFlux.length];
445
                        for (int i = 0; i < x1.length; i++)
446
                                x1[i] = i * hopTime;
447
                        plot.addPlot(x1, spectralFlux, Color.red, 4);
448
                        plot.addPlot(onsets, y2, Color.green, 3);
449
                        plot.setTitle("Spectral flux and onsets");
450
                        plot.fitAxes();
451
                }
452
                if (debug) {
453
                        System.err.printf("Onsets: %d\nContinue? ", onsets.length);
454
                        readLine();
455
                }
456
        } // processFile()
457

    
458
        /** Reads a text file containing a list of whitespace-separated feature values.
459
         *  Created for paper submitted to ICASSP'07.
460
         *  @param fileName File containing the data
461
         *  @return An array containing the feature values
462
         */
463
        static double[] getFeatures(String fileName) {
464
                ArrayList<Double> l = new ArrayList<Double>();
465
                try {
466
                        BufferedReader b = new BufferedReader(new FileReader(fileName));
467
                        while (true) {
468
                                String s = b.readLine();
469
                                if (s == null)
470
                                        break;
471
                                int start = 0;
472
                                while (start < s.length()) {
473
                                        int len = s.substring(start).indexOf(' ');
474
                                        String t = null;
475
                                        if (len < 0)
476
                                                t = s.substring(start);
477
                                        else if (len > 0) {
478
                                                t = s.substring(start, start + len);
479
                                        }
480
                                        if (t != null)
481
                                                try {
482
                                                        l.add(Double.parseDouble(t));
483
                                                } catch (NumberFormatException e) {
484
                                                        System.err.println(e);
485
                                                        if (l.size() == 0)
486
                                                                l.add(new Double(0));
487
                                                        else
488
                                                                l.add(new Double(l.get(l.size()-1)));
489
                                                }
490
                                        start += len + 1;
491
                                        if (len < 0)
492
                                                break;
493
                                }
494
                        }
495
                        double[] features = new double[l.size()];
496
                        Iterator<Double> it = l.iterator();
497
                        for (int i = 0; it.hasNext(); i++)
498
                                features[i] = it.next().doubleValue();
499
                        return features;
500
                } catch (FileNotFoundException e) {
501
                        e.printStackTrace();
502
                        return null;
503
                } catch (IOException e) {
504
                        e.printStackTrace();
505
                        return null;
506
                } catch (NumberFormatException e) {
507
                        e.printStackTrace();
508
                        return null;
509
                }
510
        } // getFeatures()
511
        
512
        /** Reads a file of feature values, treated as an onset detection function,
513
         *  and finds peaks, which are stored in <code>onsetList</code> and <code>onsets</code>.
514
         * @param fileName The file of feature values
515
         * @param hopTime The spacing of feature values in time
516
         */
517
        void processFeatures(String fileName, double hopTime) {
518
                double hop = hopTime;
519
                double[] features = getFeatures(fileName);
520
                Peaks.normalise(features);
521
                LinkedList<Integer> peaks = Peaks.findPeaks(features, (int)Math.round(0.06 / hop), 0.35, 0.84, true);
522
                onsets = new double[peaks.size()];
523
                double[] y2 = new double[onsets.length];
524
                Iterator<Integer> it = peaks.iterator();
525
                onsetList = new EventList();
526
                double minSalience = Peaks.min(features);
527
                for (int i = 0; i < onsets.length; i++) {
528
                        int index = it.next();
529
                        onsets[i] = index * hop;
530
                        y2[i] = features[index];
531
                        Event e = BeatTrackDisplay.newBeat(onsets[i], 0);
532
                        e.salience = features[index] - minSalience;
533
                        onsetList.add(e);
534
                }
535
        } // processFeatures()
536

    
537
        /** Copies output of audio processing to the display panel. */
538
        void setDisplay(BeatTrackDisplay btd) {
539
                int energy2[] = new int[totalFrames*energyOversampleFactor];
540
                double time[] = new double[totalFrames*energyOversampleFactor];
541
                for (int i = 0; i < totalFrames*energyOversampleFactor; i++) {
542
                        energy2[i] = (int) (energy[i] * 4 * energyOversampleFactor);
543
                        time[i] = i * hopTime / energyOversampleFactor;
544
                }
545
                btd.setMagnitudes(energy2);
546
                btd.setEnvTimes(time);
547
                btd.setSpectro(frames, totalFrames, hopTime, 0);//fftTime/hopTime);
548
                btd.setOnsets(onsets);
549
                btd.setOnsetList(onsetList);
550
        } // setDisplay()
551
        
552
} // class AudioProcessor
553

    
554

    
555
#endif