annotate dsp/tempotracking/DownBeat.cpp @ 321:f1e6be2de9a5

A threshold (delta) is added in the peak picking parameters structure (PPickParams). It is used as an offset when computing the smoothed detection function. A constructor for the structure PPickParams is also added to set the parameters to 0 when a structure instance is created. Hence programmes using the peak picking parameter structure and which do not set the delta parameter (e.g. QM Vamp note onset detector) won't be affected by the modifications. Functions modified: - dsp/onsets/PeakPicking.cpp - dsp/onsets/PeakPicking.h - dsp/signalconditioning/DFProcess.cpp - dsp/signalconditioning/DFProcess.h
author mathieub <mathieu.barthet@eecs.qmul.ac.uk>
date Mon, 20 Jun 2011 19:01:48 +0100
parents d5014ab8b0e5
children f6ccde089491
rev   line source
c@279 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
c@279 2
c@279 3 /*
c@279 4 QM DSP Library
c@279 5
c@279 6 Centre for Digital Music, Queen Mary, University of London.
c@279 7 This file copyright 2008-2009 Matthew Davies and QMUL.
c@309 8
c@309 9 This program is free software; you can redistribute it and/or
c@309 10 modify it under the terms of the GNU General Public License as
c@309 11 published by the Free Software Foundation; either version 2 of the
c@309 12 License, or (at your option) any later version. See the file
c@309 13 COPYING included with this distribution for more information.
c@279 14 */
c@279 15
c@279 16 #include "DownBeat.h"
c@279 17
c@279 18 #include "maths/MathAliases.h"
c@279 19 #include "maths/MathUtilities.h"
c@280 20 #include "maths/KLDivergence.h"
c@279 21 #include "dsp/transforms/FFT.h"
c@279 22
c@279 23 #include <iostream>
c@279 24 #include <cstdlib>
c@279 25
c@279 26 DownBeat::DownBeat(float originalSampleRate,
c@279 27 size_t decimationFactor,
c@279 28 size_t dfIncrement) :
c@280 29 m_bpb(0),
c@279 30 m_rate(originalSampleRate),
c@279 31 m_factor(decimationFactor),
c@279 32 m_increment(dfIncrement),
c@279 33 m_decimator1(0),
c@279 34 m_decimator2(0),
c@279 35 m_buffer(0),
c@283 36 m_decbuf(0),
c@279 37 m_bufsiz(0),
c@279 38 m_buffill(0),
c@279 39 m_beatframesize(0),
c@279 40 m_beatframe(0)
c@279 41 {
c@279 42 // beat frame size is next power of two up from 1.3 seconds at the
c@279 43 // downsampled rate (happens to produce 4096 for 44100 or 48000 at
c@279 44 // 16x decimation, which is our expected normal situation)
c@280 45 m_beatframesize = MathUtilities::nextPowerOfTwo
c@280 46 (int((m_rate / decimationFactor) * 1.3));
c@282 47 // std::cerr << "rate = " << m_rate << ", bfs = " << m_beatframesize << std::endl;
c@279 48 m_beatframe = new double[m_beatframesize];
c@279 49 m_fftRealOut = new double[m_beatframesize];
c@279 50 m_fftImagOut = new double[m_beatframesize];
c@289 51 m_fft = new FFTReal(m_beatframesize);
c@279 52 }
c@279 53
c@279 54 DownBeat::~DownBeat()
c@279 55 {
c@279 56 delete m_decimator1;
c@279 57 delete m_decimator2;
c@279 58 if (m_buffer) free(m_buffer);
c@279 59 delete[] m_decbuf;
c@279 60 delete[] m_beatframe;
c@279 61 delete[] m_fftRealOut;
c@279 62 delete[] m_fftImagOut;
c@289 63 delete m_fft;
c@279 64 }
c@279 65
c@279 66 void
c@280 67 DownBeat::setBeatsPerBar(int bpb)
c@280 68 {
c@280 69 m_bpb = bpb;
c@280 70 }
c@280 71
c@280 72 void
c@279 73 DownBeat::makeDecimators()
c@279 74 {
c@283 75 // std::cerr << "m_factor = " << m_factor << std::endl;
c@279 76 if (m_factor < 2) return;
c@302 77 size_t highest = Decimator::getHighestSupportedFactor();
c@279 78 if (m_factor <= highest) {
c@279 79 m_decimator1 = new Decimator(m_increment, m_factor);
c@282 80 // std::cerr << "DownBeat: decimator 1 factor " << m_factor << ", size " << m_increment << std::endl;
c@279 81 return;
c@279 82 }
c@279 83 m_decimator1 = new Decimator(m_increment, highest);
c@282 84 // std::cerr << "DownBeat: decimator 1 factor " << highest << ", size " << m_increment << std::endl;
c@279 85 m_decimator2 = new Decimator(m_increment / highest, m_factor / highest);
c@282 86 // std::cerr << "DownBeat: decimator 2 factor " << m_factor / highest << ", size " << m_increment / highest << std::endl;
c@280 87 m_decbuf = new float[m_increment / highest];
c@279 88 }
c@279 89
c@279 90 void
c@280 91 DownBeat::pushAudioBlock(const float *audio)
c@279 92 {
c@279 93 if (m_buffill + (m_increment / m_factor) > m_bufsiz) {
c@279 94 if (m_bufsiz == 0) m_bufsiz = m_increment * 16;
c@279 95 else m_bufsiz = m_bufsiz * 2;
c@279 96 if (!m_buffer) {
c@280 97 m_buffer = (float *)malloc(m_bufsiz * sizeof(float));
c@279 98 } else {
c@282 99 // std::cerr << "DownBeat::pushAudioBlock: realloc m_buffer to " << m_bufsiz << std::endl;
c@280 100 m_buffer = (float *)realloc(m_buffer, m_bufsiz * sizeof(float));
c@279 101 }
c@279 102 }
c@283 103 if (!m_decimator1 && m_factor > 1) makeDecimators();
c@283 104 // float rmsin = 0, rmsout = 0;
c@283 105 // for (int i = 0; i < m_increment; ++i) {
c@283 106 // rmsin += audio[i] * audio[i];
c@283 107 // }
c@279 108 if (m_decimator2) {
c@279 109 m_decimator1->process(audio, m_decbuf);
c@279 110 m_decimator2->process(m_decbuf, m_buffer + m_buffill);
c@283 111 } else if (m_decimator1) {
c@283 112 m_decimator1->process(audio, m_buffer + m_buffill);
c@279 113 } else {
c@283 114 // just copy across (m_factor is presumably 1)
c@302 115 for (size_t i = 0; i < m_increment; ++i) {
c@283 116 (m_buffer + m_buffill)[i] = audio[i];
c@283 117 }
c@279 118 }
c@283 119 // for (int i = 0; i < m_increment / m_factor; ++i) {
c@283 120 // rmsout += m_buffer[m_buffill + i] * m_buffer[m_buffill + i];
c@283 121 // }
c@282 122 // std::cerr << "pushAudioBlock: rms in " << sqrt(rmsin) << ", out " << sqrt(rmsout) << std::endl;
c@279 123 m_buffill += m_increment / m_factor;
c@279 124 }
c@279 125
c@280 126 const float *
c@279 127 DownBeat::getBufferedAudio(size_t &length) const
c@279 128 {
c@279 129 length = m_buffill;
c@279 130 return m_buffer;
c@279 131 }
c@279 132
c@279 133 void
c@280 134 DownBeat::resetAudioBuffer()
c@280 135 {
c@280 136 if (m_buffer) free(m_buffer);
c@283 137 m_buffer = 0;
c@280 138 m_buffill = 0;
c@280 139 m_bufsiz = 0;
c@280 140 }
c@280 141
c@280 142 void
c@280 143 DownBeat::findDownBeats(const float *audio,
c@279 144 size_t audioLength,
c@279 145 const d_vec_t &beats,
c@279 146 i_vec_t &downbeats)
c@279 147 {
c@279 148 // FIND DOWNBEATS BY PARTITIONING THE INPUT AUDIO FILE INTO BEAT SEGMENTS
c@279 149 // WHERE THE AUDIO FRAMES ARE DOWNSAMPLED BY A FACTOR OF 16 (fs ~= 2700Hz)
c@279 150 // THEN TAKING THE JENSEN-SHANNON DIVERGENCE BETWEEN BEAT SYNCHRONOUS SPECTRAL FRAMES
c@279 151
c@279 152 // IMPLEMENTATION (MOSTLY) FOLLOWS:
c@279 153 // DAVIES AND PLUMBLEY "A SPECTRAL DIFFERENCE APPROACH TO EXTRACTING DOWNBEATS IN MUSICAL AUDIO"
c@279 154 // EUSIPCO 2006, FLORENCE, ITALY
c@279 155
c@279 156 d_vec_t newspec(m_beatframesize / 2); // magnitude spectrum of current beat
c@279 157 d_vec_t oldspec(m_beatframesize / 2); // magnitude spectrum of previous beat
c@281 158
c@281 159 m_beatsd.clear();
c@279 160
c@279 161 if (audioLength == 0) return;
c@279 162
c@279 163 for (size_t i = 0; i + 1 < beats.size(); ++i) {
c@279 164
c@279 165 // Copy the extents of the current beat from downsampled array
c@279 166 // into beat frame buffer
c@279 167
c@279 168 size_t beatstart = (beats[i] * m_increment) / m_factor;
c@280 169 size_t beatend = (beats[i+1] * m_increment) / m_factor;
c@279 170 if (beatend >= audioLength) beatend = audioLength - 1;
c@279 171 if (beatend < beatstart) beatend = beatstart;
c@279 172 size_t beatlen = beatend - beatstart;
c@279 173
c@279 174 // Also apply a Hanning window to the beat frame buffer, sized
c@279 175 // to the beat extents rather than the frame size. (Because
c@279 176 // the size varies, it's easier to do this by hand than use
c@279 177 // our Window abstraction.)
c@279 178
c@283 179 // std::cerr << "beatlen = " << beatlen << std::endl;
c@283 180
c@283 181 // float rms = 0;
c@283 182 for (size_t j = 0; j < beatlen && j < m_beatframesize; ++j) {
c@279 183 double mul = 0.5 * (1.0 - cos(TWO_PI * (double(j) / double(beatlen))));
c@279 184 m_beatframe[j] = audio[beatstart + j] * mul;
c@283 185 // rms += m_beatframe[j] * m_beatframe[j];
c@279 186 }
c@283 187 // rms = sqrt(rms);
c@282 188 // std::cerr << "beat " << i << ": audio rms " << rms << std::endl;
c@279 189
c@279 190 for (size_t j = beatlen; j < m_beatframesize; ++j) {
c@279 191 m_beatframe[j] = 0.0;
c@279 192 }
c@279 193
c@279 194 // Now FFT beat frame
c@279 195
c@289 196 m_fft->process(false, m_beatframe, m_fftRealOut, m_fftImagOut);
c@279 197
c@279 198 // Calculate magnitudes
c@279 199
c@279 200 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279 201 newspec[j] = sqrt(m_fftRealOut[j] * m_fftRealOut[j] +
c@279 202 m_fftImagOut[j] * m_fftImagOut[j]);
c@279 203 }
c@279 204
c@279 205 // Preserve peaks by applying adaptive threshold
c@279 206
c@279 207 MathUtilities::adaptiveThreshold(newspec);
c@279 208
c@279 209 // Calculate JS divergence between new and old spectral frames
c@279 210
c@281 211 if (i > 0) { // otherwise we have no previous frame
c@281 212 m_beatsd.push_back(measureSpecDiff(oldspec, newspec));
c@282 213 // std::cerr << "specdiff: " << m_beatsd[m_beatsd.size()-1] << std::endl;
c@281 214 }
c@279 215
c@279 216 // Copy newspec across to old
c@279 217
c@279 218 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279 219 oldspec[j] = newspec[j];
c@279 220 }
c@279 221 }
c@279 222
c@279 223 // We now have all spectral difference measures in specdiff
c@279 224
c@302 225 int timesig = m_bpb;
c@280 226 if (timesig == 0) timesig = 4;
c@280 227
c@279 228 d_vec_t dbcand(timesig); // downbeat candidates
c@279 229
c@280 230 for (int beat = 0; beat < timesig; ++beat) {
c@280 231 dbcand[beat] = 0;
c@280 232 }
c@280 233
c@301 234 // look for beat transition which leads to greatest spectral change
c@301 235 for (int beat = 0; beat < timesig; ++beat) {
c@301 236 int count = 0;
c@302 237 for (int example = beat-1; example < (int)m_beatsd.size(); example += timesig) {
c@301 238 if (example < 0) continue;
c@301 239 dbcand[beat] += (m_beatsd[example]) / timesig;
c@301 240 ++count;
c@301 241 }
c@301 242 if (count > 0) dbcand[beat] /= count;
c@282 243 // std::cerr << "dbcand[" << beat << "] = " << dbcand[beat] << std::endl;
c@301 244 }
c@280 245
c@279 246 // first downbeat is beat at index of maximum value of dbcand
c@279 247 int dbind = MathUtilities::getMax(dbcand);
c@279 248
c@279 249 // remaining downbeats are at timesig intervals from the first
c@302 250 for (int i = dbind; i < (int)beats.size(); i += timesig) {
c@279 251 downbeats.push_back(i);
c@279 252 }
c@279 253 }
c@279 254
c@279 255 double
c@279 256 DownBeat::measureSpecDiff(d_vec_t oldspec, d_vec_t newspec)
c@279 257 {
c@279 258 // JENSEN-SHANNON DIVERGENCE BETWEEN SPECTRAL FRAMES
c@279 259
c@295 260 unsigned int SPECSIZE = 512; // ONLY LOOK AT FIRST 512 SAMPLES OF SPECTRUM.
c@279 261 if (SPECSIZE > oldspec.size()/4) {
c@279 262 SPECSIZE = oldspec.size()/4;
c@279 263 }
c@279 264 double SD = 0.;
c@279 265 double sd1 = 0.;
c@279 266
c@279 267 double sumnew = 0.;
c@279 268 double sumold = 0.;
c@279 269
c@295 270 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279 271 {
c@279 272 newspec[i] +=EPS;
c@279 273 oldspec[i] +=EPS;
c@279 274
c@279 275 sumnew+=newspec[i];
c@279 276 sumold+=oldspec[i];
c@279 277 }
c@279 278
c@295 279 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279 280 {
c@279 281 newspec[i] /= (sumnew);
c@279 282 oldspec[i] /= (sumold);
c@279 283
c@279 284 // IF ANY SPECTRAL VALUES ARE 0 (SHOULDN'T BE ANY!) SET THEM TO 1
c@279 285 if (newspec[i] == 0)
c@279 286 {
c@279 287 newspec[i] = 1.;
c@279 288 }
c@279 289
c@279 290 if (oldspec[i] == 0)
c@279 291 {
c@279 292 oldspec[i] = 1.;
c@279 293 }
c@279 294
c@279 295 // JENSEN-SHANNON CALCULATION
c@279 296 sd1 = 0.5*oldspec[i] + 0.5*newspec[i];
c@279 297 SD = SD + (-sd1*log(sd1)) + (0.5*(oldspec[i]*log(oldspec[i]))) + (0.5*(newspec[i]*log(newspec[i])));
c@279 298 }
c@279 299
c@279 300 return SD;
c@279 301 }
c@279 302
c@281 303 void
c@281 304 DownBeat::getBeatSD(vector<double> &beatsd) const
c@281 305 {
c@302 306 for (int i = 0; i < (int)m_beatsd.size(); ++i) beatsd.push_back(m_beatsd[i]);
c@281 307 }
c@281 308