annotate dsp/tempotracking/DownBeat.cpp @ 298:255e431ae3d4

* Key detector: when returning key strengths, use the peak value of the three underlying chromagram correlations (from 36-bin chromagram) corresponding to each key, instead of the mean. Rationale: This is the same method as used when returning the key value, and it's nice to have the same results in both returned value and plot. The peak performed better than the sum with a simple test set of triads, so it seems reasonable to change the plot to match the key output rather than the other way around. * FFT: kiss_fftr returns only the non-conjugate bins, synthesise the rest rather than leaving them (perhaps dangerously) undefined. Fixes an uninitialised data error in chromagram that could cause garbage results from key detector. * Constant Q: remove precalculated values again, I reckon they're not proving such a good tradeoff.
author Chris Cannam <c.cannam@qmul.ac.uk>
date Fri, 05 Jun 2009 15:12:39 +0000
parents 1c9258dd155e
children 4fada56adbb8
rev   line source
c@279 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
c@279 2
c@279 3 /*
c@279 4 QM DSP Library
c@279 5
c@279 6 Centre for Digital Music, Queen Mary, University of London.
c@279 7 This file copyright 2008-2009 Matthew Davies and QMUL.
c@279 8 All rights reserved.
c@279 9 */
c@279 10
c@279 11 #include "DownBeat.h"
c@279 12
c@279 13 #include "maths/MathAliases.h"
c@279 14 #include "maths/MathUtilities.h"
c@280 15 #include "maths/KLDivergence.h"
c@279 16 #include "dsp/transforms/FFT.h"
c@279 17
c@279 18 #include <iostream>
c@279 19 #include <cstdlib>
c@279 20
c@279 21 DownBeat::DownBeat(float originalSampleRate,
c@279 22 size_t decimationFactor,
c@279 23 size_t dfIncrement) :
c@280 24 m_bpb(0),
c@279 25 m_rate(originalSampleRate),
c@279 26 m_factor(decimationFactor),
c@279 27 m_increment(dfIncrement),
c@279 28 m_decimator1(0),
c@279 29 m_decimator2(0),
c@279 30 m_buffer(0),
c@283 31 m_decbuf(0),
c@279 32 m_bufsiz(0),
c@279 33 m_buffill(0),
c@279 34 m_beatframesize(0),
c@279 35 m_beatframe(0)
c@279 36 {
c@279 37 // beat frame size is next power of two up from 1.3 seconds at the
c@279 38 // downsampled rate (happens to produce 4096 for 44100 or 48000 at
c@279 39 // 16x decimation, which is our expected normal situation)
c@280 40 m_beatframesize = MathUtilities::nextPowerOfTwo
c@280 41 (int((m_rate / decimationFactor) * 1.3));
c@282 42 // std::cerr << "rate = " << m_rate << ", bfs = " << m_beatframesize << std::endl;
c@279 43 m_beatframe = new double[m_beatframesize];
c@279 44 m_fftRealOut = new double[m_beatframesize];
c@279 45 m_fftImagOut = new double[m_beatframesize];
c@289 46 m_fft = new FFTReal(m_beatframesize);
c@279 47 }
c@279 48
c@279 49 DownBeat::~DownBeat()
c@279 50 {
c@279 51 delete m_decimator1;
c@279 52 delete m_decimator2;
c@279 53 if (m_buffer) free(m_buffer);
c@279 54 delete[] m_decbuf;
c@279 55 delete[] m_beatframe;
c@279 56 delete[] m_fftRealOut;
c@279 57 delete[] m_fftImagOut;
c@289 58 delete m_fft;
c@279 59 }
c@279 60
c@279 61 void
c@280 62 DownBeat::setBeatsPerBar(int bpb)
c@280 63 {
c@280 64 m_bpb = bpb;
c@280 65 }
c@280 66
c@280 67 void
c@279 68 DownBeat::makeDecimators()
c@279 69 {
c@283 70 // std::cerr << "m_factor = " << m_factor << std::endl;
c@279 71 if (m_factor < 2) return;
c@279 72 int highest = Decimator::getHighestSupportedFactor();
c@279 73 if (m_factor <= highest) {
c@279 74 m_decimator1 = new Decimator(m_increment, m_factor);
c@282 75 // std::cerr << "DownBeat: decimator 1 factor " << m_factor << ", size " << m_increment << std::endl;
c@279 76 return;
c@279 77 }
c@279 78 m_decimator1 = new Decimator(m_increment, highest);
c@282 79 // std::cerr << "DownBeat: decimator 1 factor " << highest << ", size " << m_increment << std::endl;
c@279 80 m_decimator2 = new Decimator(m_increment / highest, m_factor / highest);
c@282 81 // std::cerr << "DownBeat: decimator 2 factor " << m_factor / highest << ", size " << m_increment / highest << std::endl;
c@280 82 m_decbuf = new float[m_increment / highest];
c@279 83 }
c@279 84
c@279 85 void
c@280 86 DownBeat::pushAudioBlock(const float *audio)
c@279 87 {
c@279 88 if (m_buffill + (m_increment / m_factor) > m_bufsiz) {
c@279 89 if (m_bufsiz == 0) m_bufsiz = m_increment * 16;
c@279 90 else m_bufsiz = m_bufsiz * 2;
c@279 91 if (!m_buffer) {
c@280 92 m_buffer = (float *)malloc(m_bufsiz * sizeof(float));
c@279 93 } else {
c@282 94 // std::cerr << "DownBeat::pushAudioBlock: realloc m_buffer to " << m_bufsiz << std::endl;
c@280 95 m_buffer = (float *)realloc(m_buffer, m_bufsiz * sizeof(float));
c@279 96 }
c@279 97 }
c@283 98 if (!m_decimator1 && m_factor > 1) makeDecimators();
c@283 99 // float rmsin = 0, rmsout = 0;
c@283 100 // for (int i = 0; i < m_increment; ++i) {
c@283 101 // rmsin += audio[i] * audio[i];
c@283 102 // }
c@279 103 if (m_decimator2) {
c@279 104 m_decimator1->process(audio, m_decbuf);
c@279 105 m_decimator2->process(m_decbuf, m_buffer + m_buffill);
c@283 106 } else if (m_decimator1) {
c@283 107 m_decimator1->process(audio, m_buffer + m_buffill);
c@279 108 } else {
c@283 109 // just copy across (m_factor is presumably 1)
c@283 110 for (int i = 0; i < m_increment; ++i) {
c@283 111 (m_buffer + m_buffill)[i] = audio[i];
c@283 112 }
c@279 113 }
c@283 114 // for (int i = 0; i < m_increment / m_factor; ++i) {
c@283 115 // rmsout += m_buffer[m_buffill + i] * m_buffer[m_buffill + i];
c@283 116 // }
c@282 117 // std::cerr << "pushAudioBlock: rms in " << sqrt(rmsin) << ", out " << sqrt(rmsout) << std::endl;
c@279 118 m_buffill += m_increment / m_factor;
c@279 119 }
c@279 120
c@280 121 const float *
c@279 122 DownBeat::getBufferedAudio(size_t &length) const
c@279 123 {
c@279 124 length = m_buffill;
c@279 125 return m_buffer;
c@279 126 }
c@279 127
c@279 128 void
c@280 129 DownBeat::resetAudioBuffer()
c@280 130 {
c@280 131 if (m_buffer) free(m_buffer);
c@283 132 m_buffer = 0;
c@280 133 m_buffill = 0;
c@280 134 m_bufsiz = 0;
c@280 135 }
c@280 136
c@280 137 void
c@280 138 DownBeat::findDownBeats(const float *audio,
c@279 139 size_t audioLength,
c@279 140 const d_vec_t &beats,
c@279 141 i_vec_t &downbeats)
c@279 142 {
c@279 143 // FIND DOWNBEATS BY PARTITIONING THE INPUT AUDIO FILE INTO BEAT SEGMENTS
c@279 144 // WHERE THE AUDIO FRAMES ARE DOWNSAMPLED BY A FACTOR OF 16 (fs ~= 2700Hz)
c@279 145 // THEN TAKING THE JENSEN-SHANNON DIVERGENCE BETWEEN BEAT SYNCHRONOUS SPECTRAL FRAMES
c@279 146
c@279 147 // IMPLEMENTATION (MOSTLY) FOLLOWS:
c@279 148 // DAVIES AND PLUMBLEY "A SPECTRAL DIFFERENCE APPROACH TO EXTRACTING DOWNBEATS IN MUSICAL AUDIO"
c@279 149 // EUSIPCO 2006, FLORENCE, ITALY
c@279 150
c@279 151 d_vec_t newspec(m_beatframesize / 2); // magnitude spectrum of current beat
c@279 152 d_vec_t oldspec(m_beatframesize / 2); // magnitude spectrum of previous beat
c@281 153
c@281 154 m_beatsd.clear();
c@279 155
c@279 156 if (audioLength == 0) return;
c@279 157
c@279 158 for (size_t i = 0; i + 1 < beats.size(); ++i) {
c@279 159
c@279 160 // Copy the extents of the current beat from downsampled array
c@279 161 // into beat frame buffer
c@279 162
c@279 163 size_t beatstart = (beats[i] * m_increment) / m_factor;
c@280 164 size_t beatend = (beats[i+1] * m_increment) / m_factor;
c@279 165 if (beatend >= audioLength) beatend = audioLength - 1;
c@279 166 if (beatend < beatstart) beatend = beatstart;
c@279 167 size_t beatlen = beatend - beatstart;
c@279 168
c@279 169 // Also apply a Hanning window to the beat frame buffer, sized
c@279 170 // to the beat extents rather than the frame size. (Because
c@279 171 // the size varies, it's easier to do this by hand than use
c@279 172 // our Window abstraction.)
c@279 173
c@283 174 // std::cerr << "beatlen = " << beatlen << std::endl;
c@283 175
c@283 176 // float rms = 0;
c@283 177 for (size_t j = 0; j < beatlen && j < m_beatframesize; ++j) {
c@279 178 double mul = 0.5 * (1.0 - cos(TWO_PI * (double(j) / double(beatlen))));
c@279 179 m_beatframe[j] = audio[beatstart + j] * mul;
c@283 180 // rms += m_beatframe[j] * m_beatframe[j];
c@279 181 }
c@283 182 // rms = sqrt(rms);
c@282 183 // std::cerr << "beat " << i << ": audio rms " << rms << std::endl;
c@279 184
c@279 185 for (size_t j = beatlen; j < m_beatframesize; ++j) {
c@279 186 m_beatframe[j] = 0.0;
c@279 187 }
c@279 188
c@279 189 // Now FFT beat frame
c@279 190
c@289 191 m_fft->process(false, m_beatframe, m_fftRealOut, m_fftImagOut);
c@279 192
c@279 193 // Calculate magnitudes
c@279 194
c@279 195 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279 196 newspec[j] = sqrt(m_fftRealOut[j] * m_fftRealOut[j] +
c@279 197 m_fftImagOut[j] * m_fftImagOut[j]);
c@279 198 }
c@279 199
c@279 200 // Preserve peaks by applying adaptive threshold
c@279 201
c@279 202 MathUtilities::adaptiveThreshold(newspec);
c@279 203
c@279 204 // Calculate JS divergence between new and old spectral frames
c@279 205
c@281 206 if (i > 0) { // otherwise we have no previous frame
c@281 207 m_beatsd.push_back(measureSpecDiff(oldspec, newspec));
c@282 208 // std::cerr << "specdiff: " << m_beatsd[m_beatsd.size()-1] << std::endl;
c@281 209 }
c@279 210
c@279 211 // Copy newspec across to old
c@279 212
c@279 213 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279 214 oldspec[j] = newspec[j];
c@279 215 }
c@279 216 }
c@279 217
c@279 218 // We now have all spectral difference measures in specdiff
c@279 219
c@295 220 unsigned int timesig = m_bpb;
c@280 221 if (timesig == 0) timesig = 4;
c@280 222
c@279 223 d_vec_t dbcand(timesig); // downbeat candidates
c@279 224
c@280 225 for (int beat = 0; beat < timesig; ++beat) {
c@280 226 dbcand[beat] = 0;
c@280 227 }
c@280 228
c@279 229 // look for beat transition which leads to greatest spectral change
c@279 230 for (int beat = 0; beat < timesig; ++beat) {
c@281 231 int count = 0;
c@281 232 for (int example = beat - 1; example < m_beatsd.size(); example += timesig) {
c@281 233 if (example < 0) continue;
c@281 234 dbcand[beat] += (m_beatsd[example]) / timesig;
c@281 235 ++count;
c@279 236 }
c@281 237 if (count > 0) m_beatsd[beat] /= count;
c@282 238 // std::cerr << "dbcand[" << beat << "] = " << dbcand[beat] << std::endl;
c@279 239 }
c@279 240
c@280 241
c@279 242 // first downbeat is beat at index of maximum value of dbcand
c@279 243 int dbind = MathUtilities::getMax(dbcand);
c@279 244
c@279 245 // remaining downbeats are at timesig intervals from the first
c@279 246 for (int i = dbind; i < beats.size(); i += timesig) {
c@279 247 downbeats.push_back(i);
c@279 248 }
c@279 249 }
c@279 250
c@279 251 double
c@279 252 DownBeat::measureSpecDiff(d_vec_t oldspec, d_vec_t newspec)
c@279 253 {
c@279 254 // JENSEN-SHANNON DIVERGENCE BETWEEN SPECTRAL FRAMES
c@279 255
c@295 256 unsigned int SPECSIZE = 512; // ONLY LOOK AT FIRST 512 SAMPLES OF SPECTRUM.
c@279 257 if (SPECSIZE > oldspec.size()/4) {
c@279 258 SPECSIZE = oldspec.size()/4;
c@279 259 }
c@279 260 double SD = 0.;
c@279 261 double sd1 = 0.;
c@279 262
c@279 263 double sumnew = 0.;
c@279 264 double sumold = 0.;
c@279 265
c@295 266 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279 267 {
c@279 268 newspec[i] +=EPS;
c@279 269 oldspec[i] +=EPS;
c@279 270
c@279 271 sumnew+=newspec[i];
c@279 272 sumold+=oldspec[i];
c@279 273 }
c@279 274
c@295 275 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279 276 {
c@279 277 newspec[i] /= (sumnew);
c@279 278 oldspec[i] /= (sumold);
c@279 279
c@279 280 // IF ANY SPECTRAL VALUES ARE 0 (SHOULDN'T BE ANY!) SET THEM TO 1
c@279 281 if (newspec[i] == 0)
c@279 282 {
c@279 283 newspec[i] = 1.;
c@279 284 }
c@279 285
c@279 286 if (oldspec[i] == 0)
c@279 287 {
c@279 288 oldspec[i] = 1.;
c@279 289 }
c@279 290
c@279 291 // JENSEN-SHANNON CALCULATION
c@279 292 sd1 = 0.5*oldspec[i] + 0.5*newspec[i];
c@279 293 SD = SD + (-sd1*log(sd1)) + (0.5*(oldspec[i]*log(oldspec[i]))) + (0.5*(newspec[i]*log(newspec[i])));
c@279 294 }
c@279 295
c@279 296 return SD;
c@279 297 }
c@279 298
c@281 299 void
c@281 300 DownBeat::getBeatSD(vector<double> &beatsd) const
c@281 301 {
c@281 302 for (int i = 0; i < m_beatsd.size(); ++i) beatsd.push_back(m_beatsd[i]);
c@281 303 }
c@281 304