annotate dsp/tempotracking/DownBeat.cpp @ 73:dcb555b90924

* Key detector: when returning key strengths, use the peak value of the three underlying chromagram correlations (from 36-bin chromagram) corresponding to each key, instead of the mean. Rationale: This is the same method as used when returning the key value, and it's nice to have the same results in both returned value and plot. The peak performed better than the sum with a simple test set of triads, so it seems reasonable to change the plot to match the key output rather than the other way around. * FFT: kiss_fftr returns only the non-conjugate bins, synthesise the rest rather than leaving them (perhaps dangerously) undefined. Fixes an uninitialised data error in chromagram that could cause garbage results from key detector. * Constant Q: remove precalculated values again, I reckon they're not proving such a good tradeoff.
author cannam
date Fri, 05 Jun 2009 15:12:39 +0000
parents c3cdb404f807
children 4fada56adbb8
rev   line source
cannam@54 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
cannam@54 2
cannam@54 3 /*
cannam@54 4 QM DSP Library
cannam@54 5
cannam@54 6 Centre for Digital Music, Queen Mary, University of London.
cannam@54 7 This file copyright 2008-2009 Matthew Davies and QMUL.
cannam@54 8 All rights reserved.
cannam@54 9 */
cannam@54 10
cannam@54 11 #include "DownBeat.h"
cannam@54 12
cannam@54 13 #include "maths/MathAliases.h"
cannam@54 14 #include "maths/MathUtilities.h"
cannam@55 15 #include "maths/KLDivergence.h"
cannam@54 16 #include "dsp/transforms/FFT.h"
cannam@54 17
cannam@54 18 #include <iostream>
cannam@54 19 #include <cstdlib>
cannam@54 20
cannam@54 21 DownBeat::DownBeat(float originalSampleRate,
cannam@54 22 size_t decimationFactor,
cannam@54 23 size_t dfIncrement) :
cannam@55 24 m_bpb(0),
cannam@54 25 m_rate(originalSampleRate),
cannam@54 26 m_factor(decimationFactor),
cannam@54 27 m_increment(dfIncrement),
cannam@54 28 m_decimator1(0),
cannam@54 29 m_decimator2(0),
cannam@54 30 m_buffer(0),
cannam@58 31 m_decbuf(0),
cannam@54 32 m_bufsiz(0),
cannam@54 33 m_buffill(0),
cannam@54 34 m_beatframesize(0),
cannam@54 35 m_beatframe(0)
cannam@54 36 {
cannam@54 37 // beat frame size is next power of two up from 1.3 seconds at the
cannam@54 38 // downsampled rate (happens to produce 4096 for 44100 or 48000 at
cannam@54 39 // 16x decimation, which is our expected normal situation)
cannam@55 40 m_beatframesize = MathUtilities::nextPowerOfTwo
cannam@55 41 (int((m_rate / decimationFactor) * 1.3));
cannam@57 42 // std::cerr << "rate = " << m_rate << ", bfs = " << m_beatframesize << std::endl;
cannam@54 43 m_beatframe = new double[m_beatframesize];
cannam@54 44 m_fftRealOut = new double[m_beatframesize];
cannam@54 45 m_fftImagOut = new double[m_beatframesize];
cannam@64 46 m_fft = new FFTReal(m_beatframesize);
cannam@54 47 }
cannam@54 48
cannam@54 49 DownBeat::~DownBeat()
cannam@54 50 {
cannam@54 51 delete m_decimator1;
cannam@54 52 delete m_decimator2;
cannam@54 53 if (m_buffer) free(m_buffer);
cannam@54 54 delete[] m_decbuf;
cannam@54 55 delete[] m_beatframe;
cannam@54 56 delete[] m_fftRealOut;
cannam@54 57 delete[] m_fftImagOut;
cannam@64 58 delete m_fft;
cannam@54 59 }
cannam@54 60
cannam@54 61 void
cannam@55 62 DownBeat::setBeatsPerBar(int bpb)
cannam@55 63 {
cannam@55 64 m_bpb = bpb;
cannam@55 65 }
cannam@55 66
cannam@55 67 void
cannam@54 68 DownBeat::makeDecimators()
cannam@54 69 {
cannam@58 70 // std::cerr << "m_factor = " << m_factor << std::endl;
cannam@54 71 if (m_factor < 2) return;
cannam@54 72 int highest = Decimator::getHighestSupportedFactor();
cannam@54 73 if (m_factor <= highest) {
cannam@54 74 m_decimator1 = new Decimator(m_increment, m_factor);
cannam@57 75 // std::cerr << "DownBeat: decimator 1 factor " << m_factor << ", size " << m_increment << std::endl;
cannam@54 76 return;
cannam@54 77 }
cannam@54 78 m_decimator1 = new Decimator(m_increment, highest);
cannam@57 79 // std::cerr << "DownBeat: decimator 1 factor " << highest << ", size " << m_increment << std::endl;
cannam@54 80 m_decimator2 = new Decimator(m_increment / highest, m_factor / highest);
cannam@57 81 // std::cerr << "DownBeat: decimator 2 factor " << m_factor / highest << ", size " << m_increment / highest << std::endl;
cannam@55 82 m_decbuf = new float[m_increment / highest];
cannam@54 83 }
cannam@54 84
cannam@54 85 void
cannam@55 86 DownBeat::pushAudioBlock(const float *audio)
cannam@54 87 {
cannam@54 88 if (m_buffill + (m_increment / m_factor) > m_bufsiz) {
cannam@54 89 if (m_bufsiz == 0) m_bufsiz = m_increment * 16;
cannam@54 90 else m_bufsiz = m_bufsiz * 2;
cannam@54 91 if (!m_buffer) {
cannam@55 92 m_buffer = (float *)malloc(m_bufsiz * sizeof(float));
cannam@54 93 } else {
cannam@57 94 // std::cerr << "DownBeat::pushAudioBlock: realloc m_buffer to " << m_bufsiz << std::endl;
cannam@55 95 m_buffer = (float *)realloc(m_buffer, m_bufsiz * sizeof(float));
cannam@54 96 }
cannam@54 97 }
cannam@58 98 if (!m_decimator1 && m_factor > 1) makeDecimators();
cannam@58 99 // float rmsin = 0, rmsout = 0;
cannam@58 100 // for (int i = 0; i < m_increment; ++i) {
cannam@58 101 // rmsin += audio[i] * audio[i];
cannam@58 102 // }
cannam@54 103 if (m_decimator2) {
cannam@54 104 m_decimator1->process(audio, m_decbuf);
cannam@54 105 m_decimator2->process(m_decbuf, m_buffer + m_buffill);
cannam@58 106 } else if (m_decimator1) {
cannam@58 107 m_decimator1->process(audio, m_buffer + m_buffill);
cannam@54 108 } else {
cannam@58 109 // just copy across (m_factor is presumably 1)
cannam@58 110 for (int i = 0; i < m_increment; ++i) {
cannam@58 111 (m_buffer + m_buffill)[i] = audio[i];
cannam@58 112 }
cannam@54 113 }
cannam@58 114 // for (int i = 0; i < m_increment / m_factor; ++i) {
cannam@58 115 // rmsout += m_buffer[m_buffill + i] * m_buffer[m_buffill + i];
cannam@58 116 // }
cannam@57 117 // std::cerr << "pushAudioBlock: rms in " << sqrt(rmsin) << ", out " << sqrt(rmsout) << std::endl;
cannam@54 118 m_buffill += m_increment / m_factor;
cannam@54 119 }
cannam@54 120
cannam@55 121 const float *
cannam@54 122 DownBeat::getBufferedAudio(size_t &length) const
cannam@54 123 {
cannam@54 124 length = m_buffill;
cannam@54 125 return m_buffer;
cannam@54 126 }
cannam@54 127
cannam@54 128 void
cannam@55 129 DownBeat::resetAudioBuffer()
cannam@55 130 {
cannam@55 131 if (m_buffer) free(m_buffer);
cannam@58 132 m_buffer = 0;
cannam@55 133 m_buffill = 0;
cannam@55 134 m_bufsiz = 0;
cannam@55 135 }
cannam@55 136
cannam@55 137 void
cannam@55 138 DownBeat::findDownBeats(const float *audio,
cannam@54 139 size_t audioLength,
cannam@54 140 const d_vec_t &beats,
cannam@54 141 i_vec_t &downbeats)
cannam@54 142 {
cannam@54 143 // FIND DOWNBEATS BY PARTITIONING THE INPUT AUDIO FILE INTO BEAT SEGMENTS
cannam@54 144 // WHERE THE AUDIO FRAMES ARE DOWNSAMPLED BY A FACTOR OF 16 (fs ~= 2700Hz)
cannam@54 145 // THEN TAKING THE JENSEN-SHANNON DIVERGENCE BETWEEN BEAT SYNCHRONOUS SPECTRAL FRAMES
cannam@54 146
cannam@54 147 // IMPLEMENTATION (MOSTLY) FOLLOWS:
cannam@54 148 // DAVIES AND PLUMBLEY "A SPECTRAL DIFFERENCE APPROACH TO EXTRACTING DOWNBEATS IN MUSICAL AUDIO"
cannam@54 149 // EUSIPCO 2006, FLORENCE, ITALY
cannam@54 150
cannam@54 151 d_vec_t newspec(m_beatframesize / 2); // magnitude spectrum of current beat
cannam@54 152 d_vec_t oldspec(m_beatframesize / 2); // magnitude spectrum of previous beat
cannam@56 153
cannam@56 154 m_beatsd.clear();
cannam@54 155
cannam@54 156 if (audioLength == 0) return;
cannam@54 157
cannam@54 158 for (size_t i = 0; i + 1 < beats.size(); ++i) {
cannam@54 159
cannam@54 160 // Copy the extents of the current beat from downsampled array
cannam@54 161 // into beat frame buffer
cannam@54 162
cannam@54 163 size_t beatstart = (beats[i] * m_increment) / m_factor;
cannam@55 164 size_t beatend = (beats[i+1] * m_increment) / m_factor;
cannam@54 165 if (beatend >= audioLength) beatend = audioLength - 1;
cannam@54 166 if (beatend < beatstart) beatend = beatstart;
cannam@54 167 size_t beatlen = beatend - beatstart;
cannam@54 168
cannam@54 169 // Also apply a Hanning window to the beat frame buffer, sized
cannam@54 170 // to the beat extents rather than the frame size. (Because
cannam@54 171 // the size varies, it's easier to do this by hand than use
cannam@54 172 // our Window abstraction.)
cannam@54 173
cannam@58 174 // std::cerr << "beatlen = " << beatlen << std::endl;
cannam@58 175
cannam@58 176 // float rms = 0;
cannam@58 177 for (size_t j = 0; j < beatlen && j < m_beatframesize; ++j) {
cannam@54 178 double mul = 0.5 * (1.0 - cos(TWO_PI * (double(j) / double(beatlen))));
cannam@54 179 m_beatframe[j] = audio[beatstart + j] * mul;
cannam@58 180 // rms += m_beatframe[j] * m_beatframe[j];
cannam@54 181 }
cannam@58 182 // rms = sqrt(rms);
cannam@57 183 // std::cerr << "beat " << i << ": audio rms " << rms << std::endl;
cannam@54 184
cannam@54 185 for (size_t j = beatlen; j < m_beatframesize; ++j) {
cannam@54 186 m_beatframe[j] = 0.0;
cannam@54 187 }
cannam@54 188
cannam@54 189 // Now FFT beat frame
cannam@54 190
cannam@64 191 m_fft->process(false, m_beatframe, m_fftRealOut, m_fftImagOut);
cannam@54 192
cannam@54 193 // Calculate magnitudes
cannam@54 194
cannam@54 195 for (size_t j = 0; j < m_beatframesize/2; ++j) {
cannam@54 196 newspec[j] = sqrt(m_fftRealOut[j] * m_fftRealOut[j] +
cannam@54 197 m_fftImagOut[j] * m_fftImagOut[j]);
cannam@54 198 }
cannam@54 199
cannam@54 200 // Preserve peaks by applying adaptive threshold
cannam@54 201
cannam@54 202 MathUtilities::adaptiveThreshold(newspec);
cannam@54 203
cannam@54 204 // Calculate JS divergence between new and old spectral frames
cannam@54 205
cannam@56 206 if (i > 0) { // otherwise we have no previous frame
cannam@56 207 m_beatsd.push_back(measureSpecDiff(oldspec, newspec));
cannam@57 208 // std::cerr << "specdiff: " << m_beatsd[m_beatsd.size()-1] << std::endl;
cannam@56 209 }
cannam@54 210
cannam@54 211 // Copy newspec across to old
cannam@54 212
cannam@54 213 for (size_t j = 0; j < m_beatframesize/2; ++j) {
cannam@54 214 oldspec[j] = newspec[j];
cannam@54 215 }
cannam@54 216 }
cannam@54 217
cannam@54 218 // We now have all spectral difference measures in specdiff
cannam@54 219
cannam@70 220 unsigned int timesig = m_bpb;
cannam@55 221 if (timesig == 0) timesig = 4;
cannam@55 222
cannam@54 223 d_vec_t dbcand(timesig); // downbeat candidates
cannam@54 224
cannam@55 225 for (int beat = 0; beat < timesig; ++beat) {
cannam@55 226 dbcand[beat] = 0;
cannam@55 227 }
cannam@55 228
cannam@54 229 // look for beat transition which leads to greatest spectral change
cannam@54 230 for (int beat = 0; beat < timesig; ++beat) {
cannam@56 231 int count = 0;
cannam@56 232 for (int example = beat - 1; example < m_beatsd.size(); example += timesig) {
cannam@56 233 if (example < 0) continue;
cannam@56 234 dbcand[beat] += (m_beatsd[example]) / timesig;
cannam@56 235 ++count;
cannam@54 236 }
cannam@56 237 if (count > 0) m_beatsd[beat] /= count;
cannam@57 238 // std::cerr << "dbcand[" << beat << "] = " << dbcand[beat] << std::endl;
cannam@54 239 }
cannam@54 240
cannam@55 241
cannam@54 242 // first downbeat is beat at index of maximum value of dbcand
cannam@54 243 int dbind = MathUtilities::getMax(dbcand);
cannam@54 244
cannam@54 245 // remaining downbeats are at timesig intervals from the first
cannam@54 246 for (int i = dbind; i < beats.size(); i += timesig) {
cannam@54 247 downbeats.push_back(i);
cannam@54 248 }
cannam@54 249 }
cannam@54 250
cannam@54 251 double
cannam@54 252 DownBeat::measureSpecDiff(d_vec_t oldspec, d_vec_t newspec)
cannam@54 253 {
cannam@54 254 // JENSEN-SHANNON DIVERGENCE BETWEEN SPECTRAL FRAMES
cannam@54 255
cannam@70 256 unsigned int SPECSIZE = 512; // ONLY LOOK AT FIRST 512 SAMPLES OF SPECTRUM.
cannam@54 257 if (SPECSIZE > oldspec.size()/4) {
cannam@54 258 SPECSIZE = oldspec.size()/4;
cannam@54 259 }
cannam@54 260 double SD = 0.;
cannam@54 261 double sd1 = 0.;
cannam@54 262
cannam@54 263 double sumnew = 0.;
cannam@54 264 double sumold = 0.;
cannam@54 265
cannam@70 266 for (unsigned int i = 0;i < SPECSIZE;i++)
cannam@54 267 {
cannam@54 268 newspec[i] +=EPS;
cannam@54 269 oldspec[i] +=EPS;
cannam@54 270
cannam@54 271 sumnew+=newspec[i];
cannam@54 272 sumold+=oldspec[i];
cannam@54 273 }
cannam@54 274
cannam@70 275 for (unsigned int i = 0;i < SPECSIZE;i++)
cannam@54 276 {
cannam@54 277 newspec[i] /= (sumnew);
cannam@54 278 oldspec[i] /= (sumold);
cannam@54 279
cannam@54 280 // IF ANY SPECTRAL VALUES ARE 0 (SHOULDN'T BE ANY!) SET THEM TO 1
cannam@54 281 if (newspec[i] == 0)
cannam@54 282 {
cannam@54 283 newspec[i] = 1.;
cannam@54 284 }
cannam@54 285
cannam@54 286 if (oldspec[i] == 0)
cannam@54 287 {
cannam@54 288 oldspec[i] = 1.;
cannam@54 289 }
cannam@54 290
cannam@54 291 // JENSEN-SHANNON CALCULATION
cannam@54 292 sd1 = 0.5*oldspec[i] + 0.5*newspec[i];
cannam@54 293 SD = SD + (-sd1*log(sd1)) + (0.5*(oldspec[i]*log(oldspec[i]))) + (0.5*(newspec[i]*log(newspec[i])));
cannam@54 294 }
cannam@54 295
cannam@54 296 return SD;
cannam@54 297 }
cannam@54 298
cannam@56 299 void
cannam@56 300 DownBeat::getBeatSD(vector<double> &beatsd) const
cannam@56 301 {
cannam@56 302 for (int i = 0; i < m_beatsd.size(); ++i) beatsd.push_back(m_beatsd[i]);
cannam@56 303 }
cannam@56 304