qm-dsp: dsp/tempotracking/DownBeat.cpp annotate

annotate dsp/tempotracking/DownBeat.cpp @ 298:255e431ae3d4

* Key detector: when returning key strengths, use the peak value of the three underlying chromagram correlations (from 36-bin chromagram) corresponding to each key, instead of the mean. Rationale: This is the same method as used when returning the key value, and it's nice to have the same results in both returned value and plot. The peak performed better than the sum with a simple test set of triads, so it seems reasonable to change the plot to match the key output rather than the other way around. * FFT: kiss_fftr returns only the non-conjugate bins, synthesise the rest rather than leaving them (perhaps dangerously) undefined. Fixes an uninitialised data error in chromagram that could cause garbage results from key detector. * Constant Q: remove precalculated values again, I reckon they're not proving such a good tradeoff.

author	Chris Cannam <c.cannam@qmul.ac.uk>
date	Fri, 05 Jun 2009 15:12:39 +0000
parents	1c9258dd155e
children	4fada56adbb8

rev	line source
c@279	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
c@279	2
c@279	3 /*
c@279	4 QM DSP Library
c@279	5
c@279	6 Centre for Digital Music, Queen Mary, University of London.
c@279	7 This file copyright 2008-2009 Matthew Davies and QMUL.
c@279	8 All rights reserved.
c@279	9 */
c@279	10
c@279	11 #include "DownBeat.h"
c@279	12
c@279	13 #include "maths/MathAliases.h"
c@279	14 #include "maths/MathUtilities.h"
c@280	15 #include "maths/KLDivergence.h"
c@279	16 #include "dsp/transforms/FFT.h"
c@279	17
c@279	18 #include <iostream>
c@279	19 #include <cstdlib>
c@279	20
c@279	21 DownBeat::DownBeat(float originalSampleRate,
c@279	22 size_t decimationFactor,
c@279	23 size_t dfIncrement) :
c@280	24 m_bpb(0),
c@279	25 m_rate(originalSampleRate),
c@279	26 m_factor(decimationFactor),
c@279	27 m_increment(dfIncrement),
c@279	28 m_decimator1(0),
c@279	29 m_decimator2(0),
c@279	30 m_buffer(0),
c@283	31 m_decbuf(0),
c@279	32 m_bufsiz(0),
c@279	33 m_buffill(0),
c@279	34 m_beatframesize(0),
c@279	35 m_beatframe(0)
c@279	36 {
c@279	37 // beat frame size is next power of two up from 1.3 seconds at the
c@279	38 // downsampled rate (happens to produce 4096 for 44100 or 48000 at
c@279	39 // 16x decimation, which is our expected normal situation)
c@280	40 m_beatframesize = MathUtilities::nextPowerOfTwo
c@280	41 (int((m_rate / decimationFactor) * 1.3));
c@282	42 // std::cerr << "rate = " << m_rate << ", bfs = " << m_beatframesize << std::endl;
c@279	43 m_beatframe = new double[m_beatframesize];
c@279	44 m_fftRealOut = new double[m_beatframesize];
c@279	45 m_fftImagOut = new double[m_beatframesize];
c@289	46 m_fft = new FFTReal(m_beatframesize);
c@279	47 }
c@279	48
c@279	49 DownBeat::~DownBeat()
c@279	50 {
c@279	51 delete m_decimator1;
c@279	52 delete m_decimator2;
c@279	53 if (m_buffer) free(m_buffer);
c@279	54 delete[] m_decbuf;
c@279	55 delete[] m_beatframe;
c@279	56 delete[] m_fftRealOut;
c@279	57 delete[] m_fftImagOut;
c@289	58 delete m_fft;
c@279	59 }
c@279	60
c@279	61 void
c@280	62 DownBeat::setBeatsPerBar(int bpb)
c@280	63 {
c@280	64 m_bpb = bpb;
c@280	65 }
c@280	66
c@280	67 void
c@279	68 DownBeat::makeDecimators()
c@279	69 {
c@283	70 // std::cerr << "m_factor = " << m_factor << std::endl;
c@279	71 if (m_factor < 2) return;
c@279	72 int highest = Decimator::getHighestSupportedFactor();
c@279	73 if (m_factor <= highest) {
c@279	74 m_decimator1 = new Decimator(m_increment, m_factor);
c@282	75 // std::cerr << "DownBeat: decimator 1 factor " << m_factor << ", size " << m_increment << std::endl;
c@279	76 return;
c@279	77 }
c@279	78 m_decimator1 = new Decimator(m_increment, highest);
c@282	79 // std::cerr << "DownBeat: decimator 1 factor " << highest << ", size " << m_increment << std::endl;
c@279	80 m_decimator2 = new Decimator(m_increment / highest, m_factor / highest);
c@282	81 // std::cerr << "DownBeat: decimator 2 factor " << m_factor / highest << ", size " << m_increment / highest << std::endl;
c@280	82 m_decbuf = new float[m_increment / highest];
c@279	83 }
c@279	84
c@279	85 void
c@280	86 DownBeat::pushAudioBlock(const float *audio)
c@279	87 {
c@279	88 if (m_buffill + (m_increment / m_factor) > m_bufsiz) {
c@279	89 if (m_bufsiz == 0) m_bufsiz = m_increment * 16;
c@279	90 else m_bufsiz = m_bufsiz * 2;
c@279	91 if (!m_buffer) {
c@280	92 m_buffer = (float )malloc(m_bufsiz sizeof(float));
c@279	93 } else {
c@282	94 // std::cerr << "DownBeat::pushAudioBlock: realloc m_buffer to " << m_bufsiz << std::endl;
c@280	95 m_buffer = (float )realloc(m_buffer, m_bufsiz sizeof(float));
c@279	96 }
c@279	97 }
c@283	98 if (!m_decimator1 && m_factor > 1) makeDecimators();
c@283	99 // float rmsin = 0, rmsout = 0;
c@283	100 // for (int i = 0; i < m_increment; ++i) {
c@283	101 // rmsin += audio[i] * audio[i];
c@283	102 // }
c@279	103 if (m_decimator2) {
c@279	104 m_decimator1->process(audio, m_decbuf);
c@279	105 m_decimator2->process(m_decbuf, m_buffer + m_buffill);
c@283	106 } else if (m_decimator1) {
c@283	107 m_decimator1->process(audio, m_buffer + m_buffill);
c@279	108 } else {
c@283	109 // just copy across (m_factor is presumably 1)
c@283	110 for (int i = 0; i < m_increment; ++i) {
c@283	111 (m_buffer + m_buffill)[i] = audio[i];
c@283	112 }
c@279	113 }
c@283	114 // for (int i = 0; i < m_increment / m_factor; ++i) {
c@283	115 // rmsout += m_buffer[m_buffill + i] * m_buffer[m_buffill + i];
c@283	116 // }
c@282	117 // std::cerr << "pushAudioBlock: rms in " << sqrt(rmsin) << ", out " << sqrt(rmsout) << std::endl;
c@279	118 m_buffill += m_increment / m_factor;
c@279	119 }
c@279	120
c@280	121 const float *
c@279	122 DownBeat::getBufferedAudio(size_t &length) const
c@279	123 {
c@279	124 length = m_buffill;
c@279	125 return m_buffer;
c@279	126 }
c@279	127
c@279	128 void
c@280	129 DownBeat::resetAudioBuffer()
c@280	130 {
c@280	131 if (m_buffer) free(m_buffer);
c@283	132 m_buffer = 0;
c@280	133 m_buffill = 0;
c@280	134 m_bufsiz = 0;
c@280	135 }
c@280	136
c@280	137 void
c@280	138 DownBeat::findDownBeats(const float *audio,
c@279	139 size_t audioLength,
c@279	140 const d_vec_t &beats,
c@279	141 i_vec_t &downbeats)
c@279	142 {
c@279	143 // FIND DOWNBEATS BY PARTITIONING THE INPUT AUDIO FILE INTO BEAT SEGMENTS
c@279	144 // WHERE THE AUDIO FRAMES ARE DOWNSAMPLED BY A FACTOR OF 16 (fs ~= 2700Hz)
c@279	145 // THEN TAKING THE JENSEN-SHANNON DIVERGENCE BETWEEN BEAT SYNCHRONOUS SPECTRAL FRAMES
c@279	146
c@279	147 // IMPLEMENTATION (MOSTLY) FOLLOWS:
c@279	148 // DAVIES AND PLUMBLEY "A SPECTRAL DIFFERENCE APPROACH TO EXTRACTING DOWNBEATS IN MUSICAL AUDIO"
c@279	149 // EUSIPCO 2006, FLORENCE, ITALY
c@279	150
c@279	151 d_vec_t newspec(m_beatframesize / 2); // magnitude spectrum of current beat
c@279	152 d_vec_t oldspec(m_beatframesize / 2); // magnitude spectrum of previous beat
c@281	153
c@281	154 m_beatsd.clear();
c@279	155
c@279	156 if (audioLength == 0) return;
c@279	157
c@279	158 for (size_t i = 0; i + 1 < beats.size(); ++i) {
c@279	159
c@279	160 // Copy the extents of the current beat from downsampled array
c@279	161 // into beat frame buffer
c@279	162
c@279	163 size_t beatstart = (beats[i] * m_increment) / m_factor;
c@280	164 size_t beatend = (beats[i+1] * m_increment) / m_factor;
c@279	165 if (beatend >= audioLength) beatend = audioLength - 1;
c@279	166 if (beatend < beatstart) beatend = beatstart;
c@279	167 size_t beatlen = beatend - beatstart;
c@279	168
c@279	169 // Also apply a Hanning window to the beat frame buffer, sized
c@279	170 // to the beat extents rather than the frame size. (Because
c@279	171 // the size varies, it's easier to do this by hand than use
c@279	172 // our Window abstraction.)
c@279	173
c@283	174 // std::cerr << "beatlen = " << beatlen << std::endl;
c@283	175
c@283	176 // float rms = 0;
c@283	177 for (size_t j = 0; j < beatlen && j < m_beatframesize; ++j) {
c@279	178 double mul = 0.5 * (1.0 - cos(TWO_PI * (double(j) / double(beatlen))));
c@279	179 m_beatframe[j] = audio[beatstart + j] * mul;
c@283	180 // rms += m_beatframe[j] * m_beatframe[j];
c@279	181 }
c@283	182 // rms = sqrt(rms);
c@282	183 // std::cerr << "beat " << i << ": audio rms " << rms << std::endl;
c@279	184
c@279	185 for (size_t j = beatlen; j < m_beatframesize; ++j) {
c@279	186 m_beatframe[j] = 0.0;
c@279	187 }
c@279	188
c@279	189 // Now FFT beat frame
c@279	190
c@289	191 m_fft->process(false, m_beatframe, m_fftRealOut, m_fftImagOut);
c@279	192
c@279	193 // Calculate magnitudes
c@279	194
c@279	195 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279	196 newspec[j] = sqrt(m_fftRealOut[j] * m_fftRealOut[j] +
c@279	197 m_fftImagOut[j] * m_fftImagOut[j]);
c@279	198 }
c@279	199
c@279	200 // Preserve peaks by applying adaptive threshold
c@279	201
c@279	202 MathUtilities::adaptiveThreshold(newspec);
c@279	203
c@279	204 // Calculate JS divergence between new and old spectral frames
c@279	205
c@281	206 if (i > 0) { // otherwise we have no previous frame
c@281	207 m_beatsd.push_back(measureSpecDiff(oldspec, newspec));
c@282	208 // std::cerr << "specdiff: " << m_beatsd[m_beatsd.size()-1] << std::endl;
c@281	209 }
c@279	210
c@279	211 // Copy newspec across to old
c@279	212
c@279	213 for (size_t j = 0; j < m_beatframesize/2; ++j) {
c@279	214 oldspec[j] = newspec[j];
c@279	215 }
c@279	216 }
c@279	217
c@279	218 // We now have all spectral difference measures in specdiff
c@279	219
c@295	220 unsigned int timesig = m_bpb;
c@280	221 if (timesig == 0) timesig = 4;
c@280	222
c@279	223 d_vec_t dbcand(timesig); // downbeat candidates
c@279	224
c@280	225 for (int beat = 0; beat < timesig; ++beat) {
c@280	226 dbcand[beat] = 0;
c@280	227 }
c@280	228
c@279	229 // look for beat transition which leads to greatest spectral change
c@279	230 for (int beat = 0; beat < timesig; ++beat) {
c@281	231 int count = 0;
c@281	232 for (int example = beat - 1; example < m_beatsd.size(); example += timesig) {
c@281	233 if (example < 0) continue;
c@281	234 dbcand[beat] += (m_beatsd[example]) / timesig;
c@281	235 ++count;
c@279	236 }
c@281	237 if (count > 0) m_beatsd[beat] /= count;
c@282	238 // std::cerr << "dbcand[" << beat << "] = " << dbcand[beat] << std::endl;
c@279	239 }
c@279	240
c@280	241
c@279	242 // first downbeat is beat at index of maximum value of dbcand
c@279	243 int dbind = MathUtilities::getMax(dbcand);
c@279	244
c@279	245 // remaining downbeats are at timesig intervals from the first
c@279	246 for (int i = dbind; i < beats.size(); i += timesig) {
c@279	247 downbeats.push_back(i);
c@279	248 }
c@279	249 }
c@279	250
c@279	251 double
c@279	252 DownBeat::measureSpecDiff(d_vec_t oldspec, d_vec_t newspec)
c@279	253 {
c@279	254 // JENSEN-SHANNON DIVERGENCE BETWEEN SPECTRAL FRAMES
c@279	255
c@295	256 unsigned int SPECSIZE = 512; // ONLY LOOK AT FIRST 512 SAMPLES OF SPECTRUM.
c@279	257 if (SPECSIZE > oldspec.size()/4) {
c@279	258 SPECSIZE = oldspec.size()/4;
c@279	259 }
c@279	260 double SD = 0.;
c@279	261 double sd1 = 0.;
c@279	262
c@279	263 double sumnew = 0.;
c@279	264 double sumold = 0.;
c@279	265
c@295	266 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279	267 {
c@279	268 newspec[i] +=EPS;
c@279	269 oldspec[i] +=EPS;
c@279	270
c@279	271 sumnew+=newspec[i];
c@279	272 sumold+=oldspec[i];
c@279	273 }
c@279	274
c@295	275 for (unsigned int i = 0;i < SPECSIZE;i++)
c@279	276 {
c@279	277 newspec[i] /= (sumnew);
c@279	278 oldspec[i] /= (sumold);
c@279	279
c@279	280 // IF ANY SPECTRAL VALUES ARE 0 (SHOULDN'T BE ANY!) SET THEM TO 1
c@279	281 if (newspec[i] == 0)
c@279	282 {
c@279	283 newspec[i] = 1.;
c@279	284 }
c@279	285
c@279	286 if (oldspec[i] == 0)
c@279	287 {
c@279	288 oldspec[i] = 1.;
c@279	289 }
c@279	290
c@279	291 // JENSEN-SHANNON CALCULATION
c@279	292 sd1 = 0.5oldspec[i] + 0.5newspec[i];
c@279	293 SD = SD + (-sd1log(sd1)) + (0.5(oldspec[i]log(oldspec[i]))) + (0.5(newspec[i]*log(newspec[i])));
c@279	294 }
c@279	295
c@279	296 return SD;
c@279	297 }
c@279	298
c@281	299 void
c@281	300 DownBeat::getBeatSD(vector<double> &beatsd) const
c@281	301 {
c@281	302 for (int i = 0; i < m_beatsd.size(); ++i) beatsd.push_back(m_beatsd[i]);
c@281	303 }
c@281	304

Mercurial > hg > qm-dsp

annotate dsp/tempotracking/DownBeat.cpp @ 298:255e431ae3d4