comparison examples/FixedTempoEstimator.cpp @ 255:88ef5ffdbe8d

* docs
author cannam
date Wed, 12 Nov 2008 14:11:01 +0000
parents 5bfed156b45d
children 23352e424631
comparison
equal deleted inserted replaced
254:e02c93c4de8f 255:88ef5ffdbe8d
45 45
46 #include <cmath> 46 #include <cmath>
47 47
48 48
49 class FixedTempoEstimator::D 49 class FixedTempoEstimator::D
50 // this class just avoids us having to declare any data members in the header
50 { 51 {
51 public: 52 public:
52 D(float inputSampleRate); 53 D(float inputSampleRate);
53 ~D(); 54 ~D();
54 55
312 313
313 if (m_n == 0) m_start = ts; 314 if (m_n == 0) m_start = ts;
314 m_lasttime = ts; 315 m_lasttime = ts;
315 316
316 if (m_n == m_dfsize) { 317 if (m_n == m_dfsize) {
318 // If we have seen enough input, do the estimation and return
317 calculate(); 319 calculate();
318 fs = assembleFeatures(); 320 fs = assembleFeatures();
319 ++m_n; 321 ++m_n;
320 return fs; 322 return fs;
321 } 323 }
322 324
325 // If we have seen more than enough, just discard and return!
323 if (m_n > m_dfsize) return FeatureSet(); 326 if (m_n > m_dfsize) return FeatureSet();
324 327
325 float value = 0.f; 328 float value = 0.f;
329
330 // m_df will contain an onset detection function based on the rise
331 // in overall power from one spectral frame to the next --
332 // simplistic but reasonably effective for our purposes.
326 333
327 for (size_t i = 1; i < m_blockSize/2; ++i) { 334 for (size_t i = 1; i < m_blockSize/2; ++i) {
328 335
329 float real = inputBuffers[0][i*2]; 336 float real = inputBuffers[0][i*2];
330 float imag = inputBuffers[0][i*2 + 1]; 337 float imag = inputBuffers[0][i*2 + 1];
376 m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second 383 m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second
377 cerr << "FixedTempoEstimator::calculate: Input is too short" << endl; 384 cerr << "FixedTempoEstimator::calculate: Input is too short" << endl;
378 return; 385 return;
379 } 386 }
380 387
381 int n = m_n; 388 // This function takes m_df (the detection function array filled
382 389 // out in process()) and calculates m_r (the raw autocorrelation)
383 m_r = new float[n/2]; 390 // and m_fr (the filtered autocorrelation from whose peaks tempo
384 m_fr = new float[n/2]; 391 // estimates will be taken).
385 m_t = new float[n/2]; 392
393 int n = m_n; // length of actual df array (m_dfsize is the theoretical max)
394
395 m_r = new float[n/2]; // raw autocorrelation
396 m_fr = new float[n/2]; // filtered autocorrelation
397 m_t = new float[n/2]; // averaged tempo estimate for each lag value
386 398
387 for (int i = 0; i < n/2; ++i) { 399 for (int i = 0; i < n/2; ++i) {
388 m_r[i] = 0.f; 400 m_r[i] = 0.f;
389 m_fr[i] = 0.f; 401 m_fr[i] = 0.f;
390 m_t[i] = lag2tempo(i); 402 m_t[i] = lag2tempo(i);
391 } 403 }
404
405 // Calculate the raw autocorrelation of the detection function
392 406
393 for (int i = 0; i < n/2; ++i) { 407 for (int i = 0; i < n/2; ++i) {
394 408
395 for (int j = i; j < n-1; ++j) { 409 for (int j = i; j < n-1; ++j) {
396 m_r[i] += m_df[j] * m_df[j - i]; 410 m_r[i] += m_df[j] * m_df[j - i];
397 } 411 }
398 412
399 m_r[i] /= n - i - 1; 413 m_r[i] /= n - i - 1;
400 } 414 }
401 415
416 // Filter the autocorrelation and average out the tempo estimates
417
402 float related[] = { 0.5, 2, 4, 8 }; 418 float related[] = { 0.5, 2, 4, 8 };
403 419
404 for (int i = 1; i < n/2-1; ++i) { 420 for (int i = 1; i < n/2-1; ++i) {
405 421
406 float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
407 if (weight < 0.f) weight = 0.f;
408 weight = weight * weight * weight;
409
410 m_fr[i] = m_r[i]; 422 m_fr[i] = m_r[i];
411 423
412 int div = 1; 424 int div = 1;
413 425
414 for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) { 426 for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) {
427
428 // Check for an obvious peak at each metrically related lag
415 429
416 int k0 = int(i * related[j] + 0.5); 430 int k0 = int(i * related[j] + 0.5);
417 431
418 if (k0 >= 0 && k0 < int(n/2)) { 432 if (k0 >= 0 && k0 < int(n/2)) {
419 433
429 if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; } 443 if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; }
430 444
431 have = true; 445 have = true;
432 } 446 }
433 447
448 // Boost the original lag according to the strongest
449 // value found close to this related lag
450
434 m_fr[i] += m_r[kmax] / 5; 451 m_fr[i] += m_r[kmax] / 5;
435 452
436 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) && 453 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) &&
437 (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) && 454 (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) &&
438 kvmax > kvmin * 1.05) { 455 kvmax > kvmin * 1.05) {
456
457 // The strongest value close to the related lag is
458 // also a pretty good looking peak, so use it to
459 // improve our tempo estimate for the original lag
439 460
440 m_t[i] = m_t[i] + lag2tempo(kmax) * related[j]; 461 m_t[i] = m_t[i] + lag2tempo(kmax) * related[j];
441 ++div; 462 ++div;
442 } 463 }
443 } 464 }
444 } 465 }
445 466
446 m_t[i] /= div; 467 m_t[i] /= div;
447 468
469 // Finally apply a primitive perceptual weighting (to prefer
470 // tempi of around 120-130)
471
472 float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
473 if (weight < 0.f) weight = 0.f;
474 weight = weight * weight * weight;
475
448 m_fr[i] += m_fr[i] * (weight / 3); 476 m_fr[i] += m_fr[i] * (weight / 3);
449 } 477 }
450 } 478 }
451 479
452 FixedTempoEstimator::FeatureSet 480 FixedTempoEstimator::FeatureSet
453 FixedTempoEstimator::D::assembleFeatures() 481 FixedTempoEstimator::D::assembleFeatures()
454 { 482 {
455 FeatureSet fs; 483 FeatureSet fs;
456 if (!m_r) return fs; // No results 484 if (!m_r) return fs; // No autocorrelation: no results
457 485
458 Feature feature; 486 Feature feature;
459 feature.hasTimestamp = true; 487 feature.hasTimestamp = true;
460 feature.hasDuration = false; 488 feature.hasDuration = false;
461 feature.label = ""; 489 feature.label = "";
465 char buffer[40]; 493 char buffer[40];
466 494
467 int n = m_n; 495 int n = m_n;
468 496
469 for (int i = 0; i < n; ++i) { 497 for (int i = 0; i < n; ++i) {
498
499 // Return the detection function in the DF output
500
470 feature.timestamp = m_start + 501 feature.timestamp = m_start +
471 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 502 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
472 feature.values[0] = m_df[i]; 503 feature.values[0] = m_df[i];
473 feature.label = ""; 504 feature.label = "";
474 fs[DFOutput].push_back(feature); 505 fs[DFOutput].push_back(feature);
475 } 506 }
476 507
477 for (int i = 1; i < n/2; ++i) { 508 for (int i = 1; i < n/2; ++i) {
509
510 // Return the raw autocorrelation in the ACF output, each
511 // value labelled according to its corresponding tempo
512
478 feature.timestamp = m_start + 513 feature.timestamp = m_start +
479 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 514 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
480 feature.values[0] = m_r[i]; 515 feature.values[0] = m_r[i];
481 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 516 sprintf(buffer, "%.1f bpm", lag2tempo(i));
482 if (i == n/2-1) feature.label = ""; 517 if (i == n/2-1) feature.label = "";
494 529
495 for (int i = p0; i <= p1 && i < n/2-1; ++i) { 530 for (int i = p0; i <= p1 && i < n/2-1; ++i) {
496 531
497 if (m_fr[i] > m_fr[i-1] && 532 if (m_fr[i] > m_fr[i-1] &&
498 m_fr[i] > m_fr[i+1]) { 533 m_fr[i] > m_fr[i+1]) {
534
535 // This is a peak in the filtered autocorrelation: stick
536 // it into the map from filtered autocorrelation to lag
537 // index -- this sorts our peaks by filtered acf value
538
499 candidates[m_fr[i]] = i; 539 candidates[m_fr[i]] = i;
500 } 540 }
541
542 // Also return the filtered autocorrelation in its own output
501 543
502 feature.timestamp = m_start + 544 feature.timestamp = m_start +
503 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 545 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
504 feature.values[0] = m_fr[i]; 546 feature.values[0] = m_fr[i];
505 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 547 sprintf(buffer, "%.1f bpm", lag2tempo(i));
517 feature.timestamp = m_start; 559 feature.timestamp = m_start;
518 560
519 feature.hasDuration = true; 561 feature.hasDuration = true;
520 feature.duration = m_lasttime - m_start; 562 feature.duration = m_lasttime - m_start;
521 563
564 // The map contains only peaks and is sorted by filtered acf
565 // value, so the final element in it is our "best" tempo guess
566
522 std::map<float, int>::const_iterator ci = candidates.end(); 567 std::map<float, int>::const_iterator ci = candidates.end();
523 --ci; 568 --ci;
524 int maxpi = ci->second; 569 int maxpi = ci->second;
525 570
526 if (m_t[maxpi] > 0) { 571 if (m_t[maxpi] > 0) {
527 cerr << "*** Using adjusted tempo " << m_t[maxpi] << " instead of lag tempo " << lag2tempo(maxpi) << endl; 572
573 // This lag has an adjusted tempo from the averaging process:
574 // use it
575
528 feature.values[0] = m_t[maxpi]; 576 feature.values[0] = m_t[maxpi];
577
529 } else { 578 } else {
530 // shouldn't happen -- it would imply that this high value was not a peak! 579
580 // shouldn't happen -- it would imply that this high value was
581 // not a peak!
582
531 feature.values[0] = lag2tempo(maxpi); 583 feature.values[0] = lag2tempo(maxpi);
532 cerr << "WARNING: No stored tempo for index " << maxpi << endl; 584 cerr << "WARNING: No stored tempo for index " << maxpi << endl;
533 } 585 }
534 586
535 sprintf(buffer, "%.1f bpm", feature.values[0]); 587 sprintf(buffer, "%.1f bpm", feature.values[0]);
536 feature.label = buffer; 588 feature.label = buffer;
537 589
590 // Return the best tempo in the main output
591
538 fs[TempoOutput].push_back(feature); 592 fs[TempoOutput].push_back(feature);
593
594 // And return the other estimates (up to the arbitrarily chosen
595 // number of 10 of them) in the candidates output
539 596
540 feature.values.clear(); 597 feature.values.clear();
541 feature.label = ""; 598 feature.label = "";
542 599
543 while (feature.values.size() < 8) { 600 while (feature.values.size() < 10) {
544 if (m_t[ci->second] > 0) { 601 if (m_t[ci->second] > 0) {
545 feature.values.push_back(m_t[ci->second]); 602 feature.values.push_back(m_t[ci->second]);
546 } else { 603 } else {
547 feature.values.push_back(lag2tempo(ci->second)); 604 feature.values.push_back(lag2tempo(ci->second));
548 } 605 }