FixedTempoEstimator.cpp
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
2 
3 /*
4  Vamp
5 
6  An API for audio analysis and feature extraction plugins.
7 
8  Centre for Digital Music, Queen Mary, University of London.
9  Copyright 2006-2009 Chris Cannam and QMUL.
10 
11  Permission is hereby granted, free of charge, to any person
12  obtaining a copy of this software and associated documentation
13  files (the "Software"), to deal in the Software without
14  restriction, including without limitation the rights to use, copy,
15  modify, merge, publish, distribute, sublicense, and/or sell copies
16  of the Software, and to permit persons to whom the Software is
17  furnished to do so, subject to the following conditions:
18 
19  The above copyright notice and this permission notice shall be
20  included in all copies or substantial portions of the Software.
21 
22  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
26  ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
27  CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 
30  Except as contained in this notice, the names of the Centre for
31  Digital Music; Queen Mary, University of London; and Chris Cannam
32  shall not be used in advertising or otherwise to promote the sale,
33  use or other dealings in this Software without prior written
34  authorization.
35 */
36 
37 #include "FixedTempoEstimator.h"
38 
39 using std::string;
40 using std::vector;
41 using std::cerr;
42 using std::endl;
43 
44 using Vamp::RealTime;
45 
46 #include <cmath>
47 #include <cstdio>
48 
49 
51 // this class just avoids us having to declare any data members in the header
52 {
53 public:
54  D(float inputSampleRate);
55  ~D();
56 
57  size_t getPreferredStepSize() const { return 64; }
58  size_t getPreferredBlockSize() const { return 256; }
59 
61  float getParameter(string id) const;
62  void setParameter(string id, float value);
63 
65 
66  bool initialise(size_t channels, size_t stepSize, size_t blockSize);
67  void reset();
68  FeatureSet process(const float *const *, RealTime);
70 
71 private:
72  void calculate();
74 
75  float lag2tempo(int);
76  int tempo2lag(float);
77 
79  size_t m_stepSize;
80  size_t m_blockSize;
81 
82  float m_minbpm;
83  float m_maxbpm;
84  float m_maxdflen;
85 
87 
88  size_t m_dfsize;
89  float *m_df;
90  float *m_r;
91  float *m_fr;
92  float *m_t;
93  size_t m_n;
94 
97 };
98 
99 FixedTempoEstimator::D::D(float inputSampleRate) :
100  m_inputSampleRate(inputSampleRate),
101  m_stepSize(0),
102  m_blockSize(0),
103  m_minbpm(50),
104  m_maxbpm(190),
105  m_maxdflen(10),
107  m_df(0),
108  m_r(0),
109  m_fr(0),
110  m_t(0),
111  m_n(0)
112 {
113 }
114 
116 {
117  delete[] m_priorMagnitudes;
118  delete[] m_df;
119  delete[] m_r;
120  delete[] m_fr;
121  delete[] m_t;
122 }
123 
126 {
127  ParameterList list;
128 
130  d.identifier = "minbpm";
131  d.name = "Minimum estimated tempo";
132  d.description = "Minimum beat-per-minute value which the tempo estimator is able to return";
133  d.unit = "bpm";
134  d.minValue = 10;
135  d.maxValue = 360;
136  d.defaultValue = 50;
137  d.isQuantized = false;
138  list.push_back(d);
139 
140  d.identifier = "maxbpm";
141  d.name = "Maximum estimated tempo";
142  d.description = "Maximum beat-per-minute value which the tempo estimator is able to return";
143  d.defaultValue = 190;
144  list.push_back(d);
145 
146  d.identifier = "maxdflen";
147  d.name = "Input duration to study";
148  d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo. There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio. The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results.";
149  d.unit = "s";
150  d.minValue = 2;
151  d.maxValue = 40;
152  d.defaultValue = 10;
153  list.push_back(d);
154 
155  return list;
156 }
157 
158 float
160 {
161  if (id == "minbpm") {
162  return m_minbpm;
163  } else if (id == "maxbpm") {
164  return m_maxbpm;
165  } else if (id == "maxdflen") {
166  return m_maxdflen;
167  }
168  return 0.f;
169 }
170 
171 void
173 {
174  if (id == "minbpm") {
175  m_minbpm = value;
176  } else if (id == "maxbpm") {
177  m_maxbpm = value;
178  } else if (id == "maxdflen") {
179  m_maxdflen = value;
180  }
181 }
182 
183 static int TempoOutput = 0;
184 static int CandidatesOutput = 1;
185 static int DFOutput = 2;
186 static int ACFOutput = 3;
187 static int FilteredACFOutput = 4;
188 
191 {
192  OutputList list;
193 
195  d.identifier = "tempo";
196  d.name = "Tempo";
197  d.description = "Estimated tempo";
198  d.unit = "bpm";
199  d.hasFixedBinCount = true;
200  d.binCount = 1;
201  d.hasKnownExtents = false;
202  d.isQuantized = false;
205  d.hasDuration = true; // our returned tempo spans a certain range
206  list.push_back(d);
207 
208  d.identifier = "candidates";
209  d.name = "Tempo candidates";
210  d.description = "Possible tempo estimates, one per bin with the most likely in the first bin";
211  d.unit = "bpm";
212  d.hasFixedBinCount = false;
213  list.push_back(d);
214 
215  d.identifier = "detectionfunction";
216  d.name = "Detection Function";
217  d.description = "Onset detection function";
218  d.unit = "";
219  d.hasFixedBinCount = 1;
220  d.binCount = 1;
221  d.hasKnownExtents = true;
222  d.minValue = 0.0;
223  d.maxValue = 1.0;
224  d.isQuantized = false;
225  d.quantizeStep = 0.0;
227  if (m_stepSize) {
229  } else {
231  }
232  d.hasDuration = false;
233  list.push_back(d);
234 
235  d.identifier = "acf";
236  d.name = "Autocorrelation Function";
237  d.description = "Autocorrelation of onset detection function";
238  d.hasKnownExtents = false;
239  d.unit = "r";
240  list.push_back(d);
241 
242  d.identifier = "filtered_acf";
243  d.name = "Filtered Autocorrelation";
244  d.description = "Filtered autocorrelation of onset detection function";
245  d.unit = "r";
246  list.push_back(d);
247 
248  return list;
249 }
250 
251 bool
252 FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize)
253 {
254  m_stepSize = stepSize;
255  m_blockSize = blockSize;
256 
257  float dfLengthSecs = m_maxdflen;
258  m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize;
259 
260  m_priorMagnitudes = new float[m_blockSize/2];
261  m_df = new float[m_dfsize];
262 
263  for (size_t i = 0; i < m_blockSize/2; ++i) {
264  m_priorMagnitudes[i] = 0.f;
265  }
266  for (size_t i = 0; i < m_dfsize; ++i) {
267  m_df[i] = 0.f;
268  }
269 
270  m_n = 0;
271 
272  return true;
273 }
274 
275 void
277 {
278  if (!m_priorMagnitudes) return;
279 
280  for (size_t i = 0; i < m_blockSize/2; ++i) {
281  m_priorMagnitudes[i] = 0.f;
282  }
283  for (size_t i = 0; i < m_dfsize; ++i) {
284  m_df[i] = 0.f;
285  }
286 
287  delete[] m_r;
288  m_r = 0;
289 
290  delete[] m_fr;
291  m_fr = 0;
292 
293  delete[] m_t;
294  m_t = 0;
295 
296  m_n = 0;
297 
298  m_start = RealTime::zeroTime;
299  m_lasttime = RealTime::zeroTime;
300 }
301 
303 FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts)
304 {
305  FeatureSet fs;
306 
307  if (m_stepSize == 0) {
308  cerr << "ERROR: FixedTempoEstimator::process: "
309  << "FixedTempoEstimator has not been initialised"
310  << endl;
311  return fs;
312  }
313 
314  if (m_n == 0) m_start = ts;
315  m_lasttime = ts;
316 
317  if (m_n == m_dfsize) {
318  // If we have seen enough input, do the estimation and return
319  calculate();
320  fs = assembleFeatures();
321  ++m_n;
322  return fs;
323  }
324 
325  // If we have seen more than enough, just discard and return!
326  if (m_n > m_dfsize) return FeatureSet();
327 
328  float value = 0.f;
329 
330  // m_df will contain an onset detection function based on the rise
331  // in overall power from one spectral frame to the next --
332  // simplistic but reasonably effective for our purposes.
333 
334  for (size_t i = 1; i < m_blockSize/2; ++i) {
335 
336  float real = inputBuffers[0][i*2];
337  float imag = inputBuffers[0][i*2 + 1];
338 
339  float sqrmag = real * real + imag * imag;
340  value += fabsf(sqrmag - m_priorMagnitudes[i]);
341 
342  m_priorMagnitudes[i] = sqrmag;
343  }
344 
345  m_df[m_n] = value;
346 
347  ++m_n;
348  return fs;
349 }
350 
353 {
354  FeatureSet fs;
355  if (m_n > m_dfsize) return fs;
356  calculate();
357  fs = assembleFeatures();
358  ++m_n;
359  return fs;
360 }
361 
362 float
364 {
365  return 60.f / ((lag * m_stepSize) / m_inputSampleRate);
366 }
367 
368 int
370 {
371  return ((60.f / tempo) * m_inputSampleRate) / m_stepSize;
372 }
373 
374 void
376 {
377  if (m_r) {
378  cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl;
379  return;
380  }
381 
382  if (m_n < m_dfsize / 9 &&
383  m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second
384  cerr << "FixedTempoEstimator::calculate: Input is too short" << endl;
385  return;
386  }
387 
388  // This function takes m_df (the detection function array filled
389  // out in process()) and calculates m_r (the raw autocorrelation)
390  // and m_fr (the filtered autocorrelation from whose peaks tempo
391  // estimates will be taken).
392 
393  int n = m_n; // length of actual df array (m_dfsize is the theoretical max)
394 
395  m_r = new float[n/2]; // raw autocorrelation
396  m_fr = new float[n/2]; // filtered autocorrelation
397  m_t = new float[n/2]; // averaged tempo estimate for each lag value
398 
399  for (int i = 0; i < n/2; ++i) {
400  m_r[i] = 0.f;
401  m_fr[i] = 0.f;
402  m_t[i] = lag2tempo(i);
403  }
404 
405  // Calculate the raw autocorrelation of the detection function
406 
407  for (int i = 0; i < n/2; ++i) {
408 
409  for (int j = i; j < n; ++j) {
410  m_r[i] += m_df[j] * m_df[j - i];
411  }
412 
413  m_r[i] /= n - i - 1;
414  }
415 
416  // Filter the autocorrelation and average out the tempo estimates
417 
418  float related[] = { 0.5, 2, 4, 8 };
419 
420  for (int i = 1; i < n/2-1; ++i) {
421 
422  m_fr[i] = m_r[i];
423 
424  int div = 1;
425 
426  for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) {
427 
428  // Check for an obvious peak at each metrically related lag
429 
430  int k0 = int(i * related[j] + 0.5);
431 
432  if (k0 >= 0 && k0 < int(n/2)) {
433 
434  int kmax = 0;
435  float kvmax = 0, kvmin = 0;
436  bool have = false;
437 
438  for (int k = k0 - 1; k <= k0 + 1; ++k) {
439 
440  if (k < 0 || k >= n/2) continue;
441 
442  if (!have || (m_r[k] > kvmax)) { kvmax = m_r[k]; kmax = k; }
443  if (!have || (m_r[k] < kvmin)) { kvmin = m_r[k]; }
444 
445  have = true;
446  }
447 
448  // Boost the original lag according to the strongest
449  // value found close to this related lag
450 
451  m_fr[i] += m_r[kmax] / 5;
452 
453  if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) &&
454  (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) &&
455  kvmax > kvmin * 1.05) {
456 
457  // The strongest value close to the related lag is
458  // also a pretty good looking peak, so use it to
459  // improve our tempo estimate for the original lag
460 
461  m_t[i] = m_t[i] + lag2tempo(kmax) * related[j];
462  ++div;
463  }
464  }
465  }
466 
467  m_t[i] /= div;
468 
469  // Finally apply a primitive perceptual weighting (to prefer
470  // tempi of around 120-130)
471 
472  float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
473  if (weight < 0.f) weight = 0.f;
474  weight = weight * weight * weight;
475 
476  m_fr[i] += m_fr[i] * (weight / 3);
477  }
478 }
479 
482 {
483  FeatureSet fs;
484  if (!m_r) return fs; // No autocorrelation: no results
485 
486  Feature feature;
487  feature.hasTimestamp = true;
488  feature.hasDuration = false;
489  feature.label = "";
490  feature.values.clear();
491  feature.values.push_back(0.f);
492 
493  char buffer[40];
494 
495  int n = m_n;
496 
497  for (int i = 0; i < n; ++i) {
498 
499  // Return the detection function in the DF output
500 
501  feature.timestamp = m_start +
502  RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
503  feature.values[0] = m_df[i];
504  feature.label = "";
505  fs[DFOutput].push_back(feature);
506  }
507 
508  for (int i = 1; i < n/2; ++i) {
509 
510  // Return the raw autocorrelation in the ACF output, each
511  // value labelled according to its corresponding tempo
512 
513  feature.timestamp = m_start +
514  RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
515  feature.values[0] = m_r[i];
516  sprintf(buffer, "%.1f bpm", lag2tempo(i));
517  if (i == n/2-1) feature.label = "";
518  else feature.label = buffer;
519  fs[ACFOutput].push_back(feature);
520  }
521 
522  float t0 = m_minbpm; // our minimum detected tempo
523  float t1 = m_maxbpm; // our maximum detected tempo
524 
525  int p0 = tempo2lag(t1);
526  int p1 = tempo2lag(t0);
527 
528  std::map<float, int> candidates;
529 
530  for (int i = p0; i <= p1 && i+1 < n/2; ++i) {
531 
532  if (m_fr[i] > m_fr[i-1] &&
533  m_fr[i] > m_fr[i+1]) {
534 
535  // This is a peak in the filtered autocorrelation: stick
536  // it into the map from filtered autocorrelation to lag
537  // index -- this sorts our peaks by filtered acf value
538 
539  candidates[m_fr[i]] = i;
540  }
541 
542  // Also return the filtered autocorrelation in its own output
543 
544  feature.timestamp = m_start +
545  RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
546  feature.values[0] = m_fr[i];
547  sprintf(buffer, "%.1f bpm", lag2tempo(i));
548  if (i == p1 || i == n/2-2) feature.label = "";
549  else feature.label = buffer;
550  fs[FilteredACFOutput].push_back(feature);
551  }
552 
553  if (candidates.empty()) {
554  cerr << "No tempo candidates!" << endl;
555  return fs;
556  }
557 
558  feature.hasTimestamp = true;
559  feature.timestamp = m_start;
560 
561  feature.hasDuration = true;
562  feature.duration = m_lasttime - m_start;
563 
564  // The map contains only peaks and is sorted by filtered acf
565  // value, so the final element in it is our "best" tempo guess
566 
567  std::map<float, int>::const_iterator ci = candidates.end();
568  --ci;
569  int maxpi = ci->second;
570 
571  if (m_t[maxpi] > 0) {
572 
573  // This lag has an adjusted tempo from the averaging process:
574  // use it
575 
576  feature.values[0] = m_t[maxpi];
577 
578  } else {
579 
580  // shouldn't happen -- it would imply that this high value was
581  // not a peak!
582 
583  feature.values[0] = lag2tempo(maxpi);
584  cerr << "WARNING: No stored tempo for index " << maxpi << endl;
585  }
586 
587  sprintf(buffer, "%.1f bpm", feature.values[0]);
588  feature.label = buffer;
589 
590  // Return the best tempo in the main output
591 
592  fs[TempoOutput].push_back(feature);
593 
594  // And return the other estimates (up to the arbitrarily chosen
595  // number of 10 of them) in the candidates output
596 
597  feature.values.clear();
598  feature.label = "";
599 
600  while (feature.values.size() < 10) {
601  if (m_t[ci->second] > 0) {
602  feature.values.push_back(m_t[ci->second]);
603  } else {
604  feature.values.push_back(lag2tempo(ci->second));
605  }
606  if (ci == candidates.begin()) break;
607  --ci;
608  }
609 
610  fs[CandidatesOutput].push_back(feature);
611 
612  return fs;
613 }
614 
615 
616 
618  Plugin(inputSampleRate),
619  m_d(new D(inputSampleRate))
620 {
621 }
622 
624 {
625  delete m_d;
626 }
627 
628 string
630 {
631  return "fixedtempo";
632 }
633 
634 string
636 {
637  return "Simple Fixed Tempo Estimator";
638 }
639 
640 string
642 {
643  return "Study a short section of audio and estimate its tempo, assuming the tempo is constant";
644 }
645 
646 string
648 {
649  return "Vamp SDK Example Plugins";
650 }
651 
652 int
654 {
655  return 1;
656 }
657 
658 string
660 {
661  return "Code copyright 2008 Queen Mary, University of London. Freely redistributable (BSD license)";
662 }
663 
664 size_t
666 {
667  return m_d->getPreferredStepSize();
668 }
669 
670 size_t
672 {
673  return m_d->getPreferredBlockSize();
674 }
675 
676 bool
677 FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize)
678 {
679  if (channels < getMinChannelCount() ||
680  channels > getMaxChannelCount()) return false;
681 
682  return m_d->initialise(channels, stepSize, blockSize);
683 }
684 
685 void
687 {
688  return m_d->reset();
689 }
690 
693 {
694  return m_d->getParameterDescriptors();
695 }
696 
697 float
699 {
700  return m_d->getParameter(id);
701 }
702 
703 void
704 FixedTempoEstimator::setParameter(std::string id, float value)
705 {
706  m_d->setParameter(id, value);
707 }
708 
711 {
712  return m_d->getOutputDescriptors();
713 }
714 
716 FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts)
717 {
718  return m_d->process(inputBuffers, ts);
719 }
720 
723 {
724  return m_d->getRemainingFeatures();
725 }
std::vector< OutputDescriptor > OutputList
bool hasDuration
True if the returned results for this output are known to have a duration field.
ParameterList getParameterDescriptors() const
static int ACFOutput
std::string label
Label for the sample of this feature.
FeatureSet process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Process a single block of input data.
float sampleRate
Sample rate of the output results, as samples per second.
size_t getPreferredStepSize() const
bool hasFixedBinCount
True if the output has the same number of values per sample for every output sample.
std::vector< float > values
Results for a single sample of this feature.
Results are evenly spaced in time (sampleRate specified below)
std::map< int, FeatureList > FeatureSet
float quantizeStep
Quantization resolution of the output values (e.g.
RealTime timestamp
Timestamp of the output feature.
std::string getName() const
Get a human-readable name or title of the plugin.
std::string identifier
The name of the parameter, in computer-usable form.
std::string description
A human-readable short text describing the output.
Plugin(float inputSampleRate)
void reset()
Reset the plugin after use, to prepare it for another clean run.
std::string getDescription() const
Get a human-readable description for the plugin, typically a line of text that may optionally be disp...
std::string identifier
The name of the output, in computer-usable form.
float minValue
Minimum value of the results in the output.
float getParameter(std::string id) const
Get the value of a named parameter.
OutputList getOutputDescriptors() const
std::string getIdentifier() const
Get the computer-usable name of the plugin.
std::string name
The human-readable name of the parameter.
float maxValue
Maximum value of the results in the output.
static int DFOutput
size_t getPreferredBlockSize() const
Get the preferred block size (window size – the number of sample frames passed in each block to the ...
RealTime duration
Duration of the output feature.
float minValue
The minimum value of the parameter.
std::string getMaker() const
Get the name of the author or vendor of the plugin in human-readable form.
float getParameter(string id) const
std::string unit
The unit of the parameter, in human-readable form.
std::string unit
The unit of the output, in human-readable form.
std::string name
The human-readable name of the output.
RealTime represents time values to nanosecond precision with accurate arithmetic and frame-rate conve...
bool hasTimestamp
True if an output feature has its own timestamp.
FeatureSet getRemainingFeatures()
After all blocks have been processed, calculate and return any remaining features derived from the co...
void setParameter(std::string id, float value)
Set a named parameter.
std::string description
A human-readable short text describing the parameter.
float maxValue
The maximum value of the parameter.
static int CandidatesOutput
ParameterList getParameterDescriptors() const
Get the controllable parameters of this plugin.
FeatureSet process(const float *const *, RealTime)
bool hasDuration
True if an output feature has a specified duration.
std::string getCopyright() const
Get the copyright statement or licensing summary for the plugin.
size_t getPreferredBlockSize() const
virtual size_t getMaxChannelCount() const
Get the maximum supported number of input channels.
static int FilteredACFOutput
size_t binCount
The number of values per result of the output.
void setParameter(string id, float value)
size_t getPreferredStepSize() const
Get the preferred step size (window increment – the distance in sample frames between the start fram...
bool initialise(size_t channels, size_t stepSize, size_t blockSize)
Initialise a plugin to prepare it for use with the given number of input channels, step size (window increment, in sample frames) and block size (window size, in sample frames).
bool initialise(size_t channels, size_t stepSize, size_t blockSize)
bool isQuantized
True if the output values are quantized to a particular resolution.
static int TempoOutput
float defaultValue
The default value of the parameter.
OutputList getOutputDescriptors() const
Get the outputs of this plugin.
virtual size_t getMinChannelCount() const
Get the minimum supported number of input channels.
int getPluginVersion() const
Get the version number of the plugin.
bool isQuantized
True if the parameter values are quantized to a particular resolution.
FixedTempoEstimator(float inputSampleRate)
SampleType sampleType
Positioning in time of the output results.
D(float inputSampleRate)
Results are unevenly spaced and have individual timestamps.
bool hasKnownExtents
True if the results in each output bin fall within a fixed numeric range (minimum and maximum values)...
std::vector< ParameterDescriptor > ParameterList