comparison Chordino.cpp @ 35:cf8898a0174c matthiasm-plugin

* Split out NNLSChroma plugin into three plugins (chroma, chordino, tuning) with a common base class. There's still quite a lot of duplication between the getRemainingFeatures functions. Also add copyright / copying headers, etc.
author Chris Cannam
date Fri, 22 Oct 2010 11:30:21 +0100
parents NNLSChroma.cpp@da3195577172
children 7409ab74c63b
comparison
equal deleted inserted replaced
34:8edcf48f4031 35:cf8898a0174c
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
2
3 /*
4 NNLS-Chroma / Chordino
5
6 Audio feature extraction plugins for chromagram and chord
7 estimation.
8
9 Centre for Digital Music, Queen Mary University of London.
10 This file copyright 2008-2010 Matthias Mauch and QMUL.
11
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version. See the file
16 COPYING included with this distribution for more information.
17 */
18
19 #include "Chordino.h"
20
21 #include "chromamethods.h"
22
23 #include <cstdlib>
24 #include <fstream>
25 #include <cmath>
26
27 #include <algorithm>
28
29 const bool debug_on = false;
30
31 const vector<float> hw(hammingwind, hammingwind+19);
32
33 Chordino::Chordino(float inputSampleRate) :
34 NNLSBase(inputSampleRate)
35 {
36 if (debug_on) cerr << "--> Chordino" << endl;
37 }
38
39 Chordino::~Chordino()
40 {
41 if (debug_on) cerr << "--> ~Chordino" << endl;
42 }
43
44 string
45 Chordino::getIdentifier() const
46 {
47 if (debug_on) cerr << "--> getIdentifier" << endl;
48 return "chordino";
49 }
50
51 string
52 Chordino::getName() const
53 {
54 if (debug_on) cerr << "--> getName" << endl;
55 return "Chordino";
56 }
57
58 string
59 Chordino::getDescription() const
60 {
61 if (debug_on) cerr << "--> getDescription" << endl;
62 return "This plugin provides a number of features derived from a log-frequency amplitude spectrum of the DFT: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; based on this semitone spectrum, chroma features and a simple chord estimate.";
63 }
64
65 Chordino::OutputList
66 Chordino::getOutputDescriptors() const
67 {
68 if (debug_on) cerr << "--> getOutputDescriptors" << endl;
69 OutputList list;
70
71 int index = 0;
72
73 OutputDescriptor d7;
74 d7.identifier = "simplechord";
75 d7.name = "Simple Chord Estimate";
76 d7.description = "A simple chord estimate based on the inner product of chord templates with the smoothed chroma.";
77 d7.unit = "";
78 d7.hasFixedBinCount = true;
79 d7.binCount = 0;
80 d7.hasKnownExtents = false;
81 d7.isQuantized = false;
82 d7.sampleType = OutputDescriptor::VariableSampleRate;
83 d7.hasDuration = false;
84 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
85 list.push_back(d7);
86 m_outputChords = index++;
87
88 OutputDescriptor d8;
89 d8.identifier = "harmonicchange";
90 d8.name = "Harmonic change value";
91 d8.description = "Harmonic change.";
92 d8.unit = "";
93 d8.hasFixedBinCount = true;
94 d8.binCount = 1;
95 d8.hasKnownExtents = true;
96 d8.minValue = 0.0;
97 d8.maxValue = 0.999;
98 d8.isQuantized = false;
99 d8.sampleType = OutputDescriptor::FixedSampleRate;
100 d8.hasDuration = false;
101 // d8.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
102 list.push_back(d8);
103 m_outputHarmonicChange = index++;
104
105 return list;
106 }
107
108 bool
109 Chordino::initialise(size_t channels, size_t stepSize, size_t blockSize)
110 {
111 if (debug_on) {
112 cerr << "--> initialise";
113 }
114
115 if (!NNLSBase::initialise(channels, stepSize, blockSize)) {
116 return false;
117 }
118
119 return true;
120 }
121
122 void
123 Chordino::reset()
124 {
125 if (debug_on) cerr << "--> reset";
126 NNLSBase::reset();
127 }
128
129 Chordino::FeatureSet
130 Chordino::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
131 {
132 if (debug_on) cerr << "--> process" << endl;
133
134 NNLSBase::baseProcess(inputBuffers, timestamp);
135
136 return FeatureSet();
137 }
138
139 Chordino::FeatureSet
140 Chordino::getRemainingFeatures()
141 {
142 if (debug_on) cerr << "--> getRemainingFeatures" << endl;
143 FeatureSet fsOut;
144 if (m_logSpectrum.size() == 0) return fsOut;
145 int nChord = m_chordnames.size();
146 //
147 /** Calculate Tuning
148 calculate tuning from (using the angle of the complex number defined by the
149 cumulative mean real and imag values)
150 **/
151 float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2;
152 float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2;
153 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
154 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
155 int intShift = floor(normalisedtuning * 3);
156 float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this
157
158 char buffer0 [50];
159
160 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
161
162
163 /** Tune Log-Frequency Spectrogram
164 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to
165 perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
166 **/
167 cerr << endl << "[Chordino Plugin] Tuning Log-Frequency Spectrogram ... ";
168
169 float tempValue = 0;
170 float dbThreshold = 0; // relative to the background spectrum
171 float thresh = pow(10,dbThreshold/20);
172 // cerr << "tune local ? " << m_tuneLocal << endl;
173 int count = 0;
174
175 FeatureList tunedSpec;
176
177 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
178 Feature f1 = *i;
179 Feature f2; // tuned log-frequency spectrum
180 f2.hasTimestamp = true;
181 f2.timestamp = f1.timestamp;
182 f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
183
184 if (m_tuneLocal) {
185 intShift = floor(m_localTuning[count] * 3);
186 intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this
187 }
188
189 // cerr << intShift << " " << intFactor << endl;
190
191 for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
192 tempValue = f1.values[k + intShift] * (1-intFactor) + f1.values[k+intShift+1] * intFactor;
193 f2.values.push_back(tempValue);
194 }
195
196 f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
197 vector<float> runningmean = SpecialConvolution(f2.values,hw);
198 vector<float> runningstd;
199 for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance)
200 runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
201 }
202 runningstd = SpecialConvolution(runningstd,hw); // second step convolve
203 for (int i = 0; i < 256; i++) {
204 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
205 if (runningstd[i] > 0) {
206 // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ?
207 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
208 f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
209 (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
210 }
211 if (f2.values[i] < 0) {
212 cerr << "ERROR: negative value in logfreq spectrum" << endl;
213 }
214 }
215 tunedSpec.push_back(f2);
216 count++;
217 }
218 cerr << "done." << endl;
219
220 /** Semitone spectrum and chromagrams
221 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
222 is inferred using a non-negative least squares algorithm.
223 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
224 bass and treble stacked onto each other).
225 **/
226 if (m_dictID == 1) {
227 cerr << "[Chordino Plugin] Mapping to semitone spectrum and chroma ... ";
228 } else {
229 cerr << "[Chordino Plugin] Performing NNLS and mapping to chroma ... ";
230 }
231
232
233 vector<vector<float> > chordogram;
234 vector<vector<int> > scoreChordogram;
235 vector<float> chordchange = vector<float>(tunedSpec.size(),0);
236 count = 0;
237
238 FeatureList chromaList;
239
240 for (FeatureList::iterator it = tunedSpec.begin(); it != tunedSpec.end(); ++it) {
241 Feature f2 = *it; // logfreq spectrum
242 Feature f6; // treble and bass chromagram
243
244 f6.hasTimestamp = true;
245 f6.timestamp = f2.timestamp;
246
247 float b[256];
248
249 bool some_b_greater_zero = false;
250 float sumb = 0;
251 for (int i = 0; i < 256; i++) {
252 // b[i] = m_dict[(256 * count + i) % (256 * 84)];
253 b[i] = f2.values[i];
254 sumb += b[i];
255 if (b[i] > 0) {
256 some_b_greater_zero = true;
257 }
258 }
259
260 // here's where the non-negative least squares algorithm calculates the note activation x
261
262 vector<float> chroma = vector<float>(12, 0);
263 vector<float> basschroma = vector<float>(12, 0);
264 float currval;
265 unsigned iSemitone = 0;
266
267 if (some_b_greater_zero) {
268 if (m_dictID == 1) {
269 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
270 currval = 0;
271 currval += b[iNote + 1 + -1] * 0.5;
272 currval += b[iNote + 1 + 0] * 1.0;
273 currval += b[iNote + 1 + 1] * 0.5;
274 chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
275 basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
276 iSemitone++;
277 }
278
279 } else {
280 float x[84+1000];
281 for (int i = 1; i < 1084; ++i) x[i] = 1.0;
282 vector<int> signifIndex;
283 int index=0;
284 sumb /= 84.0;
285 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
286 float currval = 0;
287 currval += b[iNote + 1 + -1];
288 currval += b[iNote + 1 + 0];
289 currval += b[iNote + 1 + 1];
290 if (currval > 0) signifIndex.push_back(index);
291 index++;
292 }
293 float rnorm;
294 float w[84+1000];
295 float zz[84+1000];
296 int indx[84+1000];
297 int mode;
298 int dictsize = 256*signifIndex.size();
299 float *curr_dict = new float[dictsize];
300 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
301 for (unsigned iBin = 0; iBin < 256; iBin++) {
302 curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin];
303 }
304 }
305 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
306 delete [] curr_dict;
307 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
308 // cerr << mode << endl;
309 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
310 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
311 }
312 }
313 }
314
315 vector<float> origchroma = chroma;
316 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
317 f6.values = chroma;
318
319 if (m_doNormalizeChroma > 0) {
320 vector<float> chromanorm = vector<float>(3,0);
321 switch (int(m_doNormalizeChroma)) {
322 case 0: // should never end up here
323 break;
324 case 1:
325 chromanorm[0] = *max_element(origchroma.begin(), origchroma.end());
326 chromanorm[1] = *max_element(basschroma.begin(), basschroma.end());
327 chromanorm[2] = max(chromanorm[0], chromanorm[1]);
328 break;
329 case 2:
330 for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
331 chromanorm[2] += *it;
332 }
333 break;
334 case 3:
335 for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
336 chromanorm[2] += pow(*it,2);
337 }
338 chromanorm[2] = sqrt(chromanorm[2]);
339 break;
340 }
341 if (chromanorm[2] > 0) {
342 for (int i = 0; i < chroma.size(); i++) {
343 f6.values[i] /= chromanorm[2];
344 }
345 }
346 }
347
348 chromaList.push_back(f6);
349
350 // local chord estimation
351 vector<float> currentChordSalience;
352 float tempchordvalue = 0;
353 float sumchordvalue = 0;
354
355 for (int iChord = 0; iChord < nChord; iChord++) {
356 tempchordvalue = 0;
357 for (int iBin = 0; iBin < 12; iBin++) {
358 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
359 }
360 for (int iBin = 12; iBin < 24; iBin++) {
361 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
362 }
363 sumchordvalue+=tempchordvalue;
364 currentChordSalience.push_back(tempchordvalue);
365 }
366 if (sumchordvalue > 0) {
367 for (int iChord = 0; iChord < nChord; iChord++) {
368 currentChordSalience[iChord] /= sumchordvalue;
369 }
370 } else {
371 currentChordSalience[nChord-1] = 1.0;
372 }
373 chordogram.push_back(currentChordSalience);
374
375 count++;
376 }
377 cerr << "done." << endl;
378
379
380 /* Simple chord estimation
381 I just take the local chord estimates ("currentChordSalience") and average them over time, then
382 take the maximum. Very simple, don't do this at home...
383 */
384 cerr << "[Chordino Plugin] Chord Estimation ... ";
385 count = 0;
386 int halfwindowlength = m_inputSampleRate / m_stepSize;
387 vector<int> chordSequence;
388
389 for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) { // initialise the score chordogram
390 vector<int> temp = vector<int>(nChord,0);
391 scoreChordogram.push_back(temp);
392 }
393
394 for (FeatureList::iterator it = chromaList.begin(); it < chromaList.end()-2*halfwindowlength-1; ++it) {
395 int startIndex = count + 1;
396 int endIndex = count + 2 * halfwindowlength;
397
398 float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
399
400 vector<int> chordCandidates;
401 for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
402 // float currsum = 0;
403 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
404 // currsum += chordogram[iFrame][iChord];
405 // }
406 // if (currsum > chordThreshold) chordCandidates.push_back(iChord);
407 for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
408 if (chordogram[iFrame][iChord] > chordThreshold) {
409 chordCandidates.push_back(iChord);
410 break;
411 }
412 }
413 }
414 chordCandidates.push_back(nChord-1);
415 // cerr << chordCandidates.size() << endl;
416
417 float maxval = 0; // will be the value of the most salient *chord change* in this frame
418 float maxindex = 0; //... and the index thereof
419 unsigned bestchordL = nChord-1; // index of the best "left" chord
420 unsigned bestchordR = nChord-1; // index of the best "right" chord
421
422 for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
423 // now find the max values on both sides of iWF
424 // left side:
425 float maxL = 0;
426 unsigned maxindL = nChord-1;
427 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
428 unsigned iChord = chordCandidates[kChord];
429 float currsum = 0;
430 for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
431 currsum += chordogram[count+iFrame][iChord];
432 }
433 if (iChord == nChord-1) currsum *= 0.8;
434 if (currsum > maxL) {
435 maxL = currsum;
436 maxindL = iChord;
437 }
438 }
439 // right side:
440 float maxR = 0;
441 unsigned maxindR = nChord-1;
442 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
443 unsigned iChord = chordCandidates[kChord];
444 float currsum = 0;
445 for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
446 currsum += chordogram[count+iFrame][iChord];
447 }
448 if (iChord == nChord-1) currsum *= 0.8;
449 if (currsum > maxR) {
450 maxR = currsum;
451 maxindR = iChord;
452 }
453 }
454 if (maxL+maxR > maxval) {
455 maxval = maxL+maxR;
456 maxindex = iWF;
457 bestchordL = maxindL;
458 bestchordR = maxindR;
459 }
460
461 }
462 // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
463 // add a score to every chord-frame-point that was part of a maximum
464 for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
465 scoreChordogram[iFrame+count][bestchordL]++;
466 }
467 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
468 scoreChordogram[iFrame+count][bestchordR]++;
469 }
470 if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
471 count++;
472 }
473 // cerr << "******* agent finished *******" << endl;
474 count = 0;
475 for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) {
476 float maxval = 0; // will be the value of the most salient chord in this frame
477 float maxindex = 0; //... and the index thereof
478 for (unsigned iChord = 0; iChord < nChord; iChord++) {
479 if (scoreChordogram[count][iChord] > maxval) {
480 maxval = scoreChordogram[count][iChord];
481 maxindex = iChord;
482 // cerr << iChord << endl;
483 }
484 }
485 chordSequence.push_back(maxindex);
486 // cerr << "before modefilter, maxindex: " << maxindex << endl;
487 count++;
488 }
489 // cerr << "******* mode filter done *******" << endl;
490
491
492 // mode filter on chordSequence
493 count = 0;
494 string oldChord = "";
495 for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) {
496 Feature f6 = *it;
497 Feature f7; // chord estimate
498 f7.hasTimestamp = true;
499 f7.timestamp = f6.timestamp;
500 Feature f8; // chord estimate
501 f8.hasTimestamp = true;
502 f8.timestamp = f6.timestamp;
503
504 vector<int> chordCount = vector<int>(nChord,0);
505 int maxChordCount = 0;
506 int maxChordIndex = nChord-1;
507 string maxChord;
508 int startIndex = max(count - halfwindowlength/2,0);
509 int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
510 for (int i = startIndex; i < endIndex; i++) {
511 chordCount[chordSequence[i]]++;
512 if (chordCount[chordSequence[i]] > maxChordCount) {
513 // cerr << "start index " << startIndex << endl;
514 maxChordCount++;
515 maxChordIndex = chordSequence[i];
516 maxChord = m_chordnames[maxChordIndex];
517 }
518 }
519 // chordSequence[count] = maxChordIndex;
520 // cerr << maxChordIndex << endl;
521 f8.values.push_back(chordchange[count]/(halfwindowlength*2));
522 // cerr << chordchange[count] << endl;
523 fsOut[m_outputHarmonicChange].push_back(f8);
524 if (oldChord != maxChord) {
525 oldChord = maxChord;
526 f7.label = m_chordnames[maxChordIndex];
527 fsOut[m_outputChords].push_back(f7);
528 }
529 count++;
530 }
531 Feature f7; // last chord estimate
532 f7.hasTimestamp = true;
533 f7.timestamp = chromaList[chromaList.size()-1].timestamp;
534 f7.label = "N";
535 fsOut[m_outputChords].push_back(f7);
536 cerr << "done." << endl;
537
538 return fsOut;
539
540 }
541