comparison NNLSBase.cpp @ 35:cf8898a0174c matthiasm-plugin

* Split out NNLSChroma plugin into three plugins (chroma, chordino, tuning) with a common base class. There's still quite a lot of duplication between the getRemainingFeatures functions. Also add copyright / copying headers, etc.
author Chris Cannam
date Fri, 22 Oct 2010 11:30:21 +0100
parents NNLSChroma.cpp@da3195577172
children d6bb9b43ac1c
comparison
equal deleted inserted replaced
34:8edcf48f4031 35:cf8898a0174c
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
2
3 /*
4 NNLS-Chroma / Chordino
5
6 Audio feature extraction plugins for chromagram and chord
7 estimation.
8
9 Centre for Digital Music, Queen Mary University of London.
10 This file copyright 2008-2010 Matthias Mauch and QMUL.
11
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version. See the file
16 COPYING included with this distribution for more information.
17 */
18
19 #include "NNLSBase.h"
20
21 #include "chromamethods.h"
22
23 #include <cstdlib>
24 #include <fstream>
25 #include <cmath>
26
27 #include <algorithm>
28
29 const bool debug_on = false;
30
31 const vector<float> hw(hammingwind, hammingwind+19);
32
33 NNLSBase::NNLSBase(float inputSampleRate) :
34 Plugin(inputSampleRate),
35 m_logSpectrum(0),
36 m_blockSize(0),
37 m_stepSize(0),
38 m_lengthOfNoteIndex(0),
39 m_meanTuning0(0),
40 m_meanTuning1(0),
41 m_meanTuning2(0),
42 m_localTuning0(0),
43 m_localTuning1(0),
44 m_localTuning2(0),
45 m_paling(1.0),
46 m_preset(0.0),
47 m_localTuning(0),
48 m_kernelValue(0),
49 m_kernelFftIndex(0),
50 m_kernelNoteIndex(0),
51 m_dict(0),
52 m_tuneLocal(false),
53 m_dictID(0),
54 m_chorddict(0),
55 m_chordnames(0),
56 m_doNormalizeChroma(0),
57 m_rollon(0.01)
58 {
59 if (debug_on) cerr << "--> NNLSBase" << endl;
60
61 // make the *note* dictionary matrix
62 m_dict = new float[nNote * 84];
63 for (unsigned i = 0; i < nNote * 84; ++i) m_dict[i] = 0.0;
64 dictionaryMatrix(m_dict);
65
66 // get the *chord* dictionary from file (if the file exists)
67 m_chordnames = chordDictionary(&m_chorddict);
68 }
69
70
71 NNLSBase::~NNLSBase()
72 {
73 if (debug_on) cerr << "--> ~NNLSBase" << endl;
74 delete [] m_dict;
75 }
76
77 string
78 NNLSBase::getMaker() const
79 {
80 if (debug_on) cerr << "--> getMaker" << endl;
81 // Your name here
82 return "Matthias Mauch";
83 }
84
85 int
86 NNLSBase::getPluginVersion() const
87 {
88 if (debug_on) cerr << "--> getPluginVersion" << endl;
89 // Increment this each time you release a version that behaves
90 // differently from the previous one
91 return 1;
92 }
93
94 string
95 NNLSBase::getCopyright() const
96 {
97 if (debug_on) cerr << "--> getCopyright" << endl;
98 // This function is not ideally named. It does not necessarily
99 // need to say who made the plugin -- getMaker does that -- but it
100 // should indicate the terms under which it is distributed. For
101 // example, "Copyright (year). All Rights Reserved", or "GPL"
102 return "GPL";
103 }
104
105 NNLSBase::InputDomain
106 NNLSBase::getInputDomain() const
107 {
108 if (debug_on) cerr << "--> getInputDomain" << endl;
109 return FrequencyDomain;
110 }
111
112 size_t
113 NNLSBase::getPreferredBlockSize() const
114 {
115 if (debug_on) cerr << "--> getPreferredBlockSize" << endl;
116 return 16384; // 0 means "I can handle any block size"
117 }
118
119 size_t
120 NNLSBase::getPreferredStepSize() const
121 {
122 if (debug_on) cerr << "--> getPreferredStepSize" << endl;
123 return 2048; // 0 means "anything sensible"; in practice this
124 // means the same as the block size for TimeDomain
125 // plugins, or half of it for FrequencyDomain plugins
126 }
127
128 size_t
129 NNLSBase::getMinChannelCount() const
130 {
131 if (debug_on) cerr << "--> getMinChannelCount" << endl;
132 return 1;
133 }
134
135 size_t
136 NNLSBase::getMaxChannelCount() const
137 {
138 if (debug_on) cerr << "--> getMaxChannelCount" << endl;
139 return 1;
140 }
141
142 NNLSBase::ParameterList
143 NNLSBase::getParameterDescriptors() const
144 {
145 if (debug_on) cerr << "--> getParameterDescriptors" << endl;
146 ParameterList list;
147
148 ParameterDescriptor d3;
149 d3.identifier = "preset";
150 d3.name = "preset";
151 d3.description = "Spectral paling: no paling - 0; whitening - 1.";
152 d3.unit = "";
153 d3.isQuantized = true;
154 d3.quantizeStep = 1;
155 d3.minValue = 0.0;
156 d3.maxValue = 3.0;
157 d3.defaultValue = 0.0;
158 d3.valueNames.push_back("polyphonic pop");
159 d3.valueNames.push_back("polyphonic pop (fast)");
160 d3.valueNames.push_back("solo keyboard");
161 d3.valueNames.push_back("manual");
162 list.push_back(d3);
163
164 ParameterDescriptor d5;
165 d5.identifier = "rollon";
166 d5.name = "spectral roll-on";
167 d5.description = "The bins below the spectral roll-on quantile will be set to 0.";
168 d5.unit = "";
169 d5.minValue = 0;
170 d5.maxValue = 1;
171 d5.defaultValue = 0;
172 d5.isQuantized = false;
173 list.push_back(d5);
174
175 // ParameterDescriptor d0;
176 // d0.identifier = "notedict";
177 // d0.name = "note dictionary";
178 // d0.description = "Notes in different note dictionaries differ by their spectral shapes.";
179 // d0.unit = "";
180 // d0.minValue = 0;
181 // d0.maxValue = 1;
182 // d0.defaultValue = 0;
183 // d0.isQuantized = true;
184 // d0.valueNames.push_back("s = 0.6");
185 // d0.valueNames.push_back("no NNLS");
186 // d0.quantizeStep = 1.0;
187 // list.push_back(d0);
188
189 ParameterDescriptor d1;
190 d1.identifier = "tuningmode";
191 d1.name = "tuning mode";
192 d1.description = "Tuning can be performed locally or on the whole extraction segment. Local tuning is only advisable when the tuning is likely to change over the audio, for example in podcasts, or in a cappella singing.";
193 d1.unit = "";
194 d1.minValue = 0;
195 d1.maxValue = 1;
196 d1.defaultValue = 0;
197 d1.isQuantized = true;
198 d1.valueNames.push_back("global tuning");
199 d1.valueNames.push_back("local tuning");
200 d1.quantizeStep = 1.0;
201 list.push_back(d1);
202
203 // ParameterDescriptor d2;
204 // d2.identifier = "paling";
205 // d2.name = "spectral paling";
206 // d2.description = "Spectral paling: no paling - 0; whitening - 1.";
207 // d2.unit = "";
208 // d2.isQuantized = true;
209 // // d2.quantizeStep = 0.1;
210 // d2.minValue = 0.0;
211 // d2.maxValue = 1.0;
212 // d2.defaultValue = 1.0;
213 // d2.isQuantized = false;
214 // list.push_back(d2);
215 ParameterDescriptor d4;
216 d4.identifier = "chromanormalize";
217 d4.name = "chroma normalization";
218 d4.description = "How shall the chroma vector be normalized?";
219 d4.unit = "";
220 d4.minValue = 0;
221 d4.maxValue = 3;
222 d4.defaultValue = 0;
223 d4.isQuantized = true;
224 d4.valueNames.push_back("none");
225 d4.valueNames.push_back("maximum norm");
226 d4.valueNames.push_back("L1 norm");
227 d4.valueNames.push_back("L2 norm");
228 d4.quantizeStep = 1.0;
229 list.push_back(d4);
230
231 return list;
232 }
233
234 float
235 NNLSBase::getParameter(string identifier) const
236 {
237 if (debug_on) cerr << "--> getParameter" << endl;
238 if (identifier == "notedict") {
239 return m_dictID;
240 }
241
242 if (identifier == "paling") {
243 return m_paling;
244 }
245
246 if (identifier == "rollon") {
247 return m_rollon;
248 }
249
250 if (identifier == "tuningmode") {
251 if (m_tuneLocal) {
252 return 1.0;
253 } else {
254 return 0.0;
255 }
256 }
257 if (identifier == "preset") {
258 return m_preset;
259 }
260 if (identifier == "chromanormalize") {
261 return m_doNormalizeChroma;
262 }
263 return 0;
264
265 }
266
267 void
268 NNLSBase::setParameter(string identifier, float value)
269 {
270 if (debug_on) cerr << "--> setParameter" << endl;
271 if (identifier == "notedict") {
272 m_dictID = (int) value;
273 }
274
275 if (identifier == "paling") {
276 m_paling = value;
277 }
278
279 if (identifier == "tuningmode") {
280 m_tuneLocal = (value > 0) ? true : false;
281 // cerr << "m_tuneLocal :" << m_tuneLocal << endl;
282 }
283 if (identifier == "preset") {
284 m_preset = value;
285 if (m_preset == 0.0) {
286 m_tuneLocal = false;
287 m_paling = 1.0;
288 m_dictID = 0.0;
289 }
290 if (m_preset == 1.0) {
291 m_tuneLocal = false;
292 m_paling = 1.0;
293 m_dictID = 1.0;
294 }
295 if (m_preset == 2.0) {
296 m_tuneLocal = false;
297 m_paling = 0.7;
298 m_dictID = 0.0;
299 }
300 }
301 if (identifier == "chromanormalize") {
302 m_doNormalizeChroma = value;
303 }
304
305 if (identifier == "rollon") {
306 m_rollon = value;
307 }
308 }
309
310 NNLSBase::ProgramList
311 NNLSBase::getPrograms() const
312 {
313 if (debug_on) cerr << "--> getPrograms" << endl;
314 ProgramList list;
315
316 // If you have no programs, return an empty list (or simply don't
317 // implement this function or getCurrentProgram/selectProgram)
318
319 return list;
320 }
321
322 string
323 NNLSBase::getCurrentProgram() const
324 {
325 if (debug_on) cerr << "--> getCurrentProgram" << endl;
326 return ""; // no programs
327 }
328
329 void
330 NNLSBase::selectProgram(string name)
331 {
332 if (debug_on) cerr << "--> selectProgram" << endl;
333 }
334
335
336 bool
337 NNLSBase::initialise(size_t channels, size_t stepSize, size_t blockSize)
338 {
339 if (debug_on) {
340 cerr << "--> initialise";
341 }
342
343 if (channels < getMinChannelCount() ||
344 channels > getMaxChannelCount()) return false;
345 m_blockSize = blockSize;
346 m_stepSize = stepSize;
347 m_frameCount = 0;
348 int tempn = 256 * m_blockSize/2;
349 // cerr << "length of tempkernel : " << tempn << endl;
350 float *tempkernel;
351
352 tempkernel = new float[tempn];
353
354 logFreqMatrix(m_inputSampleRate, m_blockSize, tempkernel);
355 m_kernelValue.clear();
356 m_kernelFftIndex.clear();
357 m_kernelNoteIndex.clear();
358 int countNonzero = 0;
359 for (unsigned iNote = 0; iNote < nNote; ++iNote) { // I don't know if this is wise: manually making a sparse matrix
360 for (unsigned iFFT = 0; iFFT < blockSize/2; ++iFFT) {
361 if (tempkernel[iFFT + blockSize/2 * iNote] > 0) {
362 m_kernelValue.push_back(tempkernel[iFFT + blockSize/2 * iNote]);
363 if (tempkernel[iFFT + blockSize/2 * iNote] > 0) {
364 countNonzero++;
365 }
366 m_kernelFftIndex.push_back(iFFT);
367 m_kernelNoteIndex.push_back(iNote);
368 }
369 }
370 }
371 // cerr << "nonzero count : " << countNonzero << endl;
372 delete [] tempkernel;
373 /*
374 ofstream myfile;
375 myfile.open ("matrix.txt");
376 // myfile << "Writing this to a file.\n";
377 for (int i = 0; i < nNote * 84; ++i) {
378 myfile << m_dict[i] << endl;
379 }
380 myfile.close();
381 */
382 return true;
383 }
384
385 void
386 NNLSBase::reset()
387 {
388 if (debug_on) cerr << "--> reset";
389
390 // Clear buffers, reset stored values, etc
391 m_frameCount = 0;
392 m_dictID = 0;
393 m_logSpectrum.clear();
394 m_meanTuning0 = 0;
395 m_meanTuning1 = 0;
396 m_meanTuning2 = 0;
397 m_localTuning0 = 0;
398 m_localTuning1 = 0;
399 m_localTuning2 = 0;
400 m_localTuning.clear();
401 }
402
403 void
404 NNLSBase::baseProcess(const float *const *inputBuffers, Vamp::RealTime timestamp)
405 {
406 m_frameCount++;
407 float *magnitude = new float[m_blockSize/2];
408
409 const float *fbuf = inputBuffers[0];
410 float energysum = 0;
411 // make magnitude
412 float maxmag = -10000;
413 for (size_t iBin = 0; iBin < m_blockSize/2; iBin++) {
414 magnitude[iBin] = sqrt(fbuf[2 * iBin] * fbuf[2 * iBin] +
415 fbuf[2 * iBin + 1] * fbuf[2 * iBin + 1]);
416 if (maxmag < magnitude[iBin]) maxmag = magnitude[iBin];
417 if (m_rollon > 0) {
418 energysum += pow(magnitude[iBin],2);
419 }
420 }
421
422 float cumenergy = 0;
423 if (m_rollon > 0) {
424 for (size_t iBin = 2; iBin < m_blockSize/2; iBin++) {
425 cumenergy += pow(magnitude[iBin],2);
426 if (cumenergy < energysum * m_rollon) magnitude[iBin-2] = 0;
427 else break;
428 }
429 }
430
431 if (maxmag < 2) {
432 // cerr << "timestamp " << timestamp << ": very low magnitude, setting magnitude to all zeros" << endl;
433 for (size_t iBin = 0; iBin < m_blockSize/2; iBin++) {
434 magnitude[iBin] = 0;
435 }
436 }
437
438 // note magnitude mapping using pre-calculated matrix
439 float *nm = new float[nNote]; // note magnitude
440 for (size_t iNote = 0; iNote < nNote; iNote++) {
441 nm[iNote] = 0; // initialise as 0
442 }
443 int binCount = 0;
444 for (vector<float>::iterator it = m_kernelValue.begin(); it != m_kernelValue.end(); ++it) {
445 // cerr << ".";
446 nm[m_kernelNoteIndex[binCount]] += magnitude[m_kernelFftIndex[binCount]] * m_kernelValue[binCount];
447 // cerr << m_kernelFftIndex[binCount] << " -- " << magnitude[m_kernelFftIndex[binCount]] << " -- "<< m_kernelValue[binCount] << endl;
448 binCount++;
449 }
450 // cerr << nm[20];
451 // cerr << endl;
452
453
454 float one_over_N = 1.0/m_frameCount;
455 // update means of complex tuning variables
456 m_meanTuning0 *= float(m_frameCount-1)*one_over_N;
457 m_meanTuning1 *= float(m_frameCount-1)*one_over_N;
458 m_meanTuning2 *= float(m_frameCount-1)*one_over_N;
459
460 for (int iTone = 0; iTone < 160; iTone = iTone + 3) {
461 m_meanTuning0 += nm[iTone + 0]*one_over_N;
462 m_meanTuning1 += nm[iTone + 1]*one_over_N;
463 m_meanTuning2 += nm[iTone + 2]*one_over_N;
464 float ratioOld = 0.997;
465 m_localTuning0 *= ratioOld; m_localTuning0 += nm[iTone + 0] * (1 - ratioOld);
466 m_localTuning1 *= ratioOld; m_localTuning1 += nm[iTone + 1] * (1 - ratioOld);
467 m_localTuning2 *= ratioOld; m_localTuning2 += nm[iTone + 2] * (1 - ratioOld);
468 }
469
470 // if (m_tuneLocal) {
471 // local tuning
472 float localTuningImag = sinvalue * m_localTuning1 - sinvalue * m_localTuning2;
473 float localTuningReal = m_localTuning0 + cosvalue * m_localTuning1 + cosvalue * m_localTuning2;
474 float normalisedtuning = atan2(localTuningImag, localTuningReal)/(2*M_PI);
475 m_localTuning.push_back(normalisedtuning);
476
477 Feature f1; // logfreqspec
478 f1.hasTimestamp = true;
479 f1.timestamp = timestamp;
480 for (size_t iNote = 0; iNote < nNote; iNote++) {
481 f1.values.push_back(nm[iNote]);
482 }
483
484 // deletes
485 delete[] magnitude;
486 delete[] nm;
487
488 m_logSpectrum.push_back(f1); // remember note magnitude
489 }
490
491
492 #ifdef NOT_DEFINED
493
494 NNLSBase::FeatureSet
495 NNLSBase::getRemainingFeatures()
496 {
497 if (debug_on) cerr << "--> getRemainingFeatures" << endl;
498 FeatureSet fsOut;
499 if (m_logSpectrum.size() == 0) return fsOut;
500 int nChord = m_chordnames.size();
501 //
502 /** Calculate Tuning
503 calculate tuning from (using the angle of the complex number defined by the
504 cumulative mean real and imag values)
505 **/
506 float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2;
507 float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2;
508 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
509 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
510 int intShift = floor(normalisedtuning * 3);
511 float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this
512
513 char buffer0 [50];
514
515 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
516
517 // cerr << "normalisedtuning: " << normalisedtuning << '\n';
518
519 // push tuning to FeatureSet fsOut
520 Feature f0; // tuning
521 f0.hasTimestamp = true;
522 f0.timestamp = Vamp::RealTime::frame2RealTime(0, lrintf(m_inputSampleRate));;
523 f0.label = buffer0;
524 fsOut[0].push_back(f0);
525
526 /** Tune Log-Frequency Spectrogram
527 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to
528 perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
529 **/
530 cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... ";
531
532 float tempValue = 0;
533 float dbThreshold = 0; // relative to the background spectrum
534 float thresh = pow(10,dbThreshold/20);
535 // cerr << "tune local ? " << m_tuneLocal << endl;
536 int count = 0;
537
538 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
539 Feature f1 = *i;
540 Feature f2; // tuned log-frequency spectrum
541 f2.hasTimestamp = true;
542 f2.timestamp = f1.timestamp;
543 f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
544
545 if (m_tuneLocal) {
546 intShift = floor(m_localTuning[count] * 3);
547 intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this
548 }
549
550 // cerr << intShift << " " << intFactor << endl;
551
552 for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
553 tempValue = f1.values[k + intShift] * (1-intFactor) + f1.values[k+intShift+1] * intFactor;
554 f2.values.push_back(tempValue);
555 }
556
557 f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
558 vector<float> runningmean = SpecialConvolution(f2.values,hw);
559 vector<float> runningstd;
560 for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance)
561 runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
562 }
563 runningstd = SpecialConvolution(runningstd,hw); // second step convolve
564 for (int i = 0; i < 256; i++) {
565 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
566 if (runningstd[i] > 0) {
567 // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ?
568 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
569 f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
570 (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
571 }
572 if (f2.values[i] < 0) {
573 cerr << "ERROR: negative value in logfreq spectrum" << endl;
574 }
575 }
576 fsOut[2].push_back(f2);
577 count++;
578 }
579 cerr << "done." << endl;
580
581 /** Semitone spectrum and chromagrams
582 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
583 is inferred using a non-negative least squares algorithm.
584 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
585 bass and treble stacked onto each other).
586 **/
587 if (m_dictID == 1) {
588 cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... ";
589 } else {
590 cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... ";
591 }
592
593
594 vector<vector<float> > chordogram;
595 vector<vector<int> > scoreChordogram;
596 vector<float> chordchange = vector<float>(fsOut[2].size(),0);
597 vector<float> oldchroma = vector<float>(12,0);
598 vector<float> oldbasschroma = vector<float>(12,0);
599 count = 0;
600
601 for (FeatureList::iterator it = fsOut[2].begin(); it != fsOut[2].end(); ++it) {
602 Feature f2 = *it; // logfreq spectrum
603 Feature f3; // semitone spectrum
604 Feature f4; // treble chromagram
605 Feature f5; // bass chromagram
606 Feature f6; // treble and bass chromagram
607
608 f3.hasTimestamp = true;
609 f3.timestamp = f2.timestamp;
610
611 f4.hasTimestamp = true;
612 f4.timestamp = f2.timestamp;
613
614 f5.hasTimestamp = true;
615 f5.timestamp = f2.timestamp;
616
617 f6.hasTimestamp = true;
618 f6.timestamp = f2.timestamp;
619
620 float b[256];
621
622 bool some_b_greater_zero = false;
623 float sumb = 0;
624 for (int i = 0; i < 256; i++) {
625 // b[i] = m_dict[(256 * count + i) % (256 * 84)];
626 b[i] = f2.values[i];
627 sumb += b[i];
628 if (b[i] > 0) {
629 some_b_greater_zero = true;
630 }
631 }
632
633 // here's where the non-negative least squares algorithm calculates the note activation x
634
635 vector<float> chroma = vector<float>(12, 0);
636 vector<float> basschroma = vector<float>(12, 0);
637 float currval;
638 unsigned iSemitone = 0;
639
640 if (some_b_greater_zero) {
641 if (m_dictID == 1) {
642 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
643 currval = 0;
644 currval += b[iNote + 1 + -1] * 0.5;
645 currval += b[iNote + 1 + 0] * 1.0;
646 currval += b[iNote + 1 + 1] * 0.5;
647 f3.values.push_back(currval);
648 chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
649 basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
650 iSemitone++;
651 }
652
653 } else {
654 float x[84+1000];
655 for (int i = 1; i < 1084; ++i) x[i] = 1.0;
656 vector<int> signifIndex;
657 int index=0;
658 sumb /= 84.0;
659 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
660 float currval = 0;
661 currval += b[iNote + 1 + -1];
662 currval += b[iNote + 1 + 0];
663 currval += b[iNote + 1 + 1];
664 if (currval > 0) signifIndex.push_back(index);
665 f3.values.push_back(0); // fill the values, change later
666 index++;
667 }
668 float rnorm;
669 float w[84+1000];
670 float zz[84+1000];
671 int indx[84+1000];
672 int mode;
673 int dictsize = 256*signifIndex.size();
674 // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl;
675 float *curr_dict = new float[dictsize];
676 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
677 for (unsigned iBin = 0; iBin < 256; iBin++) {
678 curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin];
679 }
680 }
681 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
682 delete [] curr_dict;
683 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
684 f3.values[signifIndex[iNote]] = x[iNote];
685 // cerr << mode << endl;
686 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
687 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
688 }
689 }
690 }
691
692
693
694
695 f4.values = chroma;
696 f5.values = basschroma;
697 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
698 f6.values = chroma;
699
700 if (m_doNormalizeChroma > 0) {
701 vector<float> chromanorm = vector<float>(3,0);
702 switch (int(m_doNormalizeChroma)) {
703 case 0: // should never end up here
704 break;
705 case 1:
706 chromanorm[0] = *max_element(f4.values.begin(), f4.values.end());
707 chromanorm[1] = *max_element(f5.values.begin(), f5.values.end());
708 chromanorm[2] = max(chromanorm[0], chromanorm[1]);
709 break;
710 case 2:
711 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
712 chromanorm[0] += *it;
713 }
714 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
715 chromanorm[1] += *it;
716 }
717 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
718 chromanorm[2] += *it;
719 }
720 break;
721 case 3:
722 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
723 chromanorm[0] += pow(*it,2);
724 }
725 chromanorm[0] = sqrt(chromanorm[0]);
726 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
727 chromanorm[1] += pow(*it,2);
728 }
729 chromanorm[1] = sqrt(chromanorm[1]);
730 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
731 chromanorm[2] += pow(*it,2);
732 }
733 chromanorm[2] = sqrt(chromanorm[2]);
734 break;
735 }
736 if (chromanorm[0] > 0) {
737 for (int i = 0; i < f4.values.size(); i++) {
738 f4.values[i] /= chromanorm[0];
739 }
740 }
741 if (chromanorm[1] > 0) {
742 for (int i = 0; i < f5.values.size(); i++) {
743 f5.values[i] /= chromanorm[1];
744 }
745 }
746 if (chromanorm[2] > 0) {
747 for (int i = 0; i < f6.values.size(); i++) {
748 f6.values[i] /= chromanorm[2];
749 }
750 }
751
752 }
753
754 // local chord estimation
755 vector<float> currentChordSalience;
756 float tempchordvalue = 0;
757 float sumchordvalue = 0;
758
759 for (int iChord = 0; iChord < nChord; iChord++) {
760 tempchordvalue = 0;
761 for (int iBin = 0; iBin < 12; iBin++) {
762 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
763 }
764 for (int iBin = 12; iBin < 24; iBin++) {
765 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
766 }
767 sumchordvalue+=tempchordvalue;
768 currentChordSalience.push_back(tempchordvalue);
769 }
770 if (sumchordvalue > 0) {
771 for (int iChord = 0; iChord < nChord; iChord++) {
772 currentChordSalience[iChord] /= sumchordvalue;
773 }
774 } else {
775 currentChordSalience[nChord-1] = 1.0;
776 }
777 chordogram.push_back(currentChordSalience);
778
779 fsOut[3].push_back(f3);
780 fsOut[4].push_back(f4);
781 fsOut[5].push_back(f5);
782 fsOut[6].push_back(f6);
783 count++;
784 }
785 cerr << "done." << endl;
786
787
788 /* Simple chord estimation
789 I just take the local chord estimates ("currentChordSalience") and average them over time, then
790 take the maximum. Very simple, don't do this at home...
791 */
792 cerr << "[NNLS Chroma Plugin] Chord Estimation ... ";
793 count = 0;
794 int halfwindowlength = m_inputSampleRate / m_stepSize;
795 vector<int> chordSequence;
796 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram
797 vector<int> temp = vector<int>(nChord,0);
798 scoreChordogram.push_back(temp);
799 }
800 for (FeatureList::iterator it = fsOut[6].begin(); it < fsOut[6].end()-2*halfwindowlength-1; ++it) {
801 int startIndex = count + 1;
802 int endIndex = count + 2 * halfwindowlength;
803
804 float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
805
806 vector<int> chordCandidates;
807 for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
808 // float currsum = 0;
809 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
810 // currsum += chordogram[iFrame][iChord];
811 // }
812 // if (currsum > chordThreshold) chordCandidates.push_back(iChord);
813 for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
814 if (chordogram[iFrame][iChord] > chordThreshold) {
815 chordCandidates.push_back(iChord);
816 break;
817 }
818 }
819 }
820 chordCandidates.push_back(nChord-1);
821 // cerr << chordCandidates.size() << endl;
822
823 float maxval = 0; // will be the value of the most salient *chord change* in this frame
824 float maxindex = 0; //... and the index thereof
825 unsigned bestchordL = nChord-1; // index of the best "left" chord
826 unsigned bestchordR = nChord-1; // index of the best "right" chord
827
828 for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
829 // now find the max values on both sides of iWF
830 // left side:
831 float maxL = 0;
832 unsigned maxindL = nChord-1;
833 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
834 unsigned iChord = chordCandidates[kChord];
835 float currsum = 0;
836 for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
837 currsum += chordogram[count+iFrame][iChord];
838 }
839 if (iChord == nChord-1) currsum *= 0.8;
840 if (currsum > maxL) {
841 maxL = currsum;
842 maxindL = iChord;
843 }
844 }
845 // right side:
846 float maxR = 0;
847 unsigned maxindR = nChord-1;
848 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
849 unsigned iChord = chordCandidates[kChord];
850 float currsum = 0;
851 for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
852 currsum += chordogram[count+iFrame][iChord];
853 }
854 if (iChord == nChord-1) currsum *= 0.8;
855 if (currsum > maxR) {
856 maxR = currsum;
857 maxindR = iChord;
858 }
859 }
860 if (maxL+maxR > maxval) {
861 maxval = maxL+maxR;
862 maxindex = iWF;
863 bestchordL = maxindL;
864 bestchordR = maxindR;
865 }
866
867 }
868 // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
869 // add a score to every chord-frame-point that was part of a maximum
870 for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
871 scoreChordogram[iFrame+count][bestchordL]++;
872 }
873 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
874 scoreChordogram[iFrame+count][bestchordR]++;
875 }
876 if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
877 count++;
878 }
879 // cerr << "******* agent finished *******" << endl;
880 count = 0;
881 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
882 float maxval = 0; // will be the value of the most salient chord in this frame
883 float maxindex = 0; //... and the index thereof
884 for (unsigned iChord = 0; iChord < nChord; iChord++) {
885 if (scoreChordogram[count][iChord] > maxval) {
886 maxval = scoreChordogram[count][iChord];
887 maxindex = iChord;
888 // cerr << iChord << endl;
889 }
890 }
891 chordSequence.push_back(maxindex);
892 // cerr << "before modefilter, maxindex: " << maxindex << endl;
893 count++;
894 }
895 // cerr << "******* mode filter done *******" << endl;
896
897
898 // mode filter on chordSequence
899 count = 0;
900 string oldChord = "";
901 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
902 Feature f6 = *it;
903 Feature f7; // chord estimate
904 f7.hasTimestamp = true;
905 f7.timestamp = f6.timestamp;
906 Feature f8; // chord estimate
907 f8.hasTimestamp = true;
908 f8.timestamp = f6.timestamp;
909
910 vector<int> chordCount = vector<int>(nChord,0);
911 int maxChordCount = 0;
912 int maxChordIndex = nChord-1;
913 string maxChord;
914 int startIndex = max(count - halfwindowlength/2,0);
915 int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
916 for (int i = startIndex; i < endIndex; i++) {
917 chordCount[chordSequence[i]]++;
918 if (chordCount[chordSequence[i]] > maxChordCount) {
919 // cerr << "start index " << startIndex << endl;
920 maxChordCount++;
921 maxChordIndex = chordSequence[i];
922 maxChord = m_chordnames[maxChordIndex];
923 }
924 }
925 // chordSequence[count] = maxChordIndex;
926 // cerr << maxChordIndex << endl;
927 f8.values.push_back(chordchange[count]/(halfwindowlength*2));
928 // cerr << chordchange[count] << endl;
929 fsOut[9].push_back(f8);
930 if (oldChord != maxChord) {
931 oldChord = maxChord;
932
933 // char buffer1 [50];
934 // if (maxChordIndex < nChord - 1) {
935 // sprintf(buffer1, "%s%s", notenames[maxChordIndex % 12 + 12], chordtypes[maxChordIndex]);
936 // } else {
937 // sprintf(buffer1, "N");
938 // }
939 // f7.label = buffer1;
940 f7.label = m_chordnames[maxChordIndex];
941 fsOut[7].push_back(f7);
942 }
943 count++;
944 }
945 Feature f7; // last chord estimate
946 f7.hasTimestamp = true;
947 f7.timestamp = fsOut[6][fsOut[6].size()-1].timestamp;
948 f7.label = "N";
949 fsOut[7].push_back(f7);
950 cerr << "done." << endl;
951 // // musicity
952 // count = 0;
953 // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
954 // vector<float> musicityValue;
955 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
956 // Feature f4 = *it;
957 //
958 // int startIndex = max(count - musicitykernelwidth/2,0);
959 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
960 // float chromasum = 0;
961 // float diffsum = 0;
962 // for (int k = 0; k < 12; k++) {
963 // for (int i = startIndex + 1; i < endIndex; i++) {
964 // chromasum += pow(fsOut[4][i].values[k],2);
965 // diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
966 // }
967 // }
968 // diffsum /= chromasum;
969 // musicityValue.push_back(diffsum);
970 // count++;
971 // }
972 //
973 // float musicityThreshold = 0.44;
974 // if (m_stepSize == 4096) {
975 // musicityThreshold = 0.74;
976 // }
977 // if (m_stepSize == 4410) {
978 // musicityThreshold = 0.77;
979 // }
980 //
981 // count = 0;
982 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
983 // Feature f4 = *it;
984 // Feature f8; // musicity
985 // Feature f9; // musicity segmenter
986 //
987 // f8.hasTimestamp = true;
988 // f8.timestamp = f4.timestamp;
989 // f9.hasTimestamp = true;
990 // f9.timestamp = f4.timestamp;
991 //
992 // int startIndex = max(count - musicitykernelwidth/2,0);
993 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
994 // int musicityCount = 0;
995 // for (int i = startIndex; i <= endIndex; i++) {
996 // if (musicityValue[i] > musicityThreshold) musicityCount++;
997 // }
998 // bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
999 //
1000 // if (isSpeech) {
1001 // if (oldlabeltype != 2) {
1002 // f9.label = "Speech";
1003 // fsOut[9].push_back(f9);
1004 // oldlabeltype = 2;
1005 // }
1006 // } else {
1007 // if (oldlabeltype != 1) {
1008 // f9.label = "Music";
1009 // fsOut[9].push_back(f9);
1010 // oldlabeltype = 1;
1011 // }
1012 // }
1013 // f8.values.push_back(musicityValue[count]);
1014 // fsOut[8].push_back(f8);
1015 // count++;
1016 // }
1017 return fsOut;
1018
1019 }
1020
1021 #endif