Mercurial > hg > qm-vamp-plugins
comparison plugins/SimilarityPlugin.cpp @ 47:f8c5f11e60a6
* Add rhythmic similarity to similarity plugin.
Needs testing, and the current weighting parameter (not properly used)
needs revision.
author | Chris Cannam <c.cannam@qmul.ac.uk> |
---|---|
date | Fri, 18 Jan 2008 18:11:01 +0000 |
parents | 26a2e341d358 |
children | 3b4572153ce3 |
comparison
equal
deleted
inserted
replaced
46:26a2e341d358 | 47:f8c5f11e60a6 |
---|---|
13 #include "SimilarityPlugin.h" | 13 #include "SimilarityPlugin.h" |
14 #include "base/Pitch.h" | 14 #include "base/Pitch.h" |
15 #include "dsp/mfcc/MFCC.h" | 15 #include "dsp/mfcc/MFCC.h" |
16 #include "dsp/chromagram/Chromagram.h" | 16 #include "dsp/chromagram/Chromagram.h" |
17 #include "dsp/rateconversion/Decimator.h" | 17 #include "dsp/rateconversion/Decimator.h" |
18 #include "maths/KLDivergence.cpp" | 18 #include "dsp/rhythm/BeatSpectrum.h" |
19 #include "maths/KLDivergence.h" | |
20 #include "maths/CosineDistance.h" | |
19 | 21 |
20 using std::string; | 22 using std::string; |
21 using std::vector; | 23 using std::vector; |
22 using std::cerr; | 24 using std::cerr; |
23 using std::endl; | 25 using std::endl; |
24 using std::ostringstream; | 26 using std::ostringstream; |
25 | 27 |
28 const float | |
29 SimilarityPlugin::m_noRhythm = 0.009; | |
30 | |
31 const float | |
32 SimilarityPlugin::m_allRhythm = 0.991; | |
33 | |
26 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) : | 34 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) : |
27 Plugin(inputSampleRate), | 35 Plugin(inputSampleRate), |
28 m_type(TypeMFCC), | 36 m_type(TypeMFCC), |
29 m_mfcc(0), | 37 m_mfcc(0), |
38 m_rhythmfcc(0), | |
30 m_chromagram(0), | 39 m_chromagram(0), |
31 m_decimator(0), | 40 m_decimator(0), |
32 m_featureColumnSize(20), | 41 m_featureColumnSize(20), |
42 m_rhythmWeighting(0.f), | |
43 m_rhythmClipDuration(4.f), // seconds | |
44 m_rhythmClipOrigin(40.f), // seconds | |
45 m_rhythmClipFrameSize(0), | |
46 m_rhythmClipFrames(0), | |
47 m_rhythmColumnSize(20), | |
33 m_blockSize(0), | 48 m_blockSize(0), |
34 m_channels(0) | 49 m_channels(0), |
35 { | 50 m_processRate(0), |
36 | 51 m_frameNo(0), |
52 m_done(false) | |
53 { | |
54 int rate = lrintf(m_inputSampleRate); | |
55 int internalRate = 22050; | |
56 int decimationFactor = rate / internalRate; | |
57 if (decimationFactor < 1) decimationFactor = 1; | |
58 | |
59 // must be a power of two | |
60 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor; | |
61 | |
62 m_processRate = rate / decimationFactor; // may be 22050, 24000 etc | |
37 } | 63 } |
38 | 64 |
39 SimilarityPlugin::~SimilarityPlugin() | 65 SimilarityPlugin::~SimilarityPlugin() |
40 { | 66 { |
41 delete m_mfcc; | 67 delete m_mfcc; |
68 delete m_rhythmfcc; | |
42 delete m_chromagram; | 69 delete m_chromagram; |
43 delete m_decimator; | 70 delete m_decimator; |
44 } | 71 } |
45 | 72 |
46 string | 73 string |
62 } | 89 } |
63 | 90 |
64 string | 91 string |
65 SimilarityPlugin::getMaker() const | 92 SimilarityPlugin::getMaker() const |
66 { | 93 { |
67 return "Mark Levy and Chris Cannam, Queen Mary, University of London"; | 94 return "Mark Levy, Kurt Jacobson and Chris Cannam, Queen Mary, University of London"; |
68 } | 95 } |
69 | 96 |
70 int | 97 int |
71 SimilarityPlugin::getPluginVersion() const | 98 SimilarityPlugin::getPluginVersion() const |
72 { | 99 { |
127 | 154 |
128 if (m_type == TypeMFCC) { | 155 if (m_type == TypeMFCC) { |
129 | 156 |
130 m_featureColumnSize = 20; | 157 m_featureColumnSize = 20; |
131 | 158 |
132 MFCCConfig config(lrintf(m_inputSampleRate) / decimationFactor); | 159 MFCCConfig config(m_processRate); |
133 config.fftsize = 2048; | 160 config.fftsize = 2048; |
134 config.nceps = m_featureColumnSize - 1; | 161 config.nceps = m_featureColumnSize - 1; |
135 config.want_c0 = true; | 162 config.want_c0 = true; |
136 config.logpower = 1; | 163 config.logpower = 1; |
137 m_mfcc = new MFCC(config); | 164 m_mfcc = new MFCC(config); |
138 m_fftSize = m_mfcc->getfftlength(); | 165 m_fftSize = m_mfcc->getfftlength(); |
166 m_rhythmClipFrameSize = m_fftSize / 4; | |
139 | 167 |
140 std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl; | 168 std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl; |
141 | 169 |
142 } else if (m_type == TypeChroma) { | 170 } else if (m_type == TypeChroma) { |
143 | 171 |
144 m_featureColumnSize = 12; | 172 m_featureColumnSize = 12; |
145 | 173 |
146 ChromaConfig config; | 174 ChromaConfig config; |
147 config.FS = lrintf(m_inputSampleRate) / decimationFactor; | 175 config.FS = m_processRate; |
148 config.min = Pitch::getFrequencyForPitch(24, 0, 440); | 176 config.min = Pitch::getFrequencyForPitch(24, 0, 440); |
149 config.max = Pitch::getFrequencyForPitch(96, 0, 440); | 177 config.max = Pitch::getFrequencyForPitch(96, 0, 440); |
150 config.BPO = 12; | 178 config.BPO = 12; |
151 config.CQThresh = 0.0054; | 179 config.CQThresh = 0.0054; |
152 config.isNormalised = true; | 180 config.isNormalised = true; |
153 m_chromagram = new Chromagram(config); | 181 m_chromagram = new Chromagram(config); |
154 m_fftSize = m_chromagram->getFrameSize(); | 182 m_fftSize = m_chromagram->getFrameSize(); |
155 | 183 |
184 std::cerr << "fftsize = " << m_fftSize << std::endl; | |
185 | |
186 m_rhythmClipFrameSize = m_fftSize / 16; | |
187 while (m_rhythmClipFrameSize < 512) m_rhythmClipFrameSize *= 2; | |
188 std::cerr << "m_rhythmClipFrameSize = " << m_rhythmClipFrameSize << std::endl; | |
189 | |
156 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl; | 190 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl; |
157 | 191 |
158 } else { | 192 } else { |
159 | 193 |
160 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl; | 194 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl; |
161 return false; | 195 return false; |
162 } | 196 } |
163 | 197 |
198 if (needRhythm()) { | |
199 m_rhythmClipFrames = | |
200 int(ceil((m_rhythmClipDuration * m_processRate) | |
201 / m_rhythmClipFrameSize)); | |
202 std::cerr << "SimilarityPlugin::initialise: rhythm clip is " | |
203 << m_rhythmClipFrames << " frames of size " | |
204 << m_rhythmClipFrameSize << " at process rate " | |
205 << m_processRate << " ( = " | |
206 << (float(m_rhythmClipFrames * m_rhythmClipFrameSize) / m_processRate) << " sec )" | |
207 << std::endl; | |
208 | |
209 MFCCConfig config(m_processRate); | |
210 config.fftsize = m_rhythmClipFrameSize; | |
211 config.nceps = m_featureColumnSize - 1; | |
212 config.want_c0 = true; | |
213 config.logpower = 1; | |
214 config.window = RectangularWindow; // because no overlap | |
215 m_rhythmfcc = new MFCC(config); | |
216 } | |
217 | |
164 for (int i = 0; i < m_channels; ++i) { | 218 for (int i = 0; i < m_channels; ++i) { |
219 | |
165 m_values.push_back(FeatureMatrix()); | 220 m_values.push_back(FeatureMatrix()); |
166 } | 221 |
222 if (needRhythm()) { | |
223 m_rhythmValues.push_back(FeatureColumnQueue()); | |
224 } | |
225 } | |
226 | |
227 m_done = false; | |
167 | 228 |
168 return true; | 229 return true; |
169 } | 230 } |
170 | 231 |
171 void | 232 void |
172 SimilarityPlugin::reset() | 233 SimilarityPlugin::reset() |
173 { | 234 { |
174 //!!! | 235 //!!! |
236 m_done = false; | |
175 } | 237 } |
176 | 238 |
177 int | 239 int |
178 SimilarityPlugin::getDecimationFactor() const | 240 SimilarityPlugin::getDecimationFactor() const |
179 { | 241 { |
180 int rate = lrintf(m_inputSampleRate); | 242 int rate = lrintf(m_inputSampleRate); |
181 int internalRate = 22050; | 243 return rate / m_processRate; |
182 int decimationFactor = rate / internalRate; | |
183 if (decimationFactor < 1) decimationFactor = 1; | |
184 | |
185 // must be a power of two | |
186 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor; | |
187 | |
188 return decimationFactor; | |
189 } | 244 } |
190 | 245 |
191 size_t | 246 size_t |
192 SimilarityPlugin::getPreferredStepSize() const | 247 SimilarityPlugin::getPreferredStepSize() const |
193 { | 248 { |
194 if (m_blockSize == 0) calculateBlockSize(); | 249 if (m_blockSize == 0) calculateBlockSize(); |
250 | |
251 // there is also an assumption to this effect in process() | |
252 // (referring to m_fftSize/2 instead of a literal post-decimation | |
253 // step size): | |
195 return m_blockSize/2; | 254 return m_blockSize/2; |
196 } | 255 } |
197 | 256 |
198 size_t | 257 size_t |
199 SimilarityPlugin::getPreferredBlockSize() const | 258 SimilarityPlugin::getPreferredBlockSize() const |
207 { | 266 { |
208 if (m_blockSize != 0) return; | 267 if (m_blockSize != 0) return; |
209 int decimationFactor = getDecimationFactor(); | 268 int decimationFactor = getDecimationFactor(); |
210 if (m_type == TypeChroma) { | 269 if (m_type == TypeChroma) { |
211 ChromaConfig config; | 270 ChromaConfig config; |
212 config.FS = lrintf(m_inputSampleRate) / decimationFactor; | 271 config.FS = m_processRate; |
213 config.min = Pitch::getFrequencyForPitch(24, 0, 440); | 272 config.min = Pitch::getFrequencyForPitch(24, 0, 440); |
214 config.max = Pitch::getFrequencyForPitch(96, 0, 440); | 273 config.max = Pitch::getFrequencyForPitch(96, 0, 440); |
215 config.BPO = 12; | 274 config.BPO = 12; |
216 config.CQThresh = 0.0054; | 275 config.CQThresh = 0.0054; |
217 config.isNormalised = false; | 276 config.isNormalised = false; |
240 desc.quantizeStep = 1; | 299 desc.quantizeStep = 1; |
241 desc.valueNames.push_back("Timbral (MFCC)"); | 300 desc.valueNames.push_back("Timbral (MFCC)"); |
242 desc.valueNames.push_back("Chromatic (Chroma)"); | 301 desc.valueNames.push_back("Chromatic (Chroma)"); |
243 list.push_back(desc); | 302 list.push_back(desc); |
244 | 303 |
304 desc.identifier = "rhythmWeighting"; | |
305 desc.name = "Influence of Rhythm"; | |
306 desc.description = "Proportion of similarity measure made up from rhythmic similarity component, from 0 (entirely timbral or chromatic) to 100 (entirely rhythmic)."; | |
307 desc.unit = "%"; | |
308 desc.minValue = 0; | |
309 desc.maxValue = 100; | |
310 desc.defaultValue = 0; | |
311 desc.isQuantized = true; | |
312 desc.quantizeStep = 1; | |
313 desc.valueNames.clear(); | |
314 list.push_back(desc); | |
315 | |
245 return list; | 316 return list; |
246 } | 317 } |
247 | 318 |
248 float | 319 float |
249 SimilarityPlugin::getParameter(std::string param) const | 320 SimilarityPlugin::getParameter(std::string param) const |
250 { | 321 { |
251 if (param == "featureType") { | 322 if (param == "featureType") { |
252 if (m_type == TypeMFCC) return 0; | 323 if (m_type == TypeMFCC) return 0; |
253 else if (m_type == TypeChroma) return 1; | 324 else if (m_type == TypeChroma) return 1; |
254 else return 0; | 325 else return 0; |
326 } else if (param == "rhythmWeighting") { | |
327 return nearbyint(m_rhythmWeighting * 100.0); | |
255 } | 328 } |
256 | 329 |
257 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \"" | 330 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \"" |
258 << param << "\"" << std::endl; | 331 << param << "\"" << std::endl; |
259 return 0.0; | 332 return 0.0; |
266 int v = int(value + 0.1); | 339 int v = int(value + 0.1); |
267 Type prevType = m_type; | 340 Type prevType = m_type; |
268 if (v == 0) m_type = TypeMFCC; | 341 if (v == 0) m_type = TypeMFCC; |
269 else if (v == 1) m_type = TypeChroma; | 342 else if (v == 1) m_type = TypeChroma; |
270 if (m_type != prevType) m_blockSize = 0; | 343 if (m_type != prevType) m_blockSize = 0; |
344 return; | |
345 } else if (param == "rhythmWeighting") { | |
346 m_rhythmWeighting = value / 100; | |
271 return; | 347 return; |
272 } | 348 } |
273 | 349 |
274 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \"" | 350 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \"" |
275 << param << "\"" << std::endl; | 351 << param << "\"" << std::endl; |
353 variances.sampleRate = 1; | 429 variances.sampleRate = 1; |
354 | 430 |
355 m_variancesOutput = list.size(); | 431 m_variancesOutput = list.size(); |
356 list.push_back(variances); | 432 list.push_back(variances); |
357 | 433 |
434 OutputDescriptor beatspectrum; | |
435 beatspectrum.identifier = "beatspectrum"; | |
436 beatspectrum.name = "Beat Spectra"; | |
437 beatspectrum.description = "Rhythmic self-similarity vectors (beat spectra) for the input channels. Feature time (sec) corresponds to input channel. Not returned if rhythm weighting is zero."; | |
438 beatspectrum.unit = ""; | |
439 if (m_rhythmClipFrames > 0) { | |
440 beatspectrum.hasFixedBinCount = true; | |
441 beatspectrum.binCount = m_rhythmClipFrames / 2; | |
442 } else { | |
443 beatspectrum.hasFixedBinCount = false; | |
444 } | |
445 beatspectrum.hasKnownExtents = false; | |
446 beatspectrum.isQuantized = false; | |
447 beatspectrum.sampleType = OutputDescriptor::FixedSampleRate; | |
448 beatspectrum.sampleRate = 1; | |
449 | |
450 m_beatSpectraOutput = list.size(); | |
451 list.push_back(beatspectrum); | |
452 | |
358 return list; | 453 return list; |
359 } | 454 } |
360 | 455 |
361 SimilarityPlugin::FeatureSet | 456 SimilarityPlugin::FeatureSet |
362 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */) | 457 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */) |
363 { | 458 { |
459 if (m_done) { | |
460 return FeatureSet(); | |
461 } | |
462 | |
364 double *dblbuf = new double[m_blockSize]; | 463 double *dblbuf = new double[m_blockSize]; |
365 double *decbuf = dblbuf; | 464 double *decbuf = dblbuf; |
366 if (m_decimator) decbuf = new double[m_fftSize]; | 465 if (m_decimator) decbuf = new double[m_fftSize]; |
367 | 466 |
368 double *raw = 0; | 467 double *raw = new double[std::max(m_featureColumnSize, |
369 bool ownRaw = false; | 468 m_rhythmColumnSize)]; |
370 | |
371 if (m_type == TypeMFCC) { | |
372 raw = new double[m_featureColumnSize]; | |
373 ownRaw = true; | |
374 } | |
375 | 469 |
376 float threshold = 1e-10; | 470 float threshold = 1e-10; |
471 | |
472 bool someRhythmFrameNeeded = false; | |
377 | 473 |
378 for (size_t c = 0; c < m_channels; ++c) { | 474 for (size_t c = 0; c < m_channels; ++c) { |
379 | 475 |
380 bool empty = true; | 476 bool empty = true; |
381 | 477 |
383 float val = inputBuffers[c][i]; | 479 float val = inputBuffers[c][i]; |
384 if (fabs(val) > threshold) empty = false; | 480 if (fabs(val) > threshold) empty = false; |
385 dblbuf[i] = val; | 481 dblbuf[i] = val; |
386 } | 482 } |
387 | 483 |
388 if (empty) continue; | 484 if (empty) { |
485 if (needRhythm() && ((m_frameNo % 2) == 0)) { | |
486 for (int i = 0; i < m_fftSize / m_rhythmClipFrameSize; ++i) { | |
487 if (m_rhythmValues[c].size() < m_rhythmClipFrames) { | |
488 FeatureColumn mf(m_rhythmColumnSize); | |
489 for (int i = 0; i < m_rhythmColumnSize; ++i) { | |
490 mf[i] = 0.0; | |
491 } | |
492 m_rhythmValues[c].push_back(mf); | |
493 } | |
494 } | |
495 } | |
496 continue; | |
497 } | |
498 | |
389 m_lastNonEmptyFrame[c] = m_frameNo; | 499 m_lastNonEmptyFrame[c] = m_frameNo; |
390 | 500 |
391 if (m_decimator) { | 501 if (m_decimator) { |
392 m_decimator->process(dblbuf, decbuf); | 502 m_decimator->process(dblbuf, decbuf); |
393 } | 503 } |
394 | 504 |
395 if (m_type == TypeMFCC) { | 505 if (needTimbre()) { |
396 m_mfcc->process(decbuf, raw); | 506 |
397 } else if (m_type == TypeChroma) { | 507 if (m_type == TypeMFCC) { |
398 raw = m_chromagram->process(decbuf); | 508 m_mfcc->process(decbuf, raw); |
399 } | 509 } else if (m_type == TypeChroma) { |
510 raw = m_chromagram->process(decbuf); | |
511 } | |
400 | 512 |
401 FeatureColumn mf(m_featureColumnSize); | 513 FeatureColumn mf(m_featureColumnSize); |
402 // std::cout << m_frameNo << ":" << c << ": "; | 514 for (int i = 0; i < m_featureColumnSize; ++i) { |
403 for (int i = 0; i < m_featureColumnSize; ++i) { | 515 mf[i] = raw[i]; |
404 mf[i] = raw[i]; | 516 } |
405 // std::cout << raw[i] << " "; | 517 |
406 } | 518 m_values[c].push_back(mf); |
407 // std::cout << std::endl; | 519 } |
408 | 520 |
409 m_values[c].push_back(mf); | 521 // std::cerr << "needRhythm = " << needRhythm() << ", frame = " << m_frameNo << std::endl; |
522 | |
523 if (needRhythm() && ((m_frameNo % 2) == 0)) { | |
524 | |
525 // The incoming frames are overlapping; we only use every | |
526 // other one, because we don't want the overlap (it would | |
527 // screw up the rhythm) | |
528 | |
529 int frameOffset = 0; | |
530 | |
531 while (frameOffset + m_rhythmClipFrameSize <= m_fftSize) { | |
532 | |
533 bool needRhythmFrame = true; | |
534 | |
535 if (m_rhythmValues[c].size() >= m_rhythmClipFrames) { | |
536 | |
537 needRhythmFrame = false; | |
538 | |
539 // assumes hopsize = framesize/2 | |
540 float current = m_frameNo * (m_fftSize/2) + frameOffset; | |
541 current = current / m_processRate; | |
542 if (current - m_rhythmClipDuration < m_rhythmClipOrigin) { | |
543 needRhythmFrame = true; | |
544 m_rhythmValues[c].pop_front(); | |
545 } | |
546 | |
547 if (needRhythmFrame) { | |
548 std::cerr << "at current = " <<current << " (frame = " << m_frameNo << "), have " << m_rhythmValues[c].size() << ", need rhythm = " << needRhythmFrame << std::endl; | |
549 } | |
550 | |
551 } | |
552 | |
553 if (needRhythmFrame) { | |
554 | |
555 someRhythmFrameNeeded = true; | |
556 | |
557 m_rhythmfcc->process(decbuf + frameOffset, raw); | |
558 | |
559 FeatureColumn mf(m_rhythmColumnSize); | |
560 for (int i = 0; i < m_rhythmColumnSize; ++i) { | |
561 mf[i] = raw[i]; | |
562 } | |
563 | |
564 m_rhythmValues[c].push_back(mf); | |
565 } | |
566 | |
567 frameOffset += m_rhythmClipFrameSize; | |
568 } | |
569 } | |
570 } | |
571 | |
572 if (!needTimbre() && !someRhythmFrameNeeded && ((m_frameNo % 2) == 0)) { | |
573 std::cerr << "done!" << std::endl; | |
574 m_done = true; | |
410 } | 575 } |
411 | 576 |
412 if (m_decimator) delete[] decbuf; | 577 if (m_decimator) delete[] decbuf; |
413 delete[] dblbuf; | 578 delete[] dblbuf; |
414 | 579 delete[] raw; |
415 if (ownRaw) delete[] raw; | |
416 | 580 |
417 ++m_frameNo; | 581 ++m_frameNo; |
418 | 582 |
419 return FeatureSet(); | 583 return FeatureSet(); |
420 } | 584 } |
421 | 585 |
422 SimilarityPlugin::FeatureSet | 586 SimilarityPlugin::FeatureMatrix |
423 SimilarityPlugin::getRemainingFeatures() | 587 SimilarityPlugin::calculateTimbral(FeatureSet &returnFeatures) |
424 { | 588 { |
425 std::vector<FeatureColumn> m(m_channels); | 589 FeatureMatrix m(m_channels); // means |
426 std::vector<FeatureColumn> v(m_channels); | 590 FeatureMatrix v(m_channels); // variances |
427 | 591 |
428 for (int i = 0; i < m_channels; ++i) { | 592 for (int i = 0; i < m_channels; ++i) { |
429 | 593 |
430 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize); | 594 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize); |
431 | 595 |
439 // last non-empty frame (which may be partial) | 603 // last non-empty frame (which may be partial) |
440 | 604 |
441 int sz = m_lastNonEmptyFrame[i]; | 605 int sz = m_lastNonEmptyFrame[i]; |
442 if (sz < 0) sz = 0; | 606 if (sz < 0) sz = 0; |
443 | 607 |
444 // std::cout << "\nBin " << j << ":" << std::endl; | |
445 | |
446 count = 0; | 608 count = 0; |
447 for (int k = 0; k < sz; ++k) { | 609 for (int k = 0; k < sz; ++k) { |
448 double val = m_values[i][k][j]; | 610 double val = m_values[i][k][j]; |
449 // std::cout << val << " "; | |
450 if (isnan(val) || isinf(val)) continue; | 611 if (isnan(val) || isinf(val)) continue; |
451 mean[j] += val; | 612 mean[j] += val; |
452 ++count; | 613 ++count; |
453 } | 614 } |
454 if (count > 0) mean[j] /= count; | 615 if (count > 0) mean[j] /= count; |
455 // std::cout << "\n" << count << " non-NaN non-inf values, so mean = " << mean[j] << std::endl; | |
456 | 616 |
457 count = 0; | 617 count = 0; |
458 for (int k = 0; k < sz; ++k) { | 618 for (int k = 0; k < sz; ++k) { |
459 double val = ((m_values[i][k][j] - mean[j]) * | 619 double val = ((m_values[i][k][j] - mean[j]) * |
460 (m_values[i][k][j] - mean[j])); | 620 (m_values[i][k][j] - mean[j])); |
461 if (isnan(val) || isinf(val)) continue; | 621 if (isnan(val) || isinf(val)) continue; |
462 variance[j] += val; | 622 variance[j] += val; |
463 ++count; | 623 ++count; |
464 } | 624 } |
465 if (count > 0) variance[j] /= count; | 625 if (count > 0) variance[j] /= count; |
466 // std::cout << "... and variance = " << variance[j] << std::endl; | |
467 } | 626 } |
468 | 627 |
469 m[i] = mean; | 628 m[i] = mean; |
470 v[i] = variance; | 629 v[i] = variance; |
471 } | 630 } |
472 | |
473 // we want to return a matrix of the distances between channels, | |
474 // but Vamp doesn't have a matrix return type so we actually | |
475 // return a series of vectors | |
476 | |
477 std::vector<std::vector<double> > distances; | |
478 | 631 |
479 // "Despite the fact that MFCCs extracted from music are clearly | 632 // "Despite the fact that MFCCs extracted from music are clearly |
480 // not Gaussian, [14] showed, somewhat surprisingly, that a | 633 // not Gaussian, [14] showed, somewhat surprisingly, that a |
481 // similarity function comparing single Gaussians modelling MFCCs | 634 // similarity function comparing single Gaussians modelling MFCCs |
482 // for each track can perform as well as mixture models. A great | 635 // for each track can perform as well as mixture models. A great |
484 // form exists for the KL divergence." -- Mark Levy, "Lightweight | 637 // form exists for the KL divergence." -- Mark Levy, "Lightweight |
485 // measures for timbral similarity of musical audio" | 638 // measures for timbral similarity of musical audio" |
486 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf) | 639 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf) |
487 | 640 |
488 KLDivergence kld; | 641 KLDivergence kld; |
642 FeatureMatrix distances(m_channels); | |
489 | 643 |
490 for (int i = 0; i < m_channels; ++i) { | 644 for (int i = 0; i < m_channels; ++i) { |
491 distances.push_back(std::vector<double>()); | |
492 for (int j = 0; j < m_channels; ++j) { | 645 for (int j = 0; j < m_channels; ++j) { |
493 double d = kld.distance(m[i], v[i], m[j], v[j]); | 646 double d = kld.distance(m[i], v[i], m[j], v[j]); |
494 distances[i].push_back(d); | 647 distances[i].push_back(d); |
495 } | 648 } |
496 } | 649 } |
497 | 650 |
651 Feature feature; | |
652 feature.hasTimestamp = true; | |
653 | |
654 char labelBuffer[100]; | |
655 | |
656 for (int i = 0; i < m_channels; ++i) { | |
657 | |
658 feature.timestamp = Vamp::RealTime(i, 0); | |
659 | |
660 sprintf(labelBuffer, "Means for channel %d", i+1); | |
661 feature.label = labelBuffer; | |
662 | |
663 feature.values.clear(); | |
664 for (int k = 0; k < m_featureColumnSize; ++k) { | |
665 feature.values.push_back(m[i][k]); | |
666 } | |
667 | |
668 returnFeatures[m_meansOutput].push_back(feature); | |
669 | |
670 sprintf(labelBuffer, "Variances for channel %d", i+1); | |
671 feature.label = labelBuffer; | |
672 | |
673 feature.values.clear(); | |
674 for (int k = 0; k < m_featureColumnSize; ++k) { | |
675 feature.values.push_back(v[i][k]); | |
676 } | |
677 | |
678 returnFeatures[m_variancesOutput].push_back(feature); | |
679 } | |
680 | |
681 return distances; | |
682 } | |
683 | |
684 SimilarityPlugin::FeatureMatrix | |
685 SimilarityPlugin::calculateRhythmic(FeatureSet &returnFeatures) | |
686 { | |
687 if (!needRhythm()) return FeatureMatrix(); | |
688 | |
689 BeatSpectrum bscalc; | |
690 CosineDistance cd; | |
691 | |
692 // Our rhythm feature matrix is a deque of vectors for practical | |
693 // reasons, but BeatSpectrum::process wants a vector of vectors | |
694 // (which is what FeatureMatrix happens to be). | |
695 | |
696 FeatureMatrixSet bsinput(m_channels); | |
697 for (int i = 0; i < m_channels; ++i) { | |
698 for (int j = 0; j < m_rhythmValues[i].size(); ++j) { | |
699 bsinput[i].push_back(m_rhythmValues[i][j]); | |
700 } | |
701 } | |
702 | |
703 FeatureMatrix bs(m_channels); | |
704 for (int i = 0; i < m_channels; ++i) { | |
705 bs[i] = bscalc.process(bsinput[i]); | |
706 } | |
707 | |
708 FeatureMatrix distances(m_channels); | |
709 for (int i = 0; i < m_channels; ++i) { | |
710 for (int j = 0; j < m_channels; ++j) { | |
711 double d = cd.distance(bs[i], bs[j]); | |
712 distances[i].push_back(d); | |
713 } | |
714 } | |
715 | |
716 Feature feature; | |
717 feature.hasTimestamp = true; | |
718 | |
719 char labelBuffer[100]; | |
720 | |
721 for (int i = 0; i < m_channels; ++i) { | |
722 | |
723 feature.timestamp = Vamp::RealTime(i, 0); | |
724 | |
725 sprintf(labelBuffer, "Beat spectrum for channel %d", i+1); | |
726 feature.label = labelBuffer; | |
727 | |
728 feature.values.clear(); | |
729 for (int j = 0; j < bs[i].size(); ++j) { | |
730 feature.values.push_back(bs[i][j]); | |
731 } | |
732 | |
733 returnFeatures[m_beatSpectraOutput].push_back(feature); | |
734 } | |
735 | |
736 return distances; | |
737 } | |
738 | |
739 double | |
740 SimilarityPlugin::getDistance(const FeatureMatrix &timbral, | |
741 const FeatureMatrix &rhythmic, | |
742 int i, int j) | |
743 { | |
744 double distance = 1.0; | |
745 if (needTimbre()) distance *= timbral[i][j]; | |
746 if (needRhythm()) distance *= rhythmic[i][j]; | |
747 return distance; | |
748 } | |
749 | |
750 SimilarityPlugin::FeatureSet | |
751 SimilarityPlugin::getRemainingFeatures() | |
752 { | |
753 FeatureSet returnFeatures; | |
754 | |
755 // We want to return a matrix of the distances between channels, | |
756 // but Vamp doesn't have a matrix return type so we will actually | |
757 // return a series of vectors | |
758 | |
759 FeatureMatrix timbralDistances, rhythmicDistances; | |
760 | |
761 if (needTimbre()) { | |
762 timbralDistances = calculateTimbral(returnFeatures); | |
763 } | |
764 | |
765 if (needRhythm()) { | |
766 rhythmicDistances = calculateRhythmic(returnFeatures); | |
767 } | |
768 | |
498 // We give all features a timestamp, otherwise hosts will tend to | 769 // We give all features a timestamp, otherwise hosts will tend to |
499 // stamp them at the end of the file, which is annoying | 770 // stamp them at the end of the file, which is annoying |
500 | |
501 FeatureSet returnFeatures; | |
502 | 771 |
503 Feature feature; | 772 Feature feature; |
504 feature.hasTimestamp = true; | 773 feature.hasTimestamp = true; |
505 | 774 |
506 Feature distanceVectorFeature; | 775 Feature distanceVectorFeature; |
514 | 783 |
515 for (int i = 0; i < m_channels; ++i) { | 784 for (int i = 0; i < m_channels; ++i) { |
516 | 785 |
517 feature.timestamp = Vamp::RealTime(i, 0); | 786 feature.timestamp = Vamp::RealTime(i, 0); |
518 | 787 |
519 sprintf(labelBuffer, "Means for channel %d", i+1); | |
520 feature.label = labelBuffer; | |
521 | |
522 feature.values.clear(); | |
523 for (int k = 0; k < m_featureColumnSize; ++k) { | |
524 feature.values.push_back(m[i][k]); | |
525 } | |
526 | |
527 returnFeatures[m_meansOutput].push_back(feature); | |
528 | |
529 sprintf(labelBuffer, "Variances for channel %d", i+1); | |
530 feature.label = labelBuffer; | |
531 | |
532 feature.values.clear(); | |
533 for (int k = 0; k < m_featureColumnSize; ++k) { | |
534 feature.values.push_back(v[i][k]); | |
535 } | |
536 | |
537 returnFeatures[m_variancesOutput].push_back(feature); | |
538 | |
539 feature.values.clear(); | 788 feature.values.clear(); |
540 for (int j = 0; j < m_channels; ++j) { | 789 for (int j = 0; j < m_channels; ++j) { |
541 feature.values.push_back(distances[i][j]); | 790 double dist = getDistance(timbralDistances, rhythmicDistances, i, j); |
791 feature.values.push_back(dist); | |
542 } | 792 } |
543 | 793 |
544 sprintf(labelBuffer, "Distances from channel %d", i+1); | 794 sprintf(labelBuffer, "Distances from channel %d", i+1); |
545 feature.label = labelBuffer; | 795 feature.label = labelBuffer; |
546 | 796 |
547 returnFeatures[m_distanceMatrixOutput].push_back(feature); | 797 returnFeatures[m_distanceMatrixOutput].push_back(feature); |
548 | 798 |
549 distanceVectorFeature.values.push_back(distances[0][i]); | 799 double fromFirst = |
550 | 800 getDistance(timbralDistances, rhythmicDistances, 0, i); |
551 sorted[distances[0][i]] = i; | 801 |
802 distanceVectorFeature.values.push_back(fromFirst); | |
803 sorted[fromFirst] = i; | |
552 } | 804 } |
553 | 805 |
554 returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature); | 806 returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature); |
555 | 807 |
556 feature.label = "Order of channels by similarity to first channel"; | 808 feature.label = "Order of channels by similarity to first channel"; |