comparison plugins/SimilarityPlugin.cpp @ 48:3b4572153ce3

* Similarity -> single user control rather than separate weighting * Key detector -> correct reported min/max values for outputs * Start some documentation
author Chris Cannam <c.cannam@qmul.ac.uk>
date Mon, 21 Jan 2008 18:05:28 +0000
parents f8c5f11e60a6
children fc88b465548a
comparison
equal deleted inserted replaced
47:f8c5f11e60a6 48:3b4572153ce3
37 m_mfcc(0), 37 m_mfcc(0),
38 m_rhythmfcc(0), 38 m_rhythmfcc(0),
39 m_chromagram(0), 39 m_chromagram(0),
40 m_decimator(0), 40 m_decimator(0),
41 m_featureColumnSize(20), 41 m_featureColumnSize(20),
42 m_rhythmWeighting(0.f), 42 m_rhythmWeighting(0.5f),
43 m_rhythmClipDuration(4.f), // seconds 43 m_rhythmClipDuration(4.f), // seconds
44 m_rhythmClipOrigin(40.f), // seconds 44 m_rhythmClipOrigin(40.f), // seconds
45 m_rhythmClipFrameSize(0), 45 m_rhythmClipFrameSize(0),
46 m_rhythmClipFrames(0), 46 m_rhythmClipFrames(0),
47 m_rhythmColumnSize(20), 47 m_rhythmColumnSize(20),
288 ParameterList list; 288 ParameterList list;
289 289
290 ParameterDescriptor desc; 290 ParameterDescriptor desc;
291 desc.identifier = "featureType"; 291 desc.identifier = "featureType";
292 desc.name = "Feature Type"; 292 desc.name = "Feature Type";
293 desc.description = "Audio feature used for similarity measure. Timbral: use the first 20 MFCCs (19 plus C0). Chromatic: use 12 bin-per-octave chroma."; 293 desc.description = "Audio feature used for similarity measure. Timbral: use the first 20 MFCCs (19 plus C0). Chromatic: use 12 bin-per-octave chroma. Rhythmic: compare beat spectra of short regions.";
294 desc.unit = ""; 294 desc.unit = "";
295 desc.minValue = 0; 295 desc.minValue = 0;
296 desc.maxValue = 1; 296 desc.maxValue = 4;
297 desc.defaultValue = 0; 297 desc.defaultValue = 1;
298 desc.isQuantized = true; 298 desc.isQuantized = true;
299 desc.quantizeStep = 1; 299 desc.quantizeStep = 1;
300 desc.valueNames.push_back("Timbral (MFCC)"); 300 desc.valueNames.push_back("Timbre");
301 desc.valueNames.push_back("Chromatic (Chroma)"); 301 desc.valueNames.push_back("Timbre and Rhythm");
302 desc.valueNames.push_back("Chroma");
303 desc.valueNames.push_back("Chroma and Rhythm");
304 desc.valueNames.push_back("Rhythm only");
302 list.push_back(desc); 305 list.push_back(desc);
303 306 /*
304 desc.identifier = "rhythmWeighting"; 307 desc.identifier = "rhythmWeighting";
305 desc.name = "Influence of Rhythm"; 308 desc.name = "Influence of Rhythm";
306 desc.description = "Proportion of similarity measure made up from rhythmic similarity component, from 0 (entirely timbral or chromatic) to 100 (entirely rhythmic)."; 309 desc.description = "Proportion of similarity measure made up from rhythmic similarity component, from 0 (entirely timbral or chromatic) to 100 (entirely rhythmic).";
307 desc.unit = "%"; 310 desc.unit = "%";
308 desc.minValue = 0; 311 desc.minValue = 0;
309 desc.maxValue = 100; 312 desc.maxValue = 100;
310 desc.defaultValue = 0; 313 desc.defaultValue = 0;
311 desc.isQuantized = true; 314 desc.isQuantized = false;
312 desc.quantizeStep = 1;
313 desc.valueNames.clear(); 315 desc.valueNames.clear();
314 list.push_back(desc); 316 list.push_back(desc);
315 317 */
316 return list; 318 return list;
317 } 319 }
318 320
319 float 321 float
320 SimilarityPlugin::getParameter(std::string param) const 322 SimilarityPlugin::getParameter(std::string param) const
321 { 323 {
322 if (param == "featureType") { 324 if (param == "featureType") {
323 if (m_type == TypeMFCC) return 0; 325
324 else if (m_type == TypeChroma) return 1; 326 if (m_rhythmWeighting > m_allRhythm) {
325 else return 0; 327 return 4;
326 } else if (param == "rhythmWeighting") { 328 }
327 return nearbyint(m_rhythmWeighting * 100.0); 329
330 switch (m_type) {
331
332 case TypeMFCC:
333 if (m_rhythmWeighting < m_noRhythm) return 0;
334 else return 1;
335 break;
336
337 case TypeChroma:
338 if (m_rhythmWeighting < m_noRhythm) return 2;
339 else return 3;
340 break;
341 }
342
343 return 1;
344
345 // } else if (param == "rhythmWeighting") {
346 // return nearbyint(m_rhythmWeighting * 100.0);
328 } 347 }
329 348
330 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \"" 349 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
331 << param << "\"" << std::endl; 350 << param << "\"" << std::endl;
332 return 0.0; 351 return 0.0;
334 353
335 void 354 void
336 SimilarityPlugin::setParameter(std::string param, float value) 355 SimilarityPlugin::setParameter(std::string param, float value)
337 { 356 {
338 if (param == "featureType") { 357 if (param == "featureType") {
358
339 int v = int(value + 0.1); 359 int v = int(value + 0.1);
340 Type prevType = m_type; 360
341 if (v == 0) m_type = TypeMFCC; 361 Type newType = m_type;
342 else if (v == 1) m_type = TypeChroma; 362
343 if (m_type != prevType) m_blockSize = 0; 363 switch (v) {
364 case 0: newType = TypeMFCC; m_rhythmWeighting = 0.0f; break;
365 case 1: newType = TypeMFCC; m_rhythmWeighting = 0.5f; break;
366 case 2: newType = TypeChroma; m_rhythmWeighting = 0.0f; break;
367 case 3: newType = TypeChroma; m_rhythmWeighting = 0.5f; break;
368 case 4: newType = TypeMFCC; m_rhythmWeighting = 1.f; break;
369 }
370
371 if (newType != m_type) m_blockSize = 0;
372
373 m_type = newType;
344 return; 374 return;
345 } else if (param == "rhythmWeighting") { 375
346 m_rhythmWeighting = value / 100; 376 // } else if (param == "rhythmWeighting") {
347 return; 377 // m_rhythmWeighting = value / 100;
378 // return;
348 } 379 }
349 380
350 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \"" 381 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
351 << param << "\"" << std::endl; 382 << param << "\"" << std::endl;
352 } 383 }
627 658
628 m[i] = mean; 659 m[i] = mean;
629 v[i] = variance; 660 v[i] = variance;
630 } 661 }
631 662
632 // "Despite the fact that MFCCs extracted from music are clearly
633 // not Gaussian, [14] showed, somewhat surprisingly, that a
634 // similarity function comparing single Gaussians modelling MFCCs
635 // for each track can perform as well as mixture models. A great
636 // advantage of using single Gaussians is that a simple closed
637 // form exists for the KL divergence." -- Mark Levy, "Lightweight
638 // measures for timbral similarity of musical audio"
639 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
640
641 KLDivergence kld;
642 FeatureMatrix distances(m_channels); 663 FeatureMatrix distances(m_channels);
643 664
644 for (int i = 0; i < m_channels; ++i) { 665 if (m_type == TypeMFCC) {
645 for (int j = 0; j < m_channels; ++j) { 666
646 double d = kld.distance(m[i], v[i], m[j], v[j]); 667 // "Despite the fact that MFCCs extracted from music are
647 distances[i].push_back(d); 668 // clearly not Gaussian, [14] showed, somewhat surprisingly,
669 // that a similarity function comparing single Gaussians
670 // modelling MFCCs for each track can perform as well as
671 // mixture models. A great advantage of using single
672 // Gaussians is that a simple closed form exists for the KL
673 // divergence." -- Mark Levy, "Lightweight measures for
674 // timbral similarity of musical audio"
675 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
676
677 KLDivergence kld;
678
679 for (int i = 0; i < m_channels; ++i) {
680 for (int j = 0; j < m_channels; ++j) {
681 double d = kld.distanceGaussian(m[i], v[i], m[j], v[j]);
682 distances[i].push_back(d);
683 }
684 }
685
686 } else {
687
688 // Chroma are histograms already
689
690 KLDivergence kld;
691
692 for (int i = 0; i < m_channels; ++i) {
693 for (int j = 0; j < m_channels; ++j) {
694 double d = kld.distanceDistribution(m[i], m[j], true);
695 distances[i].push_back(d);
696 }
648 } 697 }
649 } 698 }
650 699
651 Feature feature; 700 Feature feature;
652 feature.hasTimestamp = true; 701 feature.hasTimestamp = true;