comparison plugins/SimilarityPlugin.cpp @ 47:f8c5f11e60a6

* Add rhythmic similarity to similarity plugin. Needs testing, and the current weighting parameter (not properly used) needs revision.
author Chris Cannam <c.cannam@qmul.ac.uk>
date Fri, 18 Jan 2008 18:11:01 +0000
parents 26a2e341d358
children 3b4572153ce3
comparison
equal deleted inserted replaced
46:26a2e341d358 47:f8c5f11e60a6
13 #include "SimilarityPlugin.h" 13 #include "SimilarityPlugin.h"
14 #include "base/Pitch.h" 14 #include "base/Pitch.h"
15 #include "dsp/mfcc/MFCC.h" 15 #include "dsp/mfcc/MFCC.h"
16 #include "dsp/chromagram/Chromagram.h" 16 #include "dsp/chromagram/Chromagram.h"
17 #include "dsp/rateconversion/Decimator.h" 17 #include "dsp/rateconversion/Decimator.h"
18 #include "maths/KLDivergence.cpp" 18 #include "dsp/rhythm/BeatSpectrum.h"
19 #include "maths/KLDivergence.h"
20 #include "maths/CosineDistance.h"
19 21
20 using std::string; 22 using std::string;
21 using std::vector; 23 using std::vector;
22 using std::cerr; 24 using std::cerr;
23 using std::endl; 25 using std::endl;
24 using std::ostringstream; 26 using std::ostringstream;
25 27
28 const float
29 SimilarityPlugin::m_noRhythm = 0.009;
30
31 const float
32 SimilarityPlugin::m_allRhythm = 0.991;
33
26 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) : 34 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) :
27 Plugin(inputSampleRate), 35 Plugin(inputSampleRate),
28 m_type(TypeMFCC), 36 m_type(TypeMFCC),
29 m_mfcc(0), 37 m_mfcc(0),
38 m_rhythmfcc(0),
30 m_chromagram(0), 39 m_chromagram(0),
31 m_decimator(0), 40 m_decimator(0),
32 m_featureColumnSize(20), 41 m_featureColumnSize(20),
42 m_rhythmWeighting(0.f),
43 m_rhythmClipDuration(4.f), // seconds
44 m_rhythmClipOrigin(40.f), // seconds
45 m_rhythmClipFrameSize(0),
46 m_rhythmClipFrames(0),
47 m_rhythmColumnSize(20),
33 m_blockSize(0), 48 m_blockSize(0),
34 m_channels(0) 49 m_channels(0),
35 { 50 m_processRate(0),
36 51 m_frameNo(0),
52 m_done(false)
53 {
54 int rate = lrintf(m_inputSampleRate);
55 int internalRate = 22050;
56 int decimationFactor = rate / internalRate;
57 if (decimationFactor < 1) decimationFactor = 1;
58
59 // must be a power of two
60 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
61
62 m_processRate = rate / decimationFactor; // may be 22050, 24000 etc
37 } 63 }
38 64
39 SimilarityPlugin::~SimilarityPlugin() 65 SimilarityPlugin::~SimilarityPlugin()
40 { 66 {
41 delete m_mfcc; 67 delete m_mfcc;
68 delete m_rhythmfcc;
42 delete m_chromagram; 69 delete m_chromagram;
43 delete m_decimator; 70 delete m_decimator;
44 } 71 }
45 72
46 string 73 string
62 } 89 }
63 90
64 string 91 string
65 SimilarityPlugin::getMaker() const 92 SimilarityPlugin::getMaker() const
66 { 93 {
67 return "Mark Levy and Chris Cannam, Queen Mary, University of London"; 94 return "Mark Levy, Kurt Jacobson and Chris Cannam, Queen Mary, University of London";
68 } 95 }
69 96
70 int 97 int
71 SimilarityPlugin::getPluginVersion() const 98 SimilarityPlugin::getPluginVersion() const
72 { 99 {
127 154
128 if (m_type == TypeMFCC) { 155 if (m_type == TypeMFCC) {
129 156
130 m_featureColumnSize = 20; 157 m_featureColumnSize = 20;
131 158
132 MFCCConfig config(lrintf(m_inputSampleRate) / decimationFactor); 159 MFCCConfig config(m_processRate);
133 config.fftsize = 2048; 160 config.fftsize = 2048;
134 config.nceps = m_featureColumnSize - 1; 161 config.nceps = m_featureColumnSize - 1;
135 config.want_c0 = true; 162 config.want_c0 = true;
136 config.logpower = 1; 163 config.logpower = 1;
137 m_mfcc = new MFCC(config); 164 m_mfcc = new MFCC(config);
138 m_fftSize = m_mfcc->getfftlength(); 165 m_fftSize = m_mfcc->getfftlength();
166 m_rhythmClipFrameSize = m_fftSize / 4;
139 167
140 std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl; 168 std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl;
141 169
142 } else if (m_type == TypeChroma) { 170 } else if (m_type == TypeChroma) {
143 171
144 m_featureColumnSize = 12; 172 m_featureColumnSize = 12;
145 173
146 ChromaConfig config; 174 ChromaConfig config;
147 config.FS = lrintf(m_inputSampleRate) / decimationFactor; 175 config.FS = m_processRate;
148 config.min = Pitch::getFrequencyForPitch(24, 0, 440); 176 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
149 config.max = Pitch::getFrequencyForPitch(96, 0, 440); 177 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
150 config.BPO = 12; 178 config.BPO = 12;
151 config.CQThresh = 0.0054; 179 config.CQThresh = 0.0054;
152 config.isNormalised = true; 180 config.isNormalised = true;
153 m_chromagram = new Chromagram(config); 181 m_chromagram = new Chromagram(config);
154 m_fftSize = m_chromagram->getFrameSize(); 182 m_fftSize = m_chromagram->getFrameSize();
155 183
184 std::cerr << "fftsize = " << m_fftSize << std::endl;
185
186 m_rhythmClipFrameSize = m_fftSize / 16;
187 while (m_rhythmClipFrameSize < 512) m_rhythmClipFrameSize *= 2;
188 std::cerr << "m_rhythmClipFrameSize = " << m_rhythmClipFrameSize << std::endl;
189
156 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl; 190 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl;
157 191
158 } else { 192 } else {
159 193
160 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl; 194 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl;
161 return false; 195 return false;
162 } 196 }
163 197
198 if (needRhythm()) {
199 m_rhythmClipFrames =
200 int(ceil((m_rhythmClipDuration * m_processRate)
201 / m_rhythmClipFrameSize));
202 std::cerr << "SimilarityPlugin::initialise: rhythm clip is "
203 << m_rhythmClipFrames << " frames of size "
204 << m_rhythmClipFrameSize << " at process rate "
205 << m_processRate << " ( = "
206 << (float(m_rhythmClipFrames * m_rhythmClipFrameSize) / m_processRate) << " sec )"
207 << std::endl;
208
209 MFCCConfig config(m_processRate);
210 config.fftsize = m_rhythmClipFrameSize;
211 config.nceps = m_featureColumnSize - 1;
212 config.want_c0 = true;
213 config.logpower = 1;
214 config.window = RectangularWindow; // because no overlap
215 m_rhythmfcc = new MFCC(config);
216 }
217
164 for (int i = 0; i < m_channels; ++i) { 218 for (int i = 0; i < m_channels; ++i) {
219
165 m_values.push_back(FeatureMatrix()); 220 m_values.push_back(FeatureMatrix());
166 } 221
222 if (needRhythm()) {
223 m_rhythmValues.push_back(FeatureColumnQueue());
224 }
225 }
226
227 m_done = false;
167 228
168 return true; 229 return true;
169 } 230 }
170 231
171 void 232 void
172 SimilarityPlugin::reset() 233 SimilarityPlugin::reset()
173 { 234 {
174 //!!! 235 //!!!
236 m_done = false;
175 } 237 }
176 238
177 int 239 int
178 SimilarityPlugin::getDecimationFactor() const 240 SimilarityPlugin::getDecimationFactor() const
179 { 241 {
180 int rate = lrintf(m_inputSampleRate); 242 int rate = lrintf(m_inputSampleRate);
181 int internalRate = 22050; 243 return rate / m_processRate;
182 int decimationFactor = rate / internalRate;
183 if (decimationFactor < 1) decimationFactor = 1;
184
185 // must be a power of two
186 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
187
188 return decimationFactor;
189 } 244 }
190 245
191 size_t 246 size_t
192 SimilarityPlugin::getPreferredStepSize() const 247 SimilarityPlugin::getPreferredStepSize() const
193 { 248 {
194 if (m_blockSize == 0) calculateBlockSize(); 249 if (m_blockSize == 0) calculateBlockSize();
250
251 // there is also an assumption to this effect in process()
252 // (referring to m_fftSize/2 instead of a literal post-decimation
253 // step size):
195 return m_blockSize/2; 254 return m_blockSize/2;
196 } 255 }
197 256
198 size_t 257 size_t
199 SimilarityPlugin::getPreferredBlockSize() const 258 SimilarityPlugin::getPreferredBlockSize() const
207 { 266 {
208 if (m_blockSize != 0) return; 267 if (m_blockSize != 0) return;
209 int decimationFactor = getDecimationFactor(); 268 int decimationFactor = getDecimationFactor();
210 if (m_type == TypeChroma) { 269 if (m_type == TypeChroma) {
211 ChromaConfig config; 270 ChromaConfig config;
212 config.FS = lrintf(m_inputSampleRate) / decimationFactor; 271 config.FS = m_processRate;
213 config.min = Pitch::getFrequencyForPitch(24, 0, 440); 272 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
214 config.max = Pitch::getFrequencyForPitch(96, 0, 440); 273 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
215 config.BPO = 12; 274 config.BPO = 12;
216 config.CQThresh = 0.0054; 275 config.CQThresh = 0.0054;
217 config.isNormalised = false; 276 config.isNormalised = false;
240 desc.quantizeStep = 1; 299 desc.quantizeStep = 1;
241 desc.valueNames.push_back("Timbral (MFCC)"); 300 desc.valueNames.push_back("Timbral (MFCC)");
242 desc.valueNames.push_back("Chromatic (Chroma)"); 301 desc.valueNames.push_back("Chromatic (Chroma)");
243 list.push_back(desc); 302 list.push_back(desc);
244 303
304 desc.identifier = "rhythmWeighting";
305 desc.name = "Influence of Rhythm";
306 desc.description = "Proportion of similarity measure made up from rhythmic similarity component, from 0 (entirely timbral or chromatic) to 100 (entirely rhythmic).";
307 desc.unit = "%";
308 desc.minValue = 0;
309 desc.maxValue = 100;
310 desc.defaultValue = 0;
311 desc.isQuantized = true;
312 desc.quantizeStep = 1;
313 desc.valueNames.clear();
314 list.push_back(desc);
315
245 return list; 316 return list;
246 } 317 }
247 318
248 float 319 float
249 SimilarityPlugin::getParameter(std::string param) const 320 SimilarityPlugin::getParameter(std::string param) const
250 { 321 {
251 if (param == "featureType") { 322 if (param == "featureType") {
252 if (m_type == TypeMFCC) return 0; 323 if (m_type == TypeMFCC) return 0;
253 else if (m_type == TypeChroma) return 1; 324 else if (m_type == TypeChroma) return 1;
254 else return 0; 325 else return 0;
326 } else if (param == "rhythmWeighting") {
327 return nearbyint(m_rhythmWeighting * 100.0);
255 } 328 }
256 329
257 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \"" 330 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
258 << param << "\"" << std::endl; 331 << param << "\"" << std::endl;
259 return 0.0; 332 return 0.0;
266 int v = int(value + 0.1); 339 int v = int(value + 0.1);
267 Type prevType = m_type; 340 Type prevType = m_type;
268 if (v == 0) m_type = TypeMFCC; 341 if (v == 0) m_type = TypeMFCC;
269 else if (v == 1) m_type = TypeChroma; 342 else if (v == 1) m_type = TypeChroma;
270 if (m_type != prevType) m_blockSize = 0; 343 if (m_type != prevType) m_blockSize = 0;
344 return;
345 } else if (param == "rhythmWeighting") {
346 m_rhythmWeighting = value / 100;
271 return; 347 return;
272 } 348 }
273 349
274 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \"" 350 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
275 << param << "\"" << std::endl; 351 << param << "\"" << std::endl;
353 variances.sampleRate = 1; 429 variances.sampleRate = 1;
354 430
355 m_variancesOutput = list.size(); 431 m_variancesOutput = list.size();
356 list.push_back(variances); 432 list.push_back(variances);
357 433
434 OutputDescriptor beatspectrum;
435 beatspectrum.identifier = "beatspectrum";
436 beatspectrum.name = "Beat Spectra";
437 beatspectrum.description = "Rhythmic self-similarity vectors (beat spectra) for the input channels. Feature time (sec) corresponds to input channel. Not returned if rhythm weighting is zero.";
438 beatspectrum.unit = "";
439 if (m_rhythmClipFrames > 0) {
440 beatspectrum.hasFixedBinCount = true;
441 beatspectrum.binCount = m_rhythmClipFrames / 2;
442 } else {
443 beatspectrum.hasFixedBinCount = false;
444 }
445 beatspectrum.hasKnownExtents = false;
446 beatspectrum.isQuantized = false;
447 beatspectrum.sampleType = OutputDescriptor::FixedSampleRate;
448 beatspectrum.sampleRate = 1;
449
450 m_beatSpectraOutput = list.size();
451 list.push_back(beatspectrum);
452
358 return list; 453 return list;
359 } 454 }
360 455
361 SimilarityPlugin::FeatureSet 456 SimilarityPlugin::FeatureSet
362 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */) 457 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */)
363 { 458 {
459 if (m_done) {
460 return FeatureSet();
461 }
462
364 double *dblbuf = new double[m_blockSize]; 463 double *dblbuf = new double[m_blockSize];
365 double *decbuf = dblbuf; 464 double *decbuf = dblbuf;
366 if (m_decimator) decbuf = new double[m_fftSize]; 465 if (m_decimator) decbuf = new double[m_fftSize];
367 466
368 double *raw = 0; 467 double *raw = new double[std::max(m_featureColumnSize,
369 bool ownRaw = false; 468 m_rhythmColumnSize)];
370
371 if (m_type == TypeMFCC) {
372 raw = new double[m_featureColumnSize];
373 ownRaw = true;
374 }
375 469
376 float threshold = 1e-10; 470 float threshold = 1e-10;
471
472 bool someRhythmFrameNeeded = false;
377 473
378 for (size_t c = 0; c < m_channels; ++c) { 474 for (size_t c = 0; c < m_channels; ++c) {
379 475
380 bool empty = true; 476 bool empty = true;
381 477
383 float val = inputBuffers[c][i]; 479 float val = inputBuffers[c][i];
384 if (fabs(val) > threshold) empty = false; 480 if (fabs(val) > threshold) empty = false;
385 dblbuf[i] = val; 481 dblbuf[i] = val;
386 } 482 }
387 483
388 if (empty) continue; 484 if (empty) {
485 if (needRhythm() && ((m_frameNo % 2) == 0)) {
486 for (int i = 0; i < m_fftSize / m_rhythmClipFrameSize; ++i) {
487 if (m_rhythmValues[c].size() < m_rhythmClipFrames) {
488 FeatureColumn mf(m_rhythmColumnSize);
489 for (int i = 0; i < m_rhythmColumnSize; ++i) {
490 mf[i] = 0.0;
491 }
492 m_rhythmValues[c].push_back(mf);
493 }
494 }
495 }
496 continue;
497 }
498
389 m_lastNonEmptyFrame[c] = m_frameNo; 499 m_lastNonEmptyFrame[c] = m_frameNo;
390 500
391 if (m_decimator) { 501 if (m_decimator) {
392 m_decimator->process(dblbuf, decbuf); 502 m_decimator->process(dblbuf, decbuf);
393 } 503 }
394 504
395 if (m_type == TypeMFCC) { 505 if (needTimbre()) {
396 m_mfcc->process(decbuf, raw); 506
397 } else if (m_type == TypeChroma) { 507 if (m_type == TypeMFCC) {
398 raw = m_chromagram->process(decbuf); 508 m_mfcc->process(decbuf, raw);
399 } 509 } else if (m_type == TypeChroma) {
510 raw = m_chromagram->process(decbuf);
511 }
400 512
401 FeatureColumn mf(m_featureColumnSize); 513 FeatureColumn mf(m_featureColumnSize);
402 // std::cout << m_frameNo << ":" << c << ": "; 514 for (int i = 0; i < m_featureColumnSize; ++i) {
403 for (int i = 0; i < m_featureColumnSize; ++i) { 515 mf[i] = raw[i];
404 mf[i] = raw[i]; 516 }
405 // std::cout << raw[i] << " "; 517
406 } 518 m_values[c].push_back(mf);
407 // std::cout << std::endl; 519 }
408 520
409 m_values[c].push_back(mf); 521 // std::cerr << "needRhythm = " << needRhythm() << ", frame = " << m_frameNo << std::endl;
522
523 if (needRhythm() && ((m_frameNo % 2) == 0)) {
524
525 // The incoming frames are overlapping; we only use every
526 // other one, because we don't want the overlap (it would
527 // screw up the rhythm)
528
529 int frameOffset = 0;
530
531 while (frameOffset + m_rhythmClipFrameSize <= m_fftSize) {
532
533 bool needRhythmFrame = true;
534
535 if (m_rhythmValues[c].size() >= m_rhythmClipFrames) {
536
537 needRhythmFrame = false;
538
539 // assumes hopsize = framesize/2
540 float current = m_frameNo * (m_fftSize/2) + frameOffset;
541 current = current / m_processRate;
542 if (current - m_rhythmClipDuration < m_rhythmClipOrigin) {
543 needRhythmFrame = true;
544 m_rhythmValues[c].pop_front();
545 }
546
547 if (needRhythmFrame) {
548 std::cerr << "at current = " <<current << " (frame = " << m_frameNo << "), have " << m_rhythmValues[c].size() << ", need rhythm = " << needRhythmFrame << std::endl;
549 }
550
551 }
552
553 if (needRhythmFrame) {
554
555 someRhythmFrameNeeded = true;
556
557 m_rhythmfcc->process(decbuf + frameOffset, raw);
558
559 FeatureColumn mf(m_rhythmColumnSize);
560 for (int i = 0; i < m_rhythmColumnSize; ++i) {
561 mf[i] = raw[i];
562 }
563
564 m_rhythmValues[c].push_back(mf);
565 }
566
567 frameOffset += m_rhythmClipFrameSize;
568 }
569 }
570 }
571
572 if (!needTimbre() && !someRhythmFrameNeeded && ((m_frameNo % 2) == 0)) {
573 std::cerr << "done!" << std::endl;
574 m_done = true;
410 } 575 }
411 576
412 if (m_decimator) delete[] decbuf; 577 if (m_decimator) delete[] decbuf;
413 delete[] dblbuf; 578 delete[] dblbuf;
414 579 delete[] raw;
415 if (ownRaw) delete[] raw;
416 580
417 ++m_frameNo; 581 ++m_frameNo;
418 582
419 return FeatureSet(); 583 return FeatureSet();
420 } 584 }
421 585
422 SimilarityPlugin::FeatureSet 586 SimilarityPlugin::FeatureMatrix
423 SimilarityPlugin::getRemainingFeatures() 587 SimilarityPlugin::calculateTimbral(FeatureSet &returnFeatures)
424 { 588 {
425 std::vector<FeatureColumn> m(m_channels); 589 FeatureMatrix m(m_channels); // means
426 std::vector<FeatureColumn> v(m_channels); 590 FeatureMatrix v(m_channels); // variances
427 591
428 for (int i = 0; i < m_channels; ++i) { 592 for (int i = 0; i < m_channels; ++i) {
429 593
430 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize); 594 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize);
431 595
439 // last non-empty frame (which may be partial) 603 // last non-empty frame (which may be partial)
440 604
441 int sz = m_lastNonEmptyFrame[i]; 605 int sz = m_lastNonEmptyFrame[i];
442 if (sz < 0) sz = 0; 606 if (sz < 0) sz = 0;
443 607
444 // std::cout << "\nBin " << j << ":" << std::endl;
445
446 count = 0; 608 count = 0;
447 for (int k = 0; k < sz; ++k) { 609 for (int k = 0; k < sz; ++k) {
448 double val = m_values[i][k][j]; 610 double val = m_values[i][k][j];
449 // std::cout << val << " ";
450 if (isnan(val) || isinf(val)) continue; 611 if (isnan(val) || isinf(val)) continue;
451 mean[j] += val; 612 mean[j] += val;
452 ++count; 613 ++count;
453 } 614 }
454 if (count > 0) mean[j] /= count; 615 if (count > 0) mean[j] /= count;
455 // std::cout << "\n" << count << " non-NaN non-inf values, so mean = " << mean[j] << std::endl;
456 616
457 count = 0; 617 count = 0;
458 for (int k = 0; k < sz; ++k) { 618 for (int k = 0; k < sz; ++k) {
459 double val = ((m_values[i][k][j] - mean[j]) * 619 double val = ((m_values[i][k][j] - mean[j]) *
460 (m_values[i][k][j] - mean[j])); 620 (m_values[i][k][j] - mean[j]));
461 if (isnan(val) || isinf(val)) continue; 621 if (isnan(val) || isinf(val)) continue;
462 variance[j] += val; 622 variance[j] += val;
463 ++count; 623 ++count;
464 } 624 }
465 if (count > 0) variance[j] /= count; 625 if (count > 0) variance[j] /= count;
466 // std::cout << "... and variance = " << variance[j] << std::endl;
467 } 626 }
468 627
469 m[i] = mean; 628 m[i] = mean;
470 v[i] = variance; 629 v[i] = variance;
471 } 630 }
472
473 // we want to return a matrix of the distances between channels,
474 // but Vamp doesn't have a matrix return type so we actually
475 // return a series of vectors
476
477 std::vector<std::vector<double> > distances;
478 631
479 // "Despite the fact that MFCCs extracted from music are clearly 632 // "Despite the fact that MFCCs extracted from music are clearly
480 // not Gaussian, [14] showed, somewhat surprisingly, that a 633 // not Gaussian, [14] showed, somewhat surprisingly, that a
481 // similarity function comparing single Gaussians modelling MFCCs 634 // similarity function comparing single Gaussians modelling MFCCs
482 // for each track can perform as well as mixture models. A great 635 // for each track can perform as well as mixture models. A great
484 // form exists for the KL divergence." -- Mark Levy, "Lightweight 637 // form exists for the KL divergence." -- Mark Levy, "Lightweight
485 // measures for timbral similarity of musical audio" 638 // measures for timbral similarity of musical audio"
486 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf) 639 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
487 640
488 KLDivergence kld; 641 KLDivergence kld;
642 FeatureMatrix distances(m_channels);
489 643
490 for (int i = 0; i < m_channels; ++i) { 644 for (int i = 0; i < m_channels; ++i) {
491 distances.push_back(std::vector<double>());
492 for (int j = 0; j < m_channels; ++j) { 645 for (int j = 0; j < m_channels; ++j) {
493 double d = kld.distance(m[i], v[i], m[j], v[j]); 646 double d = kld.distance(m[i], v[i], m[j], v[j]);
494 distances[i].push_back(d); 647 distances[i].push_back(d);
495 } 648 }
496 } 649 }
497 650
651 Feature feature;
652 feature.hasTimestamp = true;
653
654 char labelBuffer[100];
655
656 for (int i = 0; i < m_channels; ++i) {
657
658 feature.timestamp = Vamp::RealTime(i, 0);
659
660 sprintf(labelBuffer, "Means for channel %d", i+1);
661 feature.label = labelBuffer;
662
663 feature.values.clear();
664 for (int k = 0; k < m_featureColumnSize; ++k) {
665 feature.values.push_back(m[i][k]);
666 }
667
668 returnFeatures[m_meansOutput].push_back(feature);
669
670 sprintf(labelBuffer, "Variances for channel %d", i+1);
671 feature.label = labelBuffer;
672
673 feature.values.clear();
674 for (int k = 0; k < m_featureColumnSize; ++k) {
675 feature.values.push_back(v[i][k]);
676 }
677
678 returnFeatures[m_variancesOutput].push_back(feature);
679 }
680
681 return distances;
682 }
683
684 SimilarityPlugin::FeatureMatrix
685 SimilarityPlugin::calculateRhythmic(FeatureSet &returnFeatures)
686 {
687 if (!needRhythm()) return FeatureMatrix();
688
689 BeatSpectrum bscalc;
690 CosineDistance cd;
691
692 // Our rhythm feature matrix is a deque of vectors for practical
693 // reasons, but BeatSpectrum::process wants a vector of vectors
694 // (which is what FeatureMatrix happens to be).
695
696 FeatureMatrixSet bsinput(m_channels);
697 for (int i = 0; i < m_channels; ++i) {
698 for (int j = 0; j < m_rhythmValues[i].size(); ++j) {
699 bsinput[i].push_back(m_rhythmValues[i][j]);
700 }
701 }
702
703 FeatureMatrix bs(m_channels);
704 for (int i = 0; i < m_channels; ++i) {
705 bs[i] = bscalc.process(bsinput[i]);
706 }
707
708 FeatureMatrix distances(m_channels);
709 for (int i = 0; i < m_channels; ++i) {
710 for (int j = 0; j < m_channels; ++j) {
711 double d = cd.distance(bs[i], bs[j]);
712 distances[i].push_back(d);
713 }
714 }
715
716 Feature feature;
717 feature.hasTimestamp = true;
718
719 char labelBuffer[100];
720
721 for (int i = 0; i < m_channels; ++i) {
722
723 feature.timestamp = Vamp::RealTime(i, 0);
724
725 sprintf(labelBuffer, "Beat spectrum for channel %d", i+1);
726 feature.label = labelBuffer;
727
728 feature.values.clear();
729 for (int j = 0; j < bs[i].size(); ++j) {
730 feature.values.push_back(bs[i][j]);
731 }
732
733 returnFeatures[m_beatSpectraOutput].push_back(feature);
734 }
735
736 return distances;
737 }
738
739 double
740 SimilarityPlugin::getDistance(const FeatureMatrix &timbral,
741 const FeatureMatrix &rhythmic,
742 int i, int j)
743 {
744 double distance = 1.0;
745 if (needTimbre()) distance *= timbral[i][j];
746 if (needRhythm()) distance *= rhythmic[i][j];
747 return distance;
748 }
749
750 SimilarityPlugin::FeatureSet
751 SimilarityPlugin::getRemainingFeatures()
752 {
753 FeatureSet returnFeatures;
754
755 // We want to return a matrix of the distances between channels,
756 // but Vamp doesn't have a matrix return type so we will actually
757 // return a series of vectors
758
759 FeatureMatrix timbralDistances, rhythmicDistances;
760
761 if (needTimbre()) {
762 timbralDistances = calculateTimbral(returnFeatures);
763 }
764
765 if (needRhythm()) {
766 rhythmicDistances = calculateRhythmic(returnFeatures);
767 }
768
498 // We give all features a timestamp, otherwise hosts will tend to 769 // We give all features a timestamp, otherwise hosts will tend to
499 // stamp them at the end of the file, which is annoying 770 // stamp them at the end of the file, which is annoying
500
501 FeatureSet returnFeatures;
502 771
503 Feature feature; 772 Feature feature;
504 feature.hasTimestamp = true; 773 feature.hasTimestamp = true;
505 774
506 Feature distanceVectorFeature; 775 Feature distanceVectorFeature;
514 783
515 for (int i = 0; i < m_channels; ++i) { 784 for (int i = 0; i < m_channels; ++i) {
516 785
517 feature.timestamp = Vamp::RealTime(i, 0); 786 feature.timestamp = Vamp::RealTime(i, 0);
518 787
519 sprintf(labelBuffer, "Means for channel %d", i+1);
520 feature.label = labelBuffer;
521
522 feature.values.clear();
523 for (int k = 0; k < m_featureColumnSize; ++k) {
524 feature.values.push_back(m[i][k]);
525 }
526
527 returnFeatures[m_meansOutput].push_back(feature);
528
529 sprintf(labelBuffer, "Variances for channel %d", i+1);
530 feature.label = labelBuffer;
531
532 feature.values.clear();
533 for (int k = 0; k < m_featureColumnSize; ++k) {
534 feature.values.push_back(v[i][k]);
535 }
536
537 returnFeatures[m_variancesOutput].push_back(feature);
538
539 feature.values.clear(); 788 feature.values.clear();
540 for (int j = 0; j < m_channels; ++j) { 789 for (int j = 0; j < m_channels; ++j) {
541 feature.values.push_back(distances[i][j]); 790 double dist = getDistance(timbralDistances, rhythmicDistances, i, j);
791 feature.values.push_back(dist);
542 } 792 }
543 793
544 sprintf(labelBuffer, "Distances from channel %d", i+1); 794 sprintf(labelBuffer, "Distances from channel %d", i+1);
545 feature.label = labelBuffer; 795 feature.label = labelBuffer;
546 796
547 returnFeatures[m_distanceMatrixOutput].push_back(feature); 797 returnFeatures[m_distanceMatrixOutput].push_back(feature);
548 798
549 distanceVectorFeature.values.push_back(distances[0][i]); 799 double fromFirst =
550 800 getDistance(timbralDistances, rhythmicDistances, 0, i);
551 sorted[distances[0][i]] = i; 801
802 distanceVectorFeature.values.push_back(fromFirst);
803 sorted[fromFirst] = i;
552 } 804 }
553 805
554 returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature); 806 returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature);
555 807
556 feature.label = "Order of channels by similarity to first channel"; 808 feature.label = "Order of channels by similarity to first channel";