annotate plugins/SimilarityPlugin.cpp @ 42:0f85778f1b53

* some more fixes, + add chroma option to similarity plugin
author Chris Cannam <c.cannam@qmul.ac.uk>
date Mon, 14 Jan 2008 18:14:55 +0000
parents b9fb6dee85f7
children 1389f05cb688
rev   line source
c@41 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
c@41 2
c@41 3 /*
c@41 4 * SegmenterPlugin.cpp
c@41 5 *
c@41 6 * Copyright 2008 Centre for Digital Music, Queen Mary, University of London.
c@41 7 * All rights reserved.
c@41 8 */
c@41 9
c@41 10 #include <iostream>
c@41 11 #include <sstream>
c@41 12
c@41 13 #include "SimilarityPlugin.h"
c@42 14 #include "base/Pitch.h"
c@41 15 #include "dsp/mfcc/MFCC.h"
c@42 16 #include "dsp/chromagram/Chromagram.h"
c@41 17 #include "dsp/rateconversion/Decimator.h"
c@41 18
c@41 19 using std::string;
c@41 20 using std::vector;
c@41 21 using std::cerr;
c@41 22 using std::endl;
c@41 23 using std::ostringstream;
c@41 24
c@41 25 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) :
c@41 26 Plugin(inputSampleRate),
c@42 27 m_type(TypeMFCC),
c@41 28 m_mfcc(0),
c@42 29 m_chromagram(0),
c@41 30 m_decimator(0),
c@42 31 m_featureColumnSize(20),
c@41 32 m_blockSize(0),
c@41 33 m_channels(0)
c@41 34 {
c@41 35
c@41 36 }
c@41 37
c@41 38 SimilarityPlugin::~SimilarityPlugin()
c@41 39 {
c@41 40 delete m_mfcc;
c@42 41 delete m_chromagram;
c@41 42 delete m_decimator;
c@41 43 }
c@41 44
c@41 45 string
c@41 46 SimilarityPlugin::getIdentifier() const
c@41 47 {
c@41 48 return "qm-similarity";
c@41 49 }
c@41 50
c@41 51 string
c@41 52 SimilarityPlugin::getName() const
c@41 53 {
c@41 54 return "Similarity";
c@41 55 }
c@41 56
c@41 57 string
c@41 58 SimilarityPlugin::getDescription() const
c@41 59 {
c@42 60 return "Return a distance matrix for similarity between the input audio channels";
c@41 61 }
c@41 62
c@41 63 string
c@41 64 SimilarityPlugin::getMaker() const
c@41 65 {
c@41 66 return "Chris Cannam, Queen Mary, University of London";
c@41 67 }
c@41 68
c@41 69 int
c@41 70 SimilarityPlugin::getPluginVersion() const
c@41 71 {
c@41 72 return 1;
c@41 73 }
c@41 74
c@41 75 string
c@41 76 SimilarityPlugin::getCopyright() const
c@41 77 {
c@41 78 return "Copyright (c) 2008 - All Rights Reserved";
c@41 79 }
c@41 80
c@41 81 size_t
c@41 82 SimilarityPlugin::getMinChannelCount() const
c@41 83 {
c@41 84 return 2;
c@41 85 }
c@41 86
c@41 87 size_t
c@41 88 SimilarityPlugin::getMaxChannelCount() const
c@41 89 {
c@41 90 return 1024;
c@41 91 }
c@41 92
c@41 93 bool
c@41 94 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
c@41 95 {
c@41 96 if (channels < getMinChannelCount() ||
c@41 97 channels > getMaxChannelCount()) return false;
c@41 98
c@41 99 if (stepSize != getPreferredStepSize()) {
c@41 100 std::cerr << "SimilarityPlugin::initialise: supplied step size "
c@41 101 << stepSize << " differs from required step size "
c@41 102 << getPreferredStepSize() << std::endl;
c@41 103 return false;
c@41 104 }
c@41 105
c@41 106 if (blockSize != getPreferredBlockSize()) {
c@41 107 std::cerr << "SimilarityPlugin::initialise: supplied block size "
c@41 108 << blockSize << " differs from required block size "
c@41 109 << getPreferredBlockSize() << std::endl;
c@41 110 return false;
c@41 111 }
c@41 112
c@41 113 m_blockSize = blockSize;
c@41 114 m_channels = channels;
c@41 115
c@41 116 int decimationFactor = getDecimationFactor();
c@41 117 if (decimationFactor > 1) {
c@42 118 m_decimator = new Decimator(m_blockSize, decimationFactor);
c@41 119 }
c@41 120
c@42 121 if (m_type == TypeMFCC) {
c@42 122
c@42 123 m_featureColumnSize = 20;
c@42 124
c@42 125 MFCCConfig config;
c@42 126 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42 127 config.fftsize = 2048;
c@42 128 config.nceps = m_featureColumnSize - 1;
c@42 129 config.want_c0 = true;
c@42 130 m_mfcc = new MFCC(config);
c@42 131 m_fftSize = m_mfcc->getfftlength();
c@42 132
c@42 133 } else if (m_type == TypeChroma) {
c@42 134
c@42 135 m_featureColumnSize = 12;
c@42 136
c@42 137 ChromaConfig config;
c@42 138 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42 139 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
c@42 140 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
c@42 141 config.BPO = 12;
c@42 142 config.CQThresh = 0.0054;
c@42 143 config.isNormalised = true;
c@42 144 m_chromagram = new Chromagram(config);
c@42 145 m_fftSize = m_chromagram->getFrameSize();
c@42 146
c@42 147 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl;
c@42 148
c@42 149 } else {
c@42 150
c@42 151 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl;
c@42 152 return false;
c@42 153 }
c@41 154
c@41 155 for (int i = 0; i < m_channels; ++i) {
c@42 156 m_values.push_back(FeatureMatrix());
c@41 157 }
c@41 158
c@41 159 return true;
c@41 160 }
c@41 161
c@41 162 void
c@41 163 SimilarityPlugin::reset()
c@41 164 {
c@41 165 //!!!
c@41 166 }
c@41 167
c@41 168 int
c@41 169 SimilarityPlugin::getDecimationFactor() const
c@41 170 {
c@41 171 int rate = lrintf(m_inputSampleRate);
c@41 172 int internalRate = 22050;
c@41 173 int decimationFactor = rate / internalRate;
c@41 174 if (decimationFactor < 1) decimationFactor = 1;
c@41 175
c@41 176 // must be a power of two
c@41 177 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
c@41 178
c@41 179 return decimationFactor;
c@41 180 }
c@41 181
c@41 182 size_t
c@41 183 SimilarityPlugin::getPreferredStepSize() const
c@41 184 {
c@42 185 if (m_blockSize == 0) calculateBlockSize();
c@42 186 return m_blockSize/2;
c@41 187 }
c@41 188
c@41 189 size_t
c@41 190 SimilarityPlugin::getPreferredBlockSize() const
c@41 191 {
c@42 192 if (m_blockSize == 0) calculateBlockSize();
c@42 193 return m_blockSize;
c@42 194 }
c@42 195
c@42 196 void
c@42 197 SimilarityPlugin::calculateBlockSize() const
c@42 198 {
c@42 199 if (m_blockSize != 0) return;
c@42 200 int decimationFactor = getDecimationFactor();
c@42 201 if (m_type == TypeChroma) {
c@42 202 ChromaConfig config;
c@42 203 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42 204 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
c@42 205 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
c@42 206 config.BPO = 12;
c@42 207 config.CQThresh = 0.0054;
c@42 208 config.isNormalised = false;
c@42 209 Chromagram *c = new Chromagram(config);
c@42 210 size_t sz = c->getFrameSize();
c@42 211 delete c;
c@42 212 m_blockSize = sz * decimationFactor;
c@42 213 } else {
c@42 214 m_blockSize = 2048 * decimationFactor;
c@42 215 }
c@41 216 }
c@41 217
c@41 218 SimilarityPlugin::ParameterList SimilarityPlugin::getParameterDescriptors() const
c@41 219 {
c@41 220 ParameterList list;
c@42 221
c@42 222 ParameterDescriptor desc;
c@42 223 desc.identifier = "featureType";
c@42 224 desc.name = "Feature Type";
c@42 225 desc.description = "";//!!!
c@42 226 desc.unit = "";
c@42 227 desc.minValue = 0;
c@42 228 desc.maxValue = 1;
c@42 229 desc.defaultValue = 0;
c@42 230 desc.isQuantized = true;
c@42 231 desc.quantizeStep = 1;
c@42 232 desc.valueNames.push_back("Timbral (MFCC)");
c@42 233 desc.valueNames.push_back("Chromatic (Chroma)");
c@42 234 list.push_back(desc);
c@42 235
c@41 236 return list;
c@41 237 }
c@41 238
c@41 239 float
c@41 240 SimilarityPlugin::getParameter(std::string param) const
c@41 241 {
c@42 242 if (param == "featureType") {
c@42 243 if (m_type == TypeMFCC) return 0;
c@42 244 else if (m_type == TypeChroma) return 1;
c@42 245 else return 0;
c@42 246 }
c@42 247
c@41 248 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
c@41 249 << param << "\"" << std::endl;
c@41 250 return 0.0;
c@41 251 }
c@41 252
c@41 253 void
c@41 254 SimilarityPlugin::setParameter(std::string param, float value)
c@41 255 {
c@42 256 if (param == "featureType") {
c@42 257 int v = int(value + 0.1);
c@42 258 Type prevType = m_type;
c@42 259 if (v == 0) m_type = TypeMFCC;
c@42 260 else if (v == 1) m_type = TypeChroma;
c@42 261 if (m_type != prevType) m_blockSize = 0;
c@42 262 return;
c@42 263 }
c@42 264
c@41 265 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
c@41 266 << param << "\"" << std::endl;
c@41 267 }
c@41 268
c@41 269 SimilarityPlugin::OutputList
c@41 270 SimilarityPlugin::getOutputDescriptors() const
c@41 271 {
c@41 272 OutputList list;
c@41 273
c@41 274 OutputDescriptor similarity;
c@41 275 similarity.identifier = "distance";
c@41 276 similarity.name = "Distance";
c@42 277 similarity.description = "Distance Metric for Similarity (smaller = more similar)";
c@41 278 similarity.unit = "";
c@41 279 similarity.hasFixedBinCount = true;
c@41 280 similarity.binCount = m_channels;
c@41 281 similarity.hasKnownExtents = false;
c@41 282 similarity.isQuantized = false;
c@41 283 similarity.sampleType = OutputDescriptor::FixedSampleRate;
c@41 284 similarity.sampleRate = 1;
c@41 285
c@41 286 list.push_back(similarity);
c@41 287
c@41 288 OutputDescriptor means;
c@41 289 means.identifier = "means";
c@42 290 means.name = "Feature Means";
c@41 291 means.description = "";
c@41 292 means.unit = "";
c@41 293 means.hasFixedBinCount = true;
c@41 294 means.binCount = m_channels;
c@41 295 means.hasKnownExtents = false;
c@41 296 means.isQuantized = false;
c@41 297 means.sampleType = OutputDescriptor::VariableSampleRate;
c@41 298 means.sampleRate = m_inputSampleRate / getPreferredStepSize();
c@41 299
c@41 300 list.push_back(means);
c@41 301
c@41 302 OutputDescriptor variances;
c@41 303 variances.identifier = "variances";
c@42 304 variances.name = "Feature Variances";
c@41 305 variances.description = "";
c@41 306 variances.unit = "";
c@41 307 variances.hasFixedBinCount = true;
c@41 308 variances.binCount = m_channels;
c@41 309 variances.hasKnownExtents = false;
c@41 310 variances.isQuantized = false;
c@41 311 variances.sampleType = OutputDescriptor::VariableSampleRate;
c@41 312 variances.sampleRate = m_inputSampleRate / getPreferredStepSize();
c@41 313
c@41 314 list.push_back(variances);
c@41 315
c@41 316 return list;
c@41 317 }
c@41 318
c@41 319 SimilarityPlugin::FeatureSet
c@41 320 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */)
c@41 321 {
c@41 322 double *dblbuf = new double[m_blockSize];
c@41 323 double *decbuf = dblbuf;
c@42 324 if (m_decimator) decbuf = new double[m_fftSize];
c@42 325
c@42 326 double *raw = 0;
c@42 327 bool ownRaw = false;
c@42 328
c@42 329 if (m_type == TypeMFCC) {
c@42 330 raw = new double[m_featureColumnSize];
c@42 331 ownRaw = true;
c@42 332 }
c@41 333
c@41 334 for (size_t c = 0; c < m_channels; ++c) {
c@41 335
c@41 336 for (int i = 0; i < m_blockSize; ++i) {
c@41 337 dblbuf[i] = inputBuffers[c][i];
c@41 338 }
c@41 339
c@41 340 if (m_decimator) {
c@41 341 m_decimator->process(dblbuf, decbuf);
c@41 342 }
c@42 343
c@42 344 if (m_type == TypeMFCC) {
c@42 345 m_mfcc->process(m_fftSize, decbuf, raw);
c@42 346 } else if (m_type == TypeChroma) {
c@42 347 raw = m_chromagram->process(decbuf);
c@42 348 }
c@41 349
c@42 350 FeatureColumn mf(m_featureColumnSize);
c@42 351 for (int i = 0; i < m_featureColumnSize; ++i) mf[i] = raw[i];
c@41 352
c@42 353 m_values[c].push_back(mf);
c@41 354 }
c@41 355
c@41 356 if (m_decimator) delete[] decbuf;
c@41 357 delete[] dblbuf;
c@42 358
c@42 359 if (ownRaw) delete[] raw;
c@41 360
c@41 361 return FeatureSet();
c@41 362 }
c@41 363
c@41 364 SimilarityPlugin::FeatureSet
c@41 365 SimilarityPlugin::getRemainingFeatures()
c@41 366 {
c@42 367 std::vector<FeatureColumn> m(m_channels);
c@42 368 std::vector<FeatureColumn> v(m_channels);
c@41 369
c@41 370 for (int i = 0; i < m_channels; ++i) {
c@41 371
c@42 372 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize);
c@41 373
c@42 374 for (int j = 0; j < m_featureColumnSize; ++j) {
c@41 375
c@41 376 mean[j] = variance[j] = 0.0;
c@41 377 int count;
c@41 378
c@42 379 // std::cout << i << "," << j << ":" << std::endl;
c@42 380
c@41 381 count = 0;
c@42 382 for (int k = 0; k < m_values[i].size(); ++k) {
c@42 383 double val = m_values[i][k][j];
c@42 384 // std::cout << val << " ";
c@41 385 if (isnan(val) || isinf(val)) continue;
c@41 386 mean[j] += val;
c@41 387 ++count;
c@41 388 }
c@41 389 if (count > 0) mean[j] /= count;
c@42 390
c@42 391 // std::cout << std::endl;
c@41 392
c@41 393 count = 0;
c@42 394 for (int k = 0; k < m_values[i].size(); ++k) {
c@42 395 double val = ((m_values[i][k][j] - mean[j]) *
c@42 396 (m_values[i][k][j] - mean[j]));
c@41 397 if (isnan(val) || isinf(val)) continue;
c@41 398 variance[j] += val;
c@41 399 ++count;
c@41 400 }
c@41 401 if (count > 0) variance[j] /= count;
c@41 402 }
c@41 403
c@41 404 m[i] = mean;
c@41 405 v[i] = variance;
c@41 406 }
c@41 407
c@42 408 // we want to return a matrix of the distances between channels,
c@41 409 // but Vamp doesn't have a matrix return type so we actually
c@41 410 // return a series of vectors
c@41 411
c@41 412 std::vector<std::vector<double> > distances;
c@41 413
c@42 414 // "Despite the fact that MFCCs extracted from music are clearly
c@42 415 // not Gaussian, [14] showed, somewhat surprisingly, that a
c@42 416 // similarity function comparing single Gaussians modelling MFCCs
c@42 417 // for each track can perform as well as mixture models. A great
c@42 418 // advantage of using single Gaussians is that a simple closed
c@42 419 // form exists for the KL divergence." -- Mark Levy, "Lightweight
c@42 420 // measures for timbral similarity of musical audio"
c@42 421 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
c@42 422 //
c@42 423 // This code calculates a symmetrised distance metric based on the
c@42 424 // KL divergence of Gaussian models of the MFCC values.
c@42 425
c@41 426 for (int i = 0; i < m_channels; ++i) {
c@41 427 distances.push_back(std::vector<double>());
c@41 428 for (int j = 0; j < m_channels; ++j) {
c@42 429 double d = -2.0 * m_featureColumnSize;
c@42 430 for (int k = 0; k < m_featureColumnSize; ++k) {
c@42 431 // m[i][k] is the mean of feature bin k for channel i
c@42 432 // v[i][k] is the variance of feature bin k for channel i
c@41 433 d += v[i][k] / v[j][k] + v[j][k] / v[i][k];
c@41 434 d += (m[i][k] - m[j][k])
c@41 435 * (1.0 / v[i][k] + 1.0 / v[j][k])
c@41 436 * (m[i][k] - m[j][k]);
c@41 437 }
c@41 438 d /= 2.0;
c@41 439 distances[i].push_back(d);
c@41 440 }
c@41 441 }
c@41 442
c@41 443 FeatureSet returnFeatures;
c@41 444
c@41 445 for (int i = 0; i < m_channels; ++i) {
c@41 446
c@41 447 Feature feature;
c@41 448 feature.hasTimestamp = true; // otherwise hosts will tend to stamp them at the end of the file, which is annoying
c@41 449 feature.timestamp = Vamp::RealTime(i, 0);
c@41 450
c@41 451 feature.values.clear();
c@42 452 for (int k = 0; k < m_featureColumnSize; ++k) {
c@41 453 feature.values.push_back(m[i][k]);
c@41 454 }
c@41 455
c@41 456 returnFeatures[1].push_back(feature);
c@41 457
c@41 458 feature.values.clear();
c@42 459 for (int k = 0; k < m_featureColumnSize; ++k) {
c@41 460 feature.values.push_back(v[i][k]);
c@41 461 }
c@41 462
c@41 463 returnFeatures[2].push_back(feature);
c@41 464
c@41 465 feature.values.clear();
c@41 466 for (int j = 0; j < m_channels; ++j) {
c@41 467 feature.values.push_back(distances[i][j]);
c@41 468 }
c@41 469 ostringstream oss;
c@41 470 oss << "Distance from " << (i + 1);
c@41 471 feature.label = oss.str();
c@41 472
c@41 473 returnFeatures[0].push_back(feature);
c@41 474 }
c@41 475
c@41 476 return returnFeatures;
c@41 477 }