annotate plugins/SimilarityPlugin.cpp @ 41:b9fb6dee85f7

* Add similarity plugin
author Chris Cannam <c.cannam@qmul.ac.uk>
date Fri, 11 Jan 2008 18:18:45 +0000
parents
children 0f85778f1b53
rev   line source
c@41 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
c@41 2
c@41 3 /*
c@41 4 * SegmenterPlugin.cpp
c@41 5 *
c@41 6 * Copyright 2008 Centre for Digital Music, Queen Mary, University of London.
c@41 7 * All rights reserved.
c@41 8 */
c@41 9
c@41 10 #include <iostream>
c@41 11 #include <sstream>
c@41 12
c@41 13 #include "SimilarityPlugin.h"
c@41 14 #include "dsp/mfcc/MFCC.h"
c@41 15 #include "dsp/rateconversion/Decimator.h"
c@41 16
c@41 17 using std::string;
c@41 18 using std::vector;
c@41 19 using std::cerr;
c@41 20 using std::endl;
c@41 21 using std::ostringstream;
c@41 22
c@41 23 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) :
c@41 24 Plugin(inputSampleRate),
c@41 25 m_mfcc(0),
c@41 26 m_decimator(0),
c@41 27 m_K(20),
c@41 28 m_blockSize(0),
c@41 29 m_channels(0)
c@41 30 {
c@41 31
c@41 32 }
c@41 33
c@41 34 SimilarityPlugin::~SimilarityPlugin()
c@41 35 {
c@41 36 delete m_mfcc;
c@41 37 delete m_decimator;
c@41 38 }
c@41 39
c@41 40 string
c@41 41 SimilarityPlugin::getIdentifier() const
c@41 42 {
c@41 43 return "qm-similarity";
c@41 44 }
c@41 45
c@41 46 string
c@41 47 SimilarityPlugin::getName() const
c@41 48 {
c@41 49 return "Similarity";
c@41 50 }
c@41 51
c@41 52 string
c@41 53 SimilarityPlugin::getDescription() const
c@41 54 {
c@41 55 return "Return a distance metric for overall timbral similarity between the input audio channels";
c@41 56 }
c@41 57
c@41 58 string
c@41 59 SimilarityPlugin::getMaker() const
c@41 60 {
c@41 61 return "Chris Cannam, Queen Mary, University of London";
c@41 62 }
c@41 63
c@41 64 int
c@41 65 SimilarityPlugin::getPluginVersion() const
c@41 66 {
c@41 67 return 1;
c@41 68 }
c@41 69
c@41 70 string
c@41 71 SimilarityPlugin::getCopyright() const
c@41 72 {
c@41 73 return "Copyright (c) 2008 - All Rights Reserved";
c@41 74 }
c@41 75
c@41 76 size_t
c@41 77 SimilarityPlugin::getMinChannelCount() const
c@41 78 {
c@41 79 return 2;
c@41 80 }
c@41 81
c@41 82 size_t
c@41 83 SimilarityPlugin::getMaxChannelCount() const
c@41 84 {
c@41 85 return 1024;
c@41 86 }
c@41 87
c@41 88 bool
c@41 89 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
c@41 90 {
c@41 91 if (channels < getMinChannelCount() ||
c@41 92 channels > getMaxChannelCount()) return false;
c@41 93
c@41 94 if (stepSize != getPreferredStepSize()) {
c@41 95 std::cerr << "SimilarityPlugin::initialise: supplied step size "
c@41 96 << stepSize << " differs from required step size "
c@41 97 << getPreferredStepSize() << std::endl;
c@41 98 return false;
c@41 99 }
c@41 100
c@41 101 if (blockSize != getPreferredBlockSize()) {
c@41 102 std::cerr << "SimilarityPlugin::initialise: supplied block size "
c@41 103 << blockSize << " differs from required block size "
c@41 104 << getPreferredBlockSize() << std::endl;
c@41 105 return false;
c@41 106 }
c@41 107
c@41 108 m_blockSize = blockSize;
c@41 109 m_channels = channels;
c@41 110
c@41 111 int decimationFactor = getDecimationFactor();
c@41 112 if (decimationFactor > 1) {
c@41 113 m_decimator = new Decimator(getPreferredBlockSize(), decimationFactor);
c@41 114 }
c@41 115
c@41 116 MFCCConfig config;
c@41 117 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@41 118 config.fftsize = 2048;
c@41 119 config.nceps = m_K - 1;
c@41 120 config.want_c0 = true;
c@41 121 m_mfcc = new MFCC(config);
c@41 122
c@41 123 for (int i = 0; i < m_channels; ++i) {
c@41 124 m_mfeatures.push_back(MFCCFeatureVector());
c@41 125 }
c@41 126
c@41 127 return true;
c@41 128 }
c@41 129
c@41 130 void
c@41 131 SimilarityPlugin::reset()
c@41 132 {
c@41 133 //!!!
c@41 134 }
c@41 135
c@41 136 int
c@41 137 SimilarityPlugin::getDecimationFactor() const
c@41 138 {
c@41 139 int rate = lrintf(m_inputSampleRate);
c@41 140 int internalRate = 22050;
c@41 141 int decimationFactor = rate / internalRate;
c@41 142 if (decimationFactor < 1) decimationFactor = 1;
c@41 143
c@41 144 // must be a power of two
c@41 145 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
c@41 146
c@41 147 return decimationFactor;
c@41 148 }
c@41 149
c@41 150 size_t
c@41 151 SimilarityPlugin::getPreferredStepSize() const
c@41 152 {
c@41 153 return 1024 * getDecimationFactor();
c@41 154 }
c@41 155
c@41 156 size_t
c@41 157 SimilarityPlugin::getPreferredBlockSize() const
c@41 158 {
c@41 159 return 2048 * getDecimationFactor();
c@41 160 }
c@41 161
c@41 162 SimilarityPlugin::ParameterList SimilarityPlugin::getParameterDescriptors() const
c@41 163 {
c@41 164 ParameterList list;
c@41 165 return list;
c@41 166 }
c@41 167
c@41 168 float
c@41 169 SimilarityPlugin::getParameter(std::string param) const
c@41 170 {
c@41 171 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
c@41 172 << param << "\"" << std::endl;
c@41 173 return 0.0;
c@41 174 }
c@41 175
c@41 176 void
c@41 177 SimilarityPlugin::setParameter(std::string param, float value)
c@41 178 {
c@41 179 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
c@41 180 << param << "\"" << std::endl;
c@41 181 }
c@41 182
c@41 183 SimilarityPlugin::OutputList
c@41 184 SimilarityPlugin::getOutputDescriptors() const
c@41 185 {
c@41 186 OutputList list;
c@41 187
c@41 188 OutputDescriptor similarity;
c@41 189 similarity.identifier = "distance";
c@41 190 similarity.name = "Distance";
c@41 191 similarity.description = "Distance Metric for Timbral Similarity (smaller = more similar)";
c@41 192 similarity.unit = "";
c@41 193 similarity.hasFixedBinCount = true;
c@41 194 similarity.binCount = m_channels;
c@41 195 similarity.hasKnownExtents = false;
c@41 196 similarity.isQuantized = false;
c@41 197 similarity.sampleType = OutputDescriptor::FixedSampleRate;
c@41 198 similarity.sampleRate = 1;
c@41 199
c@41 200 list.push_back(similarity);
c@41 201
c@41 202 OutputDescriptor means;
c@41 203 means.identifier = "means";
c@41 204 means.name = "MFCC Means";
c@41 205 means.description = "";
c@41 206 means.unit = "";
c@41 207 means.hasFixedBinCount = true;
c@41 208 means.binCount = m_channels;
c@41 209 means.hasKnownExtents = false;
c@41 210 means.isQuantized = false;
c@41 211 means.sampleType = OutputDescriptor::VariableSampleRate;
c@41 212 means.sampleRate = m_inputSampleRate / getPreferredStepSize();
c@41 213
c@41 214 list.push_back(means);
c@41 215
c@41 216 OutputDescriptor variances;
c@41 217 variances.identifier = "variances";
c@41 218 variances.name = "MFCC Variances";
c@41 219 variances.description = "";
c@41 220 variances.unit = "";
c@41 221 variances.hasFixedBinCount = true;
c@41 222 variances.binCount = m_channels;
c@41 223 variances.hasKnownExtents = false;
c@41 224 variances.isQuantized = false;
c@41 225 variances.sampleType = OutputDescriptor::VariableSampleRate;
c@41 226 variances.sampleRate = m_inputSampleRate / getPreferredStepSize();
c@41 227
c@41 228 list.push_back(variances);
c@41 229
c@41 230 return list;
c@41 231 }
c@41 232
c@41 233 SimilarityPlugin::FeatureSet
c@41 234 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */)
c@41 235 {
c@41 236 double *dblbuf = new double[m_blockSize];
c@41 237 double *decbuf = dblbuf;
c@41 238 if (m_decimator) decbuf = new double[m_mfcc->getfftlength()];
c@41 239 double *ceps = new double[m_K];
c@41 240
c@41 241 for (size_t c = 0; c < m_channels; ++c) {
c@41 242
c@41 243 for (int i = 0; i < m_blockSize; ++i) {
c@41 244 dblbuf[i] = inputBuffers[c][i];
c@41 245 }
c@41 246
c@41 247 if (m_decimator) {
c@41 248 m_decimator->process(dblbuf, decbuf);
c@41 249 }
c@41 250
c@41 251 m_mfcc->process(m_mfcc->getfftlength(), decbuf, ceps);
c@41 252
c@41 253 MFCCFeature mf(m_K);
c@41 254 for (int i = 0; i < m_K; ++i) mf[i] = ceps[i];
c@41 255
c@41 256 m_mfeatures[c].push_back(mf);
c@41 257 }
c@41 258
c@41 259 if (m_decimator) delete[] decbuf;
c@41 260 delete[] dblbuf;
c@41 261 delete[] ceps;
c@41 262
c@41 263 return FeatureSet();
c@41 264 }
c@41 265
c@41 266 SimilarityPlugin::FeatureSet
c@41 267 SimilarityPlugin::getRemainingFeatures()
c@41 268 {
c@41 269 std::vector<MFCCFeature> m(m_channels);
c@41 270 std::vector<MFCCFeature> v(m_channels);
c@41 271
c@41 272 //!!! bail if m_mfeatures vectors are empty
c@41 273
c@41 274 for (int i = 0; i < m_channels; ++i) {
c@41 275
c@41 276 MFCCFeature mean(m_K), variance(m_K);
c@41 277
c@41 278 for (int j = 0; j < m_K; ++j) {
c@41 279
c@41 280 mean[j] = variance[j] = 0.0;
c@41 281 int count;
c@41 282
c@41 283 count = 0;
c@41 284 for (int k = 0; k < m_mfeatures[i].size(); ++k) {
c@41 285 double val = m_mfeatures[i][k][j];
c@41 286 // std::cout << "val = " << val << std::endl;
c@41 287 if (isnan(val) || isinf(val)) continue;
c@41 288 mean[j] += val;
c@41 289 // std::cout << "mean now = " << mean[j] << std::endl;
c@41 290 ++count;
c@41 291 }
c@41 292 if (count > 0) mean[j] /= count;
c@41 293 // std::cout << "divided by " << count << ", mean now " << mean[j] << std::endl;
c@41 294
c@41 295 count = 0;
c@41 296 for (int k = 0; k < m_mfeatures[i].size(); ++k) {
c@41 297 double val = ((m_mfeatures[i][k][j] - mean[j]) *
c@41 298 (m_mfeatures[i][k][j] - mean[j]));
c@41 299 if (isnan(val) || isinf(val)) continue;
c@41 300 variance[j] += val;
c@41 301 ++count;
c@41 302 }
c@41 303 if (count > 0) variance[j] /= count;
c@41 304 }
c@41 305
c@41 306 m[i] = mean;
c@41 307 v[i] = variance;
c@41 308 }
c@41 309
c@41 310 // std::cout << "m[0][0] = " << m[0][0] << std::endl;
c@41 311
c@41 312 // so we sorta return a matrix of the distances between channels,
c@41 313 // but Vamp doesn't have a matrix return type so we actually
c@41 314 // return a series of vectors
c@41 315
c@41 316 std::vector<std::vector<double> > distances;
c@41 317
c@41 318 for (int i = 0; i < m_channels; ++i) {
c@41 319 distances.push_back(std::vector<double>());
c@41 320 for (int j = 0; j < m_channels; ++j) {
c@41 321 double d = -2.0 * m_K;
c@41 322 for (int k = 0; k < m_K; ++k) {
c@41 323 // m[i][k] is the mean of mfcc k for channel i
c@41 324 // v[i][k] is the variance of mfcc k for channel i
c@41 325 d += v[i][k] / v[j][k] + v[j][k] / v[i][k];
c@41 326 d += (m[i][k] - m[j][k])
c@41 327 * (1.0 / v[i][k] + 1.0 / v[j][k])
c@41 328 * (m[i][k] - m[j][k]);
c@41 329 }
c@41 330 d /= 2.0;
c@41 331 distances[i].push_back(d);
c@41 332 }
c@41 333 }
c@41 334
c@41 335 FeatureSet returnFeatures;
c@41 336
c@41 337 for (int i = 0; i < m_channels; ++i) {
c@41 338
c@41 339 Feature feature;
c@41 340 feature.hasTimestamp = true; // otherwise hosts will tend to stamp them at the end of the file, which is annoying
c@41 341 feature.timestamp = Vamp::RealTime(i, 0);
c@41 342
c@41 343 feature.values.clear();
c@41 344 for (int k = 0; k < m_K; ++k) {
c@41 345 feature.values.push_back(m[i][k]);
c@41 346 }
c@41 347
c@41 348 returnFeatures[1].push_back(feature);
c@41 349
c@41 350 feature.values.clear();
c@41 351 for (int k = 0; k < m_K; ++k) {
c@41 352 feature.values.push_back(v[i][k]);
c@41 353 }
c@41 354
c@41 355 returnFeatures[2].push_back(feature);
c@41 356
c@41 357 feature.values.clear();
c@41 358 for (int j = 0; j < m_channels; ++j) {
c@41 359 feature.values.push_back(distances[i][j]);
c@41 360 }
c@41 361 ostringstream oss;
c@41 362 oss << "Distance from " << (i + 1);
c@41 363 feature.label = oss.str();
c@41 364
c@41 365 returnFeatures[0].push_back(feature);
c@41 366 }
c@41 367
c@41 368 return returnFeatures;
c@41 369 }