qm-vamp-plugins: plugins/SimilarityPlugin.cpp annotate

annotate plugins/SimilarityPlugin.cpp @ 43:1389f05cb688

* Various fixes

author	Chris Cannam <c.cannam@qmul.ac.uk>
date	Wed, 16 Jan 2008 18:03:25 +0000
parents	0f85778f1b53
children	1dc00e4dbae6

rev	line source
c@41	1 /* -- c-basic-offset: 4 indent-tabs-mode: nil -- vi:set ts=8 sts=4 sw=4: */
c@41	2
c@41	3 /*
c@41	4 * SegmenterPlugin.cpp
c@41	5 *
c@41	6 * Copyright 2008 Centre for Digital Music, Queen Mary, University of London.
c@41	7 * All rights reserved.
c@41	8 */
c@41	9
c@41	10 #include <iostream>
c@41	11 #include <sstream>
c@41	12
c@41	13 #include "SimilarityPlugin.h"
c@42	14 #include "base/Pitch.h"
c@41	15 #include "dsp/mfcc/MFCC.h"
c@42	16 #include "dsp/chromagram/Chromagram.h"
c@41	17 #include "dsp/rateconversion/Decimator.h"
c@41	18
c@41	19 using std::string;
c@41	20 using std::vector;
c@41	21 using std::cerr;
c@41	22 using std::endl;
c@41	23 using std::ostringstream;
c@41	24
c@41	25 SimilarityPlugin::SimilarityPlugin(float inputSampleRate) :
c@41	26 Plugin(inputSampleRate),
c@42	27 m_type(TypeMFCC),
c@41	28 m_mfcc(0),
c@42	29 m_chromagram(0),
c@41	30 m_decimator(0),
c@42	31 m_featureColumnSize(20),
c@41	32 m_blockSize(0),
c@41	33 m_channels(0)
c@41	34 {
c@41	35
c@41	36 }
c@41	37
c@41	38 SimilarityPlugin::~SimilarityPlugin()
c@41	39 {
c@41	40 delete m_mfcc;
c@42	41 delete m_chromagram;
c@41	42 delete m_decimator;
c@41	43 }
c@41	44
c@41	45 string
c@41	46 SimilarityPlugin::getIdentifier() const
c@41	47 {
c@41	48 return "qm-similarity";
c@41	49 }
c@41	50
c@41	51 string
c@41	52 SimilarityPlugin::getName() const
c@41	53 {
c@41	54 return "Similarity";
c@41	55 }
c@41	56
c@41	57 string
c@41	58 SimilarityPlugin::getDescription() const
c@41	59 {
c@42	60 return "Return a distance matrix for similarity between the input audio channels";
c@41	61 }
c@41	62
c@41	63 string
c@41	64 SimilarityPlugin::getMaker() const
c@41	65 {
c@41	66 return "Chris Cannam, Queen Mary, University of London";
c@41	67 }
c@41	68
c@41	69 int
c@41	70 SimilarityPlugin::getPluginVersion() const
c@41	71 {
c@41	72 return 1;
c@41	73 }
c@41	74
c@41	75 string
c@41	76 SimilarityPlugin::getCopyright() const
c@41	77 {
c@41	78 return "Copyright (c) 2008 - All Rights Reserved";
c@41	79 }
c@41	80
c@41	81 size_t
c@41	82 SimilarityPlugin::getMinChannelCount() const
c@41	83 {
c@43	84 return 1;
c@41	85 }
c@41	86
c@41	87 size_t
c@41	88 SimilarityPlugin::getMaxChannelCount() const
c@41	89 {
c@41	90 return 1024;
c@41	91 }
c@41	92
c@41	93 bool
c@41	94 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
c@41	95 {
c@41	96 if (channels < getMinChannelCount() \|\|
c@41	97 channels > getMaxChannelCount()) return false;
c@41	98
c@41	99 if (stepSize != getPreferredStepSize()) {
c@43	100 //!!! actually this perhaps shouldn't be an error... similarly
c@43	101 //using more than getMaxChannelCount channels
c@41	102 std::cerr << "SimilarityPlugin::initialise: supplied step size "
c@41	103 << stepSize << " differs from required step size "
c@41	104 << getPreferredStepSize() << std::endl;
c@41	105 return false;
c@41	106 }
c@41	107
c@41	108 if (blockSize != getPreferredBlockSize()) {
c@41	109 std::cerr << "SimilarityPlugin::initialise: supplied block size "
c@41	110 << blockSize << " differs from required block size "
c@41	111 << getPreferredBlockSize() << std::endl;
c@41	112 return false;
c@41	113 }
c@41	114
c@41	115 m_blockSize = blockSize;
c@41	116 m_channels = channels;
c@41	117
c@41	118 int decimationFactor = getDecimationFactor();
c@41	119 if (decimationFactor > 1) {
c@42	120 m_decimator = new Decimator(m_blockSize, decimationFactor);
c@41	121 }
c@41	122
c@42	123 if (m_type == TypeMFCC) {
c@42	124
c@42	125 m_featureColumnSize = 20;
c@42	126
c@42	127 MFCCConfig config;
c@42	128 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42	129 config.fftsize = 2048;
c@42	130 config.nceps = m_featureColumnSize - 1;
c@42	131 config.want_c0 = true;
c@42	132 m_mfcc = new MFCC(config);
c@42	133 m_fftSize = m_mfcc->getfftlength();
c@42	134
c@43	135 std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl;
c@43	136
c@42	137 } else if (m_type == TypeChroma) {
c@42	138
c@42	139 m_featureColumnSize = 12;
c@42	140
c@42	141 ChromaConfig config;
c@42	142 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42	143 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
c@42	144 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
c@42	145 config.BPO = 12;
c@42	146 config.CQThresh = 0.0054;
c@42	147 config.isNormalised = true;
c@42	148 m_chromagram = new Chromagram(config);
c@42	149 m_fftSize = m_chromagram->getFrameSize();
c@42	150
c@42	151 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl;
c@42	152
c@42	153 } else {
c@42	154
c@42	155 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl;
c@42	156 return false;
c@42	157 }
c@41	158
c@41	159 for (int i = 0; i < m_channels; ++i) {
c@42	160 m_values.push_back(FeatureMatrix());
c@41	161 }
c@41	162
c@41	163 return true;
c@41	164 }
c@41	165
c@41	166 void
c@41	167 SimilarityPlugin::reset()
c@41	168 {
c@41	169 //!!!
c@41	170 }
c@41	171
c@41	172 int
c@41	173 SimilarityPlugin::getDecimationFactor() const
c@41	174 {
c@41	175 int rate = lrintf(m_inputSampleRate);
c@41	176 int internalRate = 22050;
c@41	177 int decimationFactor = rate / internalRate;
c@41	178 if (decimationFactor < 1) decimationFactor = 1;
c@41	179
c@41	180 // must be a power of two
c@41	181 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
c@41	182
c@41	183 return decimationFactor;
c@41	184 }
c@41	185
c@41	186 size_t
c@41	187 SimilarityPlugin::getPreferredStepSize() const
c@41	188 {
c@42	189 if (m_blockSize == 0) calculateBlockSize();
c@43	190 if (m_type == TypeChroma) {
c@43	191 return m_blockSize/2;
c@43	192 } else {
c@43	193 // for compatibility with old-skool Soundbite, which doesn't
c@43	194 // overlap blocks on input
c@43	195 return m_blockSize;
c@43	196 }
c@41	197 }
c@41	198
c@41	199 size_t
c@41	200 SimilarityPlugin::getPreferredBlockSize() const
c@41	201 {
c@42	202 if (m_blockSize == 0) calculateBlockSize();
c@42	203 return m_blockSize;
c@42	204 }
c@42	205
c@42	206 void
c@42	207 SimilarityPlugin::calculateBlockSize() const
c@42	208 {
c@42	209 if (m_blockSize != 0) return;
c@42	210 int decimationFactor = getDecimationFactor();
c@42	211 if (m_type == TypeChroma) {
c@42	212 ChromaConfig config;
c@42	213 config.FS = lrintf(m_inputSampleRate) / decimationFactor;
c@42	214 config.min = Pitch::getFrequencyForPitch(24, 0, 440);
c@42	215 config.max = Pitch::getFrequencyForPitch(96, 0, 440);
c@42	216 config.BPO = 12;
c@42	217 config.CQThresh = 0.0054;
c@42	218 config.isNormalised = false;
c@42	219 Chromagram *c = new Chromagram(config);
c@42	220 size_t sz = c->getFrameSize();
c@42	221 delete c;
c@42	222 m_blockSize = sz * decimationFactor;
c@42	223 } else {
c@42	224 m_blockSize = 2048 * decimationFactor;
c@42	225 }
c@41	226 }
c@41	227
c@41	228 SimilarityPlugin::ParameterList SimilarityPlugin::getParameterDescriptors() const
c@41	229 {
c@41	230 ParameterList list;
c@42	231
c@42	232 ParameterDescriptor desc;
c@42	233 desc.identifier = "featureType";
c@42	234 desc.name = "Feature Type";
c@42	235 desc.description = "";//!!!
c@42	236 desc.unit = "";
c@42	237 desc.minValue = 0;
c@42	238 desc.maxValue = 1;
c@42	239 desc.defaultValue = 0;
c@42	240 desc.isQuantized = true;
c@42	241 desc.quantizeStep = 1;
c@42	242 desc.valueNames.push_back("Timbral (MFCC)");
c@42	243 desc.valueNames.push_back("Chromatic (Chroma)");
c@42	244 list.push_back(desc);
c@42	245
c@41	246 return list;
c@41	247 }
c@41	248
c@41	249 float
c@41	250 SimilarityPlugin::getParameter(std::string param) const
c@41	251 {
c@42	252 if (param == "featureType") {
c@42	253 if (m_type == TypeMFCC) return 0;
c@42	254 else if (m_type == TypeChroma) return 1;
c@42	255 else return 0;
c@42	256 }
c@42	257
c@41	258 std::cerr << "WARNING: SimilarityPlugin::getParameter: unknown parameter \""
c@41	259 << param << "\"" << std::endl;
c@41	260 return 0.0;
c@41	261 }
c@41	262
c@41	263 void
c@41	264 SimilarityPlugin::setParameter(std::string param, float value)
c@41	265 {
c@42	266 if (param == "featureType") {
c@42	267 int v = int(value + 0.1);
c@42	268 Type prevType = m_type;
c@42	269 if (v == 0) m_type = TypeMFCC;
c@42	270 else if (v == 1) m_type = TypeChroma;
c@42	271 if (m_type != prevType) m_blockSize = 0;
c@42	272 return;
c@42	273 }
c@42	274
c@41	275 std::cerr << "WARNING: SimilarityPlugin::setParameter: unknown parameter \""
c@41	276 << param << "\"" << std::endl;
c@41	277 }
c@41	278
c@41	279 SimilarityPlugin::OutputList
c@41	280 SimilarityPlugin::getOutputDescriptors() const
c@41	281 {
c@41	282 OutputList list;
c@41	283
c@41	284 OutputDescriptor similarity;
c@43	285 similarity.identifier = "distancematrix";
c@43	286 similarity.name = "Distance Matrix";
c@43	287 similarity.description = "Distance matrix for similarity metric. Smaller = more similar. Should be symmetrical.";
c@41	288 similarity.unit = "";
c@41	289 similarity.hasFixedBinCount = true;
c@41	290 similarity.binCount = m_channels;
c@41	291 similarity.hasKnownExtents = false;
c@41	292 similarity.isQuantized = false;
c@41	293 similarity.sampleType = OutputDescriptor::FixedSampleRate;
c@41	294 similarity.sampleRate = 1;
c@41	295
c@43	296 m_distanceMatrixOutput = list.size();
c@41	297 list.push_back(similarity);
c@41	298
c@43	299 OutputDescriptor simvec;
c@43	300 simvec.identifier = "distancevector";
c@43	301 simvec.name = "Distance from First Channel";
c@43	302 simvec.description = "Distance vector for similarity of each channel to the first channel. Smaller = more similar.";
c@43	303 simvec.unit = "";
c@43	304 simvec.hasFixedBinCount = true;
c@43	305 simvec.binCount = m_channels;
c@43	306 simvec.hasKnownExtents = false;
c@43	307 simvec.isQuantized = false;
c@43	308 simvec.sampleType = OutputDescriptor::FixedSampleRate;
c@43	309 simvec.sampleRate = 1;
c@43	310
c@43	311 m_distanceVectorOutput = list.size();
c@43	312 list.push_back(simvec);
c@43	313
c@41	314 OutputDescriptor means;
c@41	315 means.identifier = "means";
c@42	316 means.name = "Feature Means";
c@43	317 means.description = "Means of the feature bins. Feature time (sec) corresponds to input channel. Number of bins depends on selected feature type.";
c@41	318 means.unit = "";
c@41	319 means.hasFixedBinCount = true;
c@43	320 means.binCount = m_featureColumnSize;
c@41	321 means.hasKnownExtents = false;
c@41	322 means.isQuantized = false;
c@43	323 means.sampleType = OutputDescriptor::FixedSampleRate;
c@43	324 means.sampleRate = 1;
c@41	325
c@43	326 m_meansOutput = list.size();
c@41	327 list.push_back(means);
c@41	328
c@41	329 OutputDescriptor variances;
c@41	330 variances.identifier = "variances";
c@42	331 variances.name = "Feature Variances";
c@43	332 variances.description = "Variances of the feature bins. Feature time (sec) corresponds to input channel. Number of bins depends on selected feature type.";
c@41	333 variances.unit = "";
c@41	334 variances.hasFixedBinCount = true;
c@43	335 variances.binCount = m_featureColumnSize;
c@41	336 variances.hasKnownExtents = false;
c@41	337 variances.isQuantized = false;
c@43	338 variances.sampleType = OutputDescriptor::FixedSampleRate;
c@43	339 variances.sampleRate = 1;
c@41	340
c@43	341 m_variancesOutput = list.size();
c@41	342 list.push_back(variances);
c@41	343
c@41	344 return list;
c@41	345 }
c@41	346
c@41	347 SimilarityPlugin::FeatureSet
c@41	348 SimilarityPlugin::process(const float const inputBuffers, Vamp::RealTime /* timestamp */)
c@41	349 {
c@41	350 double *dblbuf = new double[m_blockSize];
c@41	351 double *decbuf = dblbuf;
c@42	352 if (m_decimator) decbuf = new double[m_fftSize];
c@42	353
c@42	354 double *raw = 0;
c@42	355 bool ownRaw = false;
c@42	356
c@42	357 if (m_type == TypeMFCC) {
c@42	358 raw = new double[m_featureColumnSize];
c@42	359 ownRaw = true;
c@42	360 }
c@41	361
c@43	362 float threshold = 1e-10;
c@43	363
c@41	364 for (size_t c = 0; c < m_channels; ++c) {
c@41	365
c@43	366 bool empty = true;
c@43	367
c@41	368 for (int i = 0; i < m_blockSize; ++i) {
c@43	369 float val = inputBuffers[c][i];
c@43	370 if (fabs(val) > threshold) empty = false;
c@43	371 dblbuf[i] = val;
c@41	372 }
c@41	373
c@43	374 if (empty) continue;
c@43	375
c@41	376 if (m_decimator) {
c@41	377 m_decimator->process(dblbuf, decbuf);
c@41	378 }
c@42	379
c@42	380 if (m_type == TypeMFCC) {
c@42	381 m_mfcc->process(m_fftSize, decbuf, raw);
c@42	382 } else if (m_type == TypeChroma) {
c@42	383 raw = m_chromagram->process(decbuf);
c@42	384 }
c@41	385
c@42	386 FeatureColumn mf(m_featureColumnSize);
c@42	387 for (int i = 0; i < m_featureColumnSize; ++i) mf[i] = raw[i];
c@41	388
c@42	389 m_values[c].push_back(mf);
c@41	390 }
c@41	391
c@41	392 if (m_decimator) delete[] decbuf;
c@41	393 delete[] dblbuf;
c@42	394
c@42	395 if (ownRaw) delete[] raw;
c@41	396
c@41	397 return FeatureSet();
c@41	398 }
c@41	399
c@41	400 SimilarityPlugin::FeatureSet
c@41	401 SimilarityPlugin::getRemainingFeatures()
c@41	402 {
c@42	403 std::vector<FeatureColumn> m(m_channels);
c@42	404 std::vector<FeatureColumn> v(m_channels);
c@41	405
c@41	406 for (int i = 0; i < m_channels; ++i) {
c@41	407
c@42	408 FeatureColumn mean(m_featureColumnSize), variance(m_featureColumnSize);
c@41	409
c@42	410 for (int j = 0; j < m_featureColumnSize; ++j) {
c@41	411
c@43	412 mean[j] = 0.0;
c@43	413 variance[j] = 0.0;
c@41	414 int count;
c@41	415
c@43	416 // we need to use at least one value, but we want to
c@43	417 // disregard the final value because it may have come from
c@43	418 // incomplete data
c@43	419
c@43	420 int sz = m_values[i].size();
c@43	421 if (sz > 1) --sz;
c@43	422
c@43	423 // std::cout << "\nBin " << j << ":" << std::endl;
c@42	424
c@41	425 count = 0;
c@43	426 for (int k = 0; k < sz; ++k) {
c@42	427 double val = m_values[i][k][j];
c@42	428 // std::cout << val << " ";
c@41	429 if (isnan(val) \|\| isinf(val)) continue;
c@41	430 mean[j] += val;
c@41	431 ++count;
c@41	432 }
c@41	433 if (count > 0) mean[j] /= count;
c@43	434 // std::cout << "\n" << count << " non-NaN non-inf values, so mean = " << mean[j] << std::endl;
c@41	435
c@41	436 count = 0;
c@43	437 for (int k = 0; k < sz; ++k) {
c@42	438 double val = ((m_values[i][k][j] - mean[j]) *
c@42	439 (m_values[i][k][j] - mean[j]));
c@41	440 if (isnan(val) \|\| isinf(val)) continue;
c@41	441 variance[j] += val;
c@41	442 ++count;
c@41	443 }
c@41	444 if (count > 0) variance[j] /= count;
c@43	445 // std::cout << "... and variance = " << variance[j] << std::endl;
c@41	446 }
c@41	447
c@41	448 m[i] = mean;
c@41	449 v[i] = variance;
c@41	450 }
c@41	451
c@42	452 // we want to return a matrix of the distances between channels,
c@41	453 // but Vamp doesn't have a matrix return type so we actually
c@41	454 // return a series of vectors
c@41	455
c@41	456 std::vector<std::vector<double> > distances;
c@41	457
c@42	458 // "Despite the fact that MFCCs extracted from music are clearly
c@42	459 // not Gaussian, [14] showed, somewhat surprisingly, that a
c@42	460 // similarity function comparing single Gaussians modelling MFCCs
c@42	461 // for each track can perform as well as mixture models. A great
c@42	462 // advantage of using single Gaussians is that a simple closed
c@42	463 // form exists for the KL divergence." -- Mark Levy, "Lightweight
c@42	464 // measures for timbral similarity of musical audio"
c@42	465 // (http://www.elec.qmul.ac.uk/easaier/papers/mlevytimbralsimilarity.pdf)
c@42	466 //
c@42	467 // This code calculates a symmetrised distance metric based on the
c@42	468 // KL divergence of Gaussian models of the MFCC values.
c@42	469
c@41	470 for (int i = 0; i < m_channels; ++i) {
c@41	471 distances.push_back(std::vector<double>());
c@41	472 for (int j = 0; j < m_channels; ++j) {
c@42	473 double d = -2.0 * m_featureColumnSize;
c@42	474 for (int k = 0; k < m_featureColumnSize; ++k) {
c@42	475 // m[i][k] is the mean of feature bin k for channel i
c@42	476 // v[i][k] is the variance of feature bin k for channel i
c@41	477 d += v[i][k] / v[j][k] + v[j][k] / v[i][k];
c@41	478 d += (m[i][k] - m[j][k])
c@41	479 * (1.0 / v[i][k] + 1.0 / v[j][k])
c@41	480 * (m[i][k] - m[j][k]);
c@41	481 }
c@41	482 d /= 2.0;
c@41	483 distances[i].push_back(d);
c@41	484 }
c@41	485 }
c@41	486
c@41	487 FeatureSet returnFeatures;
c@41	488
c@43	489 Feature distanceVectorFeature;
c@43	490 distanceVectorFeature.label = "Distance from first channel";
c@43	491
c@41	492 for (int i = 0; i < m_channels; ++i) {
c@41	493
c@41	494 Feature feature;
c@41	495 feature.hasTimestamp = true; // otherwise hosts will tend to stamp them at the end of the file, which is annoying
c@41	496 feature.timestamp = Vamp::RealTime(i, 0);
c@41	497
c@41	498 feature.values.clear();
c@42	499 for (int k = 0; k < m_featureColumnSize; ++k) {
c@41	500 feature.values.push_back(m[i][k]);
c@41	501 }
c@41	502
c@43	503 returnFeatures[m_meansOutput].push_back(feature);
c@41	504
c@41	505 feature.values.clear();
c@42	506 for (int k = 0; k < m_featureColumnSize; ++k) {
c@41	507 feature.values.push_back(v[i][k]);
c@41	508 }
c@41	509
c@43	510 returnFeatures[m_variancesOutput].push_back(feature);
c@41	511
c@41	512 feature.values.clear();
c@41	513 for (int j = 0; j < m_channels; ++j) {
c@41	514 feature.values.push_back(distances[i][j]);
c@41	515 }
c@43	516
c@41	517 ostringstream oss;
c@41	518 oss << "Distance from " << (i + 1);
c@41	519 feature.label = oss.str();
c@41	520
c@43	521 returnFeatures[m_distanceMatrixOutput].push_back(feature);
c@43	522
c@43	523 distanceVectorFeature.values.push_back(distances[0][i]);
c@41	524 }
c@41	525
c@43	526 returnFeatures[m_distanceVectorOutput].push_back(distanceVectorFeature);
c@43	527
c@41	528 return returnFeatures;
c@41	529 }

Mercurial > hg > qm-vamp-plugins

annotate plugins/SimilarityPlugin.cpp @ 43:1389f05cb688