annotate dsp/segmentation/ClusterMeltSegmenter.cpp @ 298:255e431ae3d4

* Key detector: when returning key strengths, use the peak value of the three underlying chromagram correlations (from 36-bin chromagram) corresponding to each key, instead of the mean. Rationale: This is the same method as used when returning the key value, and it's nice to have the same results in both returned value and plot. The peak performed better than the sum with a simple test set of triads, so it seems reasonable to change the plot to match the key output rather than the other way around. * FFT: kiss_fftr returns only the non-conjugate bins, synthesise the rest rather than leaving them (perhaps dangerously) undefined. Fixes an uninitialised data error in chromagram that could cause garbage results from key detector. * Constant Q: remove precalculated values again, I reckon they're not proving such a good tradeoff.
author Chris Cannam <c.cannam@qmul.ac.uk>
date Fri, 05 Jun 2009 15:12:39 +0000
parents befe5aa6b450
children e5907ae6de17
rev   line source
c@249 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
c@249 2
c@243 3 /*
c@249 4 * ClusterMeltSegmenter.cpp
c@243 5 *
c@249 6 * Created by Mark Levy on 23/03/2006.
c@249 7 * Copyright 2006 Centre for Digital Music, Queen Mary, University of London.
c@249 8 * All rights reserved.
c@243 9 */
c@243 10
c@243 11 #include <cfloat>
c@243 12 #include <cmath>
c@243 13
c@243 14 #include "ClusterMeltSegmenter.h"
c@243 15 #include "cluster_segmenter.h"
c@243 16 #include "segment.h"
c@243 17
c@245 18 #include "dsp/transforms/FFT.h"
c@249 19 #include "dsp/chromagram/ConstantQ.h"
c@249 20 #include "dsp/rateconversion/Decimator.h"
c@251 21 #include "dsp/mfcc/MFCC.h"
c@245 22
c@249 23 ClusterMeltSegmenter::ClusterMeltSegmenter(ClusterMeltSegmenterParams params) :
c@249 24 window(NULL),
c@289 25 fft(NULL),
c@249 26 constq(NULL),
c@251 27 mfcc(NULL),
c@249 28 featureType(params.featureType),
c@249 29 hopSize(params.hopSize),
c@249 30 windowSize(params.windowSize),
c@249 31 fmin(params.fmin),
c@249 32 fmax(params.fmax),
c@249 33 nbins(params.nbins),
c@249 34 ncomponents(params.ncomponents), // NB currently not passed - no. of PCA components is set in cluser_segmenter.c
c@249 35 nHMMStates(params.nHMMStates),
c@249 36 nclusters(params.nclusters),
c@249 37 histogramLength(params.histogramLength),
c@249 38 neighbourhoodLimit(params.neighbourhoodLimit),
c@251 39 decimator(NULL)
c@243 40 {
c@243 41 }
c@243 42
c@243 43 void ClusterMeltSegmenter::initialise(int fs)
c@243 44 {
c@249 45 samplerate = fs;
c@249 46
c@251 47 if (featureType == FEATURE_TYPE_CONSTQ ||
c@251 48 featureType == FEATURE_TYPE_CHROMA) {
c@251 49
c@251 50 // run internal processing at 11025 or thereabouts
c@249 51 int internalRate = 11025;
c@249 52 int decimationFactor = samplerate / internalRate;
c@249 53 if (decimationFactor < 1) decimationFactor = 1;
c@249 54
c@249 55 // must be a power of two
c@249 56 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
c@249 57
c@249 58 if (decimationFactor > Decimator::getHighestSupportedFactor()) {
c@249 59 decimationFactor = Decimator::getHighestSupportedFactor();
c@249 60 }
c@249 61
c@249 62 if (decimationFactor > 1) {
c@249 63 decimator = new Decimator(getWindowsize(), decimationFactor);
c@249 64 }
c@249 65
c@249 66 CQConfig config;
c@249 67 config.FS = samplerate / decimationFactor;
c@249 68 config.min = fmin;
c@249 69 config.max = fmax;
c@249 70 config.BPO = nbins;
c@249 71 config.CQThresh = 0.0054;
c@249 72
c@249 73 constq = new ConstantQ(config);
c@249 74 constq->sparsekernel();
c@251 75
c@251 76 ncoeff = constq->getK();
c@289 77
c@289 78 fft = new FFTReal(constq->getfftlength());
c@251 79
c@251 80 } else if (featureType == FEATURE_TYPE_MFCC) {
c@249 81
c@252 82 // run internal processing at 22050 or thereabouts
c@252 83 int internalRate = 22050;
c@252 84 int decimationFactor = samplerate / internalRate;
c@252 85 if (decimationFactor < 1) decimationFactor = 1;
c@252 86
c@252 87 // must be a power of two
c@252 88 while (decimationFactor & (decimationFactor - 1)) ++decimationFactor;
c@252 89
c@252 90 if (decimationFactor > Decimator::getHighestSupportedFactor()) {
c@252 91 decimationFactor = Decimator::getHighestSupportedFactor();
c@252 92 }
c@252 93
c@252 94 if (decimationFactor > 1) {
c@252 95 decimator = new Decimator(getWindowsize(), decimationFactor);
c@252 96 }
c@252 97
c@255 98 MFCCConfig config(samplerate / decimationFactor);
c@252 99 config.fftsize = 2048;
c@252 100 config.nceps = 19;
c@252 101 config.want_c0 = true;
c@251 102
c@251 103 mfcc = new MFCC(config);
c@252 104 ncoeff = config.nceps + 1;
c@249 105 }
c@243 106 }
c@243 107
c@243 108 ClusterMeltSegmenter::~ClusterMeltSegmenter()
c@243 109 {
c@249 110 delete window;
c@249 111 delete constq;
c@249 112 delete decimator;
c@289 113 delete fft;
c@245 114 }
c@245 115
c@245 116 int
c@245 117 ClusterMeltSegmenter::getWindowsize()
c@245 118 {
c@269 119 return static_cast<int>(windowSize * samplerate + 0.001);
c@245 120 }
c@245 121
c@245 122 int
c@245 123 ClusterMeltSegmenter::getHopsize()
c@245 124 {
c@269 125 return static_cast<int>(hopSize * samplerate + 0.001);
c@243 126 }
c@243 127
c@249 128 void ClusterMeltSegmenter::extractFeatures(const double* samples, int nsamples)
c@243 129 {
c@251 130 if (featureType == FEATURE_TYPE_CONSTQ ||
c@251 131 featureType == FEATURE_TYPE_CHROMA) {
c@251 132 extractFeaturesConstQ(samples, nsamples);
c@251 133 } else if (featureType == FEATURE_TYPE_MFCC) {
c@251 134 extractFeaturesMFCC(samples, nsamples);
c@251 135 }
c@251 136 }
c@251 137
c@251 138 void ClusterMeltSegmenter::extractFeaturesConstQ(const double* samples, int nsamples)
c@251 139 {
c@249 140 if (!constq) {
c@251 141 std::cerr << "ERROR: ClusterMeltSegmenter::extractFeaturesConstQ: "
c@251 142 << "No const-q: initialise not called?"
c@249 143 << std::endl;
c@249 144 return;
c@249 145 }
c@245 146
c@249 147 if (nsamples < getWindowsize()) {
c@249 148 std::cerr << "ERROR: ClusterMeltSegmenter::extractFeatures: nsamples < windowsize (" << nsamples << " < " << getWindowsize() << ")" << std::endl;
c@249 149 return;
c@249 150 }
c@249 151
c@249 152 int fftsize = constq->getfftlength();
c@249 153
c@249 154 if (!window || window->getSize() != fftsize) {
c@249 155 delete window;
c@249 156 window = new Window<double>(HammingWindow, fftsize);
c@249 157 }
c@249 158
c@249 159 vector<double> cq(ncoeff);
c@249 160
c@249 161 for (int i = 0; i < ncoeff; ++i) cq[i] = 0.0;
c@249 162
c@249 163 const double *psource = samples;
c@249 164 int pcount = nsamples;
c@249 165
c@249 166 if (decimator) {
c@249 167 pcount = nsamples / decimator->getFactor();
c@249 168 double *decout = new double[pcount];
c@249 169 decimator->process(samples, decout);
c@249 170 psource = decout;
c@249 171 }
c@249 172
c@249 173 int origin = 0;
c@249 174
c@249 175 // std::cerr << "nsamples = " << nsamples << ", pcount = " << pcount << std::endl;
c@249 176
c@249 177 int frames = 0;
c@249 178
c@249 179 double *frame = new double[fftsize];
c@249 180 double *real = new double[fftsize];
c@249 181 double *imag = new double[fftsize];
c@249 182 double *cqre = new double[ncoeff];
c@249 183 double *cqim = new double[ncoeff];
c@249 184
c@249 185 while (origin <= pcount) {
c@249 186
c@249 187 // always need at least one fft window per block, but after
c@249 188 // that we want to avoid having any incomplete ones
c@249 189 if (origin > 0 && origin + fftsize >= pcount) break;
c@249 190
c@249 191 for (int i = 0; i < fftsize; ++i) {
c@249 192 if (origin + i < pcount) {
c@249 193 frame[i] = psource[origin + i];
c@249 194 } else {
c@249 195 frame[i] = 0.0;
c@249 196 }
c@249 197 }
c@249 198
c@249 199 for (int i = 0; i < fftsize/2; ++i) {
c@249 200 double value = frame[i];
c@249 201 frame[i] = frame[i + fftsize/2];
c@249 202 frame[i + fftsize/2] = value;
c@249 203 }
c@249 204
c@249 205 window->cut(frame);
c@249 206
c@289 207 fft->process(false, frame, real, imag);
c@249 208
c@249 209 constq->process(real, imag, cqre, cqim);
c@243 210
c@249 211 for (int i = 0; i < ncoeff; ++i) {
c@249 212 cq[i] += sqrt(cqre[i] * cqre[i] + cqim[i] * cqim[i]);
c@249 213 }
c@249 214 ++frames;
c@245 215
c@249 216 origin += fftsize/2;
c@249 217 }
c@245 218
c@249 219 delete [] cqre;
c@249 220 delete [] cqim;
c@249 221 delete [] real;
c@249 222 delete [] imag;
c@249 223 delete [] frame;
c@245 224
c@249 225 for (int i = 0; i < ncoeff; ++i) {
c@249 226 cq[i] /= frames;
c@249 227 }
c@245 228
c@249 229 if (decimator) delete[] psource;
c@245 230
c@249 231 features.push_back(cq);
c@243 232 }
c@243 233
c@251 234 void ClusterMeltSegmenter::extractFeaturesMFCC(const double* samples, int nsamples)
c@251 235 {
c@251 236 if (!mfcc) {
c@251 237 std::cerr << "ERROR: ClusterMeltSegmenter::extractFeaturesMFCC: "
c@251 238 << "No mfcc: initialise not called?"
c@251 239 << std::endl;
c@251 240 return;
c@251 241 }
c@251 242
c@251 243 if (nsamples < getWindowsize()) {
c@251 244 std::cerr << "ERROR: ClusterMeltSegmenter::extractFeatures: nsamples < windowsize (" << nsamples << " < " << getWindowsize() << ")" << std::endl;
c@251 245 return;
c@251 246 }
c@251 247
c@251 248 int fftsize = mfcc->getfftlength();
c@251 249
c@251 250 vector<double> cc(ncoeff);
c@251 251
c@251 252 for (int i = 0; i < ncoeff; ++i) cc[i] = 0.0;
c@251 253
c@251 254 const double *psource = samples;
c@251 255 int pcount = nsamples;
c@251 256
c@252 257 if (decimator) {
c@252 258 pcount = nsamples / decimator->getFactor();
c@252 259 double *decout = new double[pcount];
c@252 260 decimator->process(samples, decout);
c@252 261 psource = decout;
c@252 262 }
c@252 263
c@251 264 int origin = 0;
c@251 265 int frames = 0;
c@251 266
c@251 267 double *frame = new double[fftsize];
c@251 268 double *ccout = new double[ncoeff];
c@251 269
c@251 270 while (origin <= pcount) {
c@251 271
c@251 272 // always need at least one fft window per block, but after
c@251 273 // that we want to avoid having any incomplete ones
c@251 274 if (origin > 0 && origin + fftsize >= pcount) break;
c@251 275
c@251 276 for (int i = 0; i < fftsize; ++i) {
c@251 277 if (origin + i < pcount) {
c@251 278 frame[i] = psource[origin + i];
c@251 279 } else {
c@251 280 frame[i] = 0.0;
c@251 281 }
c@251 282 }
c@251 283
c@255 284 mfcc->process(frame, ccout);
c@251 285
c@251 286 for (int i = 0; i < ncoeff; ++i) {
c@251 287 cc[i] += ccout[i];
c@251 288 }
c@251 289 ++frames;
c@251 290
c@251 291 origin += fftsize/2;
c@251 292 }
c@251 293
c@251 294 delete [] ccout;
c@251 295 delete [] frame;
c@251 296
c@251 297 for (int i = 0; i < ncoeff; ++i) {
c@251 298 cc[i] /= frames;
c@251 299 }
c@251 300
c@252 301 if (decimator) delete[] psource;
c@252 302
c@251 303 features.push_back(cc);
c@251 304 }
c@251 305
c@243 306 void ClusterMeltSegmenter::segment(int m)
c@243 307 {
c@249 308 nclusters = m;
c@249 309 segment();
c@243 310 }
c@243 311
c@243 312 void ClusterMeltSegmenter::setFeatures(const vector<vector<double> >& f)
c@243 313 {
c@249 314 features = f;
c@249 315 featureType = FEATURE_TYPE_UNKNOWN;
c@243 316 }
c@243 317
c@243 318 void ClusterMeltSegmenter::segment()
c@243 319 {
c@251 320 delete constq;
c@251 321 constq = 0;
c@251 322 delete mfcc;
c@251 323 mfcc = 0;
c@251 324 delete decimator;
c@251 325 decimator = 0;
c@283 326
c@283 327 if (features.size() < histogramLength) return;
c@283 328 /*
c@249 329 std::cerr << "ClusterMeltSegmenter::segment: have " << features.size()
c@249 330 << " features with " << features[0].size() << " coefficients (ncoeff = " << ncoeff << ", ncomponents = " << ncomponents << ")" << std::endl;
c@283 331 */
c@249 332 // copy the features to a native array and use the existing C segmenter...
c@249 333 double** arrFeatures = new double*[features.size()];
c@249 334 for (int i = 0; i < features.size(); i++)
c@249 335 {
c@249 336 if (featureType == FEATURE_TYPE_UNKNOWN) {
c@249 337 arrFeatures[i] = new double[features[0].size()];
c@249 338 for (int j = 0; j < features[0].size(); j++)
c@249 339 arrFeatures[i][j] = features[i][j];
c@249 340 } else {
c@249 341 arrFeatures[i] = new double[ncoeff+1]; // allow space for the normalised envelope
c@249 342 for (int j = 0; j < ncoeff; j++)
c@249 343 arrFeatures[i][j] = features[i][j];
c@249 344 }
c@249 345 }
c@243 346
c@249 347 q = new int[features.size()];
c@243 348
c@251 349 if (featureType == FEATURE_TYPE_UNKNOWN ||
c@251 350 featureType == FEATURE_TYPE_MFCC)
c@249 351 cluster_segment(q, arrFeatures, features.size(), features[0].size(), nHMMStates, histogramLength,
c@249 352 nclusters, neighbourhoodLimit);
c@249 353 else
c@249 354 constq_segment(q, arrFeatures, features.size(), nbins, ncoeff, featureType,
c@249 355 nHMMStates, histogramLength, nclusters, neighbourhoodLimit);
c@243 356
c@249 357 // convert the cluster assignment sequence to a segmentation
c@249 358 makeSegmentation(q, features.size());
c@243 359
c@249 360 // de-allocate arrays
c@249 361 delete [] q;
c@249 362 for (int i = 0; i < features.size(); i++)
c@249 363 delete [] arrFeatures[i];
c@249 364 delete [] arrFeatures;
c@243 365
c@249 366 // clear the features
c@249 367 clear();
c@243 368 }
c@243 369
c@243 370 void ClusterMeltSegmenter::makeSegmentation(int* q, int len)
c@243 371 {
c@249 372 segmentation.segments.clear();
c@249 373 segmentation.nsegtypes = nclusters;
c@249 374 segmentation.samplerate = samplerate;
c@243 375
c@249 376 Segment segment;
c@249 377 segment.start = 0;
c@249 378 segment.type = q[0];
c@243 379
c@249 380 for (int i = 1; i < len; i++)
c@249 381 {
c@249 382 if (q[i] != q[i-1])
c@249 383 {
c@249 384 segment.end = i * getHopsize();
c@249 385 segmentation.segments.push_back(segment);
c@249 386 segment.type = q[i];
c@249 387 segment.start = segment.end;
c@249 388 }
c@249 389 }
c@249 390 segment.end = len * getHopsize();
c@249 391 segmentation.segments.push_back(segment);
c@243 392 }
c@243 393