Mercurial > hg > qm-vamp-plugins
comparison plugins/SimilarityPlugin.cpp @ 68:68f181553123
* bit of tidying
author | Chris Cannam <c.cannam@qmul.ac.uk> |
---|---|
date | Wed, 05 Mar 2008 12:26:11 +0000 |
parents | e8e103090d97 |
children | 4a354c18e688 |
comparison
equal
deleted
inserted
replaced
67:e8e103090d97 | 68:68f181553123 |
---|---|
115 | 115 |
116 size_t | 116 size_t |
117 SimilarityPlugin::getMaxChannelCount() const | 117 SimilarityPlugin::getMaxChannelCount() const |
118 { | 118 { |
119 return 1024; | 119 return 1024; |
120 } | |
121 | |
122 bool | |
123 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize) | |
124 { | |
125 if (channels < getMinChannelCount()) return false; | |
126 | |
127 // Using more than getMaxChannelCount is not actually a problem | |
128 // for us. Using "incorrect" step and block sizes would be fine | |
129 // for timbral or chroma similarity, but will break rhythmic | |
130 // similarity, so we'd better enforce these. | |
131 | |
132 if (stepSize != getPreferredStepSize()) { | |
133 std::cerr << "SimilarityPlugin::initialise: supplied step size " | |
134 << stepSize << " differs from required step size " | |
135 << getPreferredStepSize() << std::endl; | |
136 return false; | |
137 } | |
138 | |
139 if (blockSize != getPreferredBlockSize()) { | |
140 std::cerr << "SimilarityPlugin::initialise: supplied block size " | |
141 << blockSize << " differs from required block size " | |
142 << getPreferredBlockSize() << std::endl; | |
143 return false; | |
144 } | |
145 | |
146 m_blockSize = blockSize; | |
147 m_channels = channels; | |
148 | |
149 m_lastNonEmptyFrame = std::vector<int>(m_channels); | |
150 for (int i = 0; i < m_channels; ++i) m_lastNonEmptyFrame[i] = -1; | |
151 | |
152 m_emptyFrameCount = std::vector<int>(m_channels); | |
153 for (int i = 0; i < m_channels; ++i) m_emptyFrameCount[i] = 0; | |
154 | |
155 m_frameNo = 0; | |
156 | |
157 int decimationFactor = getDecimationFactor(); | |
158 if (decimationFactor > 1) { | |
159 m_decimator = new Decimator(m_blockSize, decimationFactor); | |
160 } | |
161 | |
162 if (m_type == TypeMFCC) { | |
163 | |
164 m_featureColumnSize = 20; | |
165 | |
166 MFCCConfig config(m_processRate); | |
167 config.fftsize = 2048; | |
168 config.nceps = m_featureColumnSize - 1; | |
169 config.want_c0 = true; | |
170 config.logpower = 1; | |
171 m_mfcc = new MFCC(config); | |
172 m_fftSize = m_mfcc->getfftlength(); | |
173 m_rhythmClipFrameSize = m_fftSize / 4; | |
174 | |
175 // std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl; | |
176 | |
177 } else if (m_type == TypeChroma) { | |
178 | |
179 m_featureColumnSize = 12; | |
180 | |
181 // For simplicity, aim to have the chroma fft size equal to | |
182 // 2048, the same as the mfcc fft size (so the input block | |
183 // size does not depend on the feature type and we can use the | |
184 // same processing parameters for rhythm etc). This is also | |
185 // why getPreferredBlockSize can confidently return 2048 * the | |
186 // decimation factor. | |
187 | |
188 // The fft size for a chromagram is the filterbank Q value | |
189 // times the sample rate, divided by the minimum frequency, | |
190 // rounded up to the nearest power of two. | |
191 | |
192 double q = 1.0 / (pow(2.0, (1.0 / 12.0)) - 1.0); | |
193 double fmin = (q * m_processRate) / 2048.0; | |
194 // std::cerr << "chroma fmin = " << fmin; | |
195 | |
196 // Round fmin up to the nearest MIDI pitch multiple of 12. | |
197 // So long as fmin is greater than 12 to start with, this | |
198 // should not change the resulting fft size. | |
199 | |
200 int pmin = Pitch::getPitchForFrequency(float(fmin)); | |
201 pmin = ((pmin / 12) + 1) * 12; | |
202 fmin = Pitch::getFrequencyForPitch(pmin); | |
203 // std::cerr << " -> " << fmin << " for pitch " << pmin << std::endl; | |
204 | |
205 float fmax = Pitch::getFrequencyForPitch(pmin + 36); | |
206 // std::cerr << "fmax = " << fmax << " for pitch " << (pmin+36) << std::endl; | |
207 | |
208 | |
209 ChromaConfig config; | |
210 config.FS = m_processRate; | |
211 config.min = fmin; | |
212 config.max = fmax; | |
213 // config.min = Pitch::getFrequencyForPitch(24, 0, 440); | |
214 // config.max = Pitch::getFrequencyForPitch(96, 0, 440); | |
215 config.BPO = 12; | |
216 config.CQThresh = 0.0054; | |
217 // We don't normalise the chromagram's columns individually; | |
218 // we normalise the mean at the end instead | |
219 config.normalise = MathUtilities::NormaliseNone; | |
220 m_chromagram = new Chromagram(config); | |
221 m_fftSize = m_chromagram->getFrameSize(); | |
222 | |
223 if (m_fftSize != 2048) { | |
224 std::cerr << "WARNING: SimilarityPlugin::initialise: Internal processing FFT size " << m_fftSize << " != expected size 2048 in chroma mode" << std::endl; | |
225 } | |
226 | |
227 std::cerr << "fftsize = " << m_fftSize << std::endl; | |
228 | |
229 m_rhythmClipFrameSize = m_fftSize / 4; | |
230 | |
231 // m_rhythmClipFrameSize = m_fftSize / 16; | |
232 // while (m_rhythmClipFrameSize < 512) m_rhythmClipFrameSize *= 2; | |
233 | |
234 std::cerr << "m_rhythmClipFrameSize = " << m_rhythmClipFrameSize << std::endl; | |
235 | |
236 std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl; | |
237 | |
238 } else { | |
239 | |
240 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl; | |
241 return false; | |
242 } | |
243 | |
244 if (needRhythm()) { | |
245 m_rhythmClipFrames = | |
246 int(ceil((m_rhythmClipDuration * m_processRate) | |
247 / m_rhythmClipFrameSize)); | |
248 std::cerr << "SimilarityPlugin::initialise: rhythm clip requires " | |
249 << m_rhythmClipFrames << " frames of size " | |
250 << m_rhythmClipFrameSize << " at process rate " | |
251 << m_processRate << " ( = " | |
252 << (float(m_rhythmClipFrames * m_rhythmClipFrameSize) / m_processRate) << " sec )" | |
253 << std::endl; | |
254 | |
255 MFCCConfig config(m_processRate); | |
256 config.fftsize = m_rhythmClipFrameSize; | |
257 config.nceps = m_rhythmColumnSize - 1; | |
258 config.want_c0 = true; | |
259 config.logpower = 1; | |
260 config.window = RectangularWindow; // because no overlap | |
261 m_rhythmfcc = new MFCC(config); | |
262 } | |
263 | |
264 for (int i = 0; i < m_channels; ++i) { | |
265 | |
266 m_values.push_back(FeatureMatrix()); | |
267 | |
268 if (needRhythm()) { | |
269 m_rhythmValues.push_back(FeatureColumnQueue()); | |
270 } | |
271 } | |
272 | |
273 m_done = false; | |
274 | |
275 return true; | |
276 } | |
277 | |
278 void | |
279 SimilarityPlugin::reset() | |
280 { | |
281 for (int i = 0; i < m_values.size(); ++i) { | |
282 m_values[i].clear(); | |
283 } | |
284 | |
285 for (int i = 0; i < m_rhythmValues.size(); ++i) { | |
286 m_rhythmValues[i].clear(); | |
287 } | |
288 | |
289 for (int i = 0; i < m_lastNonEmptyFrame.size(); ++i) { | |
290 m_lastNonEmptyFrame[i] = -1; | |
291 } | |
292 | |
293 for (int i = 0; i < m_emptyFrameCount.size(); ++i) { | |
294 m_emptyFrameCount[i] = 0; | |
295 } | |
296 | |
297 m_done = false; | |
298 } | 120 } |
299 | 121 |
300 int | 122 int |
301 SimilarityPlugin::getDecimationFactor() const | 123 SimilarityPlugin::getDecimationFactor() const |
302 { | 124 { |
529 list.push_back(beatspectrum); | 351 list.push_back(beatspectrum); |
530 | 352 |
531 return list; | 353 return list; |
532 } | 354 } |
533 | 355 |
356 bool | |
357 SimilarityPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize) | |
358 { | |
359 if (channels < getMinChannelCount()) return false; | |
360 | |
361 // Using more than getMaxChannelCount is not actually a problem | |
362 // for us. Using "incorrect" step and block sizes would be fine | |
363 // for timbral or chroma similarity, but will break rhythmic | |
364 // similarity, so we'd better enforce these. | |
365 | |
366 if (stepSize != getPreferredStepSize()) { | |
367 std::cerr << "SimilarityPlugin::initialise: supplied step size " | |
368 << stepSize << " differs from required step size " | |
369 << getPreferredStepSize() << std::endl; | |
370 return false; | |
371 } | |
372 | |
373 if (blockSize != getPreferredBlockSize()) { | |
374 std::cerr << "SimilarityPlugin::initialise: supplied block size " | |
375 << blockSize << " differs from required block size " | |
376 << getPreferredBlockSize() << std::endl; | |
377 return false; | |
378 } | |
379 | |
380 m_blockSize = blockSize; | |
381 m_channels = channels; | |
382 | |
383 m_lastNonEmptyFrame = std::vector<int>(m_channels); | |
384 for (int i = 0; i < m_channels; ++i) m_lastNonEmptyFrame[i] = -1; | |
385 | |
386 m_emptyFrameCount = std::vector<int>(m_channels); | |
387 for (int i = 0; i < m_channels; ++i) m_emptyFrameCount[i] = 0; | |
388 | |
389 m_frameNo = 0; | |
390 | |
391 int decimationFactor = getDecimationFactor(); | |
392 if (decimationFactor > 1) { | |
393 m_decimator = new Decimator(m_blockSize, decimationFactor); | |
394 } | |
395 | |
396 if (m_type == TypeMFCC) { | |
397 | |
398 m_featureColumnSize = 20; | |
399 | |
400 MFCCConfig config(m_processRate); | |
401 config.fftsize = 2048; | |
402 config.nceps = m_featureColumnSize - 1; | |
403 config.want_c0 = true; | |
404 config.logpower = 1; | |
405 m_mfcc = new MFCC(config); | |
406 m_fftSize = m_mfcc->getfftlength(); | |
407 m_rhythmClipFrameSize = m_fftSize / 4; | |
408 | |
409 // std::cerr << "MFCC FS = " << config.FS << ", FFT size = " << m_fftSize<< std::endl; | |
410 | |
411 } else if (m_type == TypeChroma) { | |
412 | |
413 m_featureColumnSize = 12; | |
414 | |
415 // For simplicity, aim to have the chroma fft size equal to | |
416 // 2048, the same as the mfcc fft size (so the input block | |
417 // size does not depend on the feature type and we can use the | |
418 // same processing parameters for rhythm etc). This is also | |
419 // why getPreferredBlockSize can confidently return 2048 * the | |
420 // decimation factor. | |
421 | |
422 // The fft size for a chromagram is the filterbank Q value | |
423 // times the sample rate, divided by the minimum frequency, | |
424 // rounded up to the nearest power of two. | |
425 | |
426 double q = 1.0 / (pow(2.0, (1.0 / 12.0)) - 1.0); | |
427 double fmin = (q * m_processRate) / 2048.0; | |
428 | |
429 // Round fmin up to the nearest MIDI pitch multiple of 12. | |
430 // So long as fmin is greater than 12 to start with, this | |
431 // should not change the resulting fft size. | |
432 | |
433 int pmin = Pitch::getPitchForFrequency(float(fmin)); | |
434 pmin = ((pmin / 12) + 1) * 12; | |
435 fmin = Pitch::getFrequencyForPitch(pmin); | |
436 | |
437 float fmax = Pitch::getFrequencyForPitch(pmin + 36); | |
438 | |
439 ChromaConfig config; | |
440 config.FS = m_processRate; | |
441 config.min = fmin; | |
442 config.max = fmax; | |
443 config.BPO = 12; | |
444 config.CQThresh = 0.0054; | |
445 // We don't normalise the chromagram's columns individually; | |
446 // we normalise the mean at the end instead | |
447 config.normalise = MathUtilities::NormaliseNone; | |
448 m_chromagram = new Chromagram(config); | |
449 m_fftSize = m_chromagram->getFrameSize(); | |
450 | |
451 if (m_fftSize != 2048) { | |
452 std::cerr << "WARNING: SimilarityPlugin::initialise: Internal processing FFT size " << m_fftSize << " != expected size 2048 in chroma mode" << std::endl; | |
453 } | |
454 | |
455 // std::cerr << "fftsize = " << m_fftSize << std::endl; | |
456 | |
457 m_rhythmClipFrameSize = m_fftSize / 4; | |
458 | |
459 // std::cerr << "m_rhythmClipFrameSize = " << m_rhythmClipFrameSize << std::endl; | |
460 // std::cerr << "min = "<< config.min << ", max = " << config.max << std::endl; | |
461 | |
462 } else { | |
463 | |
464 std::cerr << "SimilarityPlugin::initialise: internal error: unknown type " << m_type << std::endl; | |
465 return false; | |
466 } | |
467 | |
468 if (needRhythm()) { | |
469 m_rhythmClipFrames = | |
470 int(ceil((m_rhythmClipDuration * m_processRate) | |
471 / m_rhythmClipFrameSize)); | |
472 // std::cerr << "SimilarityPlugin::initialise: rhythm clip requires " | |
473 // << m_rhythmClipFrames << " frames of size " | |
474 // << m_rhythmClipFrameSize << " at process rate " | |
475 // << m_processRate << " ( = " | |
476 // << (float(m_rhythmClipFrames * m_rhythmClipFrameSize) / m_processRate) << " sec )" | |
477 // << std::endl; | |
478 | |
479 MFCCConfig config(m_processRate); | |
480 config.fftsize = m_rhythmClipFrameSize; | |
481 config.nceps = m_rhythmColumnSize - 1; | |
482 config.want_c0 = true; | |
483 config.logpower = 1; | |
484 config.window = RectangularWindow; // because no overlap | |
485 m_rhythmfcc = new MFCC(config); | |
486 } | |
487 | |
488 for (int i = 0; i < m_channels; ++i) { | |
489 | |
490 m_values.push_back(FeatureMatrix()); | |
491 | |
492 if (needRhythm()) { | |
493 m_rhythmValues.push_back(FeatureColumnQueue()); | |
494 } | |
495 } | |
496 | |
497 m_done = false; | |
498 | |
499 return true; | |
500 } | |
501 | |
502 void | |
503 SimilarityPlugin::reset() | |
504 { | |
505 for (int i = 0; i < m_values.size(); ++i) { | |
506 m_values[i].clear(); | |
507 } | |
508 | |
509 for (int i = 0; i < m_rhythmValues.size(); ++i) { | |
510 m_rhythmValues[i].clear(); | |
511 } | |
512 | |
513 for (int i = 0; i < m_lastNonEmptyFrame.size(); ++i) { | |
514 m_lastNonEmptyFrame[i] = -1; | |
515 } | |
516 | |
517 for (int i = 0; i < m_emptyFrameCount.size(); ++i) { | |
518 m_emptyFrameCount[i] = 0; | |
519 } | |
520 | |
521 m_done = false; | |
522 } | |
523 | |
534 SimilarityPlugin::FeatureSet | 524 SimilarityPlugin::FeatureSet |
535 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */) | 525 SimilarityPlugin::process(const float *const *inputBuffers, Vamp::RealTime /* timestamp */) |
536 { | 526 { |
537 if (m_done) { | 527 if (m_done) { |
538 return FeatureSet(); | 528 return FeatureSet(); |