annotate src/Silvet.cpp @ 311:99af9557dfc1

Pull out applyEM function
author Chris Cannam
date Tue, 28 Apr 2015 09:41:40 +0100
parents 07ee4ebea57c
children 796d403dc83b
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@311 33 using std::pair;
Chris@40 34 using Vamp::RealTime;
Chris@31 35
Chris@31 36 static int processingSampleRate = 44100;
Chris@31 37 static int processingBPO = 60;
Chris@170 38
Chris@272 39 static int minInputSampleRate = 100;
Chris@272 40 static int maxInputSampleRate = 192000;
Chris@272 41
Chris@31 42 Silvet::Silvet(float inputSampleRate) :
Chris@31 43 Plugin(inputSampleRate),
Chris@161 44 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 45 m_resampler(0),
Chris@246 46 m_flattener(0),
Chris@110 47 m_cq(0),
Chris@162 48 m_hqMode(true),
Chris@166 49 m_fineTuning(false),
Chris@178 50 m_instrument(0),
Chris@178 51 m_colsPerSec(50)
Chris@31 52 {
Chris@31 53 }
Chris@31 54
Chris@31 55 Silvet::~Silvet()
Chris@31 56 {
Chris@31 57 delete m_resampler;
Chris@246 58 delete m_flattener;
Chris@31 59 delete m_cq;
Chris@41 60 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 61 delete m_postFilter[i];
Chris@41 62 }
Chris@31 63 }
Chris@31 64
Chris@31 65 string
Chris@31 66 Silvet::getIdentifier() const
Chris@31 67 {
Chris@31 68 return "silvet";
Chris@31 69 }
Chris@31 70
Chris@31 71 string
Chris@31 72 Silvet::getName() const
Chris@31 73 {
Chris@31 74 return "Silvet Note Transcription";
Chris@31 75 }
Chris@31 76
Chris@31 77 string
Chris@31 78 Silvet::getDescription() const
Chris@31 79 {
Chris@191 80 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 81 }
Chris@31 82
Chris@31 83 string
Chris@31 84 Silvet::getMaker() const
Chris@31 85 {
Chris@191 86 return "Queen Mary, University of London";
Chris@31 87 }
Chris@31 88
Chris@31 89 int
Chris@31 90 Silvet::getPluginVersion() const
Chris@31 91 {
Chris@309 92 return 3;
Chris@31 93 }
Chris@31 94
Chris@31 95 string
Chris@31 96 Silvet::getCopyright() const
Chris@31 97 {
Chris@191 98 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 99 }
Chris@31 100
Chris@31 101 Silvet::InputDomain
Chris@31 102 Silvet::getInputDomain() const
Chris@31 103 {
Chris@31 104 return TimeDomain;
Chris@31 105 }
Chris@31 106
Chris@31 107 size_t
Chris@31 108 Silvet::getPreferredBlockSize() const
Chris@31 109 {
Chris@31 110 return 0;
Chris@31 111 }
Chris@31 112
Chris@31 113 size_t
Chris@31 114 Silvet::getPreferredStepSize() const
Chris@31 115 {
Chris@31 116 return 0;
Chris@31 117 }
Chris@31 118
Chris@31 119 size_t
Chris@31 120 Silvet::getMinChannelCount() const
Chris@31 121 {
Chris@31 122 return 1;
Chris@31 123 }
Chris@31 124
Chris@31 125 size_t
Chris@31 126 Silvet::getMaxChannelCount() const
Chris@31 127 {
Chris@31 128 return 1;
Chris@31 129 }
Chris@31 130
Chris@31 131 Silvet::ParameterList
Chris@31 132 Silvet::getParameterDescriptors() const
Chris@31 133 {
Chris@31 134 ParameterList list;
Chris@110 135
Chris@110 136 ParameterDescriptor desc;
Chris@110 137 desc.identifier = "mode";
Chris@110 138 desc.name = "Processing mode";
Chris@110 139 desc.unit = "";
Chris@271 140 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 141 desc.minValue = 0;
Chris@110 142 desc.maxValue = 1;
Chris@113 143 desc.defaultValue = 1;
Chris@110 144 desc.isQuantized = true;
Chris@110 145 desc.quantizeStep = 1;
Chris@166 146 desc.valueNames.push_back("Draft (faster)");
Chris@165 147 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 148 list.push_back(desc);
Chris@161 149
Chris@176 150 desc.identifier = "instrument";
Chris@176 151 desc.name = "Instrument";
Chris@161 152 desc.unit = "";
Chris@271 153 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 154 desc.minValue = 0;
Chris@162 155 desc.maxValue = m_instruments.size()-1;
Chris@162 156 desc.defaultValue = 0;
Chris@161 157 desc.isQuantized = true;
Chris@161 158 desc.quantizeStep = 1;
Chris@161 159 desc.valueNames.clear();
Chris@162 160 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 161 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 162 }
Chris@166 163 list.push_back(desc);
Chris@161 164
Chris@166 165 desc.identifier = "finetune";
Chris@166 166 desc.name = "Return fine pitch estimates";
Chris@166 167 desc.unit = "";
Chris@271 168 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 169 desc.minValue = 0;
Chris@166 170 desc.maxValue = 1;
Chris@166 171 desc.defaultValue = 0;
Chris@166 172 desc.isQuantized = true;
Chris@166 173 desc.quantizeStep = 1;
Chris@166 174 desc.valueNames.clear();
Chris@110 175 list.push_back(desc);
Chris@110 176
Chris@31 177 return list;
Chris@31 178 }
Chris@31 179
Chris@31 180 float
Chris@31 181 Silvet::getParameter(string identifier) const
Chris@31 182 {
Chris@110 183 if (identifier == "mode") {
Chris@110 184 return m_hqMode ? 1.f : 0.f;
Chris@166 185 } else if (identifier == "finetune") {
Chris@166 186 return m_fineTuning ? 1.f : 0.f;
Chris@176 187 } else if (identifier == "instrument") {
Chris@162 188 return m_instrument;
Chris@110 189 }
Chris@31 190 return 0;
Chris@31 191 }
Chris@31 192
Chris@31 193 void
Chris@31 194 Silvet::setParameter(string identifier, float value)
Chris@31 195 {
Chris@110 196 if (identifier == "mode") {
Chris@110 197 m_hqMode = (value > 0.5);
Chris@166 198 } else if (identifier == "finetune") {
Chris@166 199 m_fineTuning = (value > 0.5);
Chris@176 200 } else if (identifier == "instrument") {
Chris@162 201 m_instrument = lrintf(value);
Chris@110 202 }
Chris@31 203 }
Chris@31 204
Chris@31 205 Silvet::ProgramList
Chris@31 206 Silvet::getPrograms() const
Chris@31 207 {
Chris@31 208 ProgramList list;
Chris@31 209 return list;
Chris@31 210 }
Chris@31 211
Chris@31 212 string
Chris@31 213 Silvet::getCurrentProgram() const
Chris@31 214 {
Chris@31 215 return "";
Chris@31 216 }
Chris@31 217
Chris@31 218 void
Chris@31 219 Silvet::selectProgram(string name)
Chris@31 220 {
Chris@31 221 }
Chris@31 222
Chris@31 223 Silvet::OutputList
Chris@31 224 Silvet::getOutputDescriptors() const
Chris@31 225 {
Chris@31 226 OutputList list;
Chris@31 227
Chris@31 228 OutputDescriptor d;
Chris@51 229 d.identifier = "notes";
Chris@51 230 d.name = "Note transcription";
Chris@271 231 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 232 d.unit = "Hz";
Chris@31 233 d.hasFixedBinCount = true;
Chris@31 234 d.binCount = 2;
Chris@41 235 d.binNames.push_back("Frequency");
Chris@31 236 d.binNames.push_back("Velocity");
Chris@31 237 d.hasKnownExtents = false;
Chris@31 238 d.isQuantized = false;
Chris@31 239 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 240 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 241 d.hasDuration = true;
Chris@32 242 m_notesOutputNo = list.size();
Chris@32 243 list.push_back(d);
Chris@32 244
Chris@178 245 d.identifier = "timefreq";
Chris@178 246 d.name = "Time-frequency distribution";
Chris@271 247 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 248 d.unit = "";
Chris@178 249 d.hasFixedBinCount = true;
Chris@178 250 d.binCount = m_instruments[0].templateHeight;
Chris@178 251 d.binNames.clear();
Chris@178 252 if (m_cq) {
Chris@294 253 char name[50];
Chris@178 254 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 255 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 256 // lowest-frequency 55 bins have been dropped, for a
Chris@178 257 // 545-bin template. The native CQ bins go high->low
Chris@178 258 // frequency though, so these are still the first 545 bins
Chris@178 259 // as reported by getBinFrequency, though in reverse order
Chris@178 260 float freq = m_cq->getBinFrequency
Chris@178 261 (m_instruments[0].templateHeight - i - 1);
Chris@178 262 sprintf(name, "%.1f Hz", freq);
Chris@178 263 d.binNames.push_back(name);
Chris@178 264 }
Chris@178 265 }
Chris@178 266 d.hasKnownExtents = false;
Chris@178 267 d.isQuantized = false;
Chris@178 268 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 269 d.sampleRate = m_colsPerSec;
Chris@178 270 d.hasDuration = false;
Chris@178 271 m_fcqOutputNo = list.size();
Chris@178 272 list.push_back(d);
Chris@178 273
Chris@294 274 d.identifier = "pitchactivation";
Chris@294 275 d.name = "Pitch activation distribution";
Chris@294 276 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 277 d.unit = "";
Chris@294 278 d.hasFixedBinCount = true;
Chris@294 279 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 280 d.binNames.clear();
Chris@294 281 if (m_cq) {
Chris@294 282 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 283 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 284 }
Chris@294 285 }
Chris@294 286 d.hasKnownExtents = false;
Chris@294 287 d.isQuantized = false;
Chris@294 288 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 289 d.sampleRate = m_colsPerSec;
Chris@294 290 d.hasDuration = false;
Chris@294 291 m_pitchOutputNo = list.size();
Chris@294 292 list.push_back(d);
Chris@294 293
Chris@309 294 d.identifier = "chroma";
Chris@309 295 d.name = "Pitch chroma distribution";
Chris@309 296 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
Chris@309 297 d.unit = "";
Chris@309 298 d.hasFixedBinCount = true;
Chris@309 299 d.binCount = 12;
Chris@309 300 d.binNames.clear();
Chris@309 301 if (m_cq) {
Chris@309 302 for (int i = 0; i < 12; ++i) {
Chris@309 303 d.binNames.push_back(chromaName(i));
Chris@309 304 }
Chris@309 305 }
Chris@309 306 d.hasKnownExtents = false;
Chris@309 307 d.isQuantized = false;
Chris@309 308 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@309 309 d.sampleRate = m_colsPerSec;
Chris@309 310 d.hasDuration = false;
Chris@309 311 m_chromaOutputNo = list.size();
Chris@309 312 list.push_back(d);
Chris@309 313
Chris@31 314 return list;
Chris@31 315 }
Chris@31 316
Chris@38 317 std::string
Chris@309 318 Silvet::chromaName(int pitch) const
Chris@38 319 {
Chris@38 320 static const char *names[] = {
Chris@38 321 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 322 };
Chris@38 323
Chris@309 324 return names[pitch];
Chris@309 325 }
Chris@309 326
Chris@309 327 std::string
Chris@309 328 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@309 329 {
Chris@309 330 string n = chromaName(note % 12);
Chris@38 331
Chris@175 332 int oct = (note + 9) / 12;
Chris@38 333
Chris@175 334 char buf[30];
Chris@175 335
Chris@175 336 float pshift = 0.f;
Chris@175 337 if (shiftCount > 1) {
Chris@175 338 // see noteFrequency below
Chris@175 339 pshift =
Chris@175 340 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 341 }
Chris@175 342
Chris@175 343 if (pshift > 0.f) {
Chris@309 344 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
Chris@175 345 } else if (pshift < 0.f) {
Chris@309 346 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
Chris@175 347 } else {
Chris@309 348 sprintf(buf, "%s%d", n.c_str(), oct);
Chris@175 349 }
Chris@38 350
Chris@38 351 return buf;
Chris@38 352 }
Chris@38 353
Chris@41 354 float
Chris@168 355 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 356 {
Chris@169 357 // Convert shift number to a pitch shift. The given shift number
Chris@169 358 // is an offset into the template array, which starts with some
Chris@169 359 // zeros, followed by the template, then some trailing zeros.
Chris@169 360 //
Chris@169 361 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 362 // == 5, then the number will be in the range 0-4 and the template
Chris@169 363 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 364 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 365 // represent moving the template *up* in pitch (by introducing
Chris@169 366 // zeros at the start, which is the low-frequency end), for a
Chris@169 367 // positive pitch shift; and higher values represent moving it
Chris@169 368 // down in pitch, for a negative pitch shift.
Chris@169 369
Chris@175 370 float pshift = 0.f;
Chris@175 371 if (shiftCount > 1) {
Chris@175 372 pshift =
Chris@175 373 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 374 }
Chris@169 375
Chris@169 376 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 377 }
Chris@41 378
Chris@31 379 bool
Chris@31 380 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 381 {
Chris@272 382 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 383 m_inputSampleRate > maxInputSampleRate) {
Chris@272 384 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 385 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 386 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 387 return false;
Chris@272 388 }
Chris@272 389
Chris@31 390 if (channels < getMinChannelCount() ||
Chris@272 391 channels > getMaxChannelCount()) {
Chris@272 392 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 393 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 394 << getMaxChannelCount() << ")" << endl;
Chris@272 395 return false;
Chris@272 396 }
Chris@31 397
Chris@31 398 if (stepSize != blockSize) {
Chris@31 399 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 400 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 401 return false;
Chris@31 402 }
Chris@31 403
Chris@31 404 m_blockSize = blockSize;
Chris@31 405
Chris@31 406 reset();
Chris@31 407
Chris@31 408 return true;
Chris@31 409 }
Chris@31 410
Chris@31 411 void
Chris@31 412 Silvet::reset()
Chris@31 413 {
Chris@31 414 delete m_resampler;
Chris@246 415 delete m_flattener;
Chris@31 416 delete m_cq;
Chris@31 417
Chris@31 418 if (m_inputSampleRate != processingSampleRate) {
Chris@31 419 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 420 } else {
Chris@31 421 m_resampler = 0;
Chris@31 422 }
Chris@31 423
Chris@246 424 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 425 m_flattener->reset();
Chris@246 426
Chris@173 427 double minFreq = 27.5;
Chris@173 428
Chris@173 429 if (!m_hqMode) {
Chris@173 430 // We don't actually return any notes from the bottom octave,
Chris@173 431 // so we can just pad with zeros
Chris@173 432 minFreq *= 2;
Chris@173 433 }
Chris@173 434
Chris@154 435 CQParameters params(processingSampleRate,
Chris@173 436 minFreq,
Chris@154 437 processingSampleRate / 3,
Chris@154 438 processingBPO);
Chris@154 439
Chris@155 440 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 441 // drops the FFT size to 512 from 1024 and alters
Chris@155 442 // some other processing parameters, making
Chris@155 443 // everything much, much slower. Could be a flaw
Chris@155 444 // in the CQ parameter calculations, must check
Chris@154 445 params.atomHopFactor = 0.3;
Chris@154 446 params.threshold = 0.0005;
Chris@172 447 params.window = CQParameters::Hann;
Chris@154 448
Chris@154 449 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 450
Chris@165 451 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 452
Chris@41 453 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 454 delete m_postFilter[i];
Chris@41 455 }
Chris@41 456 m_postFilter.clear();
Chris@176 457 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 458 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 459 }
Chris@41 460 m_pianoRoll.clear();
Chris@246 461 m_inputGains.clear();
Chris@32 462 m_columnCount = 0;
Chris@272 463 m_resampledCount = 0;
Chris@40 464 m_startTime = RealTime::zeroTime;
Chris@31 465 }
Chris@31 466
Chris@31 467 Silvet::FeatureSet
Chris@31 468 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 469 {
Chris@40 470 if (m_columnCount == 0) {
Chris@40 471 m_startTime = timestamp;
Chris@40 472 }
Chris@246 473
Chris@246 474 vector<float> flattened(m_blockSize);
Chris@246 475 float gain = 1.f;
Chris@246 476 m_flattener->connectInputPort
Chris@246 477 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 478 m_flattener->connectOutputPort
Chris@246 479 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 480 m_flattener->connectOutputPort
Chris@246 481 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 482 m_flattener->process(m_blockSize);
Chris@246 483
Chris@252 484 m_inputGains[timestamp] = gain;
Chris@40 485
Chris@31 486 vector<double> data;
Chris@40 487 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 488 double d = flattened[i];
Chris@235 489 data.push_back(d);
Chris@40 490 }
Chris@31 491
Chris@31 492 if (m_resampler) {
Chris@272 493
Chris@31 494 data = m_resampler->process(data.data(), data.size());
Chris@272 495
Chris@272 496 int hadCount = m_resampledCount;
Chris@272 497 m_resampledCount += data.size();
Chris@272 498
Chris@272 499 int resamplerLatency = m_resampler->getLatency();
Chris@272 500
Chris@272 501 if (hadCount < resamplerLatency) {
Chris@272 502 int stillToDrop = resamplerLatency - hadCount;
Chris@272 503 if (stillToDrop >= int(data.size())) {
Chris@272 504 return FeatureSet();
Chris@272 505 } else {
Chris@272 506 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 507 }
Chris@272 508 }
Chris@31 509 }
Chris@272 510
Chris@32 511 Grid cqout = m_cq->process(data);
Chris@51 512 FeatureSet fs = transcribe(cqout);
Chris@51 513 return fs;
Chris@34 514 }
Chris@34 515
Chris@34 516 Silvet::FeatureSet
Chris@34 517 Silvet::getRemainingFeatures()
Chris@34 518 {
Chris@145 519 Grid cqout = m_cq->getRemainingOutput();
Chris@51 520 FeatureSet fs = transcribe(cqout);
Chris@51 521 return fs;
Chris@34 522 }
Chris@34 523
Chris@34 524 Silvet::FeatureSet
Chris@34 525 Silvet::transcribe(const Grid &cqout)
Chris@34 526 {
Chris@32 527 Grid filtered = preProcess(cqout);
Chris@31 528
Chris@32 529 FeatureSet fs;
Chris@32 530
Chris@104 531 if (filtered.empty()) return fs;
Chris@170 532
Chris@170 533 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 534
Chris@178 535 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 536 Feature f;
Chris@178 537 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 538 f.values.push_back(float(filtered[i][j]));
Chris@178 539 }
Chris@178 540 fs[m_fcqOutputNo].push_back(f);
Chris@178 541 }
Chris@178 542
Chris@34 543 int width = filtered.size();
Chris@34 544
Chris@311 545 Grid localPitches(width);
Chris@170 546
Chris@170 547 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 548 int shiftCount = 1;
Chris@170 549 if (wantShifts) {
Chris@170 550 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 551 }
Chris@170 552
Chris@170 553 vector<vector<int> > localBestShifts;
Chris@170 554 if (wantShifts) {
Chris@311 555 localBestShifts = vector<vector<int> >(width);
Chris@170 556 }
Chris@170 557
Chris@123 558 for (int i = 0; i < width; ++i) {
Chris@311 559 auto out = applyEM(pack, filtered.at(i), wantShifts);
Chris@311 560 localPitches[i] = out.first;
Chris@311 561 if (wantShifts) localBestShifts[i] = out.second;
Chris@123 562 }
Chris@166 563
Chris@166 564 for (int i = 0; i < width; ++i) {
Chris@37 565
Chris@309 566 // This returns a filtered column, and pushes the
Chris@309 567 // up-to-max-polyphony activation column to m_pianoRoll
Chris@294 568 vector<double> filtered = postProcess
Chris@294 569 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 570
Chris@309 571 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
Chris@309 572 float inputGain = getInputGainAt(timestamp);
Chris@309 573
Chris@294 574 Feature f;
Chris@294 575 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 576 float v = filtered[j];
Chris@294 577 if (v < pack.levelThreshold) v = 0.f;
Chris@309 578 f.values.push_back(v / inputGain);
Chris@294 579 }
Chris@294 580 fs[m_pitchOutputNo].push_back(f);
Chris@309 581
Chris@309 582 f.values.clear();
Chris@309 583 f.values.resize(12);
Chris@309 584 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 585 f.values[j % 12] += filtered[j] / inputGain;
Chris@309 586 }
Chris@309 587 fs[m_chromaOutputNo].push_back(f);
Chris@166 588
Chris@168 589 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 590
Chris@123 591 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 592 fi != noteFeatures.end(); ++fi) {
Chris@123 593 fs[m_notesOutputNo].push_back(*fi);
Chris@40 594 }
Chris@34 595 }
Chris@34 596
Chris@32 597 return fs;
Chris@31 598 }
Chris@31 599
Chris@311 600 pair<vector<double>, vector<int> >
Chris@311 601 Silvet::applyEM(const InstrumentPack &pack,
Chris@311 602 const vector<double> &column,
Chris@311 603 bool wantShifts)
Chris@311 604 {
Chris@311 605 double columnThreshold = 1e-5;
Chris@311 606
Chris@311 607 vector<double> pitches(pack.templateNoteCount, 0.0);
Chris@311 608 vector<int> bestShifts;
Chris@311 609
Chris@311 610 double sum = 0.0;
Chris@311 611 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@311 612 sum += column.at(j);
Chris@311 613 }
Chris@311 614 if (sum < columnThreshold) return { pitches, bestShifts };
Chris@311 615
Chris@311 616 EM em(&pack, m_hqMode);
Chris@311 617
Chris@311 618 em.setPitchSparsity(pack.pitchSparsity);
Chris@311 619 em.setSourceSparsity(pack.sourceSparsity);
Chris@311 620
Chris@311 621 int iterations = m_hqMode ? 20 : 10;
Chris@311 622
Chris@311 623 for (int j = 0; j < iterations; ++j) {
Chris@311 624 em.iterate(column.data());
Chris@311 625 }
Chris@311 626
Chris@311 627 const float *pitchDist = em.getPitchDistribution();
Chris@311 628 const float *const *shiftDist = em.getShifts();
Chris@311 629
Chris@311 630 int shiftCount = 1;
Chris@311 631 if (wantShifts) {
Chris@311 632 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@311 633 }
Chris@311 634
Chris@311 635 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@311 636
Chris@311 637 pitches[j] = pitchDist[j] * sum;
Chris@311 638
Chris@311 639 int bestShift = 0;
Chris@311 640 float bestShiftValue = 0.0;
Chris@311 641 if (wantShifts) {
Chris@311 642 for (int k = 0; k < shiftCount; ++k) {
Chris@311 643 float value = shiftDist[k][j];
Chris@311 644 if (k == 0 || value > bestShiftValue) {
Chris@311 645 bestShiftValue = value;
Chris@311 646 bestShift = k;
Chris@311 647 }
Chris@311 648 }
Chris@311 649 bestShifts.push_back(bestShift);
Chris@311 650 }
Chris@311 651 }
Chris@311 652
Chris@311 653 return { pitches, bestShifts };
Chris@311 654 }
Chris@311 655
Chris@32 656 Silvet::Grid
Chris@32 657 Silvet::preProcess(const Grid &in)
Chris@32 658 {
Chris@32 659 int width = in.size();
Chris@32 660
Chris@165 661 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 662
Chris@165 663 // need to be careful that col spacing is an integer number of samples!
Chris@165 664 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 665
Chris@32 666 Grid out;
Chris@32 667
Chris@58 668 // We count the CQ latency in terms of processing hops, but
Chris@58 669 // actually it probably isn't an exact number of hops so this
Chris@58 670 // isn't quite accurate. But the small constant offset is
Chris@165 671 // practically irrelevant compared to the jitter from the frame
Chris@165 672 // size we reduce to in a moment
Chris@33 673 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 674
Chris@176 675 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 676
Chris@32 677 for (int i = 0; i < width; ++i) {
Chris@32 678
Chris@33 679 if (m_columnCount < latentColumns) {
Chris@33 680 ++m_columnCount;
Chris@33 681 continue;
Chris@33 682 }
Chris@33 683
Chris@32 684 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 685 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 686
Chris@32 687 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 688
Chris@32 689 if (select) {
Chris@32 690 vector<double> inCol = in[i];
Chris@176 691 vector<double> outCol(pack.templateHeight);
Chris@32 692
Chris@178 693 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 694 // lowest 55 of them.
Chris@178 695 //
Chris@178 696 // In draft mode the CQ is an octave shorter, returning
Chris@178 697 // 540 bins, so we instead pad them with an additional 5
Chris@178 698 // zeros.
Chris@178 699 //
Chris@178 700 // We also need to reverse the column as we go, since the
Chris@178 701 // raw CQ has the high frequencies first and we need it
Chris@178 702 // the other way around.
Chris@32 703
Chris@178 704 if (m_hqMode) {
Chris@178 705 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 706 int ix = inCol.size() - j - 55;
Chris@178 707 outCol[j] = inCol[ix];
Chris@178 708 }
Chris@178 709 } else {
Chris@178 710 for (int j = 0; j < 5; ++j) {
Chris@178 711 outCol[j] = 0.0;
Chris@178 712 }
Chris@178 713 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 714 int ix = inCol.size() - j + 4;
Chris@178 715 outCol[j] = inCol[ix];
Chris@178 716 }
Chris@46 717 }
Chris@32 718
Chris@46 719 vector<double> noiseLevel1 =
Chris@46 720 MedianFilter<double>::filter(40, outCol);
Chris@176 721 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 722 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 723 }
Chris@32 724
Chris@46 725 vector<double> noiseLevel2 =
Chris@46 726 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 727 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 728 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 729 }
Chris@32 730
Chris@165 731 out.push_back(outCol);
Chris@32 732 }
Chris@32 733
Chris@32 734 ++m_columnCount;
Chris@32 735 }
Chris@32 736
Chris@32 737 return out;
Chris@32 738 }
Chris@32 739
Chris@294 740 vector<double>
Chris@170 741 Silvet::postProcess(const vector<double> &pitches,
Chris@170 742 const vector<int> &bestShifts,
Chris@170 743 bool wantShifts)
Chris@166 744 {
Chris@176 745 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 746
Chris@41 747 vector<double> filtered;
Chris@41 748
Chris@176 749 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 750 m_postFilter[j]->push(pitches[j]);
Chris@41 751 filtered.push_back(m_postFilter[j]->get());
Chris@41 752 }
Chris@41 753
Chris@41 754 // Threshold for level and reduce number of candidate pitches
Chris@41 755
Chris@41 756 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 757
Chris@41 758 ValueIndexMap strengths;
Chris@166 759
Chris@176 760 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 761 double strength = filtered[j];
Chris@183 762 if (strength < pack.levelThreshold) continue;
Chris@168 763 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 764 }
Chris@166 765
Chris@168 766 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 767
Chris@168 768 map<int, double> active;
Chris@168 769 map<int, int> activeShifts;
Chris@168 770
Chris@183 771 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 772
Chris@168 773 --si;
Chris@168 774
Chris@168 775 double strength = si->first;
Chris@168 776 int j = si->second;
Chris@168 777
Chris@168 778 active[j] = strength;
Chris@168 779
Chris@170 780 if (wantShifts) {
Chris@170 781 activeShifts[j] = bestShifts[j];
Chris@167 782 }
Chris@41 783 }
Chris@41 784
Chris@168 785 m_pianoRoll.push_back(active);
Chris@170 786
Chris@170 787 if (wantShifts) {
Chris@168 788 m_pianoRollShifts.push_back(activeShifts);
Chris@41 789 }
Chris@294 790
Chris@294 791 return filtered;
Chris@166 792 }
Chris@166 793
Chris@166 794 Vamp::Plugin::FeatureList
Chris@168 795 Silvet::noteTrack(int shiftCount)
Chris@166 796 {
Chris@41 797 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 798 // report notes that have just ended (i.e. that are absent in the
Chris@168 799 // latest active set but present in the prior set in the piano
Chris@41 800 // roll) -- any notes that ended earlier will have been reported
Chris@41 801 // already, and if they haven't ended, we don't know their
Chris@41 802 // duration.
Chris@41 803
Chris@168 804 int width = m_pianoRoll.size() - 1;
Chris@168 805
Chris@168 806 const map<int, double> &active = m_pianoRoll[width];
Chris@41 807
Chris@165 808 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 809
Chris@165 810 // only keep notes >= 100ms or thereabouts
Chris@165 811 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 812 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 813
Chris@41 814 FeatureList noteFeatures;
Chris@41 815
Chris@41 816 if (width < durationThreshold + 1) {
Chris@41 817 return noteFeatures;
Chris@41 818 }
Chris@41 819
Chris@150 820 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 821
Chris@55 822 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 823 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 824
Chris@55 825 int note = ni->first;
Chris@41 826
Chris@41 827 if (active.find(note) != active.end()) {
Chris@41 828 // the note is still playing
Chris@41 829 continue;
Chris@41 830 }
Chris@41 831
Chris@41 832 // the note was playing but just ended
Chris@41 833 int end = width;
Chris@41 834 int start = end-1;
Chris@41 835
Chris@41 836 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 837 --start;
Chris@41 838 }
Chris@41 839 ++start;
Chris@41 840
Chris@169 841 if ((end - start) < durationThreshold) {
Chris@41 842 continue;
Chris@41 843 }
Chris@41 844
Chris@169 845 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 846 }
Chris@41 847
Chris@62 848 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 849
Chris@41 850 return noteFeatures;
Chris@41 851 }
Chris@41 852
Chris@169 853 void
Chris@169 854 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 855 FeatureList &noteFeatures)
Chris@169 856 {
Chris@169 857 int partStart = start;
Chris@169 858 int partShift = 0;
Chris@169 859 int partVelocity = 0;
Chris@169 860
Chris@252 861 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 862
Chris@169 863 for (int i = start; i != end; ++i) {
Chris@169 864
Chris@169 865 double strength = m_pianoRoll[i][note];
Chris@169 866
Chris@169 867 int shift = 0;
Chris@169 868
Chris@169 869 if (shiftCount > 1) {
Chris@169 870
Chris@169 871 shift = m_pianoRollShifts[i][note];
Chris@169 872
Chris@169 873 if (i == partStart) {
Chris@169 874 partShift = shift;
Chris@169 875 }
Chris@169 876
Chris@169 877 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 878
Chris@169 879 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 880
Chris@169 881 // pitch has changed, emit an intermediate note
Chris@252 882 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 883 i,
Chris@252 884 note,
Chris@252 885 partShift,
Chris@252 886 shiftCount,
Chris@252 887 partVelocity));
Chris@169 888 partStart = i;
Chris@169 889 partShift = shift;
Chris@169 890 partVelocity = 0;
Chris@169 891 }
Chris@169 892 }
Chris@169 893
Chris@246 894 int v = round(strength * 2);
Chris@169 895 if (v > partVelocity) {
Chris@169 896 partVelocity = v;
Chris@169 897 }
Chris@169 898 }
Chris@169 899
Chris@169 900 if (end >= partStart + partThreshold) {
Chris@252 901 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 902 end,
Chris@252 903 note,
Chris@252 904 partShift,
Chris@252 905 shiftCount,
Chris@252 906 partVelocity));
Chris@169 907 }
Chris@169 908 }
Chris@252 909
Chris@309 910 RealTime
Chris@309 911 Silvet::getColumnTimestamp(int column)
Chris@309 912 {
Chris@309 913 double columnDuration = 1.0 / m_colsPerSec;
Chris@309 914 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@309 915
Chris@309 916 return m_startTime + RealTime::fromSeconds
Chris@309 917 (columnDuration * (column - postFilterLatency) + 0.02);
Chris@309 918 }
Chris@309 919
Chris@252 920 Silvet::Feature
Chris@252 921 Silvet::makeNoteFeature(int start,
Chris@252 922 int end,
Chris@252 923 int note,
Chris@252 924 int shift,
Chris@252 925 int shiftCount,
Chris@252 926 int velocity)
Chris@252 927 {
Chris@252 928 Feature f;
Chris@252 929
Chris@252 930 f.hasTimestamp = true;
Chris@309 931 f.timestamp = getColumnTimestamp(start);
Chris@252 932
Chris@252 933 f.hasDuration = true;
Chris@309 934 f.duration = getColumnTimestamp(end) - f.timestamp;
Chris@252 935
Chris@252 936 f.values.clear();
Chris@252 937
Chris@252 938 f.values.push_back
Chris@252 939 (noteFrequency(note, shift, shiftCount));
Chris@252 940
Chris@252 941 float inputGain = getInputGainAt(f.timestamp);
Chris@252 942 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 943 velocity = round(velocity / inputGain);
Chris@252 944 if (velocity > 127) velocity = 127;
Chris@252 945 if (velocity < 1) velocity = 1;
Chris@252 946 f.values.push_back(velocity);
Chris@252 947
Chris@252 948 f.label = noteName(note, shift, shiftCount);
Chris@252 949
Chris@252 950 return f;
Chris@252 951 }
Chris@252 952
Chris@252 953 float
Chris@252 954 Silvet::getInputGainAt(RealTime t)
Chris@252 955 {
Chris@252 956 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 957
Chris@252 958 if (i == m_inputGains.end()) {
Chris@252 959 if (i != m_inputGains.begin()) {
Chris@252 960 --i;
Chris@252 961 } else {
Chris@252 962 return 1.f; // no data
Chris@252 963 }
Chris@252 964 }
Chris@252 965
Chris@252 966 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 967
Chris@252 968 return i->second;
Chris@252 969 }
Chris@252 970