annotate src/Silvet.cpp @ 312:796d403dc83b

Replace OpenMP usage with (hopefully more portable) C++11 async. Define MAX_EM_THREADS=1 if you don't want to use it.
author Chris Cannam
date Tue, 28 Apr 2015 10:02:36 +0100
parents 99af9557dfc1
children fa2ffbb786df
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@312 26 #include <future>
Chris@31 27
Chris@32 28 #include <cstdio>
Chris@32 29
Chris@31 30 using std::vector;
Chris@48 31 using std::cout;
Chris@31 32 using std::cerr;
Chris@31 33 using std::endl;
Chris@311 34 using std::pair;
Chris@312 35 using std::future;
Chris@312 36 using std::async;
Chris@40 37 using Vamp::RealTime;
Chris@31 38
Chris@31 39 static int processingSampleRate = 44100;
Chris@31 40 static int processingBPO = 60;
Chris@170 41
Chris@272 42 static int minInputSampleRate = 100;
Chris@272 43 static int maxInputSampleRate = 192000;
Chris@272 44
Chris@31 45 Silvet::Silvet(float inputSampleRate) :
Chris@31 46 Plugin(inputSampleRate),
Chris@161 47 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 48 m_resampler(0),
Chris@246 49 m_flattener(0),
Chris@110 50 m_cq(0),
Chris@162 51 m_hqMode(true),
Chris@166 52 m_fineTuning(false),
Chris@178 53 m_instrument(0),
Chris@178 54 m_colsPerSec(50)
Chris@31 55 {
Chris@31 56 }
Chris@31 57
Chris@31 58 Silvet::~Silvet()
Chris@31 59 {
Chris@31 60 delete m_resampler;
Chris@246 61 delete m_flattener;
Chris@31 62 delete m_cq;
Chris@41 63 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 64 delete m_postFilter[i];
Chris@41 65 }
Chris@31 66 }
Chris@31 67
Chris@31 68 string
Chris@31 69 Silvet::getIdentifier() const
Chris@31 70 {
Chris@31 71 return "silvet";
Chris@31 72 }
Chris@31 73
Chris@31 74 string
Chris@31 75 Silvet::getName() const
Chris@31 76 {
Chris@31 77 return "Silvet Note Transcription";
Chris@31 78 }
Chris@31 79
Chris@31 80 string
Chris@31 81 Silvet::getDescription() const
Chris@31 82 {
Chris@191 83 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 84 }
Chris@31 85
Chris@31 86 string
Chris@31 87 Silvet::getMaker() const
Chris@31 88 {
Chris@191 89 return "Queen Mary, University of London";
Chris@31 90 }
Chris@31 91
Chris@31 92 int
Chris@31 93 Silvet::getPluginVersion() const
Chris@31 94 {
Chris@309 95 return 3;
Chris@31 96 }
Chris@31 97
Chris@31 98 string
Chris@31 99 Silvet::getCopyright() const
Chris@31 100 {
Chris@191 101 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 102 }
Chris@31 103
Chris@31 104 Silvet::InputDomain
Chris@31 105 Silvet::getInputDomain() const
Chris@31 106 {
Chris@31 107 return TimeDomain;
Chris@31 108 }
Chris@31 109
Chris@31 110 size_t
Chris@31 111 Silvet::getPreferredBlockSize() const
Chris@31 112 {
Chris@31 113 return 0;
Chris@31 114 }
Chris@31 115
Chris@31 116 size_t
Chris@31 117 Silvet::getPreferredStepSize() const
Chris@31 118 {
Chris@31 119 return 0;
Chris@31 120 }
Chris@31 121
Chris@31 122 size_t
Chris@31 123 Silvet::getMinChannelCount() const
Chris@31 124 {
Chris@31 125 return 1;
Chris@31 126 }
Chris@31 127
Chris@31 128 size_t
Chris@31 129 Silvet::getMaxChannelCount() const
Chris@31 130 {
Chris@31 131 return 1;
Chris@31 132 }
Chris@31 133
Chris@31 134 Silvet::ParameterList
Chris@31 135 Silvet::getParameterDescriptors() const
Chris@31 136 {
Chris@31 137 ParameterList list;
Chris@110 138
Chris@110 139 ParameterDescriptor desc;
Chris@110 140 desc.identifier = "mode";
Chris@110 141 desc.name = "Processing mode";
Chris@110 142 desc.unit = "";
Chris@271 143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 144 desc.minValue = 0;
Chris@110 145 desc.maxValue = 1;
Chris@113 146 desc.defaultValue = 1;
Chris@110 147 desc.isQuantized = true;
Chris@110 148 desc.quantizeStep = 1;
Chris@166 149 desc.valueNames.push_back("Draft (faster)");
Chris@165 150 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 151 list.push_back(desc);
Chris@161 152
Chris@176 153 desc.identifier = "instrument";
Chris@176 154 desc.name = "Instrument";
Chris@161 155 desc.unit = "";
Chris@271 156 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 157 desc.minValue = 0;
Chris@162 158 desc.maxValue = m_instruments.size()-1;
Chris@162 159 desc.defaultValue = 0;
Chris@161 160 desc.isQuantized = true;
Chris@161 161 desc.quantizeStep = 1;
Chris@161 162 desc.valueNames.clear();
Chris@162 163 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 164 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 165 }
Chris@166 166 list.push_back(desc);
Chris@161 167
Chris@166 168 desc.identifier = "finetune";
Chris@166 169 desc.name = "Return fine pitch estimates";
Chris@166 170 desc.unit = "";
Chris@271 171 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 172 desc.minValue = 0;
Chris@166 173 desc.maxValue = 1;
Chris@166 174 desc.defaultValue = 0;
Chris@166 175 desc.isQuantized = true;
Chris@166 176 desc.quantizeStep = 1;
Chris@166 177 desc.valueNames.clear();
Chris@110 178 list.push_back(desc);
Chris@110 179
Chris@31 180 return list;
Chris@31 181 }
Chris@31 182
Chris@31 183 float
Chris@31 184 Silvet::getParameter(string identifier) const
Chris@31 185 {
Chris@110 186 if (identifier == "mode") {
Chris@110 187 return m_hqMode ? 1.f : 0.f;
Chris@166 188 } else if (identifier == "finetune") {
Chris@166 189 return m_fineTuning ? 1.f : 0.f;
Chris@176 190 } else if (identifier == "instrument") {
Chris@162 191 return m_instrument;
Chris@110 192 }
Chris@31 193 return 0;
Chris@31 194 }
Chris@31 195
Chris@31 196 void
Chris@31 197 Silvet::setParameter(string identifier, float value)
Chris@31 198 {
Chris@110 199 if (identifier == "mode") {
Chris@110 200 m_hqMode = (value > 0.5);
Chris@166 201 } else if (identifier == "finetune") {
Chris@166 202 m_fineTuning = (value > 0.5);
Chris@176 203 } else if (identifier == "instrument") {
Chris@162 204 m_instrument = lrintf(value);
Chris@110 205 }
Chris@31 206 }
Chris@31 207
Chris@31 208 Silvet::ProgramList
Chris@31 209 Silvet::getPrograms() const
Chris@31 210 {
Chris@31 211 ProgramList list;
Chris@31 212 return list;
Chris@31 213 }
Chris@31 214
Chris@31 215 string
Chris@31 216 Silvet::getCurrentProgram() const
Chris@31 217 {
Chris@31 218 return "";
Chris@31 219 }
Chris@31 220
Chris@31 221 void
Chris@31 222 Silvet::selectProgram(string name)
Chris@31 223 {
Chris@31 224 }
Chris@31 225
Chris@31 226 Silvet::OutputList
Chris@31 227 Silvet::getOutputDescriptors() const
Chris@31 228 {
Chris@31 229 OutputList list;
Chris@31 230
Chris@31 231 OutputDescriptor d;
Chris@51 232 d.identifier = "notes";
Chris@51 233 d.name = "Note transcription";
Chris@271 234 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 235 d.unit = "Hz";
Chris@31 236 d.hasFixedBinCount = true;
Chris@31 237 d.binCount = 2;
Chris@41 238 d.binNames.push_back("Frequency");
Chris@31 239 d.binNames.push_back("Velocity");
Chris@31 240 d.hasKnownExtents = false;
Chris@31 241 d.isQuantized = false;
Chris@31 242 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 243 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 244 d.hasDuration = true;
Chris@32 245 m_notesOutputNo = list.size();
Chris@32 246 list.push_back(d);
Chris@32 247
Chris@178 248 d.identifier = "timefreq";
Chris@178 249 d.name = "Time-frequency distribution";
Chris@271 250 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 251 d.unit = "";
Chris@178 252 d.hasFixedBinCount = true;
Chris@178 253 d.binCount = m_instruments[0].templateHeight;
Chris@178 254 d.binNames.clear();
Chris@178 255 if (m_cq) {
Chris@294 256 char name[50];
Chris@178 257 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 258 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 259 // lowest-frequency 55 bins have been dropped, for a
Chris@178 260 // 545-bin template. The native CQ bins go high->low
Chris@178 261 // frequency though, so these are still the first 545 bins
Chris@178 262 // as reported by getBinFrequency, though in reverse order
Chris@178 263 float freq = m_cq->getBinFrequency
Chris@178 264 (m_instruments[0].templateHeight - i - 1);
Chris@178 265 sprintf(name, "%.1f Hz", freq);
Chris@178 266 d.binNames.push_back(name);
Chris@178 267 }
Chris@178 268 }
Chris@178 269 d.hasKnownExtents = false;
Chris@178 270 d.isQuantized = false;
Chris@178 271 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 272 d.sampleRate = m_colsPerSec;
Chris@178 273 d.hasDuration = false;
Chris@178 274 m_fcqOutputNo = list.size();
Chris@178 275 list.push_back(d);
Chris@178 276
Chris@294 277 d.identifier = "pitchactivation";
Chris@294 278 d.name = "Pitch activation distribution";
Chris@294 279 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 280 d.unit = "";
Chris@294 281 d.hasFixedBinCount = true;
Chris@294 282 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 283 d.binNames.clear();
Chris@294 284 if (m_cq) {
Chris@294 285 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 286 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 287 }
Chris@294 288 }
Chris@294 289 d.hasKnownExtents = false;
Chris@294 290 d.isQuantized = false;
Chris@294 291 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 292 d.sampleRate = m_colsPerSec;
Chris@294 293 d.hasDuration = false;
Chris@294 294 m_pitchOutputNo = list.size();
Chris@294 295 list.push_back(d);
Chris@294 296
Chris@309 297 d.identifier = "chroma";
Chris@309 298 d.name = "Pitch chroma distribution";
Chris@309 299 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
Chris@309 300 d.unit = "";
Chris@309 301 d.hasFixedBinCount = true;
Chris@309 302 d.binCount = 12;
Chris@309 303 d.binNames.clear();
Chris@309 304 if (m_cq) {
Chris@309 305 for (int i = 0; i < 12; ++i) {
Chris@309 306 d.binNames.push_back(chromaName(i));
Chris@309 307 }
Chris@309 308 }
Chris@309 309 d.hasKnownExtents = false;
Chris@309 310 d.isQuantized = false;
Chris@309 311 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@309 312 d.sampleRate = m_colsPerSec;
Chris@309 313 d.hasDuration = false;
Chris@309 314 m_chromaOutputNo = list.size();
Chris@309 315 list.push_back(d);
Chris@309 316
Chris@31 317 return list;
Chris@31 318 }
Chris@31 319
Chris@38 320 std::string
Chris@309 321 Silvet::chromaName(int pitch) const
Chris@38 322 {
Chris@38 323 static const char *names[] = {
Chris@38 324 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 325 };
Chris@38 326
Chris@309 327 return names[pitch];
Chris@309 328 }
Chris@309 329
Chris@309 330 std::string
Chris@309 331 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@309 332 {
Chris@309 333 string n = chromaName(note % 12);
Chris@38 334
Chris@175 335 int oct = (note + 9) / 12;
Chris@38 336
Chris@175 337 char buf[30];
Chris@175 338
Chris@175 339 float pshift = 0.f;
Chris@175 340 if (shiftCount > 1) {
Chris@175 341 // see noteFrequency below
Chris@175 342 pshift =
Chris@175 343 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 344 }
Chris@175 345
Chris@175 346 if (pshift > 0.f) {
Chris@309 347 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
Chris@175 348 } else if (pshift < 0.f) {
Chris@309 349 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
Chris@175 350 } else {
Chris@309 351 sprintf(buf, "%s%d", n.c_str(), oct);
Chris@175 352 }
Chris@38 353
Chris@38 354 return buf;
Chris@38 355 }
Chris@38 356
Chris@41 357 float
Chris@168 358 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 359 {
Chris@169 360 // Convert shift number to a pitch shift. The given shift number
Chris@169 361 // is an offset into the template array, which starts with some
Chris@169 362 // zeros, followed by the template, then some trailing zeros.
Chris@169 363 //
Chris@169 364 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 365 // == 5, then the number will be in the range 0-4 and the template
Chris@169 366 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 367 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 368 // represent moving the template *up* in pitch (by introducing
Chris@169 369 // zeros at the start, which is the low-frequency end), for a
Chris@169 370 // positive pitch shift; and higher values represent moving it
Chris@169 371 // down in pitch, for a negative pitch shift.
Chris@169 372
Chris@175 373 float pshift = 0.f;
Chris@175 374 if (shiftCount > 1) {
Chris@175 375 pshift =
Chris@175 376 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 377 }
Chris@169 378
Chris@169 379 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 380 }
Chris@41 381
Chris@31 382 bool
Chris@31 383 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 384 {
Chris@272 385 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 386 m_inputSampleRate > maxInputSampleRate) {
Chris@272 387 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 388 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 389 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 390 return false;
Chris@272 391 }
Chris@272 392
Chris@31 393 if (channels < getMinChannelCount() ||
Chris@272 394 channels > getMaxChannelCount()) {
Chris@272 395 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 396 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 397 << getMaxChannelCount() << ")" << endl;
Chris@272 398 return false;
Chris@272 399 }
Chris@31 400
Chris@31 401 if (stepSize != blockSize) {
Chris@31 402 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 403 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 404 return false;
Chris@31 405 }
Chris@31 406
Chris@31 407 m_blockSize = blockSize;
Chris@31 408
Chris@31 409 reset();
Chris@31 410
Chris@31 411 return true;
Chris@31 412 }
Chris@31 413
Chris@31 414 void
Chris@31 415 Silvet::reset()
Chris@31 416 {
Chris@31 417 delete m_resampler;
Chris@246 418 delete m_flattener;
Chris@31 419 delete m_cq;
Chris@31 420
Chris@31 421 if (m_inputSampleRate != processingSampleRate) {
Chris@31 422 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 423 } else {
Chris@31 424 m_resampler = 0;
Chris@31 425 }
Chris@31 426
Chris@246 427 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 428 m_flattener->reset();
Chris@246 429
Chris@173 430 double minFreq = 27.5;
Chris@173 431
Chris@173 432 if (!m_hqMode) {
Chris@173 433 // We don't actually return any notes from the bottom octave,
Chris@173 434 // so we can just pad with zeros
Chris@173 435 minFreq *= 2;
Chris@173 436 }
Chris@173 437
Chris@154 438 CQParameters params(processingSampleRate,
Chris@173 439 minFreq,
Chris@154 440 processingSampleRate / 3,
Chris@154 441 processingBPO);
Chris@154 442
Chris@155 443 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 444 // drops the FFT size to 512 from 1024 and alters
Chris@155 445 // some other processing parameters, making
Chris@155 446 // everything much, much slower. Could be a flaw
Chris@155 447 // in the CQ parameter calculations, must check
Chris@154 448 params.atomHopFactor = 0.3;
Chris@154 449 params.threshold = 0.0005;
Chris@172 450 params.window = CQParameters::Hann;
Chris@154 451
Chris@154 452 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 453
Chris@165 454 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 455
Chris@41 456 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 457 delete m_postFilter[i];
Chris@41 458 }
Chris@41 459 m_postFilter.clear();
Chris@176 460 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 461 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 462 }
Chris@41 463 m_pianoRoll.clear();
Chris@246 464 m_inputGains.clear();
Chris@32 465 m_columnCount = 0;
Chris@272 466 m_resampledCount = 0;
Chris@40 467 m_startTime = RealTime::zeroTime;
Chris@31 468 }
Chris@31 469
Chris@31 470 Silvet::FeatureSet
Chris@31 471 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 472 {
Chris@40 473 if (m_columnCount == 0) {
Chris@40 474 m_startTime = timestamp;
Chris@40 475 }
Chris@246 476
Chris@246 477 vector<float> flattened(m_blockSize);
Chris@246 478 float gain = 1.f;
Chris@246 479 m_flattener->connectInputPort
Chris@246 480 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 481 m_flattener->connectOutputPort
Chris@246 482 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 483 m_flattener->connectOutputPort
Chris@246 484 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 485 m_flattener->process(m_blockSize);
Chris@246 486
Chris@252 487 m_inputGains[timestamp] = gain;
Chris@40 488
Chris@31 489 vector<double> data;
Chris@40 490 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 491 double d = flattened[i];
Chris@235 492 data.push_back(d);
Chris@40 493 }
Chris@31 494
Chris@31 495 if (m_resampler) {
Chris@272 496
Chris@31 497 data = m_resampler->process(data.data(), data.size());
Chris@272 498
Chris@272 499 int hadCount = m_resampledCount;
Chris@272 500 m_resampledCount += data.size();
Chris@272 501
Chris@272 502 int resamplerLatency = m_resampler->getLatency();
Chris@272 503
Chris@272 504 if (hadCount < resamplerLatency) {
Chris@272 505 int stillToDrop = resamplerLatency - hadCount;
Chris@272 506 if (stillToDrop >= int(data.size())) {
Chris@272 507 return FeatureSet();
Chris@272 508 } else {
Chris@272 509 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 510 }
Chris@272 511 }
Chris@31 512 }
Chris@272 513
Chris@32 514 Grid cqout = m_cq->process(data);
Chris@51 515 FeatureSet fs = transcribe(cqout);
Chris@51 516 return fs;
Chris@34 517 }
Chris@34 518
Chris@34 519 Silvet::FeatureSet
Chris@34 520 Silvet::getRemainingFeatures()
Chris@34 521 {
Chris@145 522 Grid cqout = m_cq->getRemainingOutput();
Chris@51 523 FeatureSet fs = transcribe(cqout);
Chris@51 524 return fs;
Chris@34 525 }
Chris@34 526
Chris@34 527 Silvet::FeatureSet
Chris@34 528 Silvet::transcribe(const Grid &cqout)
Chris@34 529 {
Chris@32 530 Grid filtered = preProcess(cqout);
Chris@31 531
Chris@32 532 FeatureSet fs;
Chris@32 533
Chris@104 534 if (filtered.empty()) return fs;
Chris@170 535
Chris@170 536 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 537
Chris@178 538 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 539 Feature f;
Chris@178 540 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 541 f.values.push_back(float(filtered[i][j]));
Chris@178 542 }
Chris@178 543 fs[m_fcqOutputNo].push_back(f);
Chris@178 544 }
Chris@178 545
Chris@34 546 int width = filtered.size();
Chris@34 547
Chris@311 548 Grid localPitches(width);
Chris@170 549
Chris@170 550 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 551 int shiftCount = 1;
Chris@170 552 if (wantShifts) {
Chris@170 553 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 554 }
Chris@170 555
Chris@170 556 vector<vector<int> > localBestShifts;
Chris@170 557 if (wantShifts) {
Chris@311 558 localBestShifts = vector<vector<int> >(width);
Chris@170 559 }
Chris@170 560
Chris@312 561 #ifndef MAX_EM_THREADS
Chris@312 562 #define MAX_EM_THREADS 8
Chris@312 563 #endif
Chris@312 564
Chris@312 565 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
Chris@312 566 for (int i = 0; i < width; ) {
Chris@312 567 typedef future<pair<vector<double>, vector<int>>> EMFuture;
Chris@312 568 vector<EMFuture> results;
Chris@312 569 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
Chris@312 570 results.push_back
Chris@312 571 (async(std::launch::async,
Chris@312 572 [&](int index) {
Chris@312 573 return applyEM(pack, filtered.at(index), wantShifts);
Chris@312 574 }, i + j));
Chris@312 575 }
Chris@312 576 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
Chris@312 577 auto out = results[j].get();
Chris@312 578 localPitches[i+j] = out.first;
Chris@312 579 if (wantShifts) localBestShifts[i+j] = out.second;
Chris@312 580 }
Chris@312 581 i += MAX_EM_THREADS;
Chris@312 582 }
Chris@312 583 #else
Chris@123 584 for (int i = 0; i < width; ++i) {
Chris@311 585 auto out = applyEM(pack, filtered.at(i), wantShifts);
Chris@311 586 localPitches[i] = out.first;
Chris@311 587 if (wantShifts) localBestShifts[i] = out.second;
Chris@123 588 }
Chris@312 589 #endif
Chris@312 590
Chris@166 591 for (int i = 0; i < width; ++i) {
Chris@37 592
Chris@309 593 // This returns a filtered column, and pushes the
Chris@309 594 // up-to-max-polyphony activation column to m_pianoRoll
Chris@294 595 vector<double> filtered = postProcess
Chris@294 596 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 597
Chris@309 598 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
Chris@309 599 float inputGain = getInputGainAt(timestamp);
Chris@309 600
Chris@294 601 Feature f;
Chris@294 602 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 603 float v = filtered[j];
Chris@294 604 if (v < pack.levelThreshold) v = 0.f;
Chris@309 605 f.values.push_back(v / inputGain);
Chris@294 606 }
Chris@294 607 fs[m_pitchOutputNo].push_back(f);
Chris@309 608
Chris@309 609 f.values.clear();
Chris@309 610 f.values.resize(12);
Chris@309 611 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 612 f.values[j % 12] += filtered[j] / inputGain;
Chris@309 613 }
Chris@309 614 fs[m_chromaOutputNo].push_back(f);
Chris@166 615
Chris@168 616 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 617
Chris@123 618 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 619 fi != noteFeatures.end(); ++fi) {
Chris@123 620 fs[m_notesOutputNo].push_back(*fi);
Chris@40 621 }
Chris@34 622 }
Chris@34 623
Chris@32 624 return fs;
Chris@31 625 }
Chris@31 626
Chris@311 627 pair<vector<double>, vector<int> >
Chris@311 628 Silvet::applyEM(const InstrumentPack &pack,
Chris@311 629 const vector<double> &column,
Chris@311 630 bool wantShifts)
Chris@311 631 {
Chris@311 632 double columnThreshold = 1e-5;
Chris@311 633
Chris@311 634 vector<double> pitches(pack.templateNoteCount, 0.0);
Chris@311 635 vector<int> bestShifts;
Chris@311 636
Chris@311 637 double sum = 0.0;
Chris@311 638 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@311 639 sum += column.at(j);
Chris@311 640 }
Chris@311 641 if (sum < columnThreshold) return { pitches, bestShifts };
Chris@311 642
Chris@311 643 EM em(&pack, m_hqMode);
Chris@311 644
Chris@311 645 em.setPitchSparsity(pack.pitchSparsity);
Chris@311 646 em.setSourceSparsity(pack.sourceSparsity);
Chris@311 647
Chris@311 648 int iterations = m_hqMode ? 20 : 10;
Chris@311 649
Chris@311 650 for (int j = 0; j < iterations; ++j) {
Chris@311 651 em.iterate(column.data());
Chris@311 652 }
Chris@311 653
Chris@311 654 const float *pitchDist = em.getPitchDistribution();
Chris@311 655 const float *const *shiftDist = em.getShifts();
Chris@311 656
Chris@311 657 int shiftCount = 1;
Chris@311 658 if (wantShifts) {
Chris@311 659 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@311 660 }
Chris@311 661
Chris@311 662 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@311 663
Chris@311 664 pitches[j] = pitchDist[j] * sum;
Chris@311 665
Chris@311 666 int bestShift = 0;
Chris@311 667 float bestShiftValue = 0.0;
Chris@311 668 if (wantShifts) {
Chris@311 669 for (int k = 0; k < shiftCount; ++k) {
Chris@311 670 float value = shiftDist[k][j];
Chris@311 671 if (k == 0 || value > bestShiftValue) {
Chris@311 672 bestShiftValue = value;
Chris@311 673 bestShift = k;
Chris@311 674 }
Chris@311 675 }
Chris@311 676 bestShifts.push_back(bestShift);
Chris@311 677 }
Chris@311 678 }
Chris@311 679
Chris@311 680 return { pitches, bestShifts };
Chris@311 681 }
Chris@311 682
Chris@32 683 Silvet::Grid
Chris@32 684 Silvet::preProcess(const Grid &in)
Chris@32 685 {
Chris@32 686 int width = in.size();
Chris@32 687
Chris@165 688 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 689
Chris@165 690 // need to be careful that col spacing is an integer number of samples!
Chris@165 691 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 692
Chris@32 693 Grid out;
Chris@32 694
Chris@58 695 // We count the CQ latency in terms of processing hops, but
Chris@58 696 // actually it probably isn't an exact number of hops so this
Chris@58 697 // isn't quite accurate. But the small constant offset is
Chris@165 698 // practically irrelevant compared to the jitter from the frame
Chris@165 699 // size we reduce to in a moment
Chris@33 700 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 701
Chris@176 702 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 703
Chris@32 704 for (int i = 0; i < width; ++i) {
Chris@32 705
Chris@33 706 if (m_columnCount < latentColumns) {
Chris@33 707 ++m_columnCount;
Chris@33 708 continue;
Chris@33 709 }
Chris@33 710
Chris@32 711 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 712 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 713
Chris@32 714 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 715
Chris@32 716 if (select) {
Chris@32 717 vector<double> inCol = in[i];
Chris@176 718 vector<double> outCol(pack.templateHeight);
Chris@32 719
Chris@178 720 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 721 // lowest 55 of them.
Chris@178 722 //
Chris@178 723 // In draft mode the CQ is an octave shorter, returning
Chris@178 724 // 540 bins, so we instead pad them with an additional 5
Chris@178 725 // zeros.
Chris@178 726 //
Chris@178 727 // We also need to reverse the column as we go, since the
Chris@178 728 // raw CQ has the high frequencies first and we need it
Chris@178 729 // the other way around.
Chris@32 730
Chris@178 731 if (m_hqMode) {
Chris@178 732 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 733 int ix = inCol.size() - j - 55;
Chris@178 734 outCol[j] = inCol[ix];
Chris@178 735 }
Chris@178 736 } else {
Chris@178 737 for (int j = 0; j < 5; ++j) {
Chris@178 738 outCol[j] = 0.0;
Chris@178 739 }
Chris@178 740 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 741 int ix = inCol.size() - j + 4;
Chris@178 742 outCol[j] = inCol[ix];
Chris@178 743 }
Chris@46 744 }
Chris@32 745
Chris@46 746 vector<double> noiseLevel1 =
Chris@46 747 MedianFilter<double>::filter(40, outCol);
Chris@176 748 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 749 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 750 }
Chris@32 751
Chris@46 752 vector<double> noiseLevel2 =
Chris@46 753 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 754 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 755 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 756 }
Chris@32 757
Chris@165 758 out.push_back(outCol);
Chris@32 759 }
Chris@32 760
Chris@32 761 ++m_columnCount;
Chris@32 762 }
Chris@32 763
Chris@32 764 return out;
Chris@32 765 }
Chris@32 766
Chris@294 767 vector<double>
Chris@170 768 Silvet::postProcess(const vector<double> &pitches,
Chris@170 769 const vector<int> &bestShifts,
Chris@170 770 bool wantShifts)
Chris@166 771 {
Chris@176 772 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 773
Chris@41 774 vector<double> filtered;
Chris@41 775
Chris@176 776 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 777 m_postFilter[j]->push(pitches[j]);
Chris@41 778 filtered.push_back(m_postFilter[j]->get());
Chris@41 779 }
Chris@41 780
Chris@41 781 // Threshold for level and reduce number of candidate pitches
Chris@41 782
Chris@41 783 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 784
Chris@41 785 ValueIndexMap strengths;
Chris@166 786
Chris@176 787 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 788 double strength = filtered[j];
Chris@183 789 if (strength < pack.levelThreshold) continue;
Chris@168 790 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 791 }
Chris@166 792
Chris@168 793 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 794
Chris@168 795 map<int, double> active;
Chris@168 796 map<int, int> activeShifts;
Chris@168 797
Chris@183 798 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 799
Chris@168 800 --si;
Chris@168 801
Chris@168 802 double strength = si->first;
Chris@168 803 int j = si->second;
Chris@168 804
Chris@168 805 active[j] = strength;
Chris@168 806
Chris@170 807 if (wantShifts) {
Chris@170 808 activeShifts[j] = bestShifts[j];
Chris@167 809 }
Chris@41 810 }
Chris@41 811
Chris@168 812 m_pianoRoll.push_back(active);
Chris@170 813
Chris@170 814 if (wantShifts) {
Chris@168 815 m_pianoRollShifts.push_back(activeShifts);
Chris@41 816 }
Chris@294 817
Chris@294 818 return filtered;
Chris@166 819 }
Chris@166 820
Chris@166 821 Vamp::Plugin::FeatureList
Chris@168 822 Silvet::noteTrack(int shiftCount)
Chris@166 823 {
Chris@41 824 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 825 // report notes that have just ended (i.e. that are absent in the
Chris@168 826 // latest active set but present in the prior set in the piano
Chris@41 827 // roll) -- any notes that ended earlier will have been reported
Chris@41 828 // already, and if they haven't ended, we don't know their
Chris@41 829 // duration.
Chris@41 830
Chris@168 831 int width = m_pianoRoll.size() - 1;
Chris@168 832
Chris@168 833 const map<int, double> &active = m_pianoRoll[width];
Chris@41 834
Chris@165 835 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 836
Chris@165 837 // only keep notes >= 100ms or thereabouts
Chris@165 838 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 839 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 840
Chris@41 841 FeatureList noteFeatures;
Chris@41 842
Chris@41 843 if (width < durationThreshold + 1) {
Chris@41 844 return noteFeatures;
Chris@41 845 }
Chris@41 846
Chris@150 847 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 848
Chris@55 849 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 850 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 851
Chris@55 852 int note = ni->first;
Chris@41 853
Chris@41 854 if (active.find(note) != active.end()) {
Chris@41 855 // the note is still playing
Chris@41 856 continue;
Chris@41 857 }
Chris@41 858
Chris@41 859 // the note was playing but just ended
Chris@41 860 int end = width;
Chris@41 861 int start = end-1;
Chris@41 862
Chris@41 863 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 864 --start;
Chris@41 865 }
Chris@41 866 ++start;
Chris@41 867
Chris@169 868 if ((end - start) < durationThreshold) {
Chris@41 869 continue;
Chris@41 870 }
Chris@41 871
Chris@169 872 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 873 }
Chris@41 874
Chris@62 875 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 876
Chris@41 877 return noteFeatures;
Chris@41 878 }
Chris@41 879
Chris@169 880 void
Chris@169 881 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 882 FeatureList &noteFeatures)
Chris@169 883 {
Chris@169 884 int partStart = start;
Chris@169 885 int partShift = 0;
Chris@169 886 int partVelocity = 0;
Chris@169 887
Chris@252 888 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 889
Chris@169 890 for (int i = start; i != end; ++i) {
Chris@169 891
Chris@169 892 double strength = m_pianoRoll[i][note];
Chris@169 893
Chris@169 894 int shift = 0;
Chris@169 895
Chris@169 896 if (shiftCount > 1) {
Chris@169 897
Chris@169 898 shift = m_pianoRollShifts[i][note];
Chris@169 899
Chris@169 900 if (i == partStart) {
Chris@169 901 partShift = shift;
Chris@169 902 }
Chris@169 903
Chris@169 904 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 905
Chris@169 906 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 907
Chris@169 908 // pitch has changed, emit an intermediate note
Chris@252 909 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 910 i,
Chris@252 911 note,
Chris@252 912 partShift,
Chris@252 913 shiftCount,
Chris@252 914 partVelocity));
Chris@169 915 partStart = i;
Chris@169 916 partShift = shift;
Chris@169 917 partVelocity = 0;
Chris@169 918 }
Chris@169 919 }
Chris@169 920
Chris@246 921 int v = round(strength * 2);
Chris@169 922 if (v > partVelocity) {
Chris@169 923 partVelocity = v;
Chris@169 924 }
Chris@169 925 }
Chris@169 926
Chris@169 927 if (end >= partStart + partThreshold) {
Chris@252 928 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 929 end,
Chris@252 930 note,
Chris@252 931 partShift,
Chris@252 932 shiftCount,
Chris@252 933 partVelocity));
Chris@169 934 }
Chris@169 935 }
Chris@252 936
Chris@309 937 RealTime
Chris@309 938 Silvet::getColumnTimestamp(int column)
Chris@309 939 {
Chris@309 940 double columnDuration = 1.0 / m_colsPerSec;
Chris@309 941 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@309 942
Chris@309 943 return m_startTime + RealTime::fromSeconds
Chris@309 944 (columnDuration * (column - postFilterLatency) + 0.02);
Chris@309 945 }
Chris@309 946
Chris@252 947 Silvet::Feature
Chris@252 948 Silvet::makeNoteFeature(int start,
Chris@252 949 int end,
Chris@252 950 int note,
Chris@252 951 int shift,
Chris@252 952 int shiftCount,
Chris@252 953 int velocity)
Chris@252 954 {
Chris@252 955 Feature f;
Chris@252 956
Chris@252 957 f.hasTimestamp = true;
Chris@309 958 f.timestamp = getColumnTimestamp(start);
Chris@252 959
Chris@252 960 f.hasDuration = true;
Chris@309 961 f.duration = getColumnTimestamp(end) - f.timestamp;
Chris@252 962
Chris@252 963 f.values.clear();
Chris@252 964
Chris@252 965 f.values.push_back
Chris@252 966 (noteFrequency(note, shift, shiftCount));
Chris@252 967
Chris@252 968 float inputGain = getInputGainAt(f.timestamp);
Chris@252 969 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 970 velocity = round(velocity / inputGain);
Chris@252 971 if (velocity > 127) velocity = 127;
Chris@252 972 if (velocity < 1) velocity = 1;
Chris@252 973 f.values.push_back(velocity);
Chris@252 974
Chris@252 975 f.label = noteName(note, shift, shiftCount);
Chris@252 976
Chris@252 977 return f;
Chris@252 978 }
Chris@252 979
Chris@252 980 float
Chris@252 981 Silvet::getInputGainAt(RealTime t)
Chris@252 982 {
Chris@252 983 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 984
Chris@252 985 if (i == m_inputGains.end()) {
Chris@252 986 if (i != m_inputGains.begin()) {
Chris@252 987 --i;
Chris@252 988 } else {
Chris@252 989 return 1.f; // no data
Chris@252 990 }
Chris@252 991 }
Chris@252 992
Chris@252 993 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 994
Chris@252 995 return i->second;
Chris@252 996 }
Chris@252 997