annotate src/Silvet.cpp @ 313:fa2ffbb786df

Don't rely on column count to determine whether we've set the start time or not -- it could theoretically be 0 even after some input has appeared
author Chris Cannam
date Tue, 28 Apr 2015 11:09:31 +0100
parents 796d403dc83b
children f98ba4f47e49
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@312 26 #include <future>
Chris@31 27
Chris@32 28 #include <cstdio>
Chris@32 29
Chris@31 30 using std::vector;
Chris@48 31 using std::cout;
Chris@31 32 using std::cerr;
Chris@31 33 using std::endl;
Chris@311 34 using std::pair;
Chris@312 35 using std::future;
Chris@312 36 using std::async;
Chris@40 37 using Vamp::RealTime;
Chris@31 38
Chris@31 39 static int processingSampleRate = 44100;
Chris@31 40 static int processingBPO = 60;
Chris@170 41
Chris@272 42 static int minInputSampleRate = 100;
Chris@272 43 static int maxInputSampleRate = 192000;
Chris@272 44
Chris@31 45 Silvet::Silvet(float inputSampleRate) :
Chris@31 46 Plugin(inputSampleRate),
Chris@161 47 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 48 m_resampler(0),
Chris@246 49 m_flattener(0),
Chris@110 50 m_cq(0),
Chris@162 51 m_hqMode(true),
Chris@166 52 m_fineTuning(false),
Chris@178 53 m_instrument(0),
Chris@313 54 m_colsPerSec(50),
Chris@313 55 m_haveStartTime(false)
Chris@31 56 {
Chris@31 57 }
Chris@31 58
Chris@31 59 Silvet::~Silvet()
Chris@31 60 {
Chris@31 61 delete m_resampler;
Chris@246 62 delete m_flattener;
Chris@31 63 delete m_cq;
Chris@41 64 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 65 delete m_postFilter[i];
Chris@41 66 }
Chris@31 67 }
Chris@31 68
Chris@31 69 string
Chris@31 70 Silvet::getIdentifier() const
Chris@31 71 {
Chris@31 72 return "silvet";
Chris@31 73 }
Chris@31 74
Chris@31 75 string
Chris@31 76 Silvet::getName() const
Chris@31 77 {
Chris@31 78 return "Silvet Note Transcription";
Chris@31 79 }
Chris@31 80
Chris@31 81 string
Chris@31 82 Silvet::getDescription() const
Chris@31 83 {
Chris@191 84 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 85 }
Chris@31 86
Chris@31 87 string
Chris@31 88 Silvet::getMaker() const
Chris@31 89 {
Chris@191 90 return "Queen Mary, University of London";
Chris@31 91 }
Chris@31 92
Chris@31 93 int
Chris@31 94 Silvet::getPluginVersion() const
Chris@31 95 {
Chris@309 96 return 3;
Chris@31 97 }
Chris@31 98
Chris@31 99 string
Chris@31 100 Silvet::getCopyright() const
Chris@31 101 {
Chris@191 102 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 103 }
Chris@31 104
Chris@31 105 Silvet::InputDomain
Chris@31 106 Silvet::getInputDomain() const
Chris@31 107 {
Chris@31 108 return TimeDomain;
Chris@31 109 }
Chris@31 110
Chris@31 111 size_t
Chris@31 112 Silvet::getPreferredBlockSize() const
Chris@31 113 {
Chris@31 114 return 0;
Chris@31 115 }
Chris@31 116
Chris@31 117 size_t
Chris@31 118 Silvet::getPreferredStepSize() const
Chris@31 119 {
Chris@31 120 return 0;
Chris@31 121 }
Chris@31 122
Chris@31 123 size_t
Chris@31 124 Silvet::getMinChannelCount() const
Chris@31 125 {
Chris@31 126 return 1;
Chris@31 127 }
Chris@31 128
Chris@31 129 size_t
Chris@31 130 Silvet::getMaxChannelCount() const
Chris@31 131 {
Chris@31 132 return 1;
Chris@31 133 }
Chris@31 134
Chris@31 135 Silvet::ParameterList
Chris@31 136 Silvet::getParameterDescriptors() const
Chris@31 137 {
Chris@31 138 ParameterList list;
Chris@110 139
Chris@110 140 ParameterDescriptor desc;
Chris@110 141 desc.identifier = "mode";
Chris@110 142 desc.name = "Processing mode";
Chris@110 143 desc.unit = "";
Chris@271 144 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 145 desc.minValue = 0;
Chris@110 146 desc.maxValue = 1;
Chris@113 147 desc.defaultValue = 1;
Chris@110 148 desc.isQuantized = true;
Chris@110 149 desc.quantizeStep = 1;
Chris@166 150 desc.valueNames.push_back("Draft (faster)");
Chris@165 151 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 152 list.push_back(desc);
Chris@161 153
Chris@176 154 desc.identifier = "instrument";
Chris@176 155 desc.name = "Instrument";
Chris@161 156 desc.unit = "";
Chris@271 157 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 158 desc.minValue = 0;
Chris@162 159 desc.maxValue = m_instruments.size()-1;
Chris@162 160 desc.defaultValue = 0;
Chris@161 161 desc.isQuantized = true;
Chris@161 162 desc.quantizeStep = 1;
Chris@161 163 desc.valueNames.clear();
Chris@162 164 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 165 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 166 }
Chris@166 167 list.push_back(desc);
Chris@161 168
Chris@166 169 desc.identifier = "finetune";
Chris@166 170 desc.name = "Return fine pitch estimates";
Chris@166 171 desc.unit = "";
Chris@271 172 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 173 desc.minValue = 0;
Chris@166 174 desc.maxValue = 1;
Chris@166 175 desc.defaultValue = 0;
Chris@166 176 desc.isQuantized = true;
Chris@166 177 desc.quantizeStep = 1;
Chris@166 178 desc.valueNames.clear();
Chris@110 179 list.push_back(desc);
Chris@110 180
Chris@31 181 return list;
Chris@31 182 }
Chris@31 183
Chris@31 184 float
Chris@31 185 Silvet::getParameter(string identifier) const
Chris@31 186 {
Chris@110 187 if (identifier == "mode") {
Chris@110 188 return m_hqMode ? 1.f : 0.f;
Chris@166 189 } else if (identifier == "finetune") {
Chris@166 190 return m_fineTuning ? 1.f : 0.f;
Chris@176 191 } else if (identifier == "instrument") {
Chris@162 192 return m_instrument;
Chris@110 193 }
Chris@31 194 return 0;
Chris@31 195 }
Chris@31 196
Chris@31 197 void
Chris@31 198 Silvet::setParameter(string identifier, float value)
Chris@31 199 {
Chris@110 200 if (identifier == "mode") {
Chris@110 201 m_hqMode = (value > 0.5);
Chris@166 202 } else if (identifier == "finetune") {
Chris@166 203 m_fineTuning = (value > 0.5);
Chris@176 204 } else if (identifier == "instrument") {
Chris@162 205 m_instrument = lrintf(value);
Chris@110 206 }
Chris@31 207 }
Chris@31 208
Chris@31 209 Silvet::ProgramList
Chris@31 210 Silvet::getPrograms() const
Chris@31 211 {
Chris@31 212 ProgramList list;
Chris@31 213 return list;
Chris@31 214 }
Chris@31 215
Chris@31 216 string
Chris@31 217 Silvet::getCurrentProgram() const
Chris@31 218 {
Chris@31 219 return "";
Chris@31 220 }
Chris@31 221
Chris@31 222 void
Chris@31 223 Silvet::selectProgram(string name)
Chris@31 224 {
Chris@31 225 }
Chris@31 226
Chris@31 227 Silvet::OutputList
Chris@31 228 Silvet::getOutputDescriptors() const
Chris@31 229 {
Chris@31 230 OutputList list;
Chris@31 231
Chris@31 232 OutputDescriptor d;
Chris@51 233 d.identifier = "notes";
Chris@51 234 d.name = "Note transcription";
Chris@271 235 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 236 d.unit = "Hz";
Chris@31 237 d.hasFixedBinCount = true;
Chris@31 238 d.binCount = 2;
Chris@41 239 d.binNames.push_back("Frequency");
Chris@31 240 d.binNames.push_back("Velocity");
Chris@31 241 d.hasKnownExtents = false;
Chris@31 242 d.isQuantized = false;
Chris@31 243 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 244 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 245 d.hasDuration = true;
Chris@32 246 m_notesOutputNo = list.size();
Chris@32 247 list.push_back(d);
Chris@32 248
Chris@178 249 d.identifier = "timefreq";
Chris@178 250 d.name = "Time-frequency distribution";
Chris@271 251 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 252 d.unit = "";
Chris@178 253 d.hasFixedBinCount = true;
Chris@178 254 d.binCount = m_instruments[0].templateHeight;
Chris@178 255 d.binNames.clear();
Chris@178 256 if (m_cq) {
Chris@294 257 char name[50];
Chris@178 258 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 259 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 260 // lowest-frequency 55 bins have been dropped, for a
Chris@178 261 // 545-bin template. The native CQ bins go high->low
Chris@178 262 // frequency though, so these are still the first 545 bins
Chris@178 263 // as reported by getBinFrequency, though in reverse order
Chris@178 264 float freq = m_cq->getBinFrequency
Chris@178 265 (m_instruments[0].templateHeight - i - 1);
Chris@178 266 sprintf(name, "%.1f Hz", freq);
Chris@178 267 d.binNames.push_back(name);
Chris@178 268 }
Chris@178 269 }
Chris@178 270 d.hasKnownExtents = false;
Chris@178 271 d.isQuantized = false;
Chris@178 272 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 273 d.sampleRate = m_colsPerSec;
Chris@178 274 d.hasDuration = false;
Chris@178 275 m_fcqOutputNo = list.size();
Chris@178 276 list.push_back(d);
Chris@178 277
Chris@294 278 d.identifier = "pitchactivation";
Chris@294 279 d.name = "Pitch activation distribution";
Chris@294 280 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 281 d.unit = "";
Chris@294 282 d.hasFixedBinCount = true;
Chris@294 283 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 284 d.binNames.clear();
Chris@294 285 if (m_cq) {
Chris@294 286 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 287 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 288 }
Chris@294 289 }
Chris@294 290 d.hasKnownExtents = false;
Chris@294 291 d.isQuantized = false;
Chris@294 292 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 293 d.sampleRate = m_colsPerSec;
Chris@294 294 d.hasDuration = false;
Chris@294 295 m_pitchOutputNo = list.size();
Chris@294 296 list.push_back(d);
Chris@294 297
Chris@309 298 d.identifier = "chroma";
Chris@309 299 d.name = "Pitch chroma distribution";
Chris@309 300 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
Chris@309 301 d.unit = "";
Chris@309 302 d.hasFixedBinCount = true;
Chris@309 303 d.binCount = 12;
Chris@309 304 d.binNames.clear();
Chris@309 305 if (m_cq) {
Chris@309 306 for (int i = 0; i < 12; ++i) {
Chris@309 307 d.binNames.push_back(chromaName(i));
Chris@309 308 }
Chris@309 309 }
Chris@309 310 d.hasKnownExtents = false;
Chris@309 311 d.isQuantized = false;
Chris@309 312 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@309 313 d.sampleRate = m_colsPerSec;
Chris@309 314 d.hasDuration = false;
Chris@309 315 m_chromaOutputNo = list.size();
Chris@309 316 list.push_back(d);
Chris@309 317
Chris@31 318 return list;
Chris@31 319 }
Chris@31 320
Chris@38 321 std::string
Chris@309 322 Silvet::chromaName(int pitch) const
Chris@38 323 {
Chris@38 324 static const char *names[] = {
Chris@38 325 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 326 };
Chris@38 327
Chris@309 328 return names[pitch];
Chris@309 329 }
Chris@309 330
Chris@309 331 std::string
Chris@309 332 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@309 333 {
Chris@309 334 string n = chromaName(note % 12);
Chris@38 335
Chris@175 336 int oct = (note + 9) / 12;
Chris@38 337
Chris@175 338 char buf[30];
Chris@175 339
Chris@175 340 float pshift = 0.f;
Chris@175 341 if (shiftCount > 1) {
Chris@175 342 // see noteFrequency below
Chris@175 343 pshift =
Chris@175 344 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 345 }
Chris@175 346
Chris@175 347 if (pshift > 0.f) {
Chris@309 348 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
Chris@175 349 } else if (pshift < 0.f) {
Chris@309 350 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
Chris@175 351 } else {
Chris@309 352 sprintf(buf, "%s%d", n.c_str(), oct);
Chris@175 353 }
Chris@38 354
Chris@38 355 return buf;
Chris@38 356 }
Chris@38 357
Chris@41 358 float
Chris@168 359 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 360 {
Chris@169 361 // Convert shift number to a pitch shift. The given shift number
Chris@169 362 // is an offset into the template array, which starts with some
Chris@169 363 // zeros, followed by the template, then some trailing zeros.
Chris@169 364 //
Chris@169 365 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 366 // == 5, then the number will be in the range 0-4 and the template
Chris@169 367 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 368 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 369 // represent moving the template *up* in pitch (by introducing
Chris@169 370 // zeros at the start, which is the low-frequency end), for a
Chris@169 371 // positive pitch shift; and higher values represent moving it
Chris@169 372 // down in pitch, for a negative pitch shift.
Chris@169 373
Chris@175 374 float pshift = 0.f;
Chris@175 375 if (shiftCount > 1) {
Chris@175 376 pshift =
Chris@175 377 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 378 }
Chris@169 379
Chris@169 380 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 381 }
Chris@41 382
Chris@31 383 bool
Chris@31 384 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 385 {
Chris@272 386 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 387 m_inputSampleRate > maxInputSampleRate) {
Chris@272 388 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 389 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 390 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 391 return false;
Chris@272 392 }
Chris@272 393
Chris@31 394 if (channels < getMinChannelCount() ||
Chris@272 395 channels > getMaxChannelCount()) {
Chris@272 396 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 397 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 398 << getMaxChannelCount() << ")" << endl;
Chris@272 399 return false;
Chris@272 400 }
Chris@31 401
Chris@31 402 if (stepSize != blockSize) {
Chris@31 403 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 404 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 405 return false;
Chris@31 406 }
Chris@31 407
Chris@31 408 m_blockSize = blockSize;
Chris@31 409
Chris@31 410 reset();
Chris@31 411
Chris@31 412 return true;
Chris@31 413 }
Chris@31 414
Chris@31 415 void
Chris@31 416 Silvet::reset()
Chris@31 417 {
Chris@31 418 delete m_resampler;
Chris@246 419 delete m_flattener;
Chris@31 420 delete m_cq;
Chris@31 421
Chris@31 422 if (m_inputSampleRate != processingSampleRate) {
Chris@31 423 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 424 } else {
Chris@31 425 m_resampler = 0;
Chris@31 426 }
Chris@31 427
Chris@246 428 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 429 m_flattener->reset();
Chris@246 430
Chris@173 431 double minFreq = 27.5;
Chris@173 432
Chris@173 433 if (!m_hqMode) {
Chris@173 434 // We don't actually return any notes from the bottom octave,
Chris@173 435 // so we can just pad with zeros
Chris@173 436 minFreq *= 2;
Chris@173 437 }
Chris@173 438
Chris@154 439 CQParameters params(processingSampleRate,
Chris@173 440 minFreq,
Chris@154 441 processingSampleRate / 3,
Chris@154 442 processingBPO);
Chris@154 443
Chris@155 444 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 445 // drops the FFT size to 512 from 1024 and alters
Chris@155 446 // some other processing parameters, making
Chris@155 447 // everything much, much slower. Could be a flaw
Chris@155 448 // in the CQ parameter calculations, must check
Chris@154 449 params.atomHopFactor = 0.3;
Chris@154 450 params.threshold = 0.0005;
Chris@172 451 params.window = CQParameters::Hann;
Chris@154 452
Chris@154 453 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 454
Chris@165 455 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 456
Chris@41 457 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 458 delete m_postFilter[i];
Chris@41 459 }
Chris@41 460 m_postFilter.clear();
Chris@176 461 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 462 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 463 }
Chris@41 464 m_pianoRoll.clear();
Chris@246 465 m_inputGains.clear();
Chris@32 466 m_columnCount = 0;
Chris@272 467 m_resampledCount = 0;
Chris@40 468 m_startTime = RealTime::zeroTime;
Chris@313 469 m_haveStartTime = false;
Chris@31 470 }
Chris@31 471
Chris@31 472 Silvet::FeatureSet
Chris@31 473 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 474 {
Chris@313 475 if (!m_haveStartTime) {
Chris@40 476 m_startTime = timestamp;
Chris@313 477 m_haveStartTime = true;
Chris@40 478 }
Chris@246 479
Chris@246 480 vector<float> flattened(m_blockSize);
Chris@246 481 float gain = 1.f;
Chris@246 482 m_flattener->connectInputPort
Chris@246 483 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 484 m_flattener->connectOutputPort
Chris@246 485 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 486 m_flattener->connectOutputPort
Chris@246 487 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 488 m_flattener->process(m_blockSize);
Chris@246 489
Chris@252 490 m_inputGains[timestamp] = gain;
Chris@40 491
Chris@31 492 vector<double> data;
Chris@40 493 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 494 double d = flattened[i];
Chris@235 495 data.push_back(d);
Chris@40 496 }
Chris@31 497
Chris@31 498 if (m_resampler) {
Chris@272 499
Chris@31 500 data = m_resampler->process(data.data(), data.size());
Chris@272 501
Chris@272 502 int hadCount = m_resampledCount;
Chris@272 503 m_resampledCount += data.size();
Chris@272 504
Chris@272 505 int resamplerLatency = m_resampler->getLatency();
Chris@272 506
Chris@272 507 if (hadCount < resamplerLatency) {
Chris@272 508 int stillToDrop = resamplerLatency - hadCount;
Chris@272 509 if (stillToDrop >= int(data.size())) {
Chris@272 510 return FeatureSet();
Chris@272 511 } else {
Chris@272 512 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 513 }
Chris@272 514 }
Chris@31 515 }
Chris@272 516
Chris@32 517 Grid cqout = m_cq->process(data);
Chris@51 518 FeatureSet fs = transcribe(cqout);
Chris@51 519 return fs;
Chris@34 520 }
Chris@34 521
Chris@34 522 Silvet::FeatureSet
Chris@34 523 Silvet::getRemainingFeatures()
Chris@34 524 {
Chris@145 525 Grid cqout = m_cq->getRemainingOutput();
Chris@51 526 FeatureSet fs = transcribe(cqout);
Chris@51 527 return fs;
Chris@34 528 }
Chris@34 529
Chris@34 530 Silvet::FeatureSet
Chris@34 531 Silvet::transcribe(const Grid &cqout)
Chris@34 532 {
Chris@32 533 Grid filtered = preProcess(cqout);
Chris@31 534
Chris@32 535 FeatureSet fs;
Chris@32 536
Chris@104 537 if (filtered.empty()) return fs;
Chris@170 538
Chris@170 539 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 540
Chris@178 541 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 542 Feature f;
Chris@178 543 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 544 f.values.push_back(float(filtered[i][j]));
Chris@178 545 }
Chris@178 546 fs[m_fcqOutputNo].push_back(f);
Chris@178 547 }
Chris@178 548
Chris@34 549 int width = filtered.size();
Chris@34 550
Chris@311 551 Grid localPitches(width);
Chris@170 552
Chris@170 553 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 554 int shiftCount = 1;
Chris@170 555 if (wantShifts) {
Chris@170 556 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 557 }
Chris@170 558
Chris@170 559 vector<vector<int> > localBestShifts;
Chris@170 560 if (wantShifts) {
Chris@311 561 localBestShifts = vector<vector<int> >(width);
Chris@170 562 }
Chris@170 563
Chris@312 564 #ifndef MAX_EM_THREADS
Chris@312 565 #define MAX_EM_THREADS 8
Chris@312 566 #endif
Chris@312 567
Chris@312 568 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
Chris@312 569 for (int i = 0; i < width; ) {
Chris@312 570 typedef future<pair<vector<double>, vector<int>>> EMFuture;
Chris@312 571 vector<EMFuture> results;
Chris@312 572 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
Chris@312 573 results.push_back
Chris@312 574 (async(std::launch::async,
Chris@312 575 [&](int index) {
Chris@312 576 return applyEM(pack, filtered.at(index), wantShifts);
Chris@312 577 }, i + j));
Chris@312 578 }
Chris@312 579 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
Chris@312 580 auto out = results[j].get();
Chris@312 581 localPitches[i+j] = out.first;
Chris@312 582 if (wantShifts) localBestShifts[i+j] = out.second;
Chris@312 583 }
Chris@312 584 i += MAX_EM_THREADS;
Chris@312 585 }
Chris@312 586 #else
Chris@123 587 for (int i = 0; i < width; ++i) {
Chris@311 588 auto out = applyEM(pack, filtered.at(i), wantShifts);
Chris@311 589 localPitches[i] = out.first;
Chris@311 590 if (wantShifts) localBestShifts[i] = out.second;
Chris@123 591 }
Chris@312 592 #endif
Chris@312 593
Chris@166 594 for (int i = 0; i < width; ++i) {
Chris@37 595
Chris@309 596 // This returns a filtered column, and pushes the
Chris@309 597 // up-to-max-polyphony activation column to m_pianoRoll
Chris@294 598 vector<double> filtered = postProcess
Chris@294 599 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 600
Chris@309 601 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
Chris@309 602 float inputGain = getInputGainAt(timestamp);
Chris@309 603
Chris@294 604 Feature f;
Chris@294 605 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 606 float v = filtered[j];
Chris@294 607 if (v < pack.levelThreshold) v = 0.f;
Chris@309 608 f.values.push_back(v / inputGain);
Chris@294 609 }
Chris@294 610 fs[m_pitchOutputNo].push_back(f);
Chris@309 611
Chris@309 612 f.values.clear();
Chris@309 613 f.values.resize(12);
Chris@309 614 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 615 f.values[j % 12] += filtered[j] / inputGain;
Chris@309 616 }
Chris@309 617 fs[m_chromaOutputNo].push_back(f);
Chris@166 618
Chris@168 619 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 620
Chris@123 621 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 622 fi != noteFeatures.end(); ++fi) {
Chris@123 623 fs[m_notesOutputNo].push_back(*fi);
Chris@40 624 }
Chris@34 625 }
Chris@34 626
Chris@32 627 return fs;
Chris@31 628 }
Chris@31 629
Chris@311 630 pair<vector<double>, vector<int> >
Chris@311 631 Silvet::applyEM(const InstrumentPack &pack,
Chris@311 632 const vector<double> &column,
Chris@311 633 bool wantShifts)
Chris@311 634 {
Chris@311 635 double columnThreshold = 1e-5;
Chris@311 636
Chris@311 637 vector<double> pitches(pack.templateNoteCount, 0.0);
Chris@311 638 vector<int> bestShifts;
Chris@311 639
Chris@311 640 double sum = 0.0;
Chris@311 641 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@311 642 sum += column.at(j);
Chris@311 643 }
Chris@311 644 if (sum < columnThreshold) return { pitches, bestShifts };
Chris@311 645
Chris@311 646 EM em(&pack, m_hqMode);
Chris@311 647
Chris@311 648 em.setPitchSparsity(pack.pitchSparsity);
Chris@311 649 em.setSourceSparsity(pack.sourceSparsity);
Chris@311 650
Chris@311 651 int iterations = m_hqMode ? 20 : 10;
Chris@311 652
Chris@311 653 for (int j = 0; j < iterations; ++j) {
Chris@311 654 em.iterate(column.data());
Chris@311 655 }
Chris@311 656
Chris@311 657 const float *pitchDist = em.getPitchDistribution();
Chris@311 658 const float *const *shiftDist = em.getShifts();
Chris@311 659
Chris@311 660 int shiftCount = 1;
Chris@311 661 if (wantShifts) {
Chris@311 662 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@311 663 }
Chris@311 664
Chris@311 665 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@311 666
Chris@311 667 pitches[j] = pitchDist[j] * sum;
Chris@311 668
Chris@311 669 int bestShift = 0;
Chris@311 670 float bestShiftValue = 0.0;
Chris@311 671 if (wantShifts) {
Chris@311 672 for (int k = 0; k < shiftCount; ++k) {
Chris@311 673 float value = shiftDist[k][j];
Chris@311 674 if (k == 0 || value > bestShiftValue) {
Chris@311 675 bestShiftValue = value;
Chris@311 676 bestShift = k;
Chris@311 677 }
Chris@311 678 }
Chris@311 679 bestShifts.push_back(bestShift);
Chris@311 680 }
Chris@311 681 }
Chris@311 682
Chris@311 683 return { pitches, bestShifts };
Chris@311 684 }
Chris@311 685
Chris@32 686 Silvet::Grid
Chris@32 687 Silvet::preProcess(const Grid &in)
Chris@32 688 {
Chris@32 689 int width = in.size();
Chris@32 690
Chris@165 691 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 692
Chris@165 693 // need to be careful that col spacing is an integer number of samples!
Chris@165 694 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 695
Chris@32 696 Grid out;
Chris@32 697
Chris@58 698 // We count the CQ latency in terms of processing hops, but
Chris@58 699 // actually it probably isn't an exact number of hops so this
Chris@58 700 // isn't quite accurate. But the small constant offset is
Chris@165 701 // practically irrelevant compared to the jitter from the frame
Chris@165 702 // size we reduce to in a moment
Chris@33 703 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 704
Chris@176 705 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 706
Chris@32 707 for (int i = 0; i < width; ++i) {
Chris@32 708
Chris@33 709 if (m_columnCount < latentColumns) {
Chris@33 710 ++m_columnCount;
Chris@33 711 continue;
Chris@33 712 }
Chris@33 713
Chris@32 714 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 715 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 716
Chris@32 717 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 718
Chris@32 719 if (select) {
Chris@32 720 vector<double> inCol = in[i];
Chris@176 721 vector<double> outCol(pack.templateHeight);
Chris@32 722
Chris@178 723 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 724 // lowest 55 of them.
Chris@178 725 //
Chris@178 726 // In draft mode the CQ is an octave shorter, returning
Chris@178 727 // 540 bins, so we instead pad them with an additional 5
Chris@178 728 // zeros.
Chris@178 729 //
Chris@178 730 // We also need to reverse the column as we go, since the
Chris@178 731 // raw CQ has the high frequencies first and we need it
Chris@178 732 // the other way around.
Chris@32 733
Chris@178 734 if (m_hqMode) {
Chris@178 735 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 736 int ix = inCol.size() - j - 55;
Chris@178 737 outCol[j] = inCol[ix];
Chris@178 738 }
Chris@178 739 } else {
Chris@178 740 for (int j = 0; j < 5; ++j) {
Chris@178 741 outCol[j] = 0.0;
Chris@178 742 }
Chris@178 743 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 744 int ix = inCol.size() - j + 4;
Chris@178 745 outCol[j] = inCol[ix];
Chris@178 746 }
Chris@46 747 }
Chris@32 748
Chris@46 749 vector<double> noiseLevel1 =
Chris@46 750 MedianFilter<double>::filter(40, outCol);
Chris@176 751 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 752 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 753 }
Chris@32 754
Chris@46 755 vector<double> noiseLevel2 =
Chris@46 756 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 757 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 758 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 759 }
Chris@32 760
Chris@165 761 out.push_back(outCol);
Chris@32 762 }
Chris@32 763
Chris@32 764 ++m_columnCount;
Chris@32 765 }
Chris@32 766
Chris@32 767 return out;
Chris@32 768 }
Chris@32 769
Chris@294 770 vector<double>
Chris@170 771 Silvet::postProcess(const vector<double> &pitches,
Chris@170 772 const vector<int> &bestShifts,
Chris@170 773 bool wantShifts)
Chris@166 774 {
Chris@176 775 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 776
Chris@41 777 vector<double> filtered;
Chris@41 778
Chris@176 779 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 780 m_postFilter[j]->push(pitches[j]);
Chris@41 781 filtered.push_back(m_postFilter[j]->get());
Chris@41 782 }
Chris@41 783
Chris@41 784 // Threshold for level and reduce number of candidate pitches
Chris@41 785
Chris@41 786 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 787
Chris@41 788 ValueIndexMap strengths;
Chris@166 789
Chris@176 790 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 791 double strength = filtered[j];
Chris@183 792 if (strength < pack.levelThreshold) continue;
Chris@168 793 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 794 }
Chris@166 795
Chris@168 796 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 797
Chris@168 798 map<int, double> active;
Chris@168 799 map<int, int> activeShifts;
Chris@168 800
Chris@183 801 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 802
Chris@168 803 --si;
Chris@168 804
Chris@168 805 double strength = si->first;
Chris@168 806 int j = si->second;
Chris@168 807
Chris@168 808 active[j] = strength;
Chris@168 809
Chris@170 810 if (wantShifts) {
Chris@170 811 activeShifts[j] = bestShifts[j];
Chris@167 812 }
Chris@41 813 }
Chris@41 814
Chris@168 815 m_pianoRoll.push_back(active);
Chris@170 816
Chris@170 817 if (wantShifts) {
Chris@168 818 m_pianoRollShifts.push_back(activeShifts);
Chris@41 819 }
Chris@294 820
Chris@294 821 return filtered;
Chris@166 822 }
Chris@166 823
Chris@166 824 Vamp::Plugin::FeatureList
Chris@168 825 Silvet::noteTrack(int shiftCount)
Chris@166 826 {
Chris@41 827 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 828 // report notes that have just ended (i.e. that are absent in the
Chris@168 829 // latest active set but present in the prior set in the piano
Chris@41 830 // roll) -- any notes that ended earlier will have been reported
Chris@41 831 // already, and if they haven't ended, we don't know their
Chris@41 832 // duration.
Chris@41 833
Chris@168 834 int width = m_pianoRoll.size() - 1;
Chris@168 835
Chris@168 836 const map<int, double> &active = m_pianoRoll[width];
Chris@41 837
Chris@165 838 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 839
Chris@165 840 // only keep notes >= 100ms or thereabouts
Chris@165 841 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 842 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 843
Chris@41 844 FeatureList noteFeatures;
Chris@41 845
Chris@41 846 if (width < durationThreshold + 1) {
Chris@41 847 return noteFeatures;
Chris@41 848 }
Chris@41 849
Chris@150 850 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 851
Chris@55 852 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 853 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 854
Chris@55 855 int note = ni->first;
Chris@41 856
Chris@41 857 if (active.find(note) != active.end()) {
Chris@41 858 // the note is still playing
Chris@41 859 continue;
Chris@41 860 }
Chris@41 861
Chris@41 862 // the note was playing but just ended
Chris@41 863 int end = width;
Chris@41 864 int start = end-1;
Chris@41 865
Chris@41 866 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 867 --start;
Chris@41 868 }
Chris@41 869 ++start;
Chris@41 870
Chris@169 871 if ((end - start) < durationThreshold) {
Chris@41 872 continue;
Chris@41 873 }
Chris@41 874
Chris@169 875 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 876 }
Chris@41 877
Chris@62 878 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 879
Chris@41 880 return noteFeatures;
Chris@41 881 }
Chris@41 882
Chris@169 883 void
Chris@169 884 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 885 FeatureList &noteFeatures)
Chris@169 886 {
Chris@169 887 int partStart = start;
Chris@169 888 int partShift = 0;
Chris@169 889 int partVelocity = 0;
Chris@169 890
Chris@252 891 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 892
Chris@169 893 for (int i = start; i != end; ++i) {
Chris@169 894
Chris@169 895 double strength = m_pianoRoll[i][note];
Chris@169 896
Chris@169 897 int shift = 0;
Chris@169 898
Chris@169 899 if (shiftCount > 1) {
Chris@169 900
Chris@169 901 shift = m_pianoRollShifts[i][note];
Chris@169 902
Chris@169 903 if (i == partStart) {
Chris@169 904 partShift = shift;
Chris@169 905 }
Chris@169 906
Chris@169 907 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 908
Chris@169 909 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 910
Chris@169 911 // pitch has changed, emit an intermediate note
Chris@252 912 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 913 i,
Chris@252 914 note,
Chris@252 915 partShift,
Chris@252 916 shiftCount,
Chris@252 917 partVelocity));
Chris@169 918 partStart = i;
Chris@169 919 partShift = shift;
Chris@169 920 partVelocity = 0;
Chris@169 921 }
Chris@169 922 }
Chris@169 923
Chris@246 924 int v = round(strength * 2);
Chris@169 925 if (v > partVelocity) {
Chris@169 926 partVelocity = v;
Chris@169 927 }
Chris@169 928 }
Chris@169 929
Chris@169 930 if (end >= partStart + partThreshold) {
Chris@252 931 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 932 end,
Chris@252 933 note,
Chris@252 934 partShift,
Chris@252 935 shiftCount,
Chris@252 936 partVelocity));
Chris@169 937 }
Chris@169 938 }
Chris@252 939
Chris@309 940 RealTime
Chris@309 941 Silvet::getColumnTimestamp(int column)
Chris@309 942 {
Chris@309 943 double columnDuration = 1.0 / m_colsPerSec;
Chris@309 944 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@309 945
Chris@309 946 return m_startTime + RealTime::fromSeconds
Chris@309 947 (columnDuration * (column - postFilterLatency) + 0.02);
Chris@309 948 }
Chris@309 949
Chris@252 950 Silvet::Feature
Chris@252 951 Silvet::makeNoteFeature(int start,
Chris@252 952 int end,
Chris@252 953 int note,
Chris@252 954 int shift,
Chris@252 955 int shiftCount,
Chris@252 956 int velocity)
Chris@252 957 {
Chris@252 958 Feature f;
Chris@252 959
Chris@252 960 f.hasTimestamp = true;
Chris@309 961 f.timestamp = getColumnTimestamp(start);
Chris@252 962
Chris@252 963 f.hasDuration = true;
Chris@309 964 f.duration = getColumnTimestamp(end) - f.timestamp;
Chris@252 965
Chris@252 966 f.values.clear();
Chris@252 967
Chris@252 968 f.values.push_back
Chris@252 969 (noteFrequency(note, shift, shiftCount));
Chris@252 970
Chris@252 971 float inputGain = getInputGainAt(f.timestamp);
Chris@252 972 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 973 velocity = round(velocity / inputGain);
Chris@252 974 if (velocity > 127) velocity = 127;
Chris@252 975 if (velocity < 1) velocity = 1;
Chris@252 976 f.values.push_back(velocity);
Chris@252 977
Chris@252 978 f.label = noteName(note, shift, shiftCount);
Chris@252 979
Chris@252 980 return f;
Chris@252 981 }
Chris@252 982
Chris@252 983 float
Chris@252 984 Silvet::getInputGainAt(RealTime t)
Chris@252 985 {
Chris@252 986 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 987
Chris@252 988 if (i == m_inputGains.end()) {
Chris@252 989 if (i != m_inputGains.begin()) {
Chris@252 990 --i;
Chris@252 991 } else {
Chris@252 992 return 1.f; // no data
Chris@252 993 }
Chris@252 994 }
Chris@252 995
Chris@252 996 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 997
Chris@252 998 return i->second;
Chris@252 999 }
Chris@252 1000