annotate src/Silvet.cpp @ 297:d6ab1b4918bd livemode

Thinking about a "live" mode...
author Chris Cannam
date Fri, 28 Nov 2014 09:42:56 +0000
parents aa7be9d8112e
children ebe5e0942bb8
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@40 33 using Vamp::RealTime;
Chris@31 34
Chris@31 35 static int processingSampleRate = 44100;
Chris@31 36 static int processingBPO = 60;
Chris@170 37
Chris@272 38 static int minInputSampleRate = 100;
Chris@272 39 static int maxInputSampleRate = 192000;
Chris@272 40
Chris@31 41 Silvet::Silvet(float inputSampleRate) :
Chris@31 42 Plugin(inputSampleRate),
Chris@161 43 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 44 m_resampler(0),
Chris@246 45 m_flattener(0),
Chris@110 46 m_cq(0),
Chris@297 47 m_mode(HighQualityMode),
Chris@166 48 m_fineTuning(false),
Chris@178 49 m_instrument(0),
Chris@178 50 m_colsPerSec(50)
Chris@31 51 {
Chris@31 52 }
Chris@31 53
Chris@31 54 Silvet::~Silvet()
Chris@31 55 {
Chris@31 56 delete m_resampler;
Chris@246 57 delete m_flattener;
Chris@31 58 delete m_cq;
Chris@41 59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 60 delete m_postFilter[i];
Chris@41 61 }
Chris@31 62 }
Chris@31 63
Chris@31 64 string
Chris@31 65 Silvet::getIdentifier() const
Chris@31 66 {
Chris@31 67 return "silvet";
Chris@31 68 }
Chris@31 69
Chris@31 70 string
Chris@31 71 Silvet::getName() const
Chris@31 72 {
Chris@31 73 return "Silvet Note Transcription";
Chris@31 74 }
Chris@31 75
Chris@31 76 string
Chris@31 77 Silvet::getDescription() const
Chris@31 78 {
Chris@191 79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 80 }
Chris@31 81
Chris@31 82 string
Chris@31 83 Silvet::getMaker() const
Chris@31 84 {
Chris@191 85 return "Queen Mary, University of London";
Chris@31 86 }
Chris@31 87
Chris@31 88 int
Chris@31 89 Silvet::getPluginVersion() const
Chris@31 90 {
Chris@295 91 return 2;
Chris@31 92 }
Chris@31 93
Chris@31 94 string
Chris@31 95 Silvet::getCopyright() const
Chris@31 96 {
Chris@191 97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 98 }
Chris@31 99
Chris@31 100 Silvet::InputDomain
Chris@31 101 Silvet::getInputDomain() const
Chris@31 102 {
Chris@31 103 return TimeDomain;
Chris@31 104 }
Chris@31 105
Chris@31 106 size_t
Chris@31 107 Silvet::getPreferredBlockSize() const
Chris@31 108 {
Chris@31 109 return 0;
Chris@31 110 }
Chris@31 111
Chris@31 112 size_t
Chris@31 113 Silvet::getPreferredStepSize() const
Chris@31 114 {
Chris@31 115 return 0;
Chris@31 116 }
Chris@31 117
Chris@31 118 size_t
Chris@31 119 Silvet::getMinChannelCount() const
Chris@31 120 {
Chris@31 121 return 1;
Chris@31 122 }
Chris@31 123
Chris@31 124 size_t
Chris@31 125 Silvet::getMaxChannelCount() const
Chris@31 126 {
Chris@31 127 return 1;
Chris@31 128 }
Chris@31 129
Chris@31 130 Silvet::ParameterList
Chris@31 131 Silvet::getParameterDescriptors() const
Chris@31 132 {
Chris@31 133 ParameterList list;
Chris@110 134
Chris@110 135 ParameterDescriptor desc;
Chris@110 136 desc.identifier = "mode";
Chris@110 137 desc.name = "Processing mode";
Chris@110 138 desc.unit = "";
Chris@297 139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
Chris@110 140 desc.minValue = 0;
Chris@297 141 desc.maxValue = 2;
Chris@113 142 desc.defaultValue = 1;
Chris@110 143 desc.isQuantized = true;
Chris@110 144 desc.quantizeStep = 1;
Chris@166 145 desc.valueNames.push_back("Draft (faster)");
Chris@165 146 desc.valueNames.push_back("Intensive (higher quality)");
Chris@297 147 desc.valueNames.push_back("Live (lower latency)");
Chris@161 148 list.push_back(desc);
Chris@161 149
Chris@176 150 desc.identifier = "instrument";
Chris@176 151 desc.name = "Instrument";
Chris@161 152 desc.unit = "";
Chris@271 153 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 154 desc.minValue = 0;
Chris@162 155 desc.maxValue = m_instruments.size()-1;
Chris@162 156 desc.defaultValue = 0;
Chris@161 157 desc.isQuantized = true;
Chris@161 158 desc.quantizeStep = 1;
Chris@161 159 desc.valueNames.clear();
Chris@162 160 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 161 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 162 }
Chris@166 163 list.push_back(desc);
Chris@161 164
Chris@166 165 desc.identifier = "finetune";
Chris@166 166 desc.name = "Return fine pitch estimates";
Chris@166 167 desc.unit = "";
Chris@271 168 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 169 desc.minValue = 0;
Chris@166 170 desc.maxValue = 1;
Chris@166 171 desc.defaultValue = 0;
Chris@166 172 desc.isQuantized = true;
Chris@166 173 desc.quantizeStep = 1;
Chris@166 174 desc.valueNames.clear();
Chris@110 175 list.push_back(desc);
Chris@110 176
Chris@31 177 return list;
Chris@31 178 }
Chris@31 179
Chris@31 180 float
Chris@31 181 Silvet::getParameter(string identifier) const
Chris@31 182 {
Chris@110 183 if (identifier == "mode") {
Chris@297 184 return (float)(int)m_mode;
Chris@166 185 } else if (identifier == "finetune") {
Chris@166 186 return m_fineTuning ? 1.f : 0.f;
Chris@176 187 } else if (identifier == "instrument") {
Chris@162 188 return m_instrument;
Chris@110 189 }
Chris@31 190 return 0;
Chris@31 191 }
Chris@31 192
Chris@31 193 void
Chris@31 194 Silvet::setParameter(string identifier, float value)
Chris@31 195 {
Chris@110 196 if (identifier == "mode") {
Chris@297 197 m_mode = (ProcessingMode)(int)(value + 0.5);
Chris@166 198 } else if (identifier == "finetune") {
Chris@166 199 m_fineTuning = (value > 0.5);
Chris@176 200 } else if (identifier == "instrument") {
Chris@162 201 m_instrument = lrintf(value);
Chris@110 202 }
Chris@31 203 }
Chris@31 204
Chris@31 205 Silvet::ProgramList
Chris@31 206 Silvet::getPrograms() const
Chris@31 207 {
Chris@31 208 ProgramList list;
Chris@31 209 return list;
Chris@31 210 }
Chris@31 211
Chris@31 212 string
Chris@31 213 Silvet::getCurrentProgram() const
Chris@31 214 {
Chris@31 215 return "";
Chris@31 216 }
Chris@31 217
Chris@31 218 void
Chris@31 219 Silvet::selectProgram(string name)
Chris@31 220 {
Chris@31 221 }
Chris@31 222
Chris@31 223 Silvet::OutputList
Chris@31 224 Silvet::getOutputDescriptors() const
Chris@31 225 {
Chris@31 226 OutputList list;
Chris@31 227
Chris@31 228 OutputDescriptor d;
Chris@51 229 d.identifier = "notes";
Chris@51 230 d.name = "Note transcription";
Chris@271 231 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 232 d.unit = "Hz";
Chris@31 233 d.hasFixedBinCount = true;
Chris@31 234 d.binCount = 2;
Chris@41 235 d.binNames.push_back("Frequency");
Chris@31 236 d.binNames.push_back("Velocity");
Chris@31 237 d.hasKnownExtents = false;
Chris@31 238 d.isQuantized = false;
Chris@31 239 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 240 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 241 d.hasDuration = true;
Chris@32 242 m_notesOutputNo = list.size();
Chris@32 243 list.push_back(d);
Chris@32 244
Chris@178 245 d.identifier = "timefreq";
Chris@178 246 d.name = "Time-frequency distribution";
Chris@271 247 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 248 d.unit = "";
Chris@178 249 d.hasFixedBinCount = true;
Chris@178 250 d.binCount = m_instruments[0].templateHeight;
Chris@178 251 d.binNames.clear();
Chris@178 252 if (m_cq) {
Chris@294 253 char name[50];
Chris@178 254 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 255 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 256 // lowest-frequency 55 bins have been dropped, for a
Chris@178 257 // 545-bin template. The native CQ bins go high->low
Chris@178 258 // frequency though, so these are still the first 545 bins
Chris@178 259 // as reported by getBinFrequency, though in reverse order
Chris@178 260 float freq = m_cq->getBinFrequency
Chris@178 261 (m_instruments[0].templateHeight - i - 1);
Chris@178 262 sprintf(name, "%.1f Hz", freq);
Chris@178 263 d.binNames.push_back(name);
Chris@178 264 }
Chris@178 265 }
Chris@178 266 d.hasKnownExtents = false;
Chris@178 267 d.isQuantized = false;
Chris@178 268 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 269 d.sampleRate = m_colsPerSec;
Chris@178 270 d.hasDuration = false;
Chris@178 271 m_fcqOutputNo = list.size();
Chris@178 272 list.push_back(d);
Chris@178 273
Chris@294 274 d.identifier = "pitchactivation";
Chris@294 275 d.name = "Pitch activation distribution";
Chris@294 276 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 277 d.unit = "";
Chris@294 278 d.hasFixedBinCount = true;
Chris@294 279 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 280 d.binNames.clear();
Chris@294 281 if (m_cq) {
Chris@294 282 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 283 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 284 }
Chris@294 285 }
Chris@294 286 d.hasKnownExtents = false;
Chris@294 287 d.isQuantized = false;
Chris@294 288 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 289 d.sampleRate = m_colsPerSec;
Chris@294 290 d.hasDuration = false;
Chris@294 291 m_pitchOutputNo = list.size();
Chris@294 292 list.push_back(d);
Chris@294 293
Chris@31 294 return list;
Chris@31 295 }
Chris@31 296
Chris@38 297 std::string
Chris@175 298 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 299 {
Chris@38 300 static const char *names[] = {
Chris@38 301 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 302 };
Chris@38 303
Chris@175 304 const char *n = names[note % 12];
Chris@38 305
Chris@175 306 int oct = (note + 9) / 12;
Chris@38 307
Chris@175 308 char buf[30];
Chris@175 309
Chris@175 310 float pshift = 0.f;
Chris@175 311 if (shiftCount > 1) {
Chris@175 312 // see noteFrequency below
Chris@175 313 pshift =
Chris@175 314 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 315 }
Chris@175 316
Chris@175 317 if (pshift > 0.f) {
Chris@175 318 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 319 } else if (pshift < 0.f) {
Chris@175 320 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 321 } else {
Chris@175 322 sprintf(buf, "%s%d", n, oct);
Chris@175 323 }
Chris@38 324
Chris@38 325 return buf;
Chris@38 326 }
Chris@38 327
Chris@41 328 float
Chris@168 329 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 330 {
Chris@169 331 // Convert shift number to a pitch shift. The given shift number
Chris@169 332 // is an offset into the template array, which starts with some
Chris@169 333 // zeros, followed by the template, then some trailing zeros.
Chris@169 334 //
Chris@169 335 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 336 // == 5, then the number will be in the range 0-4 and the template
Chris@169 337 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 338 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 339 // represent moving the template *up* in pitch (by introducing
Chris@169 340 // zeros at the start, which is the low-frequency end), for a
Chris@169 341 // positive pitch shift; and higher values represent moving it
Chris@169 342 // down in pitch, for a negative pitch shift.
Chris@169 343
Chris@175 344 float pshift = 0.f;
Chris@175 345 if (shiftCount > 1) {
Chris@175 346 pshift =
Chris@175 347 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 348 }
Chris@169 349
Chris@169 350 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 351 }
Chris@41 352
Chris@31 353 bool
Chris@31 354 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 355 {
Chris@272 356 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 357 m_inputSampleRate > maxInputSampleRate) {
Chris@272 358 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 359 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 360 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 361 return false;
Chris@272 362 }
Chris@272 363
Chris@31 364 if (channels < getMinChannelCount() ||
Chris@272 365 channels > getMaxChannelCount()) {
Chris@272 366 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 367 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 368 << getMaxChannelCount() << ")" << endl;
Chris@272 369 return false;
Chris@272 370 }
Chris@31 371
Chris@31 372 if (stepSize != blockSize) {
Chris@31 373 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 374 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 375 return false;
Chris@31 376 }
Chris@31 377
Chris@31 378 m_blockSize = blockSize;
Chris@31 379
Chris@31 380 reset();
Chris@31 381
Chris@31 382 return true;
Chris@31 383 }
Chris@31 384
Chris@31 385 void
Chris@31 386 Silvet::reset()
Chris@31 387 {
Chris@31 388 delete m_resampler;
Chris@246 389 delete m_flattener;
Chris@31 390 delete m_cq;
Chris@31 391
Chris@31 392 if (m_inputSampleRate != processingSampleRate) {
Chris@31 393 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 394 } else {
Chris@31 395 m_resampler = 0;
Chris@31 396 }
Chris@31 397
Chris@246 398 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 399 m_flattener->reset();
Chris@246 400
Chris@173 401 double minFreq = 27.5;
Chris@173 402
Chris@297 403 if (m_mode != HighQualityMode) {
Chris@173 404 // We don't actually return any notes from the bottom octave,
Chris@173 405 // so we can just pad with zeros
Chris@173 406 minFreq *= 2;
Chris@173 407 }
Chris@173 408
Chris@154 409 CQParameters params(processingSampleRate,
Chris@173 410 minFreq,
Chris@154 411 processingSampleRate / 3,
Chris@154 412 processingBPO);
Chris@154 413
Chris@155 414 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 415 // drops the FFT size to 512 from 1024 and alters
Chris@155 416 // some other processing parameters, making
Chris@155 417 // everything much, much slower. Could be a flaw
Chris@155 418 // in the CQ parameter calculations, must check
Chris@154 419 params.atomHopFactor = 0.3;
Chris@154 420 params.threshold = 0.0005;
Chris@172 421 params.window = CQParameters::Hann;
Chris@154 422
Chris@154 423 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 424
Chris@297 425 cerr << "cq latency = " << m_cq->getLatency() << endl;
Chris@297 426
Chris@297 427 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
Chris@165 428
Chris@41 429 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 430 delete m_postFilter[i];
Chris@41 431 }
Chris@41 432 m_postFilter.clear();
Chris@176 433 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 434 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 435 }
Chris@41 436 m_pianoRoll.clear();
Chris@246 437 m_inputGains.clear();
Chris@32 438 m_columnCount = 0;
Chris@272 439 m_resampledCount = 0;
Chris@40 440 m_startTime = RealTime::zeroTime;
Chris@31 441 }
Chris@31 442
Chris@31 443 Silvet::FeatureSet
Chris@31 444 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 445 {
Chris@40 446 if (m_columnCount == 0) {
Chris@40 447 m_startTime = timestamp;
Chris@40 448 }
Chris@246 449
Chris@246 450 vector<float> flattened(m_blockSize);
Chris@246 451 float gain = 1.f;
Chris@246 452 m_flattener->connectInputPort
Chris@246 453 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 454 m_flattener->connectOutputPort
Chris@246 455 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 456 m_flattener->connectOutputPort
Chris@246 457 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 458 m_flattener->process(m_blockSize);
Chris@246 459
Chris@252 460 m_inputGains[timestamp] = gain;
Chris@40 461
Chris@31 462 vector<double> data;
Chris@40 463 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 464 double d = flattened[i];
Chris@235 465 data.push_back(d);
Chris@40 466 }
Chris@31 467
Chris@31 468 if (m_resampler) {
Chris@272 469
Chris@31 470 data = m_resampler->process(data.data(), data.size());
Chris@272 471
Chris@272 472 int hadCount = m_resampledCount;
Chris@272 473 m_resampledCount += data.size();
Chris@272 474
Chris@272 475 int resamplerLatency = m_resampler->getLatency();
Chris@272 476
Chris@272 477 if (hadCount < resamplerLatency) {
Chris@272 478 int stillToDrop = resamplerLatency - hadCount;
Chris@272 479 if (stillToDrop >= int(data.size())) {
Chris@272 480 return FeatureSet();
Chris@272 481 } else {
Chris@272 482 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 483 }
Chris@272 484 }
Chris@31 485 }
Chris@272 486
Chris@32 487 Grid cqout = m_cq->process(data);
Chris@51 488 FeatureSet fs = transcribe(cqout);
Chris@51 489 return fs;
Chris@34 490 }
Chris@34 491
Chris@34 492 Silvet::FeatureSet
Chris@34 493 Silvet::getRemainingFeatures()
Chris@34 494 {
Chris@145 495 Grid cqout = m_cq->getRemainingOutput();
Chris@51 496 FeatureSet fs = transcribe(cqout);
Chris@51 497 return fs;
Chris@34 498 }
Chris@34 499
Chris@34 500 Silvet::FeatureSet
Chris@34 501 Silvet::transcribe(const Grid &cqout)
Chris@34 502 {
Chris@32 503 Grid filtered = preProcess(cqout);
Chris@31 504
Chris@32 505 FeatureSet fs;
Chris@32 506
Chris@104 507 if (filtered.empty()) return fs;
Chris@170 508
Chris@170 509 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 510
Chris@178 511 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 512 Feature f;
Chris@178 513 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 514 f.values.push_back(float(filtered[i][j]));
Chris@178 515 }
Chris@178 516 fs[m_fcqOutputNo].push_back(f);
Chris@178 517 }
Chris@178 518
Chris@34 519 int width = filtered.size();
Chris@34 520
Chris@297 521 int iterations = (m_mode == HighQualityMode ? 20 : 10);
Chris@34 522
Chris@176 523 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 524
Chris@297 525 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
Chris@170 526 int shiftCount = 1;
Chris@170 527 if (wantShifts) {
Chris@170 528 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 529 }
Chris@170 530
Chris@170 531 vector<vector<int> > localBestShifts;
Chris@170 532 if (wantShifts) {
Chris@170 533 localBestShifts =
Chris@176 534 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 535 }
Chris@170 536
Chris@170 537 vector<bool> present(width, false);
Chris@37 538
Chris@123 539 #pragma omp parallel for
Chris@123 540 for (int i = 0; i < width; ++i) {
Chris@104 541
Chris@170 542 double sum = 0.0;
Chris@176 543 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 544 sum += filtered.at(i).at(j);
Chris@170 545 }
Chris@170 546 if (sum < 1e-5) continue;
Chris@170 547
Chris@170 548 present[i] = true;
Chris@170 549
Chris@297 550 EM em(&pack, m_mode == HighQualityMode);
Chris@170 551
Chris@183 552 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 553 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 554
Chris@170 555 for (int j = 0; j < iterations; ++j) {
Chris@170 556 em.iterate(filtered.at(i).data());
Chris@37 557 }
Chris@37 558
Chris@170 559 const float *pitchDist = em.getPitchDistribution();
Chris@170 560 const float *const *shiftDist = em.getShifts();
Chris@37 561
Chris@176 562 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 563
Chris@170 564 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 565
Chris@170 566 int bestShift = 0;
Chris@179 567 float bestShiftValue = 0.0;
Chris@170 568 if (wantShifts) {
Chris@170 569 for (int k = 0; k < shiftCount; ++k) {
Chris@179 570 float value = shiftDist[k][j];
Chris@179 571 if (k == 0 || value > bestShiftValue) {
Chris@179 572 bestShiftValue = value;
Chris@170 573 bestShift = k;
Chris@170 574 }
Chris@170 575 }
Chris@170 576 localBestShifts[i][j] = bestShift;
Chris@170 577 }
Chris@123 578 }
Chris@123 579 }
Chris@166 580
Chris@166 581 for (int i = 0; i < width; ++i) {
Chris@37 582
Chris@170 583 if (!present[i]) {
Chris@170 584 // silent column
Chris@176 585 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 586 m_postFilter[j]->push(0.0);
Chris@170 587 }
Chris@168 588 m_pianoRoll.push_back(map<int, double>());
Chris@170 589 if (wantShifts) {
Chris@168 590 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 591 }
Chris@166 592 continue;
Chris@166 593 }
Chris@166 594
Chris@294 595 vector<double> filtered = postProcess
Chris@294 596 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 597
Chris@294 598 Feature f;
Chris@294 599 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 600 float v(filtered[j]);
Chris@294 601 if (v < pack.levelThreshold) v = 0.f;
Chris@294 602 f.values.push_back(v);
Chris@294 603 }
Chris@294 604 fs[m_pitchOutputNo].push_back(f);
Chris@166 605
Chris@168 606 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 607
Chris@123 608 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 609 fi != noteFeatures.end(); ++fi) {
Chris@123 610 fs[m_notesOutputNo].push_back(*fi);
Chris@40 611 }
Chris@34 612 }
Chris@34 613
Chris@32 614 return fs;
Chris@31 615 }
Chris@31 616
Chris@32 617 Silvet::Grid
Chris@32 618 Silvet::preProcess(const Grid &in)
Chris@32 619 {
Chris@32 620 int width = in.size();
Chris@32 621
Chris@165 622 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 623
Chris@165 624 // need to be careful that col spacing is an integer number of samples!
Chris@165 625 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 626
Chris@32 627 Grid out;
Chris@32 628
Chris@58 629 // We count the CQ latency in terms of processing hops, but
Chris@58 630 // actually it probably isn't an exact number of hops so this
Chris@58 631 // isn't quite accurate. But the small constant offset is
Chris@165 632 // practically irrelevant compared to the jitter from the frame
Chris@165 633 // size we reduce to in a moment
Chris@33 634 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 635
Chris@176 636 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 637
Chris@32 638 for (int i = 0; i < width; ++i) {
Chris@32 639
Chris@33 640 if (m_columnCount < latentColumns) {
Chris@33 641 ++m_columnCount;
Chris@33 642 continue;
Chris@33 643 }
Chris@33 644
Chris@32 645 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 646 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 647
Chris@32 648 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 649
Chris@32 650 if (select) {
Chris@32 651 vector<double> inCol = in[i];
Chris@176 652 vector<double> outCol(pack.templateHeight);
Chris@32 653
Chris@178 654 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 655 // lowest 55 of them.
Chris@178 656 //
Chris@297 657 // In draft and live mode the CQ is an octave shorter,
Chris@297 658 // returning 540 bins, so we instead pad them with an
Chris@297 659 // additional 5 zeros.
Chris@178 660 //
Chris@178 661 // We also need to reverse the column as we go, since the
Chris@178 662 // raw CQ has the high frequencies first and we need it
Chris@178 663 // the other way around.
Chris@32 664
Chris@297 665 if (m_mode == HighQualityMode) {
Chris@178 666 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 667 int ix = inCol.size() - j - 55;
Chris@178 668 outCol[j] = inCol[ix];
Chris@178 669 }
Chris@178 670 } else {
Chris@178 671 for (int j = 0; j < 5; ++j) {
Chris@178 672 outCol[j] = 0.0;
Chris@178 673 }
Chris@178 674 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 675 int ix = inCol.size() - j + 4;
Chris@178 676 outCol[j] = inCol[ix];
Chris@178 677 }
Chris@46 678 }
Chris@32 679
Chris@46 680 vector<double> noiseLevel1 =
Chris@46 681 MedianFilter<double>::filter(40, outCol);
Chris@176 682 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 683 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 684 }
Chris@32 685
Chris@46 686 vector<double> noiseLevel2 =
Chris@46 687 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 688 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 689 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 690 }
Chris@32 691
Chris@165 692 out.push_back(outCol);
Chris@32 693 }
Chris@32 694
Chris@32 695 ++m_columnCount;
Chris@32 696 }
Chris@32 697
Chris@32 698 return out;
Chris@32 699 }
Chris@32 700
Chris@294 701 vector<double>
Chris@170 702 Silvet::postProcess(const vector<double> &pitches,
Chris@170 703 const vector<int> &bestShifts,
Chris@170 704 bool wantShifts)
Chris@166 705 {
Chris@176 706 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 707
Chris@41 708 vector<double> filtered;
Chris@41 709
Chris@176 710 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 711 m_postFilter[j]->push(pitches[j]);
Chris@41 712 filtered.push_back(m_postFilter[j]->get());
Chris@41 713 }
Chris@41 714
Chris@41 715 // Threshold for level and reduce number of candidate pitches
Chris@41 716
Chris@41 717 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 718
Chris@41 719 ValueIndexMap strengths;
Chris@166 720
Chris@176 721 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 722 double strength = filtered[j];
Chris@183 723 if (strength < pack.levelThreshold) continue;
Chris@168 724 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 725 }
Chris@166 726
Chris@168 727 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 728
Chris@168 729 map<int, double> active;
Chris@168 730 map<int, int> activeShifts;
Chris@168 731
Chris@183 732 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 733
Chris@168 734 --si;
Chris@168 735
Chris@168 736 double strength = si->first;
Chris@168 737 int j = si->second;
Chris@168 738
Chris@168 739 active[j] = strength;
Chris@168 740
Chris@170 741 if (wantShifts) {
Chris@170 742 activeShifts[j] = bestShifts[j];
Chris@167 743 }
Chris@41 744 }
Chris@41 745
Chris@168 746 m_pianoRoll.push_back(active);
Chris@170 747
Chris@170 748 if (wantShifts) {
Chris@168 749 m_pianoRollShifts.push_back(activeShifts);
Chris@41 750 }
Chris@294 751
Chris@294 752 return filtered;
Chris@166 753 }
Chris@166 754
Chris@166 755 Vamp::Plugin::FeatureList
Chris@168 756 Silvet::noteTrack(int shiftCount)
Chris@166 757 {
Chris@41 758 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 759 // report notes that have just ended (i.e. that are absent in the
Chris@168 760 // latest active set but present in the prior set in the piano
Chris@41 761 // roll) -- any notes that ended earlier will have been reported
Chris@41 762 // already, and if they haven't ended, we don't know their
Chris@41 763 // duration.
Chris@41 764
Chris@168 765 int width = m_pianoRoll.size() - 1;
Chris@168 766
Chris@168 767 const map<int, double> &active = m_pianoRoll[width];
Chris@41 768
Chris@165 769 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 770
Chris@165 771 // only keep notes >= 100ms or thereabouts
Chris@165 772 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 773 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 774
Chris@41 775 FeatureList noteFeatures;
Chris@41 776
Chris@41 777 if (width < durationThreshold + 1) {
Chris@41 778 return noteFeatures;
Chris@41 779 }
Chris@41 780
Chris@150 781 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 782
Chris@55 783 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 784 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 785
Chris@55 786 int note = ni->first;
Chris@41 787
Chris@41 788 if (active.find(note) != active.end()) {
Chris@41 789 // the note is still playing
Chris@41 790 continue;
Chris@41 791 }
Chris@41 792
Chris@41 793 // the note was playing but just ended
Chris@41 794 int end = width;
Chris@41 795 int start = end-1;
Chris@41 796
Chris@41 797 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 798 --start;
Chris@41 799 }
Chris@41 800 ++start;
Chris@41 801
Chris@169 802 if ((end - start) < durationThreshold) {
Chris@41 803 continue;
Chris@41 804 }
Chris@41 805
Chris@169 806 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 807 }
Chris@41 808
Chris@62 809 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 810
Chris@41 811 return noteFeatures;
Chris@41 812 }
Chris@41 813
Chris@169 814 void
Chris@169 815 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 816 FeatureList &noteFeatures)
Chris@169 817 {
Chris@169 818 int partStart = start;
Chris@169 819 int partShift = 0;
Chris@169 820 int partVelocity = 0;
Chris@169 821
Chris@252 822 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 823
Chris@169 824 for (int i = start; i != end; ++i) {
Chris@169 825
Chris@169 826 double strength = m_pianoRoll[i][note];
Chris@169 827
Chris@169 828 int shift = 0;
Chris@169 829
Chris@169 830 if (shiftCount > 1) {
Chris@169 831
Chris@169 832 shift = m_pianoRollShifts[i][note];
Chris@169 833
Chris@169 834 if (i == partStart) {
Chris@169 835 partShift = shift;
Chris@169 836 }
Chris@169 837
Chris@169 838 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 839
Chris@169 840 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 841
Chris@169 842 // pitch has changed, emit an intermediate note
Chris@252 843 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 844 i,
Chris@252 845 note,
Chris@252 846 partShift,
Chris@252 847 shiftCount,
Chris@252 848 partVelocity));
Chris@169 849 partStart = i;
Chris@169 850 partShift = shift;
Chris@169 851 partVelocity = 0;
Chris@169 852 }
Chris@169 853 }
Chris@169 854
Chris@246 855 int v = round(strength * 2);
Chris@169 856 if (v > partVelocity) {
Chris@169 857 partVelocity = v;
Chris@169 858 }
Chris@169 859 }
Chris@169 860
Chris@169 861 if (end >= partStart + partThreshold) {
Chris@252 862 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 863 end,
Chris@252 864 note,
Chris@252 865 partShift,
Chris@252 866 shiftCount,
Chris@252 867 partVelocity));
Chris@169 868 }
Chris@169 869 }
Chris@252 870
Chris@252 871 Silvet::Feature
Chris@252 872 Silvet::makeNoteFeature(int start,
Chris@252 873 int end,
Chris@252 874 int note,
Chris@252 875 int shift,
Chris@252 876 int shiftCount,
Chris@252 877 int velocity)
Chris@252 878 {
Chris@252 879 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 880 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 881
Chris@252 882 Feature f;
Chris@252 883
Chris@252 884 f.hasTimestamp = true;
Chris@285 885 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 886 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 887
Chris@252 888 f.hasDuration = true;
Chris@252 889 f.duration = RealTime::fromSeconds
Chris@252 890 (columnDuration * (end - start));
Chris@252 891
Chris@252 892 f.values.clear();
Chris@252 893
Chris@252 894 f.values.push_back
Chris@252 895 (noteFrequency(note, shift, shiftCount));
Chris@252 896
Chris@252 897 float inputGain = getInputGainAt(f.timestamp);
Chris@252 898 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 899 velocity = round(velocity / inputGain);
Chris@252 900 if (velocity > 127) velocity = 127;
Chris@252 901 if (velocity < 1) velocity = 1;
Chris@252 902 f.values.push_back(velocity);
Chris@252 903
Chris@252 904 f.label = noteName(note, shift, shiftCount);
Chris@252 905
Chris@252 906 return f;
Chris@252 907 }
Chris@252 908
Chris@252 909 float
Chris@252 910 Silvet::getInputGainAt(RealTime t)
Chris@252 911 {
Chris@252 912 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 913
Chris@252 914 if (i == m_inputGains.end()) {
Chris@252 915 if (i != m_inputGains.begin()) {
Chris@252 916 --i;
Chris@252 917 } else {
Chris@252 918 return 1.f; // no data
Chris@252 919 }
Chris@252 920 }
Chris@252 921
Chris@252 922 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 923
Chris@252 924 return i->second;
Chris@252 925 }
Chris@252 926