annotate src/Silvet.cpp @ 305:04a3c152e590

Now that we are returning the pitch activation matrix, we don't want to skip silent columns in the postprocess loop
author Chris Cannam
date Fri, 05 Dec 2014 16:46:57 +0000
parents aa7be9d8112e
children af19bee9e53b 07ee4ebea57c
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@40 33 using Vamp::RealTime;
Chris@31 34
Chris@31 35 static int processingSampleRate = 44100;
Chris@31 36 static int processingBPO = 60;
Chris@170 37
Chris@272 38 static int minInputSampleRate = 100;
Chris@272 39 static int maxInputSampleRate = 192000;
Chris@272 40
Chris@31 41 Silvet::Silvet(float inputSampleRate) :
Chris@31 42 Plugin(inputSampleRate),
Chris@161 43 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 44 m_resampler(0),
Chris@246 45 m_flattener(0),
Chris@110 46 m_cq(0),
Chris@162 47 m_hqMode(true),
Chris@166 48 m_fineTuning(false),
Chris@178 49 m_instrument(0),
Chris@178 50 m_colsPerSec(50)
Chris@31 51 {
Chris@31 52 }
Chris@31 53
Chris@31 54 Silvet::~Silvet()
Chris@31 55 {
Chris@31 56 delete m_resampler;
Chris@246 57 delete m_flattener;
Chris@31 58 delete m_cq;
Chris@41 59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 60 delete m_postFilter[i];
Chris@41 61 }
Chris@31 62 }
Chris@31 63
Chris@31 64 string
Chris@31 65 Silvet::getIdentifier() const
Chris@31 66 {
Chris@31 67 return "silvet";
Chris@31 68 }
Chris@31 69
Chris@31 70 string
Chris@31 71 Silvet::getName() const
Chris@31 72 {
Chris@31 73 return "Silvet Note Transcription";
Chris@31 74 }
Chris@31 75
Chris@31 76 string
Chris@31 77 Silvet::getDescription() const
Chris@31 78 {
Chris@191 79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 80 }
Chris@31 81
Chris@31 82 string
Chris@31 83 Silvet::getMaker() const
Chris@31 84 {
Chris@191 85 return "Queen Mary, University of London";
Chris@31 86 }
Chris@31 87
Chris@31 88 int
Chris@31 89 Silvet::getPluginVersion() const
Chris@31 90 {
Chris@295 91 return 2;
Chris@31 92 }
Chris@31 93
Chris@31 94 string
Chris@31 95 Silvet::getCopyright() const
Chris@31 96 {
Chris@191 97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 98 }
Chris@31 99
Chris@31 100 Silvet::InputDomain
Chris@31 101 Silvet::getInputDomain() const
Chris@31 102 {
Chris@31 103 return TimeDomain;
Chris@31 104 }
Chris@31 105
Chris@31 106 size_t
Chris@31 107 Silvet::getPreferredBlockSize() const
Chris@31 108 {
Chris@31 109 return 0;
Chris@31 110 }
Chris@31 111
Chris@31 112 size_t
Chris@31 113 Silvet::getPreferredStepSize() const
Chris@31 114 {
Chris@31 115 return 0;
Chris@31 116 }
Chris@31 117
Chris@31 118 size_t
Chris@31 119 Silvet::getMinChannelCount() const
Chris@31 120 {
Chris@31 121 return 1;
Chris@31 122 }
Chris@31 123
Chris@31 124 size_t
Chris@31 125 Silvet::getMaxChannelCount() const
Chris@31 126 {
Chris@31 127 return 1;
Chris@31 128 }
Chris@31 129
Chris@31 130 Silvet::ParameterList
Chris@31 131 Silvet::getParameterDescriptors() const
Chris@31 132 {
Chris@31 133 ParameterList list;
Chris@110 134
Chris@110 135 ParameterDescriptor desc;
Chris@110 136 desc.identifier = "mode";
Chris@110 137 desc.name = "Processing mode";
Chris@110 138 desc.unit = "";
Chris@271 139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 140 desc.minValue = 0;
Chris@110 141 desc.maxValue = 1;
Chris@113 142 desc.defaultValue = 1;
Chris@110 143 desc.isQuantized = true;
Chris@110 144 desc.quantizeStep = 1;
Chris@166 145 desc.valueNames.push_back("Draft (faster)");
Chris@165 146 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 147 list.push_back(desc);
Chris@161 148
Chris@176 149 desc.identifier = "instrument";
Chris@176 150 desc.name = "Instrument";
Chris@161 151 desc.unit = "";
Chris@271 152 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 153 desc.minValue = 0;
Chris@162 154 desc.maxValue = m_instruments.size()-1;
Chris@162 155 desc.defaultValue = 0;
Chris@161 156 desc.isQuantized = true;
Chris@161 157 desc.quantizeStep = 1;
Chris@161 158 desc.valueNames.clear();
Chris@162 159 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 160 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 161 }
Chris@166 162 list.push_back(desc);
Chris@161 163
Chris@166 164 desc.identifier = "finetune";
Chris@166 165 desc.name = "Return fine pitch estimates";
Chris@166 166 desc.unit = "";
Chris@271 167 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 168 desc.minValue = 0;
Chris@166 169 desc.maxValue = 1;
Chris@166 170 desc.defaultValue = 0;
Chris@166 171 desc.isQuantized = true;
Chris@166 172 desc.quantizeStep = 1;
Chris@166 173 desc.valueNames.clear();
Chris@110 174 list.push_back(desc);
Chris@110 175
Chris@31 176 return list;
Chris@31 177 }
Chris@31 178
Chris@31 179 float
Chris@31 180 Silvet::getParameter(string identifier) const
Chris@31 181 {
Chris@110 182 if (identifier == "mode") {
Chris@110 183 return m_hqMode ? 1.f : 0.f;
Chris@166 184 } else if (identifier == "finetune") {
Chris@166 185 return m_fineTuning ? 1.f : 0.f;
Chris@176 186 } else if (identifier == "instrument") {
Chris@162 187 return m_instrument;
Chris@110 188 }
Chris@31 189 return 0;
Chris@31 190 }
Chris@31 191
Chris@31 192 void
Chris@31 193 Silvet::setParameter(string identifier, float value)
Chris@31 194 {
Chris@110 195 if (identifier == "mode") {
Chris@110 196 m_hqMode = (value > 0.5);
Chris@166 197 } else if (identifier == "finetune") {
Chris@166 198 m_fineTuning = (value > 0.5);
Chris@176 199 } else if (identifier == "instrument") {
Chris@162 200 m_instrument = lrintf(value);
Chris@110 201 }
Chris@31 202 }
Chris@31 203
Chris@31 204 Silvet::ProgramList
Chris@31 205 Silvet::getPrograms() const
Chris@31 206 {
Chris@31 207 ProgramList list;
Chris@31 208 return list;
Chris@31 209 }
Chris@31 210
Chris@31 211 string
Chris@31 212 Silvet::getCurrentProgram() const
Chris@31 213 {
Chris@31 214 return "";
Chris@31 215 }
Chris@31 216
Chris@31 217 void
Chris@31 218 Silvet::selectProgram(string name)
Chris@31 219 {
Chris@31 220 }
Chris@31 221
Chris@31 222 Silvet::OutputList
Chris@31 223 Silvet::getOutputDescriptors() const
Chris@31 224 {
Chris@31 225 OutputList list;
Chris@31 226
Chris@31 227 OutputDescriptor d;
Chris@51 228 d.identifier = "notes";
Chris@51 229 d.name = "Note transcription";
Chris@271 230 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 231 d.unit = "Hz";
Chris@31 232 d.hasFixedBinCount = true;
Chris@31 233 d.binCount = 2;
Chris@41 234 d.binNames.push_back("Frequency");
Chris@31 235 d.binNames.push_back("Velocity");
Chris@31 236 d.hasKnownExtents = false;
Chris@31 237 d.isQuantized = false;
Chris@31 238 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 239 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 240 d.hasDuration = true;
Chris@32 241 m_notesOutputNo = list.size();
Chris@32 242 list.push_back(d);
Chris@32 243
Chris@178 244 d.identifier = "timefreq";
Chris@178 245 d.name = "Time-frequency distribution";
Chris@271 246 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 247 d.unit = "";
Chris@178 248 d.hasFixedBinCount = true;
Chris@178 249 d.binCount = m_instruments[0].templateHeight;
Chris@178 250 d.binNames.clear();
Chris@178 251 if (m_cq) {
Chris@294 252 char name[50];
Chris@178 253 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 254 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 255 // lowest-frequency 55 bins have been dropped, for a
Chris@178 256 // 545-bin template. The native CQ bins go high->low
Chris@178 257 // frequency though, so these are still the first 545 bins
Chris@178 258 // as reported by getBinFrequency, though in reverse order
Chris@178 259 float freq = m_cq->getBinFrequency
Chris@178 260 (m_instruments[0].templateHeight - i - 1);
Chris@178 261 sprintf(name, "%.1f Hz", freq);
Chris@178 262 d.binNames.push_back(name);
Chris@178 263 }
Chris@178 264 }
Chris@178 265 d.hasKnownExtents = false;
Chris@178 266 d.isQuantized = false;
Chris@178 267 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 268 d.sampleRate = m_colsPerSec;
Chris@178 269 d.hasDuration = false;
Chris@178 270 m_fcqOutputNo = list.size();
Chris@178 271 list.push_back(d);
Chris@178 272
Chris@294 273 d.identifier = "pitchactivation";
Chris@294 274 d.name = "Pitch activation distribution";
Chris@294 275 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 276 d.unit = "";
Chris@294 277 d.hasFixedBinCount = true;
Chris@294 278 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 279 d.binNames.clear();
Chris@294 280 if (m_cq) {
Chris@294 281 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 282 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 283 }
Chris@294 284 }
Chris@294 285 d.hasKnownExtents = false;
Chris@294 286 d.isQuantized = false;
Chris@294 287 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 288 d.sampleRate = m_colsPerSec;
Chris@294 289 d.hasDuration = false;
Chris@294 290 m_pitchOutputNo = list.size();
Chris@294 291 list.push_back(d);
Chris@294 292
Chris@31 293 return list;
Chris@31 294 }
Chris@31 295
Chris@38 296 std::string
Chris@175 297 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 298 {
Chris@38 299 static const char *names[] = {
Chris@38 300 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 301 };
Chris@38 302
Chris@175 303 const char *n = names[note % 12];
Chris@38 304
Chris@175 305 int oct = (note + 9) / 12;
Chris@38 306
Chris@175 307 char buf[30];
Chris@175 308
Chris@175 309 float pshift = 0.f;
Chris@175 310 if (shiftCount > 1) {
Chris@175 311 // see noteFrequency below
Chris@175 312 pshift =
Chris@175 313 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 314 }
Chris@175 315
Chris@175 316 if (pshift > 0.f) {
Chris@175 317 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 318 } else if (pshift < 0.f) {
Chris@175 319 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 320 } else {
Chris@175 321 sprintf(buf, "%s%d", n, oct);
Chris@175 322 }
Chris@38 323
Chris@38 324 return buf;
Chris@38 325 }
Chris@38 326
Chris@41 327 float
Chris@168 328 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 329 {
Chris@169 330 // Convert shift number to a pitch shift. The given shift number
Chris@169 331 // is an offset into the template array, which starts with some
Chris@169 332 // zeros, followed by the template, then some trailing zeros.
Chris@169 333 //
Chris@169 334 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 335 // == 5, then the number will be in the range 0-4 and the template
Chris@169 336 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 337 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 338 // represent moving the template *up* in pitch (by introducing
Chris@169 339 // zeros at the start, which is the low-frequency end), for a
Chris@169 340 // positive pitch shift; and higher values represent moving it
Chris@169 341 // down in pitch, for a negative pitch shift.
Chris@169 342
Chris@175 343 float pshift = 0.f;
Chris@175 344 if (shiftCount > 1) {
Chris@175 345 pshift =
Chris@175 346 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 347 }
Chris@169 348
Chris@169 349 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 350 }
Chris@41 351
Chris@31 352 bool
Chris@31 353 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 354 {
Chris@272 355 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 356 m_inputSampleRate > maxInputSampleRate) {
Chris@272 357 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 358 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 359 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 360 return false;
Chris@272 361 }
Chris@272 362
Chris@31 363 if (channels < getMinChannelCount() ||
Chris@272 364 channels > getMaxChannelCount()) {
Chris@272 365 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 366 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 367 << getMaxChannelCount() << ")" << endl;
Chris@272 368 return false;
Chris@272 369 }
Chris@31 370
Chris@31 371 if (stepSize != blockSize) {
Chris@31 372 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 373 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 374 return false;
Chris@31 375 }
Chris@31 376
Chris@31 377 m_blockSize = blockSize;
Chris@31 378
Chris@31 379 reset();
Chris@31 380
Chris@31 381 return true;
Chris@31 382 }
Chris@31 383
Chris@31 384 void
Chris@31 385 Silvet::reset()
Chris@31 386 {
Chris@31 387 delete m_resampler;
Chris@246 388 delete m_flattener;
Chris@31 389 delete m_cq;
Chris@31 390
Chris@31 391 if (m_inputSampleRate != processingSampleRate) {
Chris@31 392 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 393 } else {
Chris@31 394 m_resampler = 0;
Chris@31 395 }
Chris@31 396
Chris@246 397 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 398 m_flattener->reset();
Chris@246 399
Chris@173 400 double minFreq = 27.5;
Chris@173 401
Chris@173 402 if (!m_hqMode) {
Chris@173 403 // We don't actually return any notes from the bottom octave,
Chris@173 404 // so we can just pad with zeros
Chris@173 405 minFreq *= 2;
Chris@173 406 }
Chris@173 407
Chris@154 408 CQParameters params(processingSampleRate,
Chris@173 409 minFreq,
Chris@154 410 processingSampleRate / 3,
Chris@154 411 processingBPO);
Chris@154 412
Chris@155 413 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 414 // drops the FFT size to 512 from 1024 and alters
Chris@155 415 // some other processing parameters, making
Chris@155 416 // everything much, much slower. Could be a flaw
Chris@155 417 // in the CQ parameter calculations, must check
Chris@154 418 params.atomHopFactor = 0.3;
Chris@154 419 params.threshold = 0.0005;
Chris@172 420 params.window = CQParameters::Hann;
Chris@154 421
Chris@154 422 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 423
Chris@165 424 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 425
Chris@41 426 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 427 delete m_postFilter[i];
Chris@41 428 }
Chris@41 429 m_postFilter.clear();
Chris@176 430 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 431 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 432 }
Chris@41 433 m_pianoRoll.clear();
Chris@246 434 m_inputGains.clear();
Chris@32 435 m_columnCount = 0;
Chris@272 436 m_resampledCount = 0;
Chris@40 437 m_startTime = RealTime::zeroTime;
Chris@31 438 }
Chris@31 439
Chris@31 440 Silvet::FeatureSet
Chris@31 441 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 442 {
Chris@40 443 if (m_columnCount == 0) {
Chris@40 444 m_startTime = timestamp;
Chris@40 445 }
Chris@246 446
Chris@246 447 vector<float> flattened(m_blockSize);
Chris@246 448 float gain = 1.f;
Chris@246 449 m_flattener->connectInputPort
Chris@246 450 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 451 m_flattener->connectOutputPort
Chris@246 452 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 453 m_flattener->connectOutputPort
Chris@246 454 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 455 m_flattener->process(m_blockSize);
Chris@246 456
Chris@252 457 m_inputGains[timestamp] = gain;
Chris@40 458
Chris@31 459 vector<double> data;
Chris@40 460 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 461 double d = flattened[i];
Chris@235 462 data.push_back(d);
Chris@40 463 }
Chris@31 464
Chris@31 465 if (m_resampler) {
Chris@272 466
Chris@31 467 data = m_resampler->process(data.data(), data.size());
Chris@272 468
Chris@272 469 int hadCount = m_resampledCount;
Chris@272 470 m_resampledCount += data.size();
Chris@272 471
Chris@272 472 int resamplerLatency = m_resampler->getLatency();
Chris@272 473
Chris@272 474 if (hadCount < resamplerLatency) {
Chris@272 475 int stillToDrop = resamplerLatency - hadCount;
Chris@272 476 if (stillToDrop >= int(data.size())) {
Chris@272 477 return FeatureSet();
Chris@272 478 } else {
Chris@272 479 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 480 }
Chris@272 481 }
Chris@31 482 }
Chris@272 483
Chris@32 484 Grid cqout = m_cq->process(data);
Chris@51 485 FeatureSet fs = transcribe(cqout);
Chris@51 486 return fs;
Chris@34 487 }
Chris@34 488
Chris@34 489 Silvet::FeatureSet
Chris@34 490 Silvet::getRemainingFeatures()
Chris@34 491 {
Chris@145 492 Grid cqout = m_cq->getRemainingOutput();
Chris@51 493 FeatureSet fs = transcribe(cqout);
Chris@51 494 return fs;
Chris@34 495 }
Chris@34 496
Chris@34 497 Silvet::FeatureSet
Chris@34 498 Silvet::transcribe(const Grid &cqout)
Chris@34 499 {
Chris@32 500 Grid filtered = preProcess(cqout);
Chris@31 501
Chris@32 502 FeatureSet fs;
Chris@32 503
Chris@104 504 if (filtered.empty()) return fs;
Chris@170 505
Chris@170 506 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 507
Chris@178 508 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 509 Feature f;
Chris@178 510 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 511 f.values.push_back(float(filtered[i][j]));
Chris@178 512 }
Chris@178 513 fs[m_fcqOutputNo].push_back(f);
Chris@178 514 }
Chris@178 515
Chris@34 516 int width = filtered.size();
Chris@34 517
Chris@164 518 int iterations = m_hqMode ? 20 : 10;
Chris@34 519
Chris@176 520 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 521
Chris@170 522 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 523 int shiftCount = 1;
Chris@170 524 if (wantShifts) {
Chris@170 525 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 526 }
Chris@170 527
Chris@170 528 vector<vector<int> > localBestShifts;
Chris@170 529 if (wantShifts) {
Chris@170 530 localBestShifts =
Chris@176 531 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 532 }
Chris@170 533
Chris@305 534 double columnThreshold = 1e-5;
Chris@305 535
Chris@123 536 #pragma omp parallel for
Chris@123 537 for (int i = 0; i < width; ++i) {
Chris@104 538
Chris@170 539 double sum = 0.0;
Chris@176 540 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 541 sum += filtered.at(i).at(j);
Chris@170 542 }
Chris@305 543 if (sum < columnThreshold) continue;
Chris@170 544
Chris@170 545 EM em(&pack, m_hqMode);
Chris@170 546
Chris@183 547 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 548 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 549
Chris@170 550 for (int j = 0; j < iterations; ++j) {
Chris@170 551 em.iterate(filtered.at(i).data());
Chris@37 552 }
Chris@37 553
Chris@170 554 const float *pitchDist = em.getPitchDistribution();
Chris@170 555 const float *const *shiftDist = em.getShifts();
Chris@37 556
Chris@176 557 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 558
Chris@170 559 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 560
Chris@170 561 int bestShift = 0;
Chris@179 562 float bestShiftValue = 0.0;
Chris@170 563 if (wantShifts) {
Chris@170 564 for (int k = 0; k < shiftCount; ++k) {
Chris@179 565 float value = shiftDist[k][j];
Chris@179 566 if (k == 0 || value > bestShiftValue) {
Chris@179 567 bestShiftValue = value;
Chris@170 568 bestShift = k;
Chris@170 569 }
Chris@170 570 }
Chris@170 571 localBestShifts[i][j] = bestShift;
Chris@170 572 }
Chris@123 573 }
Chris@123 574 }
Chris@166 575
Chris@166 576 for (int i = 0; i < width; ++i) {
Chris@37 577
Chris@294 578 vector<double> filtered = postProcess
Chris@294 579 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 580
Chris@294 581 Feature f;
Chris@294 582 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 583 float v(filtered[j]);
Chris@294 584 if (v < pack.levelThreshold) v = 0.f;
Chris@294 585 f.values.push_back(v);
Chris@294 586 }
Chris@294 587 fs[m_pitchOutputNo].push_back(f);
Chris@166 588
Chris@168 589 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 590
Chris@123 591 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 592 fi != noteFeatures.end(); ++fi) {
Chris@123 593 fs[m_notesOutputNo].push_back(*fi);
Chris@40 594 }
Chris@34 595 }
Chris@34 596
Chris@32 597 return fs;
Chris@31 598 }
Chris@31 599
Chris@32 600 Silvet::Grid
Chris@32 601 Silvet::preProcess(const Grid &in)
Chris@32 602 {
Chris@32 603 int width = in.size();
Chris@32 604
Chris@165 605 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 606
Chris@165 607 // need to be careful that col spacing is an integer number of samples!
Chris@165 608 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 609
Chris@32 610 Grid out;
Chris@32 611
Chris@58 612 // We count the CQ latency in terms of processing hops, but
Chris@58 613 // actually it probably isn't an exact number of hops so this
Chris@58 614 // isn't quite accurate. But the small constant offset is
Chris@165 615 // practically irrelevant compared to the jitter from the frame
Chris@165 616 // size we reduce to in a moment
Chris@33 617 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 618
Chris@176 619 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 620
Chris@32 621 for (int i = 0; i < width; ++i) {
Chris@32 622
Chris@33 623 if (m_columnCount < latentColumns) {
Chris@33 624 ++m_columnCount;
Chris@33 625 continue;
Chris@33 626 }
Chris@33 627
Chris@32 628 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 629 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 630
Chris@32 631 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 632
Chris@32 633 if (select) {
Chris@32 634 vector<double> inCol = in[i];
Chris@176 635 vector<double> outCol(pack.templateHeight);
Chris@32 636
Chris@178 637 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 638 // lowest 55 of them.
Chris@178 639 //
Chris@178 640 // In draft mode the CQ is an octave shorter, returning
Chris@178 641 // 540 bins, so we instead pad them with an additional 5
Chris@178 642 // zeros.
Chris@178 643 //
Chris@178 644 // We also need to reverse the column as we go, since the
Chris@178 645 // raw CQ has the high frequencies first and we need it
Chris@178 646 // the other way around.
Chris@32 647
Chris@178 648 if (m_hqMode) {
Chris@178 649 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 650 int ix = inCol.size() - j - 55;
Chris@178 651 outCol[j] = inCol[ix];
Chris@178 652 }
Chris@178 653 } else {
Chris@178 654 for (int j = 0; j < 5; ++j) {
Chris@178 655 outCol[j] = 0.0;
Chris@178 656 }
Chris@178 657 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 658 int ix = inCol.size() - j + 4;
Chris@178 659 outCol[j] = inCol[ix];
Chris@178 660 }
Chris@46 661 }
Chris@32 662
Chris@46 663 vector<double> noiseLevel1 =
Chris@46 664 MedianFilter<double>::filter(40, outCol);
Chris@176 665 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 666 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 667 }
Chris@32 668
Chris@46 669 vector<double> noiseLevel2 =
Chris@46 670 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 671 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 672 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 673 }
Chris@32 674
Chris@165 675 out.push_back(outCol);
Chris@32 676 }
Chris@32 677
Chris@32 678 ++m_columnCount;
Chris@32 679 }
Chris@32 680
Chris@32 681 return out;
Chris@32 682 }
Chris@32 683
Chris@294 684 vector<double>
Chris@170 685 Silvet::postProcess(const vector<double> &pitches,
Chris@170 686 const vector<int> &bestShifts,
Chris@170 687 bool wantShifts)
Chris@166 688 {
Chris@176 689 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 690
Chris@41 691 vector<double> filtered;
Chris@41 692
Chris@176 693 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 694 m_postFilter[j]->push(pitches[j]);
Chris@41 695 filtered.push_back(m_postFilter[j]->get());
Chris@41 696 }
Chris@41 697
Chris@41 698 // Threshold for level and reduce number of candidate pitches
Chris@41 699
Chris@41 700 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 701
Chris@41 702 ValueIndexMap strengths;
Chris@166 703
Chris@176 704 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 705 double strength = filtered[j];
Chris@183 706 if (strength < pack.levelThreshold) continue;
Chris@168 707 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 708 }
Chris@166 709
Chris@168 710 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 711
Chris@168 712 map<int, double> active;
Chris@168 713 map<int, int> activeShifts;
Chris@168 714
Chris@183 715 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 716
Chris@168 717 --si;
Chris@168 718
Chris@168 719 double strength = si->first;
Chris@168 720 int j = si->second;
Chris@168 721
Chris@168 722 active[j] = strength;
Chris@168 723
Chris@170 724 if (wantShifts) {
Chris@170 725 activeShifts[j] = bestShifts[j];
Chris@167 726 }
Chris@41 727 }
Chris@41 728
Chris@168 729 m_pianoRoll.push_back(active);
Chris@170 730
Chris@170 731 if (wantShifts) {
Chris@168 732 m_pianoRollShifts.push_back(activeShifts);
Chris@41 733 }
Chris@294 734
Chris@294 735 return filtered;
Chris@166 736 }
Chris@166 737
Chris@166 738 Vamp::Plugin::FeatureList
Chris@168 739 Silvet::noteTrack(int shiftCount)
Chris@166 740 {
Chris@41 741 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 742 // report notes that have just ended (i.e. that are absent in the
Chris@168 743 // latest active set but present in the prior set in the piano
Chris@41 744 // roll) -- any notes that ended earlier will have been reported
Chris@41 745 // already, and if they haven't ended, we don't know their
Chris@41 746 // duration.
Chris@41 747
Chris@168 748 int width = m_pianoRoll.size() - 1;
Chris@168 749
Chris@168 750 const map<int, double> &active = m_pianoRoll[width];
Chris@41 751
Chris@165 752 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 753
Chris@165 754 // only keep notes >= 100ms or thereabouts
Chris@165 755 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 756 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 757
Chris@41 758 FeatureList noteFeatures;
Chris@41 759
Chris@41 760 if (width < durationThreshold + 1) {
Chris@41 761 return noteFeatures;
Chris@41 762 }
Chris@41 763
Chris@150 764 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 765
Chris@55 766 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 767 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 768
Chris@55 769 int note = ni->first;
Chris@41 770
Chris@41 771 if (active.find(note) != active.end()) {
Chris@41 772 // the note is still playing
Chris@41 773 continue;
Chris@41 774 }
Chris@41 775
Chris@41 776 // the note was playing but just ended
Chris@41 777 int end = width;
Chris@41 778 int start = end-1;
Chris@41 779
Chris@41 780 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 781 --start;
Chris@41 782 }
Chris@41 783 ++start;
Chris@41 784
Chris@169 785 if ((end - start) < durationThreshold) {
Chris@41 786 continue;
Chris@41 787 }
Chris@41 788
Chris@169 789 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 790 }
Chris@41 791
Chris@62 792 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 793
Chris@41 794 return noteFeatures;
Chris@41 795 }
Chris@41 796
Chris@169 797 void
Chris@169 798 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 799 FeatureList &noteFeatures)
Chris@169 800 {
Chris@169 801 int partStart = start;
Chris@169 802 int partShift = 0;
Chris@169 803 int partVelocity = 0;
Chris@169 804
Chris@252 805 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 806
Chris@169 807 for (int i = start; i != end; ++i) {
Chris@169 808
Chris@169 809 double strength = m_pianoRoll[i][note];
Chris@169 810
Chris@169 811 int shift = 0;
Chris@169 812
Chris@169 813 if (shiftCount > 1) {
Chris@169 814
Chris@169 815 shift = m_pianoRollShifts[i][note];
Chris@169 816
Chris@169 817 if (i == partStart) {
Chris@169 818 partShift = shift;
Chris@169 819 }
Chris@169 820
Chris@169 821 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 822
Chris@169 823 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 824
Chris@169 825 // pitch has changed, emit an intermediate note
Chris@252 826 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 827 i,
Chris@252 828 note,
Chris@252 829 partShift,
Chris@252 830 shiftCount,
Chris@252 831 partVelocity));
Chris@169 832 partStart = i;
Chris@169 833 partShift = shift;
Chris@169 834 partVelocity = 0;
Chris@169 835 }
Chris@169 836 }
Chris@169 837
Chris@246 838 int v = round(strength * 2);
Chris@169 839 if (v > partVelocity) {
Chris@169 840 partVelocity = v;
Chris@169 841 }
Chris@169 842 }
Chris@169 843
Chris@169 844 if (end >= partStart + partThreshold) {
Chris@252 845 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 846 end,
Chris@252 847 note,
Chris@252 848 partShift,
Chris@252 849 shiftCount,
Chris@252 850 partVelocity));
Chris@169 851 }
Chris@169 852 }
Chris@252 853
Chris@252 854 Silvet::Feature
Chris@252 855 Silvet::makeNoteFeature(int start,
Chris@252 856 int end,
Chris@252 857 int note,
Chris@252 858 int shift,
Chris@252 859 int shiftCount,
Chris@252 860 int velocity)
Chris@252 861 {
Chris@252 862 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 863 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 864
Chris@252 865 Feature f;
Chris@252 866
Chris@252 867 f.hasTimestamp = true;
Chris@285 868 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 869 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 870
Chris@252 871 f.hasDuration = true;
Chris@252 872 f.duration = RealTime::fromSeconds
Chris@252 873 (columnDuration * (end - start));
Chris@252 874
Chris@252 875 f.values.clear();
Chris@252 876
Chris@252 877 f.values.push_back
Chris@252 878 (noteFrequency(note, shift, shiftCount));
Chris@252 879
Chris@252 880 float inputGain = getInputGainAt(f.timestamp);
Chris@252 881 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 882 velocity = round(velocity / inputGain);
Chris@252 883 if (velocity > 127) velocity = 127;
Chris@252 884 if (velocity < 1) velocity = 1;
Chris@252 885 f.values.push_back(velocity);
Chris@252 886
Chris@252 887 f.label = noteName(note, shift, shiftCount);
Chris@252 888
Chris@252 889 return f;
Chris@252 890 }
Chris@252 891
Chris@252 892 float
Chris@252 893 Silvet::getInputGainAt(RealTime t)
Chris@252 894 {
Chris@252 895 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 896
Chris@252 897 if (i == m_inputGains.end()) {
Chris@252 898 if (i != m_inputGains.begin()) {
Chris@252 899 --i;
Chris@252 900 } else {
Chris@252 901 return 1.f; // no data
Chris@252 902 }
Chris@252 903 }
Chris@252 904
Chris@252 905 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 906
Chris@252 907 return i->second;
Chris@252 908 }
Chris@252 909