annotate src/Silvet.cpp @ 294:19fd6cb033c7

Add pitch activation matrix output
author Chris Cannam
date Wed, 15 Oct 2014 17:40:20 +0100
parents 8aff275f16b5
children aa7be9d8112e
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@40 33 using Vamp::RealTime;
Chris@31 34
Chris@31 35 static int processingSampleRate = 44100;
Chris@31 36 static int processingBPO = 60;
Chris@170 37
Chris@272 38 static int minInputSampleRate = 100;
Chris@272 39 static int maxInputSampleRate = 192000;
Chris@272 40
Chris@31 41 Silvet::Silvet(float inputSampleRate) :
Chris@31 42 Plugin(inputSampleRate),
Chris@161 43 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 44 m_resampler(0),
Chris@246 45 m_flattener(0),
Chris@110 46 m_cq(0),
Chris@162 47 m_hqMode(true),
Chris@166 48 m_fineTuning(false),
Chris@178 49 m_instrument(0),
Chris@178 50 m_colsPerSec(50)
Chris@31 51 {
Chris@31 52 }
Chris@31 53
Chris@31 54 Silvet::~Silvet()
Chris@31 55 {
Chris@31 56 delete m_resampler;
Chris@246 57 delete m_flattener;
Chris@31 58 delete m_cq;
Chris@41 59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 60 delete m_postFilter[i];
Chris@41 61 }
Chris@31 62 }
Chris@31 63
Chris@31 64 string
Chris@31 65 Silvet::getIdentifier() const
Chris@31 66 {
Chris@31 67 return "silvet";
Chris@31 68 }
Chris@31 69
Chris@31 70 string
Chris@31 71 Silvet::getName() const
Chris@31 72 {
Chris@31 73 return "Silvet Note Transcription";
Chris@31 74 }
Chris@31 75
Chris@31 76 string
Chris@31 77 Silvet::getDescription() const
Chris@31 78 {
Chris@191 79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 80 }
Chris@31 81
Chris@31 82 string
Chris@31 83 Silvet::getMaker() const
Chris@31 84 {
Chris@191 85 return "Queen Mary, University of London";
Chris@31 86 }
Chris@31 87
Chris@31 88 int
Chris@31 89 Silvet::getPluginVersion() const
Chris@31 90 {
Chris@31 91 return 1;
Chris@31 92 }
Chris@31 93
Chris@31 94 string
Chris@31 95 Silvet::getCopyright() const
Chris@31 96 {
Chris@191 97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 98 }
Chris@31 99
Chris@31 100 Silvet::InputDomain
Chris@31 101 Silvet::getInputDomain() const
Chris@31 102 {
Chris@31 103 return TimeDomain;
Chris@31 104 }
Chris@31 105
Chris@31 106 size_t
Chris@31 107 Silvet::getPreferredBlockSize() const
Chris@31 108 {
Chris@31 109 return 0;
Chris@31 110 }
Chris@31 111
Chris@31 112 size_t
Chris@31 113 Silvet::getPreferredStepSize() const
Chris@31 114 {
Chris@31 115 return 0;
Chris@31 116 }
Chris@31 117
Chris@31 118 size_t
Chris@31 119 Silvet::getMinChannelCount() const
Chris@31 120 {
Chris@31 121 return 1;
Chris@31 122 }
Chris@31 123
Chris@31 124 size_t
Chris@31 125 Silvet::getMaxChannelCount() const
Chris@31 126 {
Chris@31 127 return 1;
Chris@31 128 }
Chris@31 129
Chris@31 130 Silvet::ParameterList
Chris@31 131 Silvet::getParameterDescriptors() const
Chris@31 132 {
Chris@31 133 ParameterList list;
Chris@110 134
Chris@110 135 ParameterDescriptor desc;
Chris@110 136 desc.identifier = "mode";
Chris@110 137 desc.name = "Processing mode";
Chris@110 138 desc.unit = "";
Chris@271 139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 140 desc.minValue = 0;
Chris@110 141 desc.maxValue = 1;
Chris@113 142 desc.defaultValue = 1;
Chris@110 143 desc.isQuantized = true;
Chris@110 144 desc.quantizeStep = 1;
Chris@166 145 desc.valueNames.push_back("Draft (faster)");
Chris@165 146 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 147 list.push_back(desc);
Chris@161 148
Chris@176 149 desc.identifier = "instrument";
Chris@176 150 desc.name = "Instrument";
Chris@161 151 desc.unit = "";
Chris@271 152 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 153 desc.minValue = 0;
Chris@162 154 desc.maxValue = m_instruments.size()-1;
Chris@162 155 desc.defaultValue = 0;
Chris@161 156 desc.isQuantized = true;
Chris@161 157 desc.quantizeStep = 1;
Chris@161 158 desc.valueNames.clear();
Chris@162 159 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 160 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 161 }
Chris@166 162 list.push_back(desc);
Chris@161 163
Chris@166 164 desc.identifier = "finetune";
Chris@166 165 desc.name = "Return fine pitch estimates";
Chris@166 166 desc.unit = "";
Chris@271 167 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 168 desc.minValue = 0;
Chris@166 169 desc.maxValue = 1;
Chris@166 170 desc.defaultValue = 0;
Chris@166 171 desc.isQuantized = true;
Chris@166 172 desc.quantizeStep = 1;
Chris@166 173 desc.valueNames.clear();
Chris@110 174 list.push_back(desc);
Chris@110 175
Chris@31 176 return list;
Chris@31 177 }
Chris@31 178
Chris@31 179 float
Chris@31 180 Silvet::getParameter(string identifier) const
Chris@31 181 {
Chris@110 182 if (identifier == "mode") {
Chris@110 183 return m_hqMode ? 1.f : 0.f;
Chris@166 184 } else if (identifier == "finetune") {
Chris@166 185 return m_fineTuning ? 1.f : 0.f;
Chris@176 186 } else if (identifier == "instrument") {
Chris@162 187 return m_instrument;
Chris@110 188 }
Chris@31 189 return 0;
Chris@31 190 }
Chris@31 191
Chris@31 192 void
Chris@31 193 Silvet::setParameter(string identifier, float value)
Chris@31 194 {
Chris@110 195 if (identifier == "mode") {
Chris@110 196 m_hqMode = (value > 0.5);
Chris@166 197 } else if (identifier == "finetune") {
Chris@166 198 m_fineTuning = (value > 0.5);
Chris@176 199 } else if (identifier == "instrument") {
Chris@162 200 m_instrument = lrintf(value);
Chris@110 201 }
Chris@31 202 }
Chris@31 203
Chris@31 204 Silvet::ProgramList
Chris@31 205 Silvet::getPrograms() const
Chris@31 206 {
Chris@31 207 ProgramList list;
Chris@31 208 return list;
Chris@31 209 }
Chris@31 210
Chris@31 211 string
Chris@31 212 Silvet::getCurrentProgram() const
Chris@31 213 {
Chris@31 214 return "";
Chris@31 215 }
Chris@31 216
Chris@31 217 void
Chris@31 218 Silvet::selectProgram(string name)
Chris@31 219 {
Chris@31 220 }
Chris@31 221
Chris@31 222 Silvet::OutputList
Chris@31 223 Silvet::getOutputDescriptors() const
Chris@31 224 {
Chris@31 225 OutputList list;
Chris@31 226
Chris@31 227 OutputDescriptor d;
Chris@51 228 d.identifier = "notes";
Chris@51 229 d.name = "Note transcription";
Chris@271 230 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 231 d.unit = "Hz";
Chris@31 232 d.hasFixedBinCount = true;
Chris@31 233 d.binCount = 2;
Chris@41 234 d.binNames.push_back("Frequency");
Chris@31 235 d.binNames.push_back("Velocity");
Chris@31 236 d.hasKnownExtents = false;
Chris@31 237 d.isQuantized = false;
Chris@31 238 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 239 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 240 d.hasDuration = true;
Chris@32 241 m_notesOutputNo = list.size();
Chris@32 242 list.push_back(d);
Chris@32 243
Chris@178 244 d.identifier = "timefreq";
Chris@178 245 d.name = "Time-frequency distribution";
Chris@271 246 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 247 d.unit = "";
Chris@178 248 d.hasFixedBinCount = true;
Chris@178 249 d.binCount = m_instruments[0].templateHeight;
Chris@178 250 d.binNames.clear();
Chris@178 251 if (m_cq) {
Chris@294 252 char name[50];
Chris@178 253 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 254 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 255 // lowest-frequency 55 bins have been dropped, for a
Chris@178 256 // 545-bin template. The native CQ bins go high->low
Chris@178 257 // frequency though, so these are still the first 545 bins
Chris@178 258 // as reported by getBinFrequency, though in reverse order
Chris@178 259 float freq = m_cq->getBinFrequency
Chris@178 260 (m_instruments[0].templateHeight - i - 1);
Chris@178 261 sprintf(name, "%.1f Hz", freq);
Chris@178 262 d.binNames.push_back(name);
Chris@178 263 }
Chris@178 264 }
Chris@178 265 d.hasKnownExtents = false;
Chris@178 266 d.isQuantized = false;
Chris@178 267 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 268 d.sampleRate = m_colsPerSec;
Chris@178 269 d.hasDuration = false;
Chris@178 270 m_fcqOutputNo = list.size();
Chris@178 271 list.push_back(d);
Chris@178 272
Chris@294 273 d.identifier = "pitchactivation";
Chris@294 274 d.name = "Pitch activation distribution";
Chris@294 275 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 276 d.unit = "";
Chris@294 277 d.hasFixedBinCount = true;
Chris@294 278 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 279 d.binNames.clear();
Chris@294 280 if (m_cq) {
Chris@294 281 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 282 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 283 }
Chris@294 284 }
Chris@294 285 d.hasKnownExtents = false;
Chris@294 286 d.isQuantized = false;
Chris@294 287 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 288 d.sampleRate = m_colsPerSec;
Chris@294 289 d.hasDuration = false;
Chris@294 290 m_pitchOutputNo = list.size();
Chris@294 291 list.push_back(d);
Chris@294 292
Chris@31 293 return list;
Chris@31 294 }
Chris@31 295
Chris@38 296 std::string
Chris@175 297 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 298 {
Chris@38 299 static const char *names[] = {
Chris@38 300 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 301 };
Chris@38 302
Chris@175 303 const char *n = names[note % 12];
Chris@38 304
Chris@175 305 int oct = (note + 9) / 12;
Chris@38 306
Chris@175 307 char buf[30];
Chris@175 308
Chris@175 309 float pshift = 0.f;
Chris@175 310 if (shiftCount > 1) {
Chris@175 311 // see noteFrequency below
Chris@175 312 pshift =
Chris@175 313 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 314 }
Chris@175 315
Chris@175 316 if (pshift > 0.f) {
Chris@175 317 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 318 } else if (pshift < 0.f) {
Chris@175 319 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 320 } else {
Chris@175 321 sprintf(buf, "%s%d", n, oct);
Chris@175 322 }
Chris@38 323
Chris@38 324 return buf;
Chris@38 325 }
Chris@38 326
Chris@41 327 float
Chris@168 328 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 329 {
Chris@169 330 // Convert shift number to a pitch shift. The given shift number
Chris@169 331 // is an offset into the template array, which starts with some
Chris@169 332 // zeros, followed by the template, then some trailing zeros.
Chris@169 333 //
Chris@169 334 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 335 // == 5, then the number will be in the range 0-4 and the template
Chris@169 336 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 337 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 338 // represent moving the template *up* in pitch (by introducing
Chris@169 339 // zeros at the start, which is the low-frequency end), for a
Chris@169 340 // positive pitch shift; and higher values represent moving it
Chris@169 341 // down in pitch, for a negative pitch shift.
Chris@169 342
Chris@175 343 float pshift = 0.f;
Chris@175 344 if (shiftCount > 1) {
Chris@175 345 pshift =
Chris@175 346 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 347 }
Chris@169 348
Chris@169 349 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 350 }
Chris@41 351
Chris@31 352 bool
Chris@31 353 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 354 {
Chris@272 355 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 356 m_inputSampleRate > maxInputSampleRate) {
Chris@272 357 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 358 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 359 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 360 return false;
Chris@272 361 }
Chris@272 362
Chris@31 363 if (channels < getMinChannelCount() ||
Chris@272 364 channels > getMaxChannelCount()) {
Chris@272 365 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 366 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 367 << getMaxChannelCount() << ")" << endl;
Chris@272 368 return false;
Chris@272 369 }
Chris@31 370
Chris@31 371 if (stepSize != blockSize) {
Chris@31 372 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 373 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 374 return false;
Chris@31 375 }
Chris@31 376
Chris@31 377 m_blockSize = blockSize;
Chris@31 378
Chris@31 379 reset();
Chris@31 380
Chris@31 381 return true;
Chris@31 382 }
Chris@31 383
Chris@31 384 void
Chris@31 385 Silvet::reset()
Chris@31 386 {
Chris@31 387 delete m_resampler;
Chris@246 388 delete m_flattener;
Chris@31 389 delete m_cq;
Chris@31 390
Chris@31 391 if (m_inputSampleRate != processingSampleRate) {
Chris@31 392 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 393 } else {
Chris@31 394 m_resampler = 0;
Chris@31 395 }
Chris@31 396
Chris@246 397 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 398 m_flattener->reset();
Chris@246 399
Chris@173 400 double minFreq = 27.5;
Chris@173 401
Chris@173 402 if (!m_hqMode) {
Chris@173 403 // We don't actually return any notes from the bottom octave,
Chris@173 404 // so we can just pad with zeros
Chris@173 405 minFreq *= 2;
Chris@173 406 }
Chris@173 407
Chris@154 408 CQParameters params(processingSampleRate,
Chris@173 409 minFreq,
Chris@154 410 processingSampleRate / 3,
Chris@154 411 processingBPO);
Chris@154 412
Chris@155 413 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 414 // drops the FFT size to 512 from 1024 and alters
Chris@155 415 // some other processing parameters, making
Chris@155 416 // everything much, much slower. Could be a flaw
Chris@155 417 // in the CQ parameter calculations, must check
Chris@154 418 params.atomHopFactor = 0.3;
Chris@154 419 params.threshold = 0.0005;
Chris@172 420 params.window = CQParameters::Hann;
Chris@154 421
Chris@154 422 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 423
Chris@165 424 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 425
Chris@41 426 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 427 delete m_postFilter[i];
Chris@41 428 }
Chris@41 429 m_postFilter.clear();
Chris@176 430 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 431 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 432 }
Chris@41 433 m_pianoRoll.clear();
Chris@246 434 m_inputGains.clear();
Chris@32 435 m_columnCount = 0;
Chris@272 436 m_resampledCount = 0;
Chris@40 437 m_startTime = RealTime::zeroTime;
Chris@31 438 }
Chris@31 439
Chris@31 440 Silvet::FeatureSet
Chris@31 441 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 442 {
Chris@40 443 if (m_columnCount == 0) {
Chris@40 444 m_startTime = timestamp;
Chris@40 445 }
Chris@246 446
Chris@246 447 vector<float> flattened(m_blockSize);
Chris@246 448 float gain = 1.f;
Chris@246 449 m_flattener->connectInputPort
Chris@246 450 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 451 m_flattener->connectOutputPort
Chris@246 452 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 453 m_flattener->connectOutputPort
Chris@246 454 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 455 m_flattener->process(m_blockSize);
Chris@246 456
Chris@252 457 m_inputGains[timestamp] = gain;
Chris@40 458
Chris@31 459 vector<double> data;
Chris@40 460 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 461 double d = flattened[i];
Chris@235 462 data.push_back(d);
Chris@40 463 }
Chris@31 464
Chris@31 465 if (m_resampler) {
Chris@272 466
Chris@31 467 data = m_resampler->process(data.data(), data.size());
Chris@272 468
Chris@272 469 int hadCount = m_resampledCount;
Chris@272 470 m_resampledCount += data.size();
Chris@272 471
Chris@272 472 int resamplerLatency = m_resampler->getLatency();
Chris@272 473
Chris@272 474 if (hadCount < resamplerLatency) {
Chris@272 475 int stillToDrop = resamplerLatency - hadCount;
Chris@272 476 if (stillToDrop >= int(data.size())) {
Chris@272 477 return FeatureSet();
Chris@272 478 } else {
Chris@272 479 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 480 }
Chris@272 481 }
Chris@31 482 }
Chris@272 483
Chris@32 484 Grid cqout = m_cq->process(data);
Chris@51 485 FeatureSet fs = transcribe(cqout);
Chris@51 486 return fs;
Chris@34 487 }
Chris@34 488
Chris@34 489 Silvet::FeatureSet
Chris@34 490 Silvet::getRemainingFeatures()
Chris@34 491 {
Chris@145 492 Grid cqout = m_cq->getRemainingOutput();
Chris@51 493 FeatureSet fs = transcribe(cqout);
Chris@51 494 return fs;
Chris@34 495 }
Chris@34 496
Chris@34 497 Silvet::FeatureSet
Chris@34 498 Silvet::transcribe(const Grid &cqout)
Chris@34 499 {
Chris@32 500 Grid filtered = preProcess(cqout);
Chris@31 501
Chris@32 502 FeatureSet fs;
Chris@32 503
Chris@104 504 if (filtered.empty()) return fs;
Chris@170 505
Chris@170 506 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 507
Chris@178 508 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 509 Feature f;
Chris@178 510 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 511 f.values.push_back(float(filtered[i][j]));
Chris@178 512 }
Chris@178 513 fs[m_fcqOutputNo].push_back(f);
Chris@178 514 }
Chris@178 515
Chris@34 516 int width = filtered.size();
Chris@34 517
Chris@164 518 int iterations = m_hqMode ? 20 : 10;
Chris@34 519
Chris@176 520 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 521
Chris@170 522 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 523 int shiftCount = 1;
Chris@170 524 if (wantShifts) {
Chris@170 525 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 526 }
Chris@170 527
Chris@170 528 vector<vector<int> > localBestShifts;
Chris@170 529 if (wantShifts) {
Chris@170 530 localBestShifts =
Chris@176 531 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 532 }
Chris@170 533
Chris@170 534 vector<bool> present(width, false);
Chris@37 535
Chris@123 536 #pragma omp parallel for
Chris@123 537 for (int i = 0; i < width; ++i) {
Chris@104 538
Chris@170 539 double sum = 0.0;
Chris@176 540 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 541 sum += filtered.at(i).at(j);
Chris@170 542 }
Chris@170 543 if (sum < 1e-5) continue;
Chris@170 544
Chris@170 545 present[i] = true;
Chris@170 546
Chris@170 547 EM em(&pack, m_hqMode);
Chris@170 548
Chris@183 549 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 550 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 551
Chris@170 552 for (int j = 0; j < iterations; ++j) {
Chris@170 553 em.iterate(filtered.at(i).data());
Chris@37 554 }
Chris@37 555
Chris@170 556 const float *pitchDist = em.getPitchDistribution();
Chris@170 557 const float *const *shiftDist = em.getShifts();
Chris@37 558
Chris@176 559 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 560
Chris@170 561 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 562
Chris@170 563 int bestShift = 0;
Chris@179 564 float bestShiftValue = 0.0;
Chris@170 565 if (wantShifts) {
Chris@170 566 for (int k = 0; k < shiftCount; ++k) {
Chris@179 567 float value = shiftDist[k][j];
Chris@179 568 if (k == 0 || value > bestShiftValue) {
Chris@179 569 bestShiftValue = value;
Chris@170 570 bestShift = k;
Chris@170 571 }
Chris@170 572 }
Chris@170 573 localBestShifts[i][j] = bestShift;
Chris@170 574 }
Chris@123 575 }
Chris@123 576 }
Chris@166 577
Chris@166 578 for (int i = 0; i < width; ++i) {
Chris@37 579
Chris@170 580 if (!present[i]) {
Chris@170 581 // silent column
Chris@176 582 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 583 m_postFilter[j]->push(0.0);
Chris@170 584 }
Chris@168 585 m_pianoRoll.push_back(map<int, double>());
Chris@170 586 if (wantShifts) {
Chris@168 587 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 588 }
Chris@166 589 continue;
Chris@166 590 }
Chris@166 591
Chris@294 592 vector<double> filtered = postProcess
Chris@294 593 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 594
Chris@294 595 Feature f;
Chris@294 596 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 597 float v(filtered[j]);
Chris@294 598 if (v < pack.levelThreshold) v = 0.f;
Chris@294 599 f.values.push_back(v);
Chris@294 600 }
Chris@294 601 fs[m_pitchOutputNo].push_back(f);
Chris@166 602
Chris@168 603 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 604
Chris@123 605 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 606 fi != noteFeatures.end(); ++fi) {
Chris@123 607 fs[m_notesOutputNo].push_back(*fi);
Chris@40 608 }
Chris@34 609 }
Chris@34 610
Chris@32 611 return fs;
Chris@31 612 }
Chris@31 613
Chris@32 614 Silvet::Grid
Chris@32 615 Silvet::preProcess(const Grid &in)
Chris@32 616 {
Chris@32 617 int width = in.size();
Chris@32 618
Chris@165 619 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 620
Chris@165 621 // need to be careful that col spacing is an integer number of samples!
Chris@165 622 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 623
Chris@32 624 Grid out;
Chris@32 625
Chris@58 626 // We count the CQ latency in terms of processing hops, but
Chris@58 627 // actually it probably isn't an exact number of hops so this
Chris@58 628 // isn't quite accurate. But the small constant offset is
Chris@165 629 // practically irrelevant compared to the jitter from the frame
Chris@165 630 // size we reduce to in a moment
Chris@33 631 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 632
Chris@176 633 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 634
Chris@32 635 for (int i = 0; i < width; ++i) {
Chris@32 636
Chris@33 637 if (m_columnCount < latentColumns) {
Chris@33 638 ++m_columnCount;
Chris@33 639 continue;
Chris@33 640 }
Chris@33 641
Chris@32 642 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 643 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 644
Chris@32 645 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 646
Chris@32 647 if (select) {
Chris@32 648 vector<double> inCol = in[i];
Chris@176 649 vector<double> outCol(pack.templateHeight);
Chris@32 650
Chris@178 651 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 652 // lowest 55 of them.
Chris@178 653 //
Chris@178 654 // In draft mode the CQ is an octave shorter, returning
Chris@178 655 // 540 bins, so we instead pad them with an additional 5
Chris@178 656 // zeros.
Chris@178 657 //
Chris@178 658 // We also need to reverse the column as we go, since the
Chris@178 659 // raw CQ has the high frequencies first and we need it
Chris@178 660 // the other way around.
Chris@32 661
Chris@178 662 if (m_hqMode) {
Chris@178 663 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 664 int ix = inCol.size() - j - 55;
Chris@178 665 outCol[j] = inCol[ix];
Chris@178 666 }
Chris@178 667 } else {
Chris@178 668 for (int j = 0; j < 5; ++j) {
Chris@178 669 outCol[j] = 0.0;
Chris@178 670 }
Chris@178 671 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 672 int ix = inCol.size() - j + 4;
Chris@178 673 outCol[j] = inCol[ix];
Chris@178 674 }
Chris@46 675 }
Chris@32 676
Chris@46 677 vector<double> noiseLevel1 =
Chris@46 678 MedianFilter<double>::filter(40, outCol);
Chris@176 679 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 680 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 681 }
Chris@32 682
Chris@46 683 vector<double> noiseLevel2 =
Chris@46 684 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 685 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 686 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 687 }
Chris@32 688
Chris@165 689 out.push_back(outCol);
Chris@32 690 }
Chris@32 691
Chris@32 692 ++m_columnCount;
Chris@32 693 }
Chris@32 694
Chris@32 695 return out;
Chris@32 696 }
Chris@32 697
Chris@294 698 vector<double>
Chris@170 699 Silvet::postProcess(const vector<double> &pitches,
Chris@170 700 const vector<int> &bestShifts,
Chris@170 701 bool wantShifts)
Chris@166 702 {
Chris@176 703 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 704
Chris@41 705 vector<double> filtered;
Chris@41 706
Chris@176 707 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 708 m_postFilter[j]->push(pitches[j]);
Chris@41 709 filtered.push_back(m_postFilter[j]->get());
Chris@41 710 }
Chris@41 711
Chris@41 712 // Threshold for level and reduce number of candidate pitches
Chris@41 713
Chris@41 714 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 715
Chris@41 716 ValueIndexMap strengths;
Chris@166 717
Chris@176 718 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 719 double strength = filtered[j];
Chris@183 720 if (strength < pack.levelThreshold) continue;
Chris@168 721 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 722 }
Chris@166 723
Chris@168 724 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 725
Chris@168 726 map<int, double> active;
Chris@168 727 map<int, int> activeShifts;
Chris@168 728
Chris@183 729 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 730
Chris@168 731 --si;
Chris@168 732
Chris@168 733 double strength = si->first;
Chris@168 734 int j = si->second;
Chris@168 735
Chris@168 736 active[j] = strength;
Chris@168 737
Chris@170 738 if (wantShifts) {
Chris@170 739 activeShifts[j] = bestShifts[j];
Chris@167 740 }
Chris@41 741 }
Chris@41 742
Chris@168 743 m_pianoRoll.push_back(active);
Chris@170 744
Chris@170 745 if (wantShifts) {
Chris@168 746 m_pianoRollShifts.push_back(activeShifts);
Chris@41 747 }
Chris@294 748
Chris@294 749 return filtered;
Chris@166 750 }
Chris@166 751
Chris@166 752 Vamp::Plugin::FeatureList
Chris@168 753 Silvet::noteTrack(int shiftCount)
Chris@166 754 {
Chris@41 755 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 756 // report notes that have just ended (i.e. that are absent in the
Chris@168 757 // latest active set but present in the prior set in the piano
Chris@41 758 // roll) -- any notes that ended earlier will have been reported
Chris@41 759 // already, and if they haven't ended, we don't know their
Chris@41 760 // duration.
Chris@41 761
Chris@168 762 int width = m_pianoRoll.size() - 1;
Chris@168 763
Chris@168 764 const map<int, double> &active = m_pianoRoll[width];
Chris@41 765
Chris@165 766 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 767
Chris@165 768 // only keep notes >= 100ms or thereabouts
Chris@165 769 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 770 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 771
Chris@41 772 FeatureList noteFeatures;
Chris@41 773
Chris@41 774 if (width < durationThreshold + 1) {
Chris@41 775 return noteFeatures;
Chris@41 776 }
Chris@41 777
Chris@150 778 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 779
Chris@55 780 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 781 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 782
Chris@55 783 int note = ni->first;
Chris@41 784
Chris@41 785 if (active.find(note) != active.end()) {
Chris@41 786 // the note is still playing
Chris@41 787 continue;
Chris@41 788 }
Chris@41 789
Chris@41 790 // the note was playing but just ended
Chris@41 791 int end = width;
Chris@41 792 int start = end-1;
Chris@41 793
Chris@41 794 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 795 --start;
Chris@41 796 }
Chris@41 797 ++start;
Chris@41 798
Chris@169 799 if ((end - start) < durationThreshold) {
Chris@41 800 continue;
Chris@41 801 }
Chris@41 802
Chris@169 803 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 804 }
Chris@41 805
Chris@62 806 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 807
Chris@41 808 return noteFeatures;
Chris@41 809 }
Chris@41 810
Chris@169 811 void
Chris@169 812 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 813 FeatureList &noteFeatures)
Chris@169 814 {
Chris@169 815 int partStart = start;
Chris@169 816 int partShift = 0;
Chris@169 817 int partVelocity = 0;
Chris@169 818
Chris@252 819 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 820
Chris@169 821 for (int i = start; i != end; ++i) {
Chris@169 822
Chris@169 823 double strength = m_pianoRoll[i][note];
Chris@169 824
Chris@169 825 int shift = 0;
Chris@169 826
Chris@169 827 if (shiftCount > 1) {
Chris@169 828
Chris@169 829 shift = m_pianoRollShifts[i][note];
Chris@169 830
Chris@169 831 if (i == partStart) {
Chris@169 832 partShift = shift;
Chris@169 833 }
Chris@169 834
Chris@169 835 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 836
Chris@169 837 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 838
Chris@169 839 // pitch has changed, emit an intermediate note
Chris@252 840 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 841 i,
Chris@252 842 note,
Chris@252 843 partShift,
Chris@252 844 shiftCount,
Chris@252 845 partVelocity));
Chris@169 846 partStart = i;
Chris@169 847 partShift = shift;
Chris@169 848 partVelocity = 0;
Chris@169 849 }
Chris@169 850 }
Chris@169 851
Chris@246 852 int v = round(strength * 2);
Chris@169 853 if (v > partVelocity) {
Chris@169 854 partVelocity = v;
Chris@169 855 }
Chris@169 856 }
Chris@169 857
Chris@169 858 if (end >= partStart + partThreshold) {
Chris@252 859 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 860 end,
Chris@252 861 note,
Chris@252 862 partShift,
Chris@252 863 shiftCount,
Chris@252 864 partVelocity));
Chris@169 865 }
Chris@169 866 }
Chris@252 867
Chris@252 868 Silvet::Feature
Chris@252 869 Silvet::makeNoteFeature(int start,
Chris@252 870 int end,
Chris@252 871 int note,
Chris@252 872 int shift,
Chris@252 873 int shiftCount,
Chris@252 874 int velocity)
Chris@252 875 {
Chris@252 876 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 877 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 878
Chris@252 879 Feature f;
Chris@252 880
Chris@252 881 f.hasTimestamp = true;
Chris@285 882 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 883 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 884
Chris@252 885 f.hasDuration = true;
Chris@252 886 f.duration = RealTime::fromSeconds
Chris@252 887 (columnDuration * (end - start));
Chris@252 888
Chris@252 889 f.values.clear();
Chris@252 890
Chris@252 891 f.values.push_back
Chris@252 892 (noteFrequency(note, shift, shiftCount));
Chris@252 893
Chris@252 894 float inputGain = getInputGainAt(f.timestamp);
Chris@252 895 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 896 velocity = round(velocity / inputGain);
Chris@252 897 if (velocity > 127) velocity = 127;
Chris@252 898 if (velocity < 1) velocity = 1;
Chris@252 899 f.values.push_back(velocity);
Chris@252 900
Chris@252 901 f.label = noteName(note, shift, shiftCount);
Chris@252 902
Chris@252 903 return f;
Chris@252 904 }
Chris@252 905
Chris@252 906 float
Chris@252 907 Silvet::getInputGainAt(RealTime t)
Chris@252 908 {
Chris@252 909 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 910
Chris@252 911 if (i == m_inputGains.end()) {
Chris@252 912 if (i != m_inputGains.begin()) {
Chris@252 913 --i;
Chris@252 914 } else {
Chris@252 915 return 1.f; // no data
Chris@252 916 }
Chris@252 917 }
Chris@252 918
Chris@252 919 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 920
Chris@252 921 return i->second;
Chris@252 922 }
Chris@252 923