annotate src/Silvet.cpp @ 285:8aff275f16b5

Fix failure to take starting timestamp into account
author Chris Cannam
date Thu, 07 Aug 2014 16:06:02 +0100
parents e5f897b2d5e8
children 19fd6cb033c7
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@40 33 using Vamp::RealTime;
Chris@31 34
Chris@31 35 static int processingSampleRate = 44100;
Chris@31 36 static int processingBPO = 60;
Chris@170 37
Chris@272 38 static int minInputSampleRate = 100;
Chris@272 39 static int maxInputSampleRate = 192000;
Chris@272 40
Chris@31 41 Silvet::Silvet(float inputSampleRate) :
Chris@31 42 Plugin(inputSampleRate),
Chris@161 43 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 44 m_resampler(0),
Chris@246 45 m_flattener(0),
Chris@110 46 m_cq(0),
Chris@162 47 m_hqMode(true),
Chris@166 48 m_fineTuning(false),
Chris@178 49 m_instrument(0),
Chris@178 50 m_colsPerSec(50)
Chris@31 51 {
Chris@31 52 }
Chris@31 53
Chris@31 54 Silvet::~Silvet()
Chris@31 55 {
Chris@31 56 delete m_resampler;
Chris@246 57 delete m_flattener;
Chris@31 58 delete m_cq;
Chris@41 59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 60 delete m_postFilter[i];
Chris@41 61 }
Chris@31 62 }
Chris@31 63
Chris@31 64 string
Chris@31 65 Silvet::getIdentifier() const
Chris@31 66 {
Chris@31 67 return "silvet";
Chris@31 68 }
Chris@31 69
Chris@31 70 string
Chris@31 71 Silvet::getName() const
Chris@31 72 {
Chris@31 73 return "Silvet Note Transcription";
Chris@31 74 }
Chris@31 75
Chris@31 76 string
Chris@31 77 Silvet::getDescription() const
Chris@31 78 {
Chris@191 79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 80 }
Chris@31 81
Chris@31 82 string
Chris@31 83 Silvet::getMaker() const
Chris@31 84 {
Chris@191 85 return "Queen Mary, University of London";
Chris@31 86 }
Chris@31 87
Chris@31 88 int
Chris@31 89 Silvet::getPluginVersion() const
Chris@31 90 {
Chris@31 91 return 1;
Chris@31 92 }
Chris@31 93
Chris@31 94 string
Chris@31 95 Silvet::getCopyright() const
Chris@31 96 {
Chris@191 97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 98 }
Chris@31 99
Chris@31 100 Silvet::InputDomain
Chris@31 101 Silvet::getInputDomain() const
Chris@31 102 {
Chris@31 103 return TimeDomain;
Chris@31 104 }
Chris@31 105
Chris@31 106 size_t
Chris@31 107 Silvet::getPreferredBlockSize() const
Chris@31 108 {
Chris@31 109 return 0;
Chris@31 110 }
Chris@31 111
Chris@31 112 size_t
Chris@31 113 Silvet::getPreferredStepSize() const
Chris@31 114 {
Chris@31 115 return 0;
Chris@31 116 }
Chris@31 117
Chris@31 118 size_t
Chris@31 119 Silvet::getMinChannelCount() const
Chris@31 120 {
Chris@31 121 return 1;
Chris@31 122 }
Chris@31 123
Chris@31 124 size_t
Chris@31 125 Silvet::getMaxChannelCount() const
Chris@31 126 {
Chris@31 127 return 1;
Chris@31 128 }
Chris@31 129
Chris@31 130 Silvet::ParameterList
Chris@31 131 Silvet::getParameterDescriptors() const
Chris@31 132 {
Chris@31 133 ParameterList list;
Chris@110 134
Chris@110 135 ParameterDescriptor desc;
Chris@110 136 desc.identifier = "mode";
Chris@110 137 desc.name = "Processing mode";
Chris@110 138 desc.unit = "";
Chris@271 139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 140 desc.minValue = 0;
Chris@110 141 desc.maxValue = 1;
Chris@113 142 desc.defaultValue = 1;
Chris@110 143 desc.isQuantized = true;
Chris@110 144 desc.quantizeStep = 1;
Chris@166 145 desc.valueNames.push_back("Draft (faster)");
Chris@165 146 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 147 list.push_back(desc);
Chris@161 148
Chris@176 149 desc.identifier = "instrument";
Chris@176 150 desc.name = "Instrument";
Chris@161 151 desc.unit = "";
Chris@271 152 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 153 desc.minValue = 0;
Chris@162 154 desc.maxValue = m_instruments.size()-1;
Chris@162 155 desc.defaultValue = 0;
Chris@161 156 desc.isQuantized = true;
Chris@161 157 desc.quantizeStep = 1;
Chris@161 158 desc.valueNames.clear();
Chris@162 159 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 160 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 161 }
Chris@166 162 list.push_back(desc);
Chris@161 163
Chris@166 164 desc.identifier = "finetune";
Chris@166 165 desc.name = "Return fine pitch estimates";
Chris@166 166 desc.unit = "";
Chris@271 167 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 168 desc.minValue = 0;
Chris@166 169 desc.maxValue = 1;
Chris@166 170 desc.defaultValue = 0;
Chris@166 171 desc.isQuantized = true;
Chris@166 172 desc.quantizeStep = 1;
Chris@166 173 desc.valueNames.clear();
Chris@110 174 list.push_back(desc);
Chris@110 175
Chris@31 176 return list;
Chris@31 177 }
Chris@31 178
Chris@31 179 float
Chris@31 180 Silvet::getParameter(string identifier) const
Chris@31 181 {
Chris@110 182 if (identifier == "mode") {
Chris@110 183 return m_hqMode ? 1.f : 0.f;
Chris@166 184 } else if (identifier == "finetune") {
Chris@166 185 return m_fineTuning ? 1.f : 0.f;
Chris@176 186 } else if (identifier == "instrument") {
Chris@162 187 return m_instrument;
Chris@110 188 }
Chris@31 189 return 0;
Chris@31 190 }
Chris@31 191
Chris@31 192 void
Chris@31 193 Silvet::setParameter(string identifier, float value)
Chris@31 194 {
Chris@110 195 if (identifier == "mode") {
Chris@110 196 m_hqMode = (value > 0.5);
Chris@166 197 } else if (identifier == "finetune") {
Chris@166 198 m_fineTuning = (value > 0.5);
Chris@176 199 } else if (identifier == "instrument") {
Chris@162 200 m_instrument = lrintf(value);
Chris@110 201 }
Chris@31 202 }
Chris@31 203
Chris@31 204 Silvet::ProgramList
Chris@31 205 Silvet::getPrograms() const
Chris@31 206 {
Chris@31 207 ProgramList list;
Chris@31 208 return list;
Chris@31 209 }
Chris@31 210
Chris@31 211 string
Chris@31 212 Silvet::getCurrentProgram() const
Chris@31 213 {
Chris@31 214 return "";
Chris@31 215 }
Chris@31 216
Chris@31 217 void
Chris@31 218 Silvet::selectProgram(string name)
Chris@31 219 {
Chris@31 220 }
Chris@31 221
Chris@31 222 Silvet::OutputList
Chris@31 223 Silvet::getOutputDescriptors() const
Chris@31 224 {
Chris@31 225 OutputList list;
Chris@31 226
Chris@31 227 OutputDescriptor d;
Chris@51 228 d.identifier = "notes";
Chris@51 229 d.name = "Note transcription";
Chris@271 230 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 231 d.unit = "Hz";
Chris@31 232 d.hasFixedBinCount = true;
Chris@31 233 d.binCount = 2;
Chris@41 234 d.binNames.push_back("Frequency");
Chris@31 235 d.binNames.push_back("Velocity");
Chris@31 236 d.hasKnownExtents = false;
Chris@31 237 d.isQuantized = false;
Chris@31 238 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 239 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 240 d.hasDuration = true;
Chris@32 241 m_notesOutputNo = list.size();
Chris@32 242 list.push_back(d);
Chris@32 243
Chris@178 244 d.identifier = "timefreq";
Chris@178 245 d.name = "Time-frequency distribution";
Chris@271 246 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 247 d.unit = "";
Chris@178 248 d.hasFixedBinCount = true;
Chris@178 249 d.binCount = m_instruments[0].templateHeight;
Chris@178 250 d.binNames.clear();
Chris@178 251 if (m_cq) {
Chris@178 252 char name[20];
Chris@178 253 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 254 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 255 // lowest-frequency 55 bins have been dropped, for a
Chris@178 256 // 545-bin template. The native CQ bins go high->low
Chris@178 257 // frequency though, so these are still the first 545 bins
Chris@178 258 // as reported by getBinFrequency, though in reverse order
Chris@178 259 float freq = m_cq->getBinFrequency
Chris@178 260 (m_instruments[0].templateHeight - i - 1);
Chris@178 261 sprintf(name, "%.1f Hz", freq);
Chris@178 262 d.binNames.push_back(name);
Chris@178 263 }
Chris@178 264 }
Chris@178 265 d.hasKnownExtents = false;
Chris@178 266 d.isQuantized = false;
Chris@178 267 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 268 d.sampleRate = m_colsPerSec;
Chris@178 269 d.hasDuration = false;
Chris@178 270 m_fcqOutputNo = list.size();
Chris@178 271 list.push_back(d);
Chris@178 272
Chris@31 273 return list;
Chris@31 274 }
Chris@31 275
Chris@38 276 std::string
Chris@175 277 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 278 {
Chris@38 279 static const char *names[] = {
Chris@38 280 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 281 };
Chris@38 282
Chris@175 283 const char *n = names[note % 12];
Chris@38 284
Chris@175 285 int oct = (note + 9) / 12;
Chris@38 286
Chris@175 287 char buf[30];
Chris@175 288
Chris@175 289 float pshift = 0.f;
Chris@175 290 if (shiftCount > 1) {
Chris@175 291 // see noteFrequency below
Chris@175 292 pshift =
Chris@175 293 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 294 }
Chris@175 295
Chris@175 296 if (pshift > 0.f) {
Chris@175 297 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 298 } else if (pshift < 0.f) {
Chris@175 299 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 300 } else {
Chris@175 301 sprintf(buf, "%s%d", n, oct);
Chris@175 302 }
Chris@38 303
Chris@38 304 return buf;
Chris@38 305 }
Chris@38 306
Chris@41 307 float
Chris@168 308 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 309 {
Chris@169 310 // Convert shift number to a pitch shift. The given shift number
Chris@169 311 // is an offset into the template array, which starts with some
Chris@169 312 // zeros, followed by the template, then some trailing zeros.
Chris@169 313 //
Chris@169 314 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 315 // == 5, then the number will be in the range 0-4 and the template
Chris@169 316 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 317 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 318 // represent moving the template *up* in pitch (by introducing
Chris@169 319 // zeros at the start, which is the low-frequency end), for a
Chris@169 320 // positive pitch shift; and higher values represent moving it
Chris@169 321 // down in pitch, for a negative pitch shift.
Chris@169 322
Chris@175 323 float pshift = 0.f;
Chris@175 324 if (shiftCount > 1) {
Chris@175 325 pshift =
Chris@175 326 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 327 }
Chris@169 328
Chris@169 329 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 330 }
Chris@41 331
Chris@31 332 bool
Chris@31 333 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 334 {
Chris@272 335 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 336 m_inputSampleRate > maxInputSampleRate) {
Chris@272 337 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 338 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 339 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 340 return false;
Chris@272 341 }
Chris@272 342
Chris@31 343 if (channels < getMinChannelCount() ||
Chris@272 344 channels > getMaxChannelCount()) {
Chris@272 345 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 346 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 347 << getMaxChannelCount() << ")" << endl;
Chris@272 348 return false;
Chris@272 349 }
Chris@31 350
Chris@31 351 if (stepSize != blockSize) {
Chris@31 352 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 353 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 354 return false;
Chris@31 355 }
Chris@31 356
Chris@31 357 m_blockSize = blockSize;
Chris@31 358
Chris@31 359 reset();
Chris@31 360
Chris@31 361 return true;
Chris@31 362 }
Chris@31 363
Chris@31 364 void
Chris@31 365 Silvet::reset()
Chris@31 366 {
Chris@31 367 delete m_resampler;
Chris@246 368 delete m_flattener;
Chris@31 369 delete m_cq;
Chris@31 370
Chris@31 371 if (m_inputSampleRate != processingSampleRate) {
Chris@31 372 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 373 } else {
Chris@31 374 m_resampler = 0;
Chris@31 375 }
Chris@31 376
Chris@246 377 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 378 m_flattener->reset();
Chris@246 379
Chris@173 380 double minFreq = 27.5;
Chris@173 381
Chris@173 382 if (!m_hqMode) {
Chris@173 383 // We don't actually return any notes from the bottom octave,
Chris@173 384 // so we can just pad with zeros
Chris@173 385 minFreq *= 2;
Chris@173 386 }
Chris@173 387
Chris@154 388 CQParameters params(processingSampleRate,
Chris@173 389 minFreq,
Chris@154 390 processingSampleRate / 3,
Chris@154 391 processingBPO);
Chris@154 392
Chris@155 393 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 394 // drops the FFT size to 512 from 1024 and alters
Chris@155 395 // some other processing parameters, making
Chris@155 396 // everything much, much slower. Could be a flaw
Chris@155 397 // in the CQ parameter calculations, must check
Chris@154 398 params.atomHopFactor = 0.3;
Chris@154 399 params.threshold = 0.0005;
Chris@172 400 params.window = CQParameters::Hann;
Chris@154 401
Chris@154 402 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 403
Chris@165 404 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 405
Chris@41 406 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 407 delete m_postFilter[i];
Chris@41 408 }
Chris@41 409 m_postFilter.clear();
Chris@176 410 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 411 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 412 }
Chris@41 413 m_pianoRoll.clear();
Chris@246 414 m_inputGains.clear();
Chris@32 415 m_columnCount = 0;
Chris@272 416 m_resampledCount = 0;
Chris@40 417 m_startTime = RealTime::zeroTime;
Chris@31 418 }
Chris@31 419
Chris@31 420 Silvet::FeatureSet
Chris@31 421 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 422 {
Chris@40 423 if (m_columnCount == 0) {
Chris@40 424 m_startTime = timestamp;
Chris@40 425 }
Chris@246 426
Chris@246 427 vector<float> flattened(m_blockSize);
Chris@246 428 float gain = 1.f;
Chris@246 429 m_flattener->connectInputPort
Chris@246 430 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 431 m_flattener->connectOutputPort
Chris@246 432 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 433 m_flattener->connectOutputPort
Chris@246 434 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 435 m_flattener->process(m_blockSize);
Chris@246 436
Chris@252 437 m_inputGains[timestamp] = gain;
Chris@40 438
Chris@31 439 vector<double> data;
Chris@40 440 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 441 double d = flattened[i];
Chris@235 442 data.push_back(d);
Chris@40 443 }
Chris@31 444
Chris@31 445 if (m_resampler) {
Chris@272 446
Chris@31 447 data = m_resampler->process(data.data(), data.size());
Chris@272 448
Chris@272 449 int hadCount = m_resampledCount;
Chris@272 450 m_resampledCount += data.size();
Chris@272 451
Chris@272 452 int resamplerLatency = m_resampler->getLatency();
Chris@272 453
Chris@272 454 if (hadCount < resamplerLatency) {
Chris@272 455 int stillToDrop = resamplerLatency - hadCount;
Chris@272 456 if (stillToDrop >= int(data.size())) {
Chris@272 457 return FeatureSet();
Chris@272 458 } else {
Chris@272 459 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 460 }
Chris@272 461 }
Chris@31 462 }
Chris@272 463
Chris@32 464 Grid cqout = m_cq->process(data);
Chris@51 465 FeatureSet fs = transcribe(cqout);
Chris@51 466 return fs;
Chris@34 467 }
Chris@34 468
Chris@34 469 Silvet::FeatureSet
Chris@34 470 Silvet::getRemainingFeatures()
Chris@34 471 {
Chris@145 472 Grid cqout = m_cq->getRemainingOutput();
Chris@51 473 FeatureSet fs = transcribe(cqout);
Chris@51 474 return fs;
Chris@34 475 }
Chris@34 476
Chris@34 477 Silvet::FeatureSet
Chris@34 478 Silvet::transcribe(const Grid &cqout)
Chris@34 479 {
Chris@32 480 Grid filtered = preProcess(cqout);
Chris@31 481
Chris@32 482 FeatureSet fs;
Chris@32 483
Chris@104 484 if (filtered.empty()) return fs;
Chris@170 485
Chris@170 486 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 487
Chris@178 488 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 489 Feature f;
Chris@178 490 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 491 f.values.push_back(float(filtered[i][j]));
Chris@178 492 }
Chris@178 493 fs[m_fcqOutputNo].push_back(f);
Chris@178 494 }
Chris@178 495
Chris@34 496 int width = filtered.size();
Chris@34 497
Chris@164 498 int iterations = m_hqMode ? 20 : 10;
Chris@34 499
Chris@176 500 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 501
Chris@170 502 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 503 int shiftCount = 1;
Chris@170 504 if (wantShifts) {
Chris@170 505 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 506 }
Chris@170 507
Chris@170 508 vector<vector<int> > localBestShifts;
Chris@170 509 if (wantShifts) {
Chris@170 510 localBestShifts =
Chris@176 511 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 512 }
Chris@170 513
Chris@170 514 vector<bool> present(width, false);
Chris@37 515
Chris@123 516 #pragma omp parallel for
Chris@123 517 for (int i = 0; i < width; ++i) {
Chris@104 518
Chris@170 519 double sum = 0.0;
Chris@176 520 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 521 sum += filtered.at(i).at(j);
Chris@170 522 }
Chris@170 523 if (sum < 1e-5) continue;
Chris@170 524
Chris@170 525 present[i] = true;
Chris@170 526
Chris@170 527 EM em(&pack, m_hqMode);
Chris@170 528
Chris@183 529 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 530 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 531
Chris@170 532 for (int j = 0; j < iterations; ++j) {
Chris@170 533 em.iterate(filtered.at(i).data());
Chris@37 534 }
Chris@37 535
Chris@170 536 const float *pitchDist = em.getPitchDistribution();
Chris@170 537 const float *const *shiftDist = em.getShifts();
Chris@37 538
Chris@176 539 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 540
Chris@170 541 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 542
Chris@170 543 int bestShift = 0;
Chris@179 544 float bestShiftValue = 0.0;
Chris@170 545 if (wantShifts) {
Chris@170 546 for (int k = 0; k < shiftCount; ++k) {
Chris@179 547 float value = shiftDist[k][j];
Chris@179 548 if (k == 0 || value > bestShiftValue) {
Chris@179 549 bestShiftValue = value;
Chris@170 550 bestShift = k;
Chris@170 551 }
Chris@170 552 }
Chris@170 553 localBestShifts[i][j] = bestShift;
Chris@170 554 }
Chris@123 555 }
Chris@123 556 }
Chris@166 557
Chris@166 558 for (int i = 0; i < width; ++i) {
Chris@37 559
Chris@170 560 if (!present[i]) {
Chris@170 561 // silent column
Chris@176 562 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 563 m_postFilter[j]->push(0.0);
Chris@170 564 }
Chris@168 565 m_pianoRoll.push_back(map<int, double>());
Chris@170 566 if (wantShifts) {
Chris@168 567 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 568 }
Chris@166 569 continue;
Chris@166 570 }
Chris@166 571
Chris@170 572 postProcess(localPitches[i], localBestShifts[i], wantShifts);
Chris@166 573
Chris@168 574 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 575
Chris@123 576 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 577 fi != noteFeatures.end(); ++fi) {
Chris@123 578 fs[m_notesOutputNo].push_back(*fi);
Chris@40 579 }
Chris@34 580 }
Chris@34 581
Chris@32 582 return fs;
Chris@31 583 }
Chris@31 584
Chris@32 585 Silvet::Grid
Chris@32 586 Silvet::preProcess(const Grid &in)
Chris@32 587 {
Chris@32 588 int width = in.size();
Chris@32 589
Chris@165 590 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 591
Chris@165 592 // need to be careful that col spacing is an integer number of samples!
Chris@165 593 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 594
Chris@32 595 Grid out;
Chris@32 596
Chris@58 597 // We count the CQ latency in terms of processing hops, but
Chris@58 598 // actually it probably isn't an exact number of hops so this
Chris@58 599 // isn't quite accurate. But the small constant offset is
Chris@165 600 // practically irrelevant compared to the jitter from the frame
Chris@165 601 // size we reduce to in a moment
Chris@33 602 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 603
Chris@176 604 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 605
Chris@32 606 for (int i = 0; i < width; ++i) {
Chris@32 607
Chris@33 608 if (m_columnCount < latentColumns) {
Chris@33 609 ++m_columnCount;
Chris@33 610 continue;
Chris@33 611 }
Chris@33 612
Chris@32 613 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 614 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 615
Chris@32 616 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 617
Chris@32 618 if (select) {
Chris@32 619 vector<double> inCol = in[i];
Chris@176 620 vector<double> outCol(pack.templateHeight);
Chris@32 621
Chris@178 622 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 623 // lowest 55 of them.
Chris@178 624 //
Chris@178 625 // In draft mode the CQ is an octave shorter, returning
Chris@178 626 // 540 bins, so we instead pad them with an additional 5
Chris@178 627 // zeros.
Chris@178 628 //
Chris@178 629 // We also need to reverse the column as we go, since the
Chris@178 630 // raw CQ has the high frequencies first and we need it
Chris@178 631 // the other way around.
Chris@32 632
Chris@178 633 if (m_hqMode) {
Chris@178 634 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 635 int ix = inCol.size() - j - 55;
Chris@178 636 outCol[j] = inCol[ix];
Chris@178 637 }
Chris@178 638 } else {
Chris@178 639 for (int j = 0; j < 5; ++j) {
Chris@178 640 outCol[j] = 0.0;
Chris@178 641 }
Chris@178 642 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 643 int ix = inCol.size() - j + 4;
Chris@178 644 outCol[j] = inCol[ix];
Chris@178 645 }
Chris@46 646 }
Chris@32 647
Chris@46 648 vector<double> noiseLevel1 =
Chris@46 649 MedianFilter<double>::filter(40, outCol);
Chris@176 650 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 651 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 652 }
Chris@32 653
Chris@46 654 vector<double> noiseLevel2 =
Chris@46 655 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 656 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 657 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 658 }
Chris@32 659
Chris@165 660 out.push_back(outCol);
Chris@32 661 }
Chris@32 662
Chris@32 663 ++m_columnCount;
Chris@32 664 }
Chris@32 665
Chris@32 666 return out;
Chris@32 667 }
Chris@32 668
Chris@168 669 void
Chris@170 670 Silvet::postProcess(const vector<double> &pitches,
Chris@170 671 const vector<int> &bestShifts,
Chris@170 672 bool wantShifts)
Chris@166 673 {
Chris@176 674 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 675
Chris@41 676 vector<double> filtered;
Chris@41 677
Chris@176 678 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 679 m_postFilter[j]->push(pitches[j]);
Chris@41 680 filtered.push_back(m_postFilter[j]->get());
Chris@41 681 }
Chris@41 682
Chris@41 683 // Threshold for level and reduce number of candidate pitches
Chris@41 684
Chris@41 685 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 686
Chris@41 687 ValueIndexMap strengths;
Chris@166 688
Chris@176 689 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 690 double strength = filtered[j];
Chris@183 691 if (strength < pack.levelThreshold) continue;
Chris@168 692 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 693 }
Chris@166 694
Chris@168 695 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 696
Chris@168 697 map<int, double> active;
Chris@168 698 map<int, int> activeShifts;
Chris@168 699
Chris@183 700 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 701
Chris@168 702 --si;
Chris@168 703
Chris@168 704 double strength = si->first;
Chris@168 705 int j = si->second;
Chris@168 706
Chris@168 707 active[j] = strength;
Chris@168 708
Chris@170 709 if (wantShifts) {
Chris@170 710 activeShifts[j] = bestShifts[j];
Chris@167 711 }
Chris@41 712 }
Chris@41 713
Chris@168 714 m_pianoRoll.push_back(active);
Chris@170 715
Chris@170 716 if (wantShifts) {
Chris@168 717 m_pianoRollShifts.push_back(activeShifts);
Chris@41 718 }
Chris@166 719 }
Chris@166 720
Chris@166 721 Vamp::Plugin::FeatureList
Chris@168 722 Silvet::noteTrack(int shiftCount)
Chris@166 723 {
Chris@41 724 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 725 // report notes that have just ended (i.e. that are absent in the
Chris@168 726 // latest active set but present in the prior set in the piano
Chris@41 727 // roll) -- any notes that ended earlier will have been reported
Chris@41 728 // already, and if they haven't ended, we don't know their
Chris@41 729 // duration.
Chris@41 730
Chris@168 731 int width = m_pianoRoll.size() - 1;
Chris@168 732
Chris@168 733 const map<int, double> &active = m_pianoRoll[width];
Chris@41 734
Chris@165 735 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 736
Chris@165 737 // only keep notes >= 100ms or thereabouts
Chris@165 738 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 739 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 740
Chris@41 741 FeatureList noteFeatures;
Chris@41 742
Chris@41 743 if (width < durationThreshold + 1) {
Chris@41 744 return noteFeatures;
Chris@41 745 }
Chris@41 746
Chris@150 747 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 748
Chris@55 749 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 750 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 751
Chris@55 752 int note = ni->first;
Chris@41 753
Chris@41 754 if (active.find(note) != active.end()) {
Chris@41 755 // the note is still playing
Chris@41 756 continue;
Chris@41 757 }
Chris@41 758
Chris@41 759 // the note was playing but just ended
Chris@41 760 int end = width;
Chris@41 761 int start = end-1;
Chris@41 762
Chris@41 763 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 764 --start;
Chris@41 765 }
Chris@41 766 ++start;
Chris@41 767
Chris@169 768 if ((end - start) < durationThreshold) {
Chris@41 769 continue;
Chris@41 770 }
Chris@41 771
Chris@169 772 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 773 }
Chris@41 774
Chris@62 775 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 776
Chris@41 777 return noteFeatures;
Chris@41 778 }
Chris@41 779
Chris@169 780 void
Chris@169 781 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 782 FeatureList &noteFeatures)
Chris@169 783 {
Chris@169 784 int partStart = start;
Chris@169 785 int partShift = 0;
Chris@169 786 int partVelocity = 0;
Chris@169 787
Chris@252 788 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 789
Chris@169 790 for (int i = start; i != end; ++i) {
Chris@169 791
Chris@169 792 double strength = m_pianoRoll[i][note];
Chris@169 793
Chris@169 794 int shift = 0;
Chris@169 795
Chris@169 796 if (shiftCount > 1) {
Chris@169 797
Chris@169 798 shift = m_pianoRollShifts[i][note];
Chris@169 799
Chris@169 800 if (i == partStart) {
Chris@169 801 partShift = shift;
Chris@169 802 }
Chris@169 803
Chris@169 804 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 805
Chris@169 806 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 807
Chris@169 808 // pitch has changed, emit an intermediate note
Chris@252 809 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 810 i,
Chris@252 811 note,
Chris@252 812 partShift,
Chris@252 813 shiftCount,
Chris@252 814 partVelocity));
Chris@169 815 partStart = i;
Chris@169 816 partShift = shift;
Chris@169 817 partVelocity = 0;
Chris@169 818 }
Chris@169 819 }
Chris@169 820
Chris@246 821 int v = round(strength * 2);
Chris@169 822 if (v > partVelocity) {
Chris@169 823 partVelocity = v;
Chris@169 824 }
Chris@169 825 }
Chris@169 826
Chris@169 827 if (end >= partStart + partThreshold) {
Chris@252 828 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 829 end,
Chris@252 830 note,
Chris@252 831 partShift,
Chris@252 832 shiftCount,
Chris@252 833 partVelocity));
Chris@169 834 }
Chris@169 835 }
Chris@252 836
Chris@252 837 Silvet::Feature
Chris@252 838 Silvet::makeNoteFeature(int start,
Chris@252 839 int end,
Chris@252 840 int note,
Chris@252 841 int shift,
Chris@252 842 int shiftCount,
Chris@252 843 int velocity)
Chris@252 844 {
Chris@252 845 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 846 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 847
Chris@252 848 Feature f;
Chris@252 849
Chris@252 850 f.hasTimestamp = true;
Chris@285 851 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 852 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 853
Chris@252 854 f.hasDuration = true;
Chris@252 855 f.duration = RealTime::fromSeconds
Chris@252 856 (columnDuration * (end - start));
Chris@252 857
Chris@252 858 f.values.clear();
Chris@252 859
Chris@252 860 f.values.push_back
Chris@252 861 (noteFrequency(note, shift, shiftCount));
Chris@252 862
Chris@252 863 float inputGain = getInputGainAt(f.timestamp);
Chris@252 864 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 865 velocity = round(velocity / inputGain);
Chris@252 866 if (velocity > 127) velocity = 127;
Chris@252 867 if (velocity < 1) velocity = 1;
Chris@252 868 f.values.push_back(velocity);
Chris@252 869
Chris@252 870 f.label = noteName(note, shift, shiftCount);
Chris@252 871
Chris@252 872 return f;
Chris@252 873 }
Chris@252 874
Chris@252 875 float
Chris@252 876 Silvet::getInputGainAt(RealTime t)
Chris@252 877 {
Chris@252 878 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 879
Chris@252 880 if (i == m_inputGains.end()) {
Chris@252 881 if (i != m_inputGains.begin()) {
Chris@252 882 --i;
Chris@252 883 } else {
Chris@252 884 return 1.f; // no data
Chris@252 885 }
Chris@252 886 }
Chris@252 887
Chris@252 888 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 889
Chris@252 890 return i->second;
Chris@252 891 }
Chris@252 892