annotate src/Silvet.cpp @ 301:00fab71b80ec livemode

More tweaking of frequencies
author Chris Cannam
date Mon, 01 Dec 2014 17:12:19 +0000
parents ba5f3b084466
children cac0be04c43c
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@298 24 #include "LiveInstruments.h"
Chris@31 25
Chris@31 26 #include <vector>
Chris@31 27
Chris@32 28 #include <cstdio>
Chris@32 29
Chris@31 30 using std::vector;
Chris@48 31 using std::cout;
Chris@31 32 using std::cerr;
Chris@31 33 using std::endl;
Chris@40 34 using Vamp::RealTime;
Chris@31 35
Chris@31 36 static int processingSampleRate = 44100;
Chris@298 37
Chris@298 38 static int binsPerSemitoneLive = 1;
Chris@298 39 static int binsPerSemitoneNormal = 5;
Chris@170 40
Chris@272 41 static int minInputSampleRate = 100;
Chris@272 42 static int maxInputSampleRate = 192000;
Chris@272 43
Chris@31 44 Silvet::Silvet(float inputSampleRate) :
Chris@31 45 Plugin(inputSampleRate),
Chris@161 46 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@298 47 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
Chris@31 48 m_resampler(0),
Chris@246 49 m_flattener(0),
Chris@110 50 m_cq(0),
Chris@297 51 m_mode(HighQualityMode),
Chris@166 52 m_fineTuning(false),
Chris@178 53 m_instrument(0),
Chris@178 54 m_colsPerSec(50)
Chris@31 55 {
Chris@31 56 }
Chris@31 57
Chris@31 58 Silvet::~Silvet()
Chris@31 59 {
Chris@31 60 delete m_resampler;
Chris@246 61 delete m_flattener;
Chris@31 62 delete m_cq;
Chris@41 63 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 64 delete m_postFilter[i];
Chris@41 65 }
Chris@31 66 }
Chris@31 67
Chris@31 68 string
Chris@31 69 Silvet::getIdentifier() const
Chris@31 70 {
Chris@31 71 return "silvet";
Chris@31 72 }
Chris@31 73
Chris@31 74 string
Chris@31 75 Silvet::getName() const
Chris@31 76 {
Chris@31 77 return "Silvet Note Transcription";
Chris@31 78 }
Chris@31 79
Chris@31 80 string
Chris@31 81 Silvet::getDescription() const
Chris@31 82 {
Chris@191 83 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 84 }
Chris@31 85
Chris@31 86 string
Chris@31 87 Silvet::getMaker() const
Chris@31 88 {
Chris@191 89 return "Queen Mary, University of London";
Chris@31 90 }
Chris@31 91
Chris@31 92 int
Chris@31 93 Silvet::getPluginVersion() const
Chris@31 94 {
Chris@295 95 return 2;
Chris@31 96 }
Chris@31 97
Chris@31 98 string
Chris@31 99 Silvet::getCopyright() const
Chris@31 100 {
Chris@191 101 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 102 }
Chris@31 103
Chris@31 104 Silvet::InputDomain
Chris@31 105 Silvet::getInputDomain() const
Chris@31 106 {
Chris@31 107 return TimeDomain;
Chris@31 108 }
Chris@31 109
Chris@31 110 size_t
Chris@31 111 Silvet::getPreferredBlockSize() const
Chris@31 112 {
Chris@31 113 return 0;
Chris@31 114 }
Chris@31 115
Chris@31 116 size_t
Chris@31 117 Silvet::getPreferredStepSize() const
Chris@31 118 {
Chris@31 119 return 0;
Chris@31 120 }
Chris@31 121
Chris@31 122 size_t
Chris@31 123 Silvet::getMinChannelCount() const
Chris@31 124 {
Chris@31 125 return 1;
Chris@31 126 }
Chris@31 127
Chris@31 128 size_t
Chris@31 129 Silvet::getMaxChannelCount() const
Chris@31 130 {
Chris@31 131 return 1;
Chris@31 132 }
Chris@31 133
Chris@31 134 Silvet::ParameterList
Chris@31 135 Silvet::getParameterDescriptors() const
Chris@31 136 {
Chris@31 137 ParameterList list;
Chris@110 138
Chris@110 139 ParameterDescriptor desc;
Chris@110 140 desc.identifier = "mode";
Chris@110 141 desc.name = "Processing mode";
Chris@110 142 desc.unit = "";
Chris@297 143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
Chris@110 144 desc.minValue = 0;
Chris@297 145 desc.maxValue = 2;
Chris@113 146 desc.defaultValue = 1;
Chris@110 147 desc.isQuantized = true;
Chris@110 148 desc.quantizeStep = 1;
Chris@166 149 desc.valueNames.push_back("Draft (faster)");
Chris@165 150 desc.valueNames.push_back("Intensive (higher quality)");
Chris@297 151 desc.valueNames.push_back("Live (lower latency)");
Chris@161 152 list.push_back(desc);
Chris@161 153
Chris@176 154 desc.identifier = "instrument";
Chris@176 155 desc.name = "Instrument";
Chris@161 156 desc.unit = "";
Chris@271 157 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 158 desc.minValue = 0;
Chris@162 159 desc.maxValue = m_instruments.size()-1;
Chris@162 160 desc.defaultValue = 0;
Chris@161 161 desc.isQuantized = true;
Chris@161 162 desc.quantizeStep = 1;
Chris@161 163 desc.valueNames.clear();
Chris@162 164 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 165 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 166 }
Chris@166 167 list.push_back(desc);
Chris@161 168
Chris@166 169 desc.identifier = "finetune";
Chris@166 170 desc.name = "Return fine pitch estimates";
Chris@166 171 desc.unit = "";
Chris@271 172 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 173 desc.minValue = 0;
Chris@166 174 desc.maxValue = 1;
Chris@166 175 desc.defaultValue = 0;
Chris@166 176 desc.isQuantized = true;
Chris@166 177 desc.quantizeStep = 1;
Chris@166 178 desc.valueNames.clear();
Chris@110 179 list.push_back(desc);
Chris@110 180
Chris@31 181 return list;
Chris@31 182 }
Chris@31 183
Chris@31 184 float
Chris@31 185 Silvet::getParameter(string identifier) const
Chris@31 186 {
Chris@110 187 if (identifier == "mode") {
Chris@297 188 return (float)(int)m_mode;
Chris@166 189 } else if (identifier == "finetune") {
Chris@166 190 return m_fineTuning ? 1.f : 0.f;
Chris@176 191 } else if (identifier == "instrument") {
Chris@162 192 return m_instrument;
Chris@110 193 }
Chris@31 194 return 0;
Chris@31 195 }
Chris@31 196
Chris@31 197 void
Chris@31 198 Silvet::setParameter(string identifier, float value)
Chris@31 199 {
Chris@110 200 if (identifier == "mode") {
Chris@297 201 m_mode = (ProcessingMode)(int)(value + 0.5);
Chris@166 202 } else if (identifier == "finetune") {
Chris@166 203 m_fineTuning = (value > 0.5);
Chris@176 204 } else if (identifier == "instrument") {
Chris@162 205 m_instrument = lrintf(value);
Chris@110 206 }
Chris@31 207 }
Chris@31 208
Chris@31 209 Silvet::ProgramList
Chris@31 210 Silvet::getPrograms() const
Chris@31 211 {
Chris@31 212 ProgramList list;
Chris@31 213 return list;
Chris@31 214 }
Chris@31 215
Chris@31 216 string
Chris@31 217 Silvet::getCurrentProgram() const
Chris@31 218 {
Chris@31 219 return "";
Chris@31 220 }
Chris@31 221
Chris@31 222 void
Chris@31 223 Silvet::selectProgram(string name)
Chris@31 224 {
Chris@31 225 }
Chris@31 226
Chris@31 227 Silvet::OutputList
Chris@31 228 Silvet::getOutputDescriptors() const
Chris@31 229 {
Chris@31 230 OutputList list;
Chris@31 231
Chris@31 232 OutputDescriptor d;
Chris@51 233 d.identifier = "notes";
Chris@51 234 d.name = "Note transcription";
Chris@271 235 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 236 d.unit = "Hz";
Chris@31 237 d.hasFixedBinCount = true;
Chris@31 238 d.binCount = 2;
Chris@41 239 d.binNames.push_back("Frequency");
Chris@31 240 d.binNames.push_back("Velocity");
Chris@31 241 d.hasKnownExtents = false;
Chris@31 242 d.isQuantized = false;
Chris@31 243 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 244 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 245 d.hasDuration = true;
Chris@32 246 m_notesOutputNo = list.size();
Chris@32 247 list.push_back(d);
Chris@32 248
Chris@178 249 d.identifier = "timefreq";
Chris@178 250 d.name = "Time-frequency distribution";
Chris@271 251 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 252 d.unit = "";
Chris@178 253 d.hasFixedBinCount = true;
Chris@298 254 d.binCount = getPack(0).templateHeight;
Chris@178 255 d.binNames.clear();
Chris@178 256 if (m_cq) {
Chris@294 257 char name[50];
Chris@298 258 for (int i = 0; i < getPack(0).templateHeight; ++i) {
Chris@178 259 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 260 // lowest-frequency 55 bins have been dropped, for a
Chris@178 261 // 545-bin template. The native CQ bins go high->low
Chris@178 262 // frequency though, so these are still the first 545 bins
Chris@178 263 // as reported by getBinFrequency, though in reverse order
Chris@178 264 float freq = m_cq->getBinFrequency
Chris@298 265 (getPack(0).templateHeight - i - 1);
Chris@178 266 sprintf(name, "%.1f Hz", freq);
Chris@178 267 d.binNames.push_back(name);
Chris@178 268 }
Chris@178 269 }
Chris@178 270 d.hasKnownExtents = false;
Chris@178 271 d.isQuantized = false;
Chris@178 272 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 273 d.sampleRate = m_colsPerSec;
Chris@178 274 d.hasDuration = false;
Chris@178 275 m_fcqOutputNo = list.size();
Chris@178 276 list.push_back(d);
Chris@178 277
Chris@294 278 d.identifier = "pitchactivation";
Chris@294 279 d.name = "Pitch activation distribution";
Chris@294 280 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 281 d.unit = "";
Chris@294 282 d.hasFixedBinCount = true;
Chris@298 283 d.binCount = getPack(0).templateNoteCount;
Chris@294 284 d.binNames.clear();
Chris@294 285 if (m_cq) {
Chris@298 286 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@294 287 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 288 }
Chris@294 289 }
Chris@294 290 d.hasKnownExtents = false;
Chris@294 291 d.isQuantized = false;
Chris@294 292 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 293 d.sampleRate = m_colsPerSec;
Chris@294 294 d.hasDuration = false;
Chris@294 295 m_pitchOutputNo = list.size();
Chris@294 296 list.push_back(d);
Chris@294 297
Chris@31 298 return list;
Chris@31 299 }
Chris@31 300
Chris@38 301 std::string
Chris@175 302 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 303 {
Chris@38 304 static const char *names[] = {
Chris@38 305 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 306 };
Chris@38 307
Chris@175 308 const char *n = names[note % 12];
Chris@38 309
Chris@175 310 int oct = (note + 9) / 12;
Chris@38 311
Chris@175 312 char buf[30];
Chris@175 313
Chris@175 314 float pshift = 0.f;
Chris@175 315 if (shiftCount > 1) {
Chris@175 316 // see noteFrequency below
Chris@175 317 pshift =
Chris@175 318 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 319 }
Chris@175 320
Chris@175 321 if (pshift > 0.f) {
Chris@175 322 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 323 } else if (pshift < 0.f) {
Chris@175 324 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 325 } else {
Chris@175 326 sprintf(buf, "%s%d", n, oct);
Chris@175 327 }
Chris@38 328
Chris@38 329 return buf;
Chris@38 330 }
Chris@38 331
Chris@41 332 float
Chris@168 333 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 334 {
Chris@169 335 // Convert shift number to a pitch shift. The given shift number
Chris@169 336 // is an offset into the template array, which starts with some
Chris@169 337 // zeros, followed by the template, then some trailing zeros.
Chris@169 338 //
Chris@169 339 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 340 // == 5, then the number will be in the range 0-4 and the template
Chris@169 341 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 342 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 343 // represent moving the template *up* in pitch (by introducing
Chris@169 344 // zeros at the start, which is the low-frequency end), for a
Chris@169 345 // positive pitch shift; and higher values represent moving it
Chris@169 346 // down in pitch, for a negative pitch shift.
Chris@169 347
Chris@175 348 float pshift = 0.f;
Chris@175 349 if (shiftCount > 1) {
Chris@175 350 pshift =
Chris@175 351 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 352 }
Chris@169 353
Chris@301 354 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@301 355
Chris@301 356 cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
Chris@301 357 << shiftCount << ", obtained freq = " << freq << endl;
Chris@301 358
Chris@301 359 return freq;
Chris@41 360 }
Chris@41 361
Chris@31 362 bool
Chris@31 363 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 364 {
Chris@272 365 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 366 m_inputSampleRate > maxInputSampleRate) {
Chris@272 367 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 368 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 369 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 370 return false;
Chris@272 371 }
Chris@272 372
Chris@31 373 if (channels < getMinChannelCount() ||
Chris@272 374 channels > getMaxChannelCount()) {
Chris@272 375 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 376 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 377 << getMaxChannelCount() << ")" << endl;
Chris@272 378 return false;
Chris@272 379 }
Chris@31 380
Chris@31 381 if (stepSize != blockSize) {
Chris@31 382 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 383 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 384 return false;
Chris@31 385 }
Chris@31 386
Chris@31 387 m_blockSize = blockSize;
Chris@31 388
Chris@31 389 reset();
Chris@31 390
Chris@31 391 return true;
Chris@31 392 }
Chris@31 393
Chris@31 394 void
Chris@31 395 Silvet::reset()
Chris@31 396 {
Chris@31 397 delete m_resampler;
Chris@246 398 delete m_flattener;
Chris@31 399 delete m_cq;
Chris@31 400
Chris@31 401 if (m_inputSampleRate != processingSampleRate) {
Chris@31 402 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 403 } else {
Chris@31 404 m_resampler = 0;
Chris@31 405 }
Chris@31 406
Chris@246 407 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 408 m_flattener->reset();
Chris@246 409
Chris@301 410 // this happens to be processingSampleRate / 3, and is the top
Chris@301 411 // freq used for the EM templates:
Chris@301 412 double maxFreq = 14700;
Chris@301 413
Chris@301 414 if (m_mode == LiveMode) {
Chris@301 415 // We only have 12 bpo rather than 60, so we need the top bin
Chris@301 416 // to be the middle one of the top 5, i.e. 2/5 of a semitone
Chris@301 417 // lower than 14700
Chris@301 418 maxFreq *= powf(2.0, -1.0 / 30.0);
Chris@301 419 }
Chris@301 420
Chris@173 421 double minFreq = 27.5;
Chris@173 422
Chris@297 423 if (m_mode != HighQualityMode) {
Chris@173 424 // We don't actually return any notes from the bottom octave,
Chris@173 425 // so we can just pad with zeros
Chris@173 426 minFreq *= 2;
Chris@173 427 }
Chris@173 428
Chris@298 429 int bpo = 12 *
Chris@298 430 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@301 431
Chris@154 432 CQParameters params(processingSampleRate,
Chris@173 433 minFreq,
Chris@154 434 processingSampleRate / 3,
Chris@298 435 bpo);
Chris@154 436
Chris@155 437 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 438 // drops the FFT size to 512 from 1024 and alters
Chris@155 439 // some other processing parameters, making
Chris@155 440 // everything much, much slower. Could be a flaw
Chris@155 441 // in the CQ parameter calculations, must check
Chris@154 442 params.atomHopFactor = 0.3;
Chris@154 443 params.threshold = 0.0005;
Chris@172 444 params.window = CQParameters::Hann;
Chris@154 445
Chris@154 446 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 447
Chris@301 448 cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
Chris@301 449 cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
Chris@297 450
Chris@297 451 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
Chris@165 452
Chris@41 453 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 454 delete m_postFilter[i];
Chris@41 455 }
Chris@41 456 m_postFilter.clear();
Chris@298 457 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@41 458 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 459 }
Chris@41 460 m_pianoRoll.clear();
Chris@246 461 m_inputGains.clear();
Chris@32 462 m_columnCount = 0;
Chris@272 463 m_resampledCount = 0;
Chris@40 464 m_startTime = RealTime::zeroTime;
Chris@31 465 }
Chris@31 466
Chris@31 467 Silvet::FeatureSet
Chris@31 468 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 469 {
Chris@40 470 if (m_columnCount == 0) {
Chris@40 471 m_startTime = timestamp;
Chris@40 472 }
Chris@246 473
Chris@246 474 vector<float> flattened(m_blockSize);
Chris@246 475 float gain = 1.f;
Chris@246 476 m_flattener->connectInputPort
Chris@246 477 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 478 m_flattener->connectOutputPort
Chris@246 479 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 480 m_flattener->connectOutputPort
Chris@246 481 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 482 m_flattener->process(m_blockSize);
Chris@246 483
Chris@252 484 m_inputGains[timestamp] = gain;
Chris@40 485
Chris@31 486 vector<double> data;
Chris@40 487 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 488 double d = flattened[i];
Chris@235 489 data.push_back(d);
Chris@40 490 }
Chris@31 491
Chris@31 492 if (m_resampler) {
Chris@272 493
Chris@31 494 data = m_resampler->process(data.data(), data.size());
Chris@272 495
Chris@272 496 int hadCount = m_resampledCount;
Chris@272 497 m_resampledCount += data.size();
Chris@272 498
Chris@272 499 int resamplerLatency = m_resampler->getLatency();
Chris@272 500
Chris@272 501 if (hadCount < resamplerLatency) {
Chris@272 502 int stillToDrop = resamplerLatency - hadCount;
Chris@272 503 if (stillToDrop >= int(data.size())) {
Chris@272 504 return FeatureSet();
Chris@272 505 } else {
Chris@272 506 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 507 }
Chris@272 508 }
Chris@31 509 }
Chris@272 510
Chris@32 511 Grid cqout = m_cq->process(data);
Chris@51 512 FeatureSet fs = transcribe(cqout);
Chris@51 513 return fs;
Chris@34 514 }
Chris@34 515
Chris@34 516 Silvet::FeatureSet
Chris@34 517 Silvet::getRemainingFeatures()
Chris@34 518 {
Chris@145 519 Grid cqout = m_cq->getRemainingOutput();
Chris@51 520 FeatureSet fs = transcribe(cqout);
Chris@51 521 return fs;
Chris@34 522 }
Chris@34 523
Chris@34 524 Silvet::FeatureSet
Chris@34 525 Silvet::transcribe(const Grid &cqout)
Chris@34 526 {
Chris@32 527 Grid filtered = preProcess(cqout);
Chris@31 528
Chris@32 529 FeatureSet fs;
Chris@32 530
Chris@104 531 if (filtered.empty()) return fs;
Chris@170 532
Chris@298 533 const InstrumentPack &pack(getPack(m_instrument));
Chris@104 534
Chris@178 535 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 536 Feature f;
Chris@178 537 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 538 f.values.push_back(float(filtered[i][j]));
Chris@178 539 }
Chris@178 540 fs[m_fcqOutputNo].push_back(f);
Chris@178 541 }
Chris@178 542
Chris@34 543 int width = filtered.size();
Chris@34 544
Chris@297 545 int iterations = (m_mode == HighQualityMode ? 20 : 10);
Chris@34 546
Chris@176 547 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 548
Chris@297 549 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
Chris@170 550 int shiftCount = 1;
Chris@170 551 if (wantShifts) {
Chris@170 552 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 553 }
Chris@170 554
Chris@170 555 vector<vector<int> > localBestShifts;
Chris@170 556 if (wantShifts) {
Chris@170 557 localBestShifts =
Chris@176 558 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 559 }
Chris@170 560
Chris@170 561 vector<bool> present(width, false);
Chris@37 562
Chris@123 563 #pragma omp parallel for
Chris@123 564 for (int i = 0; i < width; ++i) {
Chris@104 565
Chris@170 566 double sum = 0.0;
Chris@176 567 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 568 sum += filtered.at(i).at(j);
Chris@170 569 }
Chris@170 570 if (sum < 1e-5) continue;
Chris@170 571
Chris@170 572 present[i] = true;
Chris@170 573
Chris@297 574 EM em(&pack, m_mode == HighQualityMode);
Chris@170 575
Chris@183 576 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 577 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 578
Chris@170 579 for (int j = 0; j < iterations; ++j) {
Chris@170 580 em.iterate(filtered.at(i).data());
Chris@37 581 }
Chris@37 582
Chris@170 583 const float *pitchDist = em.getPitchDistribution();
Chris@170 584 const float *const *shiftDist = em.getShifts();
Chris@37 585
Chris@176 586 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 587
Chris@170 588 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 589
Chris@170 590 int bestShift = 0;
Chris@179 591 float bestShiftValue = 0.0;
Chris@170 592 if (wantShifts) {
Chris@170 593 for (int k = 0; k < shiftCount; ++k) {
Chris@179 594 float value = shiftDist[k][j];
Chris@179 595 if (k == 0 || value > bestShiftValue) {
Chris@179 596 bestShiftValue = value;
Chris@170 597 bestShift = k;
Chris@170 598 }
Chris@170 599 }
Chris@170 600 localBestShifts[i][j] = bestShift;
Chris@170 601 }
Chris@123 602 }
Chris@123 603 }
Chris@166 604
Chris@166 605 for (int i = 0; i < width; ++i) {
Chris@37 606
Chris@170 607 if (!present[i]) {
Chris@170 608 // silent column
Chris@176 609 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 610 m_postFilter[j]->push(0.0);
Chris@170 611 }
Chris@168 612 m_pianoRoll.push_back(map<int, double>());
Chris@170 613 if (wantShifts) {
Chris@168 614 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 615 }
Chris@166 616 continue;
Chris@166 617 }
Chris@166 618
Chris@294 619 vector<double> filtered = postProcess
Chris@294 620 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 621
Chris@294 622 Feature f;
Chris@294 623 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 624 float v(filtered[j]);
Chris@294 625 if (v < pack.levelThreshold) v = 0.f;
Chris@294 626 f.values.push_back(v);
Chris@294 627 }
Chris@294 628 fs[m_pitchOutputNo].push_back(f);
Chris@166 629
Chris@168 630 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 631
Chris@123 632 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 633 fi != noteFeatures.end(); ++fi) {
Chris@123 634 fs[m_notesOutputNo].push_back(*fi);
Chris@40 635 }
Chris@34 636 }
Chris@34 637
Chris@32 638 return fs;
Chris@31 639 }
Chris@31 640
Chris@32 641 Silvet::Grid
Chris@32 642 Silvet::preProcess(const Grid &in)
Chris@32 643 {
Chris@32 644 int width = in.size();
Chris@32 645
Chris@165 646 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 647
Chris@165 648 // need to be careful that col spacing is an integer number of samples!
Chris@165 649 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 650
Chris@32 651 Grid out;
Chris@32 652
Chris@58 653 // We count the CQ latency in terms of processing hops, but
Chris@58 654 // actually it probably isn't an exact number of hops so this
Chris@58 655 // isn't quite accurate. But the small constant offset is
Chris@165 656 // practically irrelevant compared to the jitter from the frame
Chris@165 657 // size we reduce to in a moment
Chris@33 658 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 659
Chris@298 660 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 661
Chris@32 662 for (int i = 0; i < width; ++i) {
Chris@32 663
Chris@33 664 if (m_columnCount < latentColumns) {
Chris@33 665 ++m_columnCount;
Chris@33 666 continue;
Chris@33 667 }
Chris@33 668
Chris@32 669 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 670 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 671
Chris@32 672 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 673
Chris@32 674 if (select) {
Chris@32 675 vector<double> inCol = in[i];
Chris@176 676 vector<double> outCol(pack.templateHeight);
Chris@32 677
Chris@178 678 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@298 679 // lowest 55 of them (assuming binsPerSemitone == 5).
Chris@178 680 //
Chris@297 681 // In draft and live mode the CQ is an octave shorter,
Chris@300 682 // returning 540 bins or equivalent, so we instead pad
Chris@300 683 // them with an additional 5 or equivalent zeros.
Chris@178 684 //
Chris@178 685 // We also need to reverse the column as we go, since the
Chris@178 686 // raw CQ has the high frequencies first and we need it
Chris@178 687 // the other way around.
Chris@32 688
Chris@298 689 int bps = (m_mode == LiveMode ?
Chris@298 690 binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@298 691
Chris@297 692 if (m_mode == HighQualityMode) {
Chris@178 693 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@298 694 int ix = inCol.size() - j - (11 * bps);
Chris@178 695 outCol[j] = inCol[ix];
Chris@178 696 }
Chris@178 697 } else {
Chris@298 698 for (int j = 0; j < bps; ++j) {
Chris@178 699 outCol[j] = 0.0;
Chris@178 700 }
Chris@298 701 for (int j = bps; j < pack.templateHeight; ++j) {
Chris@298 702 int ix = inCol.size() - j + (bps-1);
Chris@178 703 outCol[j] = inCol[ix];
Chris@178 704 }
Chris@46 705 }
Chris@32 706
Chris@46 707 vector<double> noiseLevel1 =
Chris@298 708 MedianFilter<double>::filter(8 * bps, outCol);
Chris@176 709 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 710 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 711 }
Chris@32 712
Chris@46 713 vector<double> noiseLevel2 =
Chris@298 714 MedianFilter<double>::filter(8 * bps, noiseLevel1);
Chris@176 715 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 716 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 717 }
Chris@32 718
Chris@165 719 out.push_back(outCol);
Chris@32 720 }
Chris@32 721
Chris@32 722 ++m_columnCount;
Chris@32 723 }
Chris@32 724
Chris@32 725 return out;
Chris@32 726 }
Chris@32 727
Chris@294 728 vector<double>
Chris@170 729 Silvet::postProcess(const vector<double> &pitches,
Chris@170 730 const vector<int> &bestShifts,
Chris@170 731 bool wantShifts)
Chris@166 732 {
Chris@298 733 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 734
Chris@41 735 vector<double> filtered;
Chris@41 736
Chris@176 737 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 738 m_postFilter[j]->push(pitches[j]);
Chris@41 739 filtered.push_back(m_postFilter[j]->get());
Chris@41 740 }
Chris@41 741
Chris@41 742 // Threshold for level and reduce number of candidate pitches
Chris@41 743
Chris@41 744 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 745
Chris@41 746 ValueIndexMap strengths;
Chris@166 747
Chris@176 748 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 749 double strength = filtered[j];
Chris@183 750 if (strength < pack.levelThreshold) continue;
Chris@168 751 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 752 }
Chris@166 753
Chris@168 754 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 755
Chris@168 756 map<int, double> active;
Chris@168 757 map<int, int> activeShifts;
Chris@168 758
Chris@183 759 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 760
Chris@168 761 --si;
Chris@168 762
Chris@168 763 double strength = si->first;
Chris@168 764 int j = si->second;
Chris@168 765
Chris@168 766 active[j] = strength;
Chris@168 767
Chris@170 768 if (wantShifts) {
Chris@170 769 activeShifts[j] = bestShifts[j];
Chris@167 770 }
Chris@41 771 }
Chris@41 772
Chris@168 773 m_pianoRoll.push_back(active);
Chris@170 774
Chris@170 775 if (wantShifts) {
Chris@168 776 m_pianoRollShifts.push_back(activeShifts);
Chris@41 777 }
Chris@294 778
Chris@294 779 return filtered;
Chris@166 780 }
Chris@166 781
Chris@166 782 Vamp::Plugin::FeatureList
Chris@168 783 Silvet::noteTrack(int shiftCount)
Chris@166 784 {
Chris@41 785 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 786 // report notes that have just ended (i.e. that are absent in the
Chris@168 787 // latest active set but present in the prior set in the piano
Chris@41 788 // roll) -- any notes that ended earlier will have been reported
Chris@41 789 // already, and if they haven't ended, we don't know their
Chris@41 790 // duration.
Chris@41 791
Chris@168 792 int width = m_pianoRoll.size() - 1;
Chris@168 793
Chris@168 794 const map<int, double> &active = m_pianoRoll[width];
Chris@41 795
Chris@165 796 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 797
Chris@165 798 // only keep notes >= 100ms or thereabouts
Chris@165 799 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 800 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 801
Chris@41 802 FeatureList noteFeatures;
Chris@41 803
Chris@41 804 if (width < durationThreshold + 1) {
Chris@41 805 return noteFeatures;
Chris@41 806 }
Chris@41 807
Chris@150 808 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 809
Chris@55 810 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 811 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 812
Chris@55 813 int note = ni->first;
Chris@41 814
Chris@41 815 if (active.find(note) != active.end()) {
Chris@41 816 // the note is still playing
Chris@41 817 continue;
Chris@41 818 }
Chris@41 819
Chris@41 820 // the note was playing but just ended
Chris@41 821 int end = width;
Chris@41 822 int start = end-1;
Chris@41 823
Chris@41 824 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 825 --start;
Chris@41 826 }
Chris@41 827 ++start;
Chris@41 828
Chris@169 829 if ((end - start) < durationThreshold) {
Chris@41 830 continue;
Chris@41 831 }
Chris@41 832
Chris@169 833 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 834 }
Chris@41 835
Chris@62 836 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 837
Chris@41 838 return noteFeatures;
Chris@41 839 }
Chris@41 840
Chris@169 841 void
Chris@169 842 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 843 FeatureList &noteFeatures)
Chris@169 844 {
Chris@169 845 int partStart = start;
Chris@169 846 int partShift = 0;
Chris@169 847 int partVelocity = 0;
Chris@169 848
Chris@252 849 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 850
Chris@169 851 for (int i = start; i != end; ++i) {
Chris@169 852
Chris@169 853 double strength = m_pianoRoll[i][note];
Chris@169 854
Chris@169 855 int shift = 0;
Chris@169 856
Chris@169 857 if (shiftCount > 1) {
Chris@169 858
Chris@169 859 shift = m_pianoRollShifts[i][note];
Chris@169 860
Chris@169 861 if (i == partStart) {
Chris@169 862 partShift = shift;
Chris@169 863 }
Chris@169 864
Chris@169 865 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 866
Chris@169 867 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 868
Chris@169 869 // pitch has changed, emit an intermediate note
Chris@252 870 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 871 i,
Chris@252 872 note,
Chris@252 873 partShift,
Chris@252 874 shiftCount,
Chris@252 875 partVelocity));
Chris@169 876 partStart = i;
Chris@169 877 partShift = shift;
Chris@169 878 partVelocity = 0;
Chris@169 879 }
Chris@169 880 }
Chris@169 881
Chris@246 882 int v = round(strength * 2);
Chris@169 883 if (v > partVelocity) {
Chris@169 884 partVelocity = v;
Chris@169 885 }
Chris@169 886 }
Chris@169 887
Chris@169 888 if (end >= partStart + partThreshold) {
Chris@252 889 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 890 end,
Chris@252 891 note,
Chris@252 892 partShift,
Chris@252 893 shiftCount,
Chris@252 894 partVelocity));
Chris@169 895 }
Chris@169 896 }
Chris@252 897
Chris@252 898 Silvet::Feature
Chris@252 899 Silvet::makeNoteFeature(int start,
Chris@252 900 int end,
Chris@252 901 int note,
Chris@252 902 int shift,
Chris@252 903 int shiftCount,
Chris@252 904 int velocity)
Chris@252 905 {
Chris@252 906 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 907 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 908
Chris@252 909 Feature f;
Chris@252 910
Chris@252 911 f.hasTimestamp = true;
Chris@285 912 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 913 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 914
Chris@252 915 f.hasDuration = true;
Chris@252 916 f.duration = RealTime::fromSeconds
Chris@252 917 (columnDuration * (end - start));
Chris@252 918
Chris@252 919 f.values.clear();
Chris@252 920
Chris@252 921 f.values.push_back
Chris@252 922 (noteFrequency(note, shift, shiftCount));
Chris@252 923
Chris@252 924 float inputGain = getInputGainAt(f.timestamp);
Chris@252 925 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 926 velocity = round(velocity / inputGain);
Chris@252 927 if (velocity > 127) velocity = 127;
Chris@252 928 if (velocity < 1) velocity = 1;
Chris@252 929 f.values.push_back(velocity);
Chris@252 930
Chris@252 931 f.label = noteName(note, shift, shiftCount);
Chris@252 932
Chris@252 933 return f;
Chris@252 934 }
Chris@252 935
Chris@252 936 float
Chris@252 937 Silvet::getInputGainAt(RealTime t)
Chris@252 938 {
Chris@252 939 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 940
Chris@252 941 if (i == m_inputGains.end()) {
Chris@252 942 if (i != m_inputGains.begin()) {
Chris@252 943 --i;
Chris@252 944 } else {
Chris@252 945 return 1.f; // no data
Chris@252 946 }
Chris@252 947 }
Chris@252 948
Chris@252 949 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 950
Chris@252 951 return i->second;
Chris@252 952 }
Chris@252 953