annotate src/Silvet.cpp @ 303:d8468176339d livemode

More adjustments for live mode, including actually *using* the max frequency we calculated earlier
author Chris Cannam
date Fri, 05 Dec 2014 16:34:24 +0000
parents cac0be04c43c
children f5f3b50b2b9f
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@298 24 #include "LiveInstruments.h"
Chris@31 25
Chris@31 26 #include <vector>
Chris@31 27
Chris@32 28 #include <cstdio>
Chris@32 29
Chris@31 30 using std::vector;
Chris@48 31 using std::cout;
Chris@31 32 using std::cerr;
Chris@31 33 using std::endl;
Chris@40 34 using Vamp::RealTime;
Chris@31 35
Chris@31 36 static int processingSampleRate = 44100;
Chris@298 37
Chris@298 38 static int binsPerSemitoneLive = 1;
Chris@298 39 static int binsPerSemitoneNormal = 5;
Chris@170 40
Chris@272 41 static int minInputSampleRate = 100;
Chris@272 42 static int maxInputSampleRate = 192000;
Chris@272 43
Chris@31 44 Silvet::Silvet(float inputSampleRate) :
Chris@31 45 Plugin(inputSampleRate),
Chris@161 46 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@298 47 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
Chris@31 48 m_resampler(0),
Chris@246 49 m_flattener(0),
Chris@110 50 m_cq(0),
Chris@297 51 m_mode(HighQualityMode),
Chris@166 52 m_fineTuning(false),
Chris@178 53 m_instrument(0),
Chris@178 54 m_colsPerSec(50)
Chris@31 55 {
Chris@31 56 }
Chris@31 57
Chris@31 58 Silvet::~Silvet()
Chris@31 59 {
Chris@31 60 delete m_resampler;
Chris@246 61 delete m_flattener;
Chris@31 62 delete m_cq;
Chris@41 63 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 64 delete m_postFilter[i];
Chris@41 65 }
Chris@31 66 }
Chris@31 67
Chris@31 68 string
Chris@31 69 Silvet::getIdentifier() const
Chris@31 70 {
Chris@31 71 return "silvet";
Chris@31 72 }
Chris@31 73
Chris@31 74 string
Chris@31 75 Silvet::getName() const
Chris@31 76 {
Chris@31 77 return "Silvet Note Transcription";
Chris@31 78 }
Chris@31 79
Chris@31 80 string
Chris@31 81 Silvet::getDescription() const
Chris@31 82 {
Chris@191 83 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 84 }
Chris@31 85
Chris@31 86 string
Chris@31 87 Silvet::getMaker() const
Chris@31 88 {
Chris@191 89 return "Queen Mary, University of London";
Chris@31 90 }
Chris@31 91
Chris@31 92 int
Chris@31 93 Silvet::getPluginVersion() const
Chris@31 94 {
Chris@295 95 return 2;
Chris@31 96 }
Chris@31 97
Chris@31 98 string
Chris@31 99 Silvet::getCopyright() const
Chris@31 100 {
Chris@191 101 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 102 }
Chris@31 103
Chris@31 104 Silvet::InputDomain
Chris@31 105 Silvet::getInputDomain() const
Chris@31 106 {
Chris@31 107 return TimeDomain;
Chris@31 108 }
Chris@31 109
Chris@31 110 size_t
Chris@31 111 Silvet::getPreferredBlockSize() const
Chris@31 112 {
Chris@31 113 return 0;
Chris@31 114 }
Chris@31 115
Chris@31 116 size_t
Chris@31 117 Silvet::getPreferredStepSize() const
Chris@31 118 {
Chris@31 119 return 0;
Chris@31 120 }
Chris@31 121
Chris@31 122 size_t
Chris@31 123 Silvet::getMinChannelCount() const
Chris@31 124 {
Chris@31 125 return 1;
Chris@31 126 }
Chris@31 127
Chris@31 128 size_t
Chris@31 129 Silvet::getMaxChannelCount() const
Chris@31 130 {
Chris@31 131 return 1;
Chris@31 132 }
Chris@31 133
Chris@31 134 Silvet::ParameterList
Chris@31 135 Silvet::getParameterDescriptors() const
Chris@31 136 {
Chris@31 137 ParameterList list;
Chris@110 138
Chris@110 139 ParameterDescriptor desc;
Chris@110 140 desc.identifier = "mode";
Chris@110 141 desc.name = "Processing mode";
Chris@110 142 desc.unit = "";
Chris@297 143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
Chris@110 144 desc.minValue = 0;
Chris@297 145 desc.maxValue = 2;
Chris@113 146 desc.defaultValue = 1;
Chris@110 147 desc.isQuantized = true;
Chris@110 148 desc.quantizeStep = 1;
Chris@166 149 desc.valueNames.push_back("Draft (faster)");
Chris@165 150 desc.valueNames.push_back("Intensive (higher quality)");
Chris@297 151 desc.valueNames.push_back("Live (lower latency)");
Chris@161 152 list.push_back(desc);
Chris@161 153
Chris@176 154 desc.identifier = "instrument";
Chris@176 155 desc.name = "Instrument";
Chris@161 156 desc.unit = "";
Chris@271 157 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 158 desc.minValue = 0;
Chris@162 159 desc.maxValue = m_instruments.size()-1;
Chris@162 160 desc.defaultValue = 0;
Chris@161 161 desc.isQuantized = true;
Chris@161 162 desc.quantizeStep = 1;
Chris@161 163 desc.valueNames.clear();
Chris@162 164 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 165 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 166 }
Chris@166 167 list.push_back(desc);
Chris@161 168
Chris@166 169 desc.identifier = "finetune";
Chris@166 170 desc.name = "Return fine pitch estimates";
Chris@166 171 desc.unit = "";
Chris@271 172 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 173 desc.minValue = 0;
Chris@166 174 desc.maxValue = 1;
Chris@166 175 desc.defaultValue = 0;
Chris@166 176 desc.isQuantized = true;
Chris@166 177 desc.quantizeStep = 1;
Chris@166 178 desc.valueNames.clear();
Chris@110 179 list.push_back(desc);
Chris@110 180
Chris@31 181 return list;
Chris@31 182 }
Chris@31 183
Chris@31 184 float
Chris@31 185 Silvet::getParameter(string identifier) const
Chris@31 186 {
Chris@110 187 if (identifier == "mode") {
Chris@297 188 return (float)(int)m_mode;
Chris@166 189 } else if (identifier == "finetune") {
Chris@166 190 return m_fineTuning ? 1.f : 0.f;
Chris@176 191 } else if (identifier == "instrument") {
Chris@162 192 return m_instrument;
Chris@110 193 }
Chris@31 194 return 0;
Chris@31 195 }
Chris@31 196
Chris@31 197 void
Chris@31 198 Silvet::setParameter(string identifier, float value)
Chris@31 199 {
Chris@110 200 if (identifier == "mode") {
Chris@297 201 m_mode = (ProcessingMode)(int)(value + 0.5);
Chris@166 202 } else if (identifier == "finetune") {
Chris@166 203 m_fineTuning = (value > 0.5);
Chris@176 204 } else if (identifier == "instrument") {
Chris@162 205 m_instrument = lrintf(value);
Chris@110 206 }
Chris@31 207 }
Chris@31 208
Chris@31 209 Silvet::ProgramList
Chris@31 210 Silvet::getPrograms() const
Chris@31 211 {
Chris@31 212 ProgramList list;
Chris@31 213 return list;
Chris@31 214 }
Chris@31 215
Chris@31 216 string
Chris@31 217 Silvet::getCurrentProgram() const
Chris@31 218 {
Chris@31 219 return "";
Chris@31 220 }
Chris@31 221
Chris@31 222 void
Chris@31 223 Silvet::selectProgram(string name)
Chris@31 224 {
Chris@31 225 }
Chris@31 226
Chris@31 227 Silvet::OutputList
Chris@31 228 Silvet::getOutputDescriptors() const
Chris@31 229 {
Chris@31 230 OutputList list;
Chris@31 231
Chris@31 232 OutputDescriptor d;
Chris@51 233 d.identifier = "notes";
Chris@51 234 d.name = "Note transcription";
Chris@271 235 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 236 d.unit = "Hz";
Chris@31 237 d.hasFixedBinCount = true;
Chris@31 238 d.binCount = 2;
Chris@41 239 d.binNames.push_back("Frequency");
Chris@31 240 d.binNames.push_back("Velocity");
Chris@31 241 d.hasKnownExtents = false;
Chris@31 242 d.isQuantized = false;
Chris@31 243 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 244 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 245 d.hasDuration = true;
Chris@32 246 m_notesOutputNo = list.size();
Chris@32 247 list.push_back(d);
Chris@32 248
Chris@178 249 d.identifier = "timefreq";
Chris@178 250 d.name = "Time-frequency distribution";
Chris@271 251 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 252 d.unit = "";
Chris@178 253 d.hasFixedBinCount = true;
Chris@298 254 d.binCount = getPack(0).templateHeight;
Chris@178 255 d.binNames.clear();
Chris@178 256 if (m_cq) {
Chris@294 257 char name[50];
Chris@298 258 for (int i = 0; i < getPack(0).templateHeight; ++i) {
Chris@178 259 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 260 // lowest-frequency 55 bins have been dropped, for a
Chris@178 261 // 545-bin template. The native CQ bins go high->low
Chris@178 262 // frequency though, so these are still the first 545 bins
Chris@178 263 // as reported by getBinFrequency, though in reverse order
Chris@178 264 float freq = m_cq->getBinFrequency
Chris@298 265 (getPack(0).templateHeight - i - 1);
Chris@178 266 sprintf(name, "%.1f Hz", freq);
Chris@178 267 d.binNames.push_back(name);
Chris@178 268 }
Chris@178 269 }
Chris@178 270 d.hasKnownExtents = false;
Chris@178 271 d.isQuantized = false;
Chris@178 272 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 273 d.sampleRate = m_colsPerSec;
Chris@178 274 d.hasDuration = false;
Chris@178 275 m_fcqOutputNo = list.size();
Chris@178 276 list.push_back(d);
Chris@178 277
Chris@294 278 d.identifier = "pitchactivation";
Chris@294 279 d.name = "Pitch activation distribution";
Chris@294 280 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 281 d.unit = "";
Chris@294 282 d.hasFixedBinCount = true;
Chris@298 283 d.binCount = getPack(0).templateNoteCount;
Chris@294 284 d.binNames.clear();
Chris@294 285 if (m_cq) {
Chris@298 286 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@294 287 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 288 }
Chris@294 289 }
Chris@294 290 d.hasKnownExtents = false;
Chris@294 291 d.isQuantized = false;
Chris@294 292 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 293 d.sampleRate = m_colsPerSec;
Chris@294 294 d.hasDuration = false;
Chris@294 295 m_pitchOutputNo = list.size();
Chris@294 296 list.push_back(d);
Chris@294 297
Chris@302 298 d.identifier = "templates";
Chris@302 299 d.name = "Templates";
Chris@302 300 d.description = "Constant-Q spectral templates for the selected instrument pack.";
Chris@302 301 d.unit = "";
Chris@302 302 d.hasFixedBinCount = true;
Chris@302 303 d.binCount = getPack(0).templateHeight;
Chris@302 304 d.binNames.clear();
Chris@302 305 if (m_cq) {
Chris@302 306 char name[50];
Chris@302 307 for (int i = 0; i < getPack(0).templateHeight; ++i) {
Chris@302 308 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@302 309 // lowest-frequency 55 bins have been dropped, for a
Chris@302 310 // 545-bin template. The native CQ bins go high->low
Chris@302 311 // frequency though, so these are still the first 545 bins
Chris@302 312 // as reported by getBinFrequency, though in reverse order
Chris@302 313 float freq = m_cq->getBinFrequency
Chris@302 314 (getPack(0).templateHeight - i - 1);
Chris@302 315 sprintf(name, "%.1f Hz", freq);
Chris@302 316 d.binNames.push_back(name);
Chris@302 317 }
Chris@302 318 }
Chris@302 319 d.hasKnownExtents = false;
Chris@302 320 d.isQuantized = false;
Chris@302 321 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@302 322 d.sampleRate = m_colsPerSec;
Chris@302 323 d.hasDuration = false;
Chris@302 324 m_templateOutputNo = list.size();
Chris@302 325 list.push_back(d);
Chris@302 326
Chris@31 327 return list;
Chris@31 328 }
Chris@31 329
Chris@38 330 std::string
Chris@175 331 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 332 {
Chris@38 333 static const char *names[] = {
Chris@38 334 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 335 };
Chris@38 336
Chris@175 337 const char *n = names[note % 12];
Chris@38 338
Chris@175 339 int oct = (note + 9) / 12;
Chris@38 340
Chris@175 341 char buf[30];
Chris@175 342
Chris@175 343 float pshift = 0.f;
Chris@175 344 if (shiftCount > 1) {
Chris@175 345 // see noteFrequency below
Chris@175 346 pshift =
Chris@175 347 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 348 }
Chris@175 349
Chris@175 350 if (pshift > 0.f) {
Chris@175 351 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 352 } else if (pshift < 0.f) {
Chris@175 353 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 354 } else {
Chris@175 355 sprintf(buf, "%s%d", n, oct);
Chris@175 356 }
Chris@38 357
Chris@38 358 return buf;
Chris@38 359 }
Chris@38 360
Chris@41 361 float
Chris@168 362 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 363 {
Chris@169 364 // Convert shift number to a pitch shift. The given shift number
Chris@169 365 // is an offset into the template array, which starts with some
Chris@169 366 // zeros, followed by the template, then some trailing zeros.
Chris@169 367 //
Chris@169 368 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 369 // == 5, then the number will be in the range 0-4 and the template
Chris@169 370 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 371 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 372 // represent moving the template *up* in pitch (by introducing
Chris@169 373 // zeros at the start, which is the low-frequency end), for a
Chris@169 374 // positive pitch shift; and higher values represent moving it
Chris@169 375 // down in pitch, for a negative pitch shift.
Chris@169 376
Chris@175 377 float pshift = 0.f;
Chris@175 378 if (shiftCount > 1) {
Chris@175 379 pshift =
Chris@175 380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 381 }
Chris@169 382
Chris@301 383 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@301 384
Chris@303 385 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
Chris@303 386 // << shiftCount << ", obtained freq = " << freq << endl;
Chris@301 387
Chris@301 388 return freq;
Chris@41 389 }
Chris@41 390
Chris@31 391 bool
Chris@31 392 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 393 {
Chris@272 394 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 395 m_inputSampleRate > maxInputSampleRate) {
Chris@272 396 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 397 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 398 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 399 return false;
Chris@272 400 }
Chris@272 401
Chris@31 402 if (channels < getMinChannelCount() ||
Chris@272 403 channels > getMaxChannelCount()) {
Chris@272 404 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 405 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 406 << getMaxChannelCount() << ")" << endl;
Chris@272 407 return false;
Chris@272 408 }
Chris@31 409
Chris@31 410 if (stepSize != blockSize) {
Chris@31 411 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 412 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 413 return false;
Chris@31 414 }
Chris@31 415
Chris@31 416 m_blockSize = blockSize;
Chris@31 417
Chris@31 418 reset();
Chris@31 419
Chris@31 420 return true;
Chris@31 421 }
Chris@31 422
Chris@31 423 void
Chris@31 424 Silvet::reset()
Chris@31 425 {
Chris@31 426 delete m_resampler;
Chris@246 427 delete m_flattener;
Chris@31 428 delete m_cq;
Chris@31 429
Chris@31 430 if (m_inputSampleRate != processingSampleRate) {
Chris@31 431 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 432 } else {
Chris@31 433 m_resampler = 0;
Chris@31 434 }
Chris@31 435
Chris@246 436 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 437 m_flattener->reset();
Chris@246 438
Chris@301 439 // this happens to be processingSampleRate / 3, and is the top
Chris@301 440 // freq used for the EM templates:
Chris@301 441 double maxFreq = 14700;
Chris@301 442
Chris@301 443 if (m_mode == LiveMode) {
Chris@301 444 // We only have 12 bpo rather than 60, so we need the top bin
Chris@301 445 // to be the middle one of the top 5, i.e. 2/5 of a semitone
Chris@301 446 // lower than 14700
Chris@301 447 maxFreq *= powf(2.0, -1.0 / 30.0);
Chris@301 448 }
Chris@301 449
Chris@173 450 double minFreq = 27.5;
Chris@173 451
Chris@297 452 if (m_mode != HighQualityMode) {
Chris@173 453 // We don't actually return any notes from the bottom octave,
Chris@173 454 // so we can just pad with zeros
Chris@173 455 minFreq *= 2;
Chris@173 456 }
Chris@173 457
Chris@298 458 int bpo = 12 *
Chris@298 459 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@301 460
Chris@154 461 CQParameters params(processingSampleRate,
Chris@173 462 minFreq,
Chris@303 463 maxFreq,
Chris@298 464 bpo);
Chris@154 465
Chris@155 466 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 467 // drops the FFT size to 512 from 1024 and alters
Chris@155 468 // some other processing parameters, making
Chris@155 469 // everything much, much slower. Could be a flaw
Chris@155 470 // in the CQ parameter calculations, must check
Chris@154 471 params.atomHopFactor = 0.3;
Chris@154 472 params.threshold = 0.0005;
Chris@172 473 params.window = CQParameters::Hann;
Chris@154 474
Chris@154 475 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 476
Chris@303 477 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
Chris@303 478 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
Chris@297 479
Chris@297 480 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
Chris@165 481
Chris@41 482 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 483 delete m_postFilter[i];
Chris@41 484 }
Chris@41 485 m_postFilter.clear();
Chris@303 486 int postFilterLength = 3;
Chris@298 487 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@303 488 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
Chris@41 489 }
Chris@41 490 m_pianoRoll.clear();
Chris@246 491 m_inputGains.clear();
Chris@32 492 m_columnCount = 0;
Chris@272 493 m_resampledCount = 0;
Chris@40 494 m_startTime = RealTime::zeroTime;
Chris@31 495 }
Chris@31 496
Chris@31 497 Silvet::FeatureSet
Chris@31 498 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 499 {
Chris@302 500 FeatureSet fs;
Chris@302 501
Chris@40 502 if (m_columnCount == 0) {
Chris@40 503 m_startTime = timestamp;
Chris@302 504 insertTemplateFeatures(fs);
Chris@40 505 }
Chris@246 506
Chris@246 507 vector<float> flattened(m_blockSize);
Chris@246 508 float gain = 1.f;
Chris@246 509 m_flattener->connectInputPort
Chris@246 510 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 511 m_flattener->connectOutputPort
Chris@246 512 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 513 m_flattener->connectOutputPort
Chris@246 514 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 515 m_flattener->process(m_blockSize);
Chris@246 516
Chris@252 517 m_inputGains[timestamp] = gain;
Chris@40 518
Chris@31 519 vector<double> data;
Chris@40 520 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 521 double d = flattened[i];
Chris@235 522 data.push_back(d);
Chris@40 523 }
Chris@31 524
Chris@31 525 if (m_resampler) {
Chris@272 526
Chris@31 527 data = m_resampler->process(data.data(), data.size());
Chris@272 528
Chris@272 529 int hadCount = m_resampledCount;
Chris@272 530 m_resampledCount += data.size();
Chris@272 531
Chris@272 532 int resamplerLatency = m_resampler->getLatency();
Chris@272 533
Chris@272 534 if (hadCount < resamplerLatency) {
Chris@272 535 int stillToDrop = resamplerLatency - hadCount;
Chris@272 536 if (stillToDrop >= int(data.size())) {
Chris@302 537 return fs;
Chris@272 538 } else {
Chris@272 539 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 540 }
Chris@272 541 }
Chris@31 542 }
Chris@272 543
Chris@32 544 Grid cqout = m_cq->process(data);
Chris@302 545 transcribe(cqout, fs);
Chris@51 546 return fs;
Chris@34 547 }
Chris@34 548
Chris@34 549 Silvet::FeatureSet
Chris@34 550 Silvet::getRemainingFeatures()
Chris@34 551 {
Chris@145 552 Grid cqout = m_cq->getRemainingOutput();
Chris@302 553 FeatureSet fs;
Chris@302 554 if (m_columnCount == 0) {
Chris@302 555 // process() was never called, but we still want these
Chris@302 556 insertTemplateFeatures(fs);
Chris@302 557 } else {
Chris@302 558 transcribe(cqout, fs);
Chris@302 559 }
Chris@51 560 return fs;
Chris@34 561 }
Chris@34 562
Chris@302 563 void
Chris@302 564 Silvet::insertTemplateFeatures(FeatureSet &fs)
Chris@302 565 {
Chris@302 566 const InstrumentPack &pack = getPack(m_instrument);
Chris@302 567 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
Chris@302 568 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
Chris@302 569 Feature f;
Chris@302 570 char buffer[50];
Chris@302 571 sprintf(buffer, "Note %d", i + 1);
Chris@302 572 f.label = buffer;
Chris@302 573 f.hasTimestamp = true;
Chris@302 574 f.timestamp = timestamp;
Chris@302 575 f.values = pack.templates[i / pack.templateNoteCount]
Chris@302 576 .data[i % pack.templateNoteCount];
Chris@302 577 fs[m_templateOutputNo].push_back(f);
Chris@302 578 }
Chris@302 579 }
Chris@302 580
Chris@302 581 void
Chris@302 582 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
Chris@34 583 {
Chris@32 584 Grid filtered = preProcess(cqout);
Chris@31 585
Chris@302 586 if (filtered.empty()) return;
Chris@170 587
Chris@298 588 const InstrumentPack &pack(getPack(m_instrument));
Chris@104 589
Chris@178 590 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 591 Feature f;
Chris@178 592 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 593 f.values.push_back(float(filtered[i][j]));
Chris@178 594 }
Chris@178 595 fs[m_fcqOutputNo].push_back(f);
Chris@178 596 }
Chris@178 597
Chris@34 598 int width = filtered.size();
Chris@34 599
Chris@297 600 int iterations = (m_mode == HighQualityMode ? 20 : 10);
Chris@34 601
Chris@176 602 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 603
Chris@297 604 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
Chris@170 605 int shiftCount = 1;
Chris@170 606 if (wantShifts) {
Chris@170 607 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 608 }
Chris@170 609
Chris@170 610 vector<vector<int> > localBestShifts;
Chris@170 611 if (wantShifts) {
Chris@170 612 localBestShifts =
Chris@176 613 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 614 }
Chris@170 615
Chris@170 616 vector<bool> present(width, false);
Chris@37 617
Chris@123 618 #pragma omp parallel for
Chris@123 619 for (int i = 0; i < width; ++i) {
Chris@104 620
Chris@170 621 double sum = 0.0;
Chris@176 622 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 623 sum += filtered.at(i).at(j);
Chris@170 624 }
Chris@170 625 if (sum < 1e-5) continue;
Chris@170 626
Chris@170 627 present[i] = true;
Chris@170 628
Chris@297 629 EM em(&pack, m_mode == HighQualityMode);
Chris@170 630
Chris@183 631 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 632 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 633
Chris@170 634 for (int j = 0; j < iterations; ++j) {
Chris@170 635 em.iterate(filtered.at(i).data());
Chris@37 636 }
Chris@37 637
Chris@170 638 const float *pitchDist = em.getPitchDistribution();
Chris@170 639 const float *const *shiftDist = em.getShifts();
Chris@37 640
Chris@176 641 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 642
Chris@170 643 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 644
Chris@170 645 int bestShift = 0;
Chris@179 646 float bestShiftValue = 0.0;
Chris@170 647 if (wantShifts) {
Chris@170 648 for (int k = 0; k < shiftCount; ++k) {
Chris@179 649 float value = shiftDist[k][j];
Chris@179 650 if (k == 0 || value > bestShiftValue) {
Chris@179 651 bestShiftValue = value;
Chris@170 652 bestShift = k;
Chris@170 653 }
Chris@170 654 }
Chris@170 655 localBestShifts[i][j] = bestShift;
Chris@170 656 }
Chris@123 657 }
Chris@123 658 }
Chris@166 659
Chris@166 660 for (int i = 0; i < width; ++i) {
Chris@37 661
Chris@170 662 if (!present[i]) {
Chris@170 663 // silent column
Chris@176 664 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 665 m_postFilter[j]->push(0.0);
Chris@170 666 }
Chris@168 667 m_pianoRoll.push_back(map<int, double>());
Chris@170 668 if (wantShifts) {
Chris@168 669 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 670 }
Chris@166 671 continue;
Chris@166 672 }
Chris@166 673
Chris@294 674 vector<double> filtered = postProcess
Chris@294 675 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 676
Chris@294 677 Feature f;
Chris@294 678 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 679 float v(filtered[j]);
Chris@294 680 if (v < pack.levelThreshold) v = 0.f;
Chris@294 681 f.values.push_back(v);
Chris@294 682 }
Chris@294 683 fs[m_pitchOutputNo].push_back(f);
Chris@166 684
Chris@168 685 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 686
Chris@123 687 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 688 fi != noteFeatures.end(); ++fi) {
Chris@123 689 fs[m_notesOutputNo].push_back(*fi);
Chris@40 690 }
Chris@34 691 }
Chris@31 692 }
Chris@31 693
Chris@32 694 Silvet::Grid
Chris@32 695 Silvet::preProcess(const Grid &in)
Chris@32 696 {
Chris@32 697 int width = in.size();
Chris@32 698
Chris@165 699 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 700
Chris@165 701 // need to be careful that col spacing is an integer number of samples!
Chris@165 702 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 703
Chris@32 704 Grid out;
Chris@32 705
Chris@58 706 // We count the CQ latency in terms of processing hops, but
Chris@58 707 // actually it probably isn't an exact number of hops so this
Chris@58 708 // isn't quite accurate. But the small constant offset is
Chris@165 709 // practically irrelevant compared to the jitter from the frame
Chris@165 710 // size we reduce to in a moment
Chris@33 711 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 712
Chris@298 713 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 714
Chris@32 715 for (int i = 0; i < width; ++i) {
Chris@32 716
Chris@33 717 if (m_columnCount < latentColumns) {
Chris@33 718 ++m_columnCount;
Chris@33 719 continue;
Chris@33 720 }
Chris@33 721
Chris@32 722 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 723 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 724
Chris@32 725 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 726
Chris@32 727 if (select) {
Chris@32 728 vector<double> inCol = in[i];
Chris@176 729 vector<double> outCol(pack.templateHeight);
Chris@32 730
Chris@178 731 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@298 732 // lowest 55 of them (assuming binsPerSemitone == 5).
Chris@178 733 //
Chris@297 734 // In draft and live mode the CQ is an octave shorter,
Chris@300 735 // returning 540 bins or equivalent, so we instead pad
Chris@300 736 // them with an additional 5 or equivalent zeros.
Chris@178 737 //
Chris@178 738 // We also need to reverse the column as we go, since the
Chris@178 739 // raw CQ has the high frequencies first and we need it
Chris@178 740 // the other way around.
Chris@32 741
Chris@298 742 int bps = (m_mode == LiveMode ?
Chris@298 743 binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@298 744
Chris@297 745 if (m_mode == HighQualityMode) {
Chris@178 746 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@298 747 int ix = inCol.size() - j - (11 * bps);
Chris@178 748 outCol[j] = inCol[ix];
Chris@178 749 }
Chris@178 750 } else {
Chris@298 751 for (int j = 0; j < bps; ++j) {
Chris@178 752 outCol[j] = 0.0;
Chris@178 753 }
Chris@298 754 for (int j = bps; j < pack.templateHeight; ++j) {
Chris@298 755 int ix = inCol.size() - j + (bps-1);
Chris@178 756 outCol[j] = inCol[ix];
Chris@178 757 }
Chris@46 758 }
Chris@32 759
Chris@46 760 vector<double> noiseLevel1 =
Chris@298 761 MedianFilter<double>::filter(8 * bps, outCol);
Chris@176 762 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 763 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 764 }
Chris@32 765
Chris@46 766 vector<double> noiseLevel2 =
Chris@298 767 MedianFilter<double>::filter(8 * bps, noiseLevel1);
Chris@176 768 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 769 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 770 }
Chris@32 771
Chris@165 772 out.push_back(outCol);
Chris@32 773 }
Chris@32 774
Chris@32 775 ++m_columnCount;
Chris@32 776 }
Chris@32 777
Chris@32 778 return out;
Chris@32 779 }
Chris@32 780
Chris@294 781 vector<double>
Chris@170 782 Silvet::postProcess(const vector<double> &pitches,
Chris@170 783 const vector<int> &bestShifts,
Chris@170 784 bool wantShifts)
Chris@166 785 {
Chris@298 786 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 787
Chris@41 788 vector<double> filtered;
Chris@41 789
Chris@176 790 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 791 m_postFilter[j]->push(pitches[j]);
Chris@41 792 filtered.push_back(m_postFilter[j]->get());
Chris@41 793 }
Chris@41 794
Chris@41 795 // Threshold for level and reduce number of candidate pitches
Chris@41 796
Chris@41 797 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 798
Chris@41 799 ValueIndexMap strengths;
Chris@166 800
Chris@176 801 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 802 double strength = filtered[j];
Chris@183 803 if (strength < pack.levelThreshold) continue;
Chris@168 804 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 805 }
Chris@166 806
Chris@168 807 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 808
Chris@168 809 map<int, double> active;
Chris@168 810 map<int, int> activeShifts;
Chris@168 811
Chris@183 812 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 813
Chris@168 814 --si;
Chris@168 815
Chris@168 816 double strength = si->first;
Chris@168 817 int j = si->second;
Chris@168 818
Chris@168 819 active[j] = strength;
Chris@168 820
Chris@170 821 if (wantShifts) {
Chris@170 822 activeShifts[j] = bestShifts[j];
Chris@167 823 }
Chris@41 824 }
Chris@41 825
Chris@168 826 m_pianoRoll.push_back(active);
Chris@170 827
Chris@170 828 if (wantShifts) {
Chris@168 829 m_pianoRollShifts.push_back(activeShifts);
Chris@41 830 }
Chris@294 831
Chris@294 832 return filtered;
Chris@166 833 }
Chris@166 834
Chris@166 835 Vamp::Plugin::FeatureList
Chris@168 836 Silvet::noteTrack(int shiftCount)
Chris@166 837 {
Chris@41 838 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 839 // report notes that have just ended (i.e. that are absent in the
Chris@168 840 // latest active set but present in the prior set in the piano
Chris@41 841 // roll) -- any notes that ended earlier will have been reported
Chris@41 842 // already, and if they haven't ended, we don't know their
Chris@41 843 // duration.
Chris@41 844
Chris@168 845 int width = m_pianoRoll.size() - 1;
Chris@168 846
Chris@168 847 const map<int, double> &active = m_pianoRoll[width];
Chris@41 848
Chris@165 849 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 850
Chris@165 851 // only keep notes >= 100ms or thereabouts
Chris@165 852 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 853 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 854
Chris@41 855 FeatureList noteFeatures;
Chris@41 856
Chris@41 857 if (width < durationThreshold + 1) {
Chris@41 858 return noteFeatures;
Chris@41 859 }
Chris@41 860
Chris@150 861 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 862
Chris@55 863 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 864 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 865
Chris@55 866 int note = ni->first;
Chris@41 867
Chris@41 868 if (active.find(note) != active.end()) {
Chris@41 869 // the note is still playing
Chris@41 870 continue;
Chris@41 871 }
Chris@41 872
Chris@41 873 // the note was playing but just ended
Chris@41 874 int end = width;
Chris@41 875 int start = end-1;
Chris@41 876
Chris@41 877 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 878 --start;
Chris@41 879 }
Chris@41 880 ++start;
Chris@41 881
Chris@169 882 if ((end - start) < durationThreshold) {
Chris@41 883 continue;
Chris@41 884 }
Chris@41 885
Chris@169 886 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 887 }
Chris@41 888
Chris@62 889 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 890
Chris@41 891 return noteFeatures;
Chris@41 892 }
Chris@41 893
Chris@169 894 void
Chris@169 895 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 896 FeatureList &noteFeatures)
Chris@169 897 {
Chris@169 898 int partStart = start;
Chris@169 899 int partShift = 0;
Chris@169 900 int partVelocity = 0;
Chris@169 901
Chris@252 902 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 903
Chris@169 904 for (int i = start; i != end; ++i) {
Chris@169 905
Chris@169 906 double strength = m_pianoRoll[i][note];
Chris@169 907
Chris@169 908 int shift = 0;
Chris@169 909
Chris@169 910 if (shiftCount > 1) {
Chris@169 911
Chris@169 912 shift = m_pianoRollShifts[i][note];
Chris@169 913
Chris@169 914 if (i == partStart) {
Chris@169 915 partShift = shift;
Chris@169 916 }
Chris@169 917
Chris@169 918 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 919
Chris@169 920 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 921
Chris@169 922 // pitch has changed, emit an intermediate note
Chris@252 923 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 924 i,
Chris@252 925 note,
Chris@252 926 partShift,
Chris@252 927 shiftCount,
Chris@252 928 partVelocity));
Chris@169 929 partStart = i;
Chris@169 930 partShift = shift;
Chris@169 931 partVelocity = 0;
Chris@169 932 }
Chris@169 933 }
Chris@169 934
Chris@303 935 int v;
Chris@303 936 if (m_mode == LiveMode) {
Chris@303 937 v = round(strength * 30);
Chris@303 938 } else {
Chris@303 939 v = round(strength * 2);
Chris@303 940 }
Chris@169 941 if (v > partVelocity) {
Chris@169 942 partVelocity = v;
Chris@169 943 }
Chris@169 944 }
Chris@169 945
Chris@169 946 if (end >= partStart + partThreshold) {
Chris@252 947 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 948 end,
Chris@252 949 note,
Chris@252 950 partShift,
Chris@252 951 shiftCount,
Chris@252 952 partVelocity));
Chris@169 953 }
Chris@169 954 }
Chris@252 955
Chris@252 956 Silvet::Feature
Chris@252 957 Silvet::makeNoteFeature(int start,
Chris@252 958 int end,
Chris@252 959 int note,
Chris@252 960 int shift,
Chris@252 961 int shiftCount,
Chris@252 962 int velocity)
Chris@252 963 {
Chris@252 964 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 965 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 966
Chris@252 967 Feature f;
Chris@252 968
Chris@252 969 f.hasTimestamp = true;
Chris@285 970 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 971 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 972
Chris@252 973 f.hasDuration = true;
Chris@252 974 f.duration = RealTime::fromSeconds
Chris@252 975 (columnDuration * (end - start));
Chris@252 976
Chris@252 977 f.values.clear();
Chris@252 978
Chris@252 979 f.values.push_back
Chris@252 980 (noteFrequency(note, shift, shiftCount));
Chris@252 981
Chris@252 982 float inputGain = getInputGainAt(f.timestamp);
Chris@252 983 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 984 velocity = round(velocity / inputGain);
Chris@252 985 if (velocity > 127) velocity = 127;
Chris@252 986 if (velocity < 1) velocity = 1;
Chris@252 987 f.values.push_back(velocity);
Chris@252 988
Chris@252 989 f.label = noteName(note, shift, shiftCount);
Chris@252 990
Chris@252 991 return f;
Chris@252 992 }
Chris@252 993
Chris@252 994 float
Chris@252 995 Silvet::getInputGainAt(RealTime t)
Chris@252 996 {
Chris@252 997 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 998
Chris@252 999 if (i == m_inputGains.end()) {
Chris@252 1000 if (i != m_inputGains.begin()) {
Chris@252 1001 --i;
Chris@252 1002 } else {
Chris@252 1003 return 1.f; // no data
Chris@252 1004 }
Chris@252 1005 }
Chris@252 1006
Chris@252 1007 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 1008
Chris@252 1009 return i->second;
Chris@252 1010 }
Chris@252 1011