annotate src/Silvet.cpp @ 302:cac0be04c43c livemode

Add output for the templates (probably temporarily)
author Chris Cannam
date Tue, 02 Dec 2014 17:13:10 +0000
parents 00fab71b80ec
children d8468176339d
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@298 24 #include "LiveInstruments.h"
Chris@31 25
Chris@31 26 #include <vector>
Chris@31 27
Chris@32 28 #include <cstdio>
Chris@32 29
Chris@31 30 using std::vector;
Chris@48 31 using std::cout;
Chris@31 32 using std::cerr;
Chris@31 33 using std::endl;
Chris@40 34 using Vamp::RealTime;
Chris@31 35
Chris@31 36 static int processingSampleRate = 44100;
Chris@298 37
Chris@298 38 static int binsPerSemitoneLive = 1;
Chris@298 39 static int binsPerSemitoneNormal = 5;
Chris@170 40
Chris@272 41 static int minInputSampleRate = 100;
Chris@272 42 static int maxInputSampleRate = 192000;
Chris@272 43
Chris@31 44 Silvet::Silvet(float inputSampleRate) :
Chris@31 45 Plugin(inputSampleRate),
Chris@161 46 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@298 47 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
Chris@31 48 m_resampler(0),
Chris@246 49 m_flattener(0),
Chris@110 50 m_cq(0),
Chris@297 51 m_mode(HighQualityMode),
Chris@166 52 m_fineTuning(false),
Chris@178 53 m_instrument(0),
Chris@178 54 m_colsPerSec(50)
Chris@31 55 {
Chris@31 56 }
Chris@31 57
Chris@31 58 Silvet::~Silvet()
Chris@31 59 {
Chris@31 60 delete m_resampler;
Chris@246 61 delete m_flattener;
Chris@31 62 delete m_cq;
Chris@41 63 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 64 delete m_postFilter[i];
Chris@41 65 }
Chris@31 66 }
Chris@31 67
Chris@31 68 string
Chris@31 69 Silvet::getIdentifier() const
Chris@31 70 {
Chris@31 71 return "silvet";
Chris@31 72 }
Chris@31 73
Chris@31 74 string
Chris@31 75 Silvet::getName() const
Chris@31 76 {
Chris@31 77 return "Silvet Note Transcription";
Chris@31 78 }
Chris@31 79
Chris@31 80 string
Chris@31 81 Silvet::getDescription() const
Chris@31 82 {
Chris@191 83 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 84 }
Chris@31 85
Chris@31 86 string
Chris@31 87 Silvet::getMaker() const
Chris@31 88 {
Chris@191 89 return "Queen Mary, University of London";
Chris@31 90 }
Chris@31 91
Chris@31 92 int
Chris@31 93 Silvet::getPluginVersion() const
Chris@31 94 {
Chris@295 95 return 2;
Chris@31 96 }
Chris@31 97
Chris@31 98 string
Chris@31 99 Silvet::getCopyright() const
Chris@31 100 {
Chris@191 101 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 102 }
Chris@31 103
Chris@31 104 Silvet::InputDomain
Chris@31 105 Silvet::getInputDomain() const
Chris@31 106 {
Chris@31 107 return TimeDomain;
Chris@31 108 }
Chris@31 109
Chris@31 110 size_t
Chris@31 111 Silvet::getPreferredBlockSize() const
Chris@31 112 {
Chris@31 113 return 0;
Chris@31 114 }
Chris@31 115
Chris@31 116 size_t
Chris@31 117 Silvet::getPreferredStepSize() const
Chris@31 118 {
Chris@31 119 return 0;
Chris@31 120 }
Chris@31 121
Chris@31 122 size_t
Chris@31 123 Silvet::getMinChannelCount() const
Chris@31 124 {
Chris@31 125 return 1;
Chris@31 126 }
Chris@31 127
Chris@31 128 size_t
Chris@31 129 Silvet::getMaxChannelCount() const
Chris@31 130 {
Chris@31 131 return 1;
Chris@31 132 }
Chris@31 133
Chris@31 134 Silvet::ParameterList
Chris@31 135 Silvet::getParameterDescriptors() const
Chris@31 136 {
Chris@31 137 ParameterList list;
Chris@110 138
Chris@110 139 ParameterDescriptor desc;
Chris@110 140 desc.identifier = "mode";
Chris@110 141 desc.name = "Processing mode";
Chris@110 142 desc.unit = "";
Chris@297 143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
Chris@110 144 desc.minValue = 0;
Chris@297 145 desc.maxValue = 2;
Chris@113 146 desc.defaultValue = 1;
Chris@110 147 desc.isQuantized = true;
Chris@110 148 desc.quantizeStep = 1;
Chris@166 149 desc.valueNames.push_back("Draft (faster)");
Chris@165 150 desc.valueNames.push_back("Intensive (higher quality)");
Chris@297 151 desc.valueNames.push_back("Live (lower latency)");
Chris@161 152 list.push_back(desc);
Chris@161 153
Chris@176 154 desc.identifier = "instrument";
Chris@176 155 desc.name = "Instrument";
Chris@161 156 desc.unit = "";
Chris@271 157 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 158 desc.minValue = 0;
Chris@162 159 desc.maxValue = m_instruments.size()-1;
Chris@162 160 desc.defaultValue = 0;
Chris@161 161 desc.isQuantized = true;
Chris@161 162 desc.quantizeStep = 1;
Chris@161 163 desc.valueNames.clear();
Chris@162 164 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 165 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 166 }
Chris@166 167 list.push_back(desc);
Chris@161 168
Chris@166 169 desc.identifier = "finetune";
Chris@166 170 desc.name = "Return fine pitch estimates";
Chris@166 171 desc.unit = "";
Chris@271 172 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 173 desc.minValue = 0;
Chris@166 174 desc.maxValue = 1;
Chris@166 175 desc.defaultValue = 0;
Chris@166 176 desc.isQuantized = true;
Chris@166 177 desc.quantizeStep = 1;
Chris@166 178 desc.valueNames.clear();
Chris@110 179 list.push_back(desc);
Chris@110 180
Chris@31 181 return list;
Chris@31 182 }
Chris@31 183
Chris@31 184 float
Chris@31 185 Silvet::getParameter(string identifier) const
Chris@31 186 {
Chris@110 187 if (identifier == "mode") {
Chris@297 188 return (float)(int)m_mode;
Chris@166 189 } else if (identifier == "finetune") {
Chris@166 190 return m_fineTuning ? 1.f : 0.f;
Chris@176 191 } else if (identifier == "instrument") {
Chris@162 192 return m_instrument;
Chris@110 193 }
Chris@31 194 return 0;
Chris@31 195 }
Chris@31 196
Chris@31 197 void
Chris@31 198 Silvet::setParameter(string identifier, float value)
Chris@31 199 {
Chris@110 200 if (identifier == "mode") {
Chris@297 201 m_mode = (ProcessingMode)(int)(value + 0.5);
Chris@166 202 } else if (identifier == "finetune") {
Chris@166 203 m_fineTuning = (value > 0.5);
Chris@176 204 } else if (identifier == "instrument") {
Chris@162 205 m_instrument = lrintf(value);
Chris@110 206 }
Chris@31 207 }
Chris@31 208
Chris@31 209 Silvet::ProgramList
Chris@31 210 Silvet::getPrograms() const
Chris@31 211 {
Chris@31 212 ProgramList list;
Chris@31 213 return list;
Chris@31 214 }
Chris@31 215
Chris@31 216 string
Chris@31 217 Silvet::getCurrentProgram() const
Chris@31 218 {
Chris@31 219 return "";
Chris@31 220 }
Chris@31 221
Chris@31 222 void
Chris@31 223 Silvet::selectProgram(string name)
Chris@31 224 {
Chris@31 225 }
Chris@31 226
Chris@31 227 Silvet::OutputList
Chris@31 228 Silvet::getOutputDescriptors() const
Chris@31 229 {
Chris@31 230 OutputList list;
Chris@31 231
Chris@31 232 OutputDescriptor d;
Chris@51 233 d.identifier = "notes";
Chris@51 234 d.name = "Note transcription";
Chris@271 235 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 236 d.unit = "Hz";
Chris@31 237 d.hasFixedBinCount = true;
Chris@31 238 d.binCount = 2;
Chris@41 239 d.binNames.push_back("Frequency");
Chris@31 240 d.binNames.push_back("Velocity");
Chris@31 241 d.hasKnownExtents = false;
Chris@31 242 d.isQuantized = false;
Chris@31 243 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 244 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 245 d.hasDuration = true;
Chris@32 246 m_notesOutputNo = list.size();
Chris@32 247 list.push_back(d);
Chris@32 248
Chris@178 249 d.identifier = "timefreq";
Chris@178 250 d.name = "Time-frequency distribution";
Chris@271 251 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 252 d.unit = "";
Chris@178 253 d.hasFixedBinCount = true;
Chris@298 254 d.binCount = getPack(0).templateHeight;
Chris@178 255 d.binNames.clear();
Chris@178 256 if (m_cq) {
Chris@294 257 char name[50];
Chris@298 258 for (int i = 0; i < getPack(0).templateHeight; ++i) {
Chris@178 259 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 260 // lowest-frequency 55 bins have been dropped, for a
Chris@178 261 // 545-bin template. The native CQ bins go high->low
Chris@178 262 // frequency though, so these are still the first 545 bins
Chris@178 263 // as reported by getBinFrequency, though in reverse order
Chris@178 264 float freq = m_cq->getBinFrequency
Chris@298 265 (getPack(0).templateHeight - i - 1);
Chris@178 266 sprintf(name, "%.1f Hz", freq);
Chris@178 267 d.binNames.push_back(name);
Chris@178 268 }
Chris@178 269 }
Chris@178 270 d.hasKnownExtents = false;
Chris@178 271 d.isQuantized = false;
Chris@178 272 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 273 d.sampleRate = m_colsPerSec;
Chris@178 274 d.hasDuration = false;
Chris@178 275 m_fcqOutputNo = list.size();
Chris@178 276 list.push_back(d);
Chris@178 277
Chris@294 278 d.identifier = "pitchactivation";
Chris@294 279 d.name = "Pitch activation distribution";
Chris@294 280 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 281 d.unit = "";
Chris@294 282 d.hasFixedBinCount = true;
Chris@298 283 d.binCount = getPack(0).templateNoteCount;
Chris@294 284 d.binNames.clear();
Chris@294 285 if (m_cq) {
Chris@298 286 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@294 287 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 288 }
Chris@294 289 }
Chris@294 290 d.hasKnownExtents = false;
Chris@294 291 d.isQuantized = false;
Chris@294 292 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 293 d.sampleRate = m_colsPerSec;
Chris@294 294 d.hasDuration = false;
Chris@294 295 m_pitchOutputNo = list.size();
Chris@294 296 list.push_back(d);
Chris@294 297
Chris@302 298 d.identifier = "templates";
Chris@302 299 d.name = "Templates";
Chris@302 300 d.description = "Constant-Q spectral templates for the selected instrument pack.";
Chris@302 301 d.unit = "";
Chris@302 302 d.hasFixedBinCount = true;
Chris@302 303 d.binCount = getPack(0).templateHeight;
Chris@302 304 d.binNames.clear();
Chris@302 305 if (m_cq) {
Chris@302 306 char name[50];
Chris@302 307 for (int i = 0; i < getPack(0).templateHeight; ++i) {
Chris@302 308 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@302 309 // lowest-frequency 55 bins have been dropped, for a
Chris@302 310 // 545-bin template. The native CQ bins go high->low
Chris@302 311 // frequency though, so these are still the first 545 bins
Chris@302 312 // as reported by getBinFrequency, though in reverse order
Chris@302 313 float freq = m_cq->getBinFrequency
Chris@302 314 (getPack(0).templateHeight - i - 1);
Chris@302 315 sprintf(name, "%.1f Hz", freq);
Chris@302 316 d.binNames.push_back(name);
Chris@302 317 }
Chris@302 318 }
Chris@302 319 d.hasKnownExtents = false;
Chris@302 320 d.isQuantized = false;
Chris@302 321 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@302 322 d.sampleRate = m_colsPerSec;
Chris@302 323 d.hasDuration = false;
Chris@302 324 m_templateOutputNo = list.size();
Chris@302 325 list.push_back(d);
Chris@302 326
Chris@31 327 return list;
Chris@31 328 }
Chris@31 329
Chris@38 330 std::string
Chris@175 331 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@38 332 {
Chris@38 333 static const char *names[] = {
Chris@38 334 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 335 };
Chris@38 336
Chris@175 337 const char *n = names[note % 12];
Chris@38 338
Chris@175 339 int oct = (note + 9) / 12;
Chris@38 340
Chris@175 341 char buf[30];
Chris@175 342
Chris@175 343 float pshift = 0.f;
Chris@175 344 if (shiftCount > 1) {
Chris@175 345 // see noteFrequency below
Chris@175 346 pshift =
Chris@175 347 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 348 }
Chris@175 349
Chris@175 350 if (pshift > 0.f) {
Chris@175 351 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
Chris@175 352 } else if (pshift < 0.f) {
Chris@175 353 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
Chris@175 354 } else {
Chris@175 355 sprintf(buf, "%s%d", n, oct);
Chris@175 356 }
Chris@38 357
Chris@38 358 return buf;
Chris@38 359 }
Chris@38 360
Chris@41 361 float
Chris@168 362 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 363 {
Chris@169 364 // Convert shift number to a pitch shift. The given shift number
Chris@169 365 // is an offset into the template array, which starts with some
Chris@169 366 // zeros, followed by the template, then some trailing zeros.
Chris@169 367 //
Chris@169 368 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 369 // == 5, then the number will be in the range 0-4 and the template
Chris@169 370 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 371 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 372 // represent moving the template *up* in pitch (by introducing
Chris@169 373 // zeros at the start, which is the low-frequency end), for a
Chris@169 374 // positive pitch shift; and higher values represent moving it
Chris@169 375 // down in pitch, for a negative pitch shift.
Chris@169 376
Chris@175 377 float pshift = 0.f;
Chris@175 378 if (shiftCount > 1) {
Chris@175 379 pshift =
Chris@175 380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 381 }
Chris@169 382
Chris@301 383 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@301 384
Chris@301 385 cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
Chris@301 386 << shiftCount << ", obtained freq = " << freq << endl;
Chris@301 387
Chris@301 388 return freq;
Chris@41 389 }
Chris@41 390
Chris@31 391 bool
Chris@31 392 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 393 {
Chris@272 394 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 395 m_inputSampleRate > maxInputSampleRate) {
Chris@272 396 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 397 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 398 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 399 return false;
Chris@272 400 }
Chris@272 401
Chris@31 402 if (channels < getMinChannelCount() ||
Chris@272 403 channels > getMaxChannelCount()) {
Chris@272 404 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 405 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 406 << getMaxChannelCount() << ")" << endl;
Chris@272 407 return false;
Chris@272 408 }
Chris@31 409
Chris@31 410 if (stepSize != blockSize) {
Chris@31 411 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 412 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 413 return false;
Chris@31 414 }
Chris@31 415
Chris@31 416 m_blockSize = blockSize;
Chris@31 417
Chris@31 418 reset();
Chris@31 419
Chris@31 420 return true;
Chris@31 421 }
Chris@31 422
Chris@31 423 void
Chris@31 424 Silvet::reset()
Chris@31 425 {
Chris@31 426 delete m_resampler;
Chris@246 427 delete m_flattener;
Chris@31 428 delete m_cq;
Chris@31 429
Chris@31 430 if (m_inputSampleRate != processingSampleRate) {
Chris@31 431 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 432 } else {
Chris@31 433 m_resampler = 0;
Chris@31 434 }
Chris@31 435
Chris@246 436 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 437 m_flattener->reset();
Chris@246 438
Chris@301 439 // this happens to be processingSampleRate / 3, and is the top
Chris@301 440 // freq used for the EM templates:
Chris@301 441 double maxFreq = 14700;
Chris@301 442
Chris@301 443 if (m_mode == LiveMode) {
Chris@301 444 // We only have 12 bpo rather than 60, so we need the top bin
Chris@301 445 // to be the middle one of the top 5, i.e. 2/5 of a semitone
Chris@301 446 // lower than 14700
Chris@301 447 maxFreq *= powf(2.0, -1.0 / 30.0);
Chris@301 448 }
Chris@301 449
Chris@173 450 double minFreq = 27.5;
Chris@173 451
Chris@297 452 if (m_mode != HighQualityMode) {
Chris@173 453 // We don't actually return any notes from the bottom octave,
Chris@173 454 // so we can just pad with zeros
Chris@173 455 minFreq *= 2;
Chris@173 456 }
Chris@173 457
Chris@298 458 int bpo = 12 *
Chris@298 459 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@301 460
Chris@154 461 CQParameters params(processingSampleRate,
Chris@173 462 minFreq,
Chris@154 463 processingSampleRate / 3,
Chris@298 464 bpo);
Chris@154 465
Chris@155 466 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 467 // drops the FFT size to 512 from 1024 and alters
Chris@155 468 // some other processing parameters, making
Chris@155 469 // everything much, much slower. Could be a flaw
Chris@155 470 // in the CQ parameter calculations, must check
Chris@154 471 params.atomHopFactor = 0.3;
Chris@154 472 params.threshold = 0.0005;
Chris@172 473 params.window = CQParameters::Hann;
Chris@154 474
Chris@154 475 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 476
Chris@301 477 cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
Chris@301 478 cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
Chris@297 479
Chris@297 480 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
Chris@165 481
Chris@41 482 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 483 delete m_postFilter[i];
Chris@41 484 }
Chris@41 485 m_postFilter.clear();
Chris@298 486 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
Chris@41 487 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 488 }
Chris@41 489 m_pianoRoll.clear();
Chris@246 490 m_inputGains.clear();
Chris@32 491 m_columnCount = 0;
Chris@272 492 m_resampledCount = 0;
Chris@40 493 m_startTime = RealTime::zeroTime;
Chris@31 494 }
Chris@31 495
Chris@31 496 Silvet::FeatureSet
Chris@31 497 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 498 {
Chris@302 499 FeatureSet fs;
Chris@302 500
Chris@40 501 if (m_columnCount == 0) {
Chris@40 502 m_startTime = timestamp;
Chris@302 503 insertTemplateFeatures(fs);
Chris@40 504 }
Chris@246 505
Chris@246 506 vector<float> flattened(m_blockSize);
Chris@246 507 float gain = 1.f;
Chris@246 508 m_flattener->connectInputPort
Chris@246 509 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 510 m_flattener->connectOutputPort
Chris@246 511 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 512 m_flattener->connectOutputPort
Chris@246 513 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 514 m_flattener->process(m_blockSize);
Chris@246 515
Chris@252 516 m_inputGains[timestamp] = gain;
Chris@40 517
Chris@31 518 vector<double> data;
Chris@40 519 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 520 double d = flattened[i];
Chris@235 521 data.push_back(d);
Chris@40 522 }
Chris@31 523
Chris@31 524 if (m_resampler) {
Chris@272 525
Chris@31 526 data = m_resampler->process(data.data(), data.size());
Chris@272 527
Chris@272 528 int hadCount = m_resampledCount;
Chris@272 529 m_resampledCount += data.size();
Chris@272 530
Chris@272 531 int resamplerLatency = m_resampler->getLatency();
Chris@272 532
Chris@272 533 if (hadCount < resamplerLatency) {
Chris@272 534 int stillToDrop = resamplerLatency - hadCount;
Chris@272 535 if (stillToDrop >= int(data.size())) {
Chris@302 536 return fs;
Chris@272 537 } else {
Chris@272 538 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 539 }
Chris@272 540 }
Chris@31 541 }
Chris@272 542
Chris@32 543 Grid cqout = m_cq->process(data);
Chris@302 544 transcribe(cqout, fs);
Chris@51 545 return fs;
Chris@34 546 }
Chris@34 547
Chris@34 548 Silvet::FeatureSet
Chris@34 549 Silvet::getRemainingFeatures()
Chris@34 550 {
Chris@145 551 Grid cqout = m_cq->getRemainingOutput();
Chris@302 552 FeatureSet fs;
Chris@302 553 if (m_columnCount == 0) {
Chris@302 554 // process() was never called, but we still want these
Chris@302 555 insertTemplateFeatures(fs);
Chris@302 556 } else {
Chris@302 557 transcribe(cqout, fs);
Chris@302 558 }
Chris@51 559 return fs;
Chris@34 560 }
Chris@34 561
Chris@302 562 void
Chris@302 563 Silvet::insertTemplateFeatures(FeatureSet &fs)
Chris@302 564 {
Chris@302 565 const InstrumentPack &pack = getPack(m_instrument);
Chris@302 566 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
Chris@302 567 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
Chris@302 568 Feature f;
Chris@302 569 char buffer[50];
Chris@302 570 sprintf(buffer, "Note %d", i + 1);
Chris@302 571 f.label = buffer;
Chris@302 572 f.hasTimestamp = true;
Chris@302 573 f.timestamp = timestamp;
Chris@302 574 f.values = pack.templates[i / pack.templateNoteCount]
Chris@302 575 .data[i % pack.templateNoteCount];
Chris@302 576 fs[m_templateOutputNo].push_back(f);
Chris@302 577 }
Chris@302 578 }
Chris@302 579
Chris@302 580 void
Chris@302 581 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
Chris@34 582 {
Chris@32 583 Grid filtered = preProcess(cqout);
Chris@31 584
Chris@302 585 if (filtered.empty()) return;
Chris@170 586
Chris@298 587 const InstrumentPack &pack(getPack(m_instrument));
Chris@104 588
Chris@178 589 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 590 Feature f;
Chris@178 591 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 592 f.values.push_back(float(filtered[i][j]));
Chris@178 593 }
Chris@178 594 fs[m_fcqOutputNo].push_back(f);
Chris@178 595 }
Chris@178 596
Chris@34 597 int width = filtered.size();
Chris@34 598
Chris@297 599 int iterations = (m_mode == HighQualityMode ? 20 : 10);
Chris@34 600
Chris@176 601 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 602
Chris@297 603 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
Chris@170 604 int shiftCount = 1;
Chris@170 605 if (wantShifts) {
Chris@170 606 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 607 }
Chris@170 608
Chris@170 609 vector<vector<int> > localBestShifts;
Chris@170 610 if (wantShifts) {
Chris@170 611 localBestShifts =
Chris@176 612 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 613 }
Chris@170 614
Chris@170 615 vector<bool> present(width, false);
Chris@37 616
Chris@123 617 #pragma omp parallel for
Chris@123 618 for (int i = 0; i < width; ++i) {
Chris@104 619
Chris@170 620 double sum = 0.0;
Chris@176 621 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 622 sum += filtered.at(i).at(j);
Chris@170 623 }
Chris@170 624 if (sum < 1e-5) continue;
Chris@170 625
Chris@170 626 present[i] = true;
Chris@170 627
Chris@297 628 EM em(&pack, m_mode == HighQualityMode);
Chris@170 629
Chris@183 630 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 631 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 632
Chris@170 633 for (int j = 0; j < iterations; ++j) {
Chris@170 634 em.iterate(filtered.at(i).data());
Chris@37 635 }
Chris@37 636
Chris@170 637 const float *pitchDist = em.getPitchDistribution();
Chris@170 638 const float *const *shiftDist = em.getShifts();
Chris@37 639
Chris@176 640 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 641
Chris@170 642 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 643
Chris@170 644 int bestShift = 0;
Chris@179 645 float bestShiftValue = 0.0;
Chris@170 646 if (wantShifts) {
Chris@170 647 for (int k = 0; k < shiftCount; ++k) {
Chris@179 648 float value = shiftDist[k][j];
Chris@179 649 if (k == 0 || value > bestShiftValue) {
Chris@179 650 bestShiftValue = value;
Chris@170 651 bestShift = k;
Chris@170 652 }
Chris@170 653 }
Chris@170 654 localBestShifts[i][j] = bestShift;
Chris@170 655 }
Chris@123 656 }
Chris@123 657 }
Chris@166 658
Chris@166 659 for (int i = 0; i < width; ++i) {
Chris@37 660
Chris@170 661 if (!present[i]) {
Chris@170 662 // silent column
Chris@176 663 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 664 m_postFilter[j]->push(0.0);
Chris@170 665 }
Chris@168 666 m_pianoRoll.push_back(map<int, double>());
Chris@170 667 if (wantShifts) {
Chris@168 668 m_pianoRollShifts.push_back(map<int, int>());
Chris@168 669 }
Chris@166 670 continue;
Chris@166 671 }
Chris@166 672
Chris@294 673 vector<double> filtered = postProcess
Chris@294 674 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 675
Chris@294 676 Feature f;
Chris@294 677 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@294 678 float v(filtered[j]);
Chris@294 679 if (v < pack.levelThreshold) v = 0.f;
Chris@294 680 f.values.push_back(v);
Chris@294 681 }
Chris@294 682 fs[m_pitchOutputNo].push_back(f);
Chris@166 683
Chris@168 684 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 685
Chris@123 686 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 687 fi != noteFeatures.end(); ++fi) {
Chris@123 688 fs[m_notesOutputNo].push_back(*fi);
Chris@40 689 }
Chris@34 690 }
Chris@31 691 }
Chris@31 692
Chris@32 693 Silvet::Grid
Chris@32 694 Silvet::preProcess(const Grid &in)
Chris@32 695 {
Chris@32 696 int width = in.size();
Chris@32 697
Chris@165 698 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 699
Chris@165 700 // need to be careful that col spacing is an integer number of samples!
Chris@165 701 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 702
Chris@32 703 Grid out;
Chris@32 704
Chris@58 705 // We count the CQ latency in terms of processing hops, but
Chris@58 706 // actually it probably isn't an exact number of hops so this
Chris@58 707 // isn't quite accurate. But the small constant offset is
Chris@165 708 // practically irrelevant compared to the jitter from the frame
Chris@165 709 // size we reduce to in a moment
Chris@33 710 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 711
Chris@298 712 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 713
Chris@32 714 for (int i = 0; i < width; ++i) {
Chris@32 715
Chris@33 716 if (m_columnCount < latentColumns) {
Chris@33 717 ++m_columnCount;
Chris@33 718 continue;
Chris@33 719 }
Chris@33 720
Chris@32 721 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 722 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 723
Chris@32 724 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 725
Chris@32 726 if (select) {
Chris@32 727 vector<double> inCol = in[i];
Chris@176 728 vector<double> outCol(pack.templateHeight);
Chris@32 729
Chris@178 730 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@298 731 // lowest 55 of them (assuming binsPerSemitone == 5).
Chris@178 732 //
Chris@297 733 // In draft and live mode the CQ is an octave shorter,
Chris@300 734 // returning 540 bins or equivalent, so we instead pad
Chris@300 735 // them with an additional 5 or equivalent zeros.
Chris@178 736 //
Chris@178 737 // We also need to reverse the column as we go, since the
Chris@178 738 // raw CQ has the high frequencies first and we need it
Chris@178 739 // the other way around.
Chris@32 740
Chris@298 741 int bps = (m_mode == LiveMode ?
Chris@298 742 binsPerSemitoneLive : binsPerSemitoneNormal);
Chris@298 743
Chris@297 744 if (m_mode == HighQualityMode) {
Chris@178 745 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@298 746 int ix = inCol.size() - j - (11 * bps);
Chris@178 747 outCol[j] = inCol[ix];
Chris@178 748 }
Chris@178 749 } else {
Chris@298 750 for (int j = 0; j < bps; ++j) {
Chris@178 751 outCol[j] = 0.0;
Chris@178 752 }
Chris@298 753 for (int j = bps; j < pack.templateHeight; ++j) {
Chris@298 754 int ix = inCol.size() - j + (bps-1);
Chris@178 755 outCol[j] = inCol[ix];
Chris@178 756 }
Chris@46 757 }
Chris@32 758
Chris@46 759 vector<double> noiseLevel1 =
Chris@298 760 MedianFilter<double>::filter(8 * bps, outCol);
Chris@176 761 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 762 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 763 }
Chris@32 764
Chris@46 765 vector<double> noiseLevel2 =
Chris@298 766 MedianFilter<double>::filter(8 * bps, noiseLevel1);
Chris@176 767 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 768 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 769 }
Chris@32 770
Chris@165 771 out.push_back(outCol);
Chris@32 772 }
Chris@32 773
Chris@32 774 ++m_columnCount;
Chris@32 775 }
Chris@32 776
Chris@32 777 return out;
Chris@32 778 }
Chris@32 779
Chris@294 780 vector<double>
Chris@170 781 Silvet::postProcess(const vector<double> &pitches,
Chris@170 782 const vector<int> &bestShifts,
Chris@170 783 bool wantShifts)
Chris@166 784 {
Chris@298 785 const InstrumentPack &pack(getPack(m_instrument));
Chris@176 786
Chris@41 787 vector<double> filtered;
Chris@41 788
Chris@176 789 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 790 m_postFilter[j]->push(pitches[j]);
Chris@41 791 filtered.push_back(m_postFilter[j]->get());
Chris@41 792 }
Chris@41 793
Chris@41 794 // Threshold for level and reduce number of candidate pitches
Chris@41 795
Chris@41 796 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 797
Chris@41 798 ValueIndexMap strengths;
Chris@166 799
Chris@176 800 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 801 double strength = filtered[j];
Chris@183 802 if (strength < pack.levelThreshold) continue;
Chris@168 803 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 804 }
Chris@166 805
Chris@168 806 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 807
Chris@168 808 map<int, double> active;
Chris@168 809 map<int, int> activeShifts;
Chris@168 810
Chris@183 811 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 812
Chris@168 813 --si;
Chris@168 814
Chris@168 815 double strength = si->first;
Chris@168 816 int j = si->second;
Chris@168 817
Chris@168 818 active[j] = strength;
Chris@168 819
Chris@170 820 if (wantShifts) {
Chris@170 821 activeShifts[j] = bestShifts[j];
Chris@167 822 }
Chris@41 823 }
Chris@41 824
Chris@168 825 m_pianoRoll.push_back(active);
Chris@170 826
Chris@170 827 if (wantShifts) {
Chris@168 828 m_pianoRollShifts.push_back(activeShifts);
Chris@41 829 }
Chris@294 830
Chris@294 831 return filtered;
Chris@166 832 }
Chris@166 833
Chris@166 834 Vamp::Plugin::FeatureList
Chris@168 835 Silvet::noteTrack(int shiftCount)
Chris@166 836 {
Chris@41 837 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 838 // report notes that have just ended (i.e. that are absent in the
Chris@168 839 // latest active set but present in the prior set in the piano
Chris@41 840 // roll) -- any notes that ended earlier will have been reported
Chris@41 841 // already, and if they haven't ended, we don't know their
Chris@41 842 // duration.
Chris@41 843
Chris@168 844 int width = m_pianoRoll.size() - 1;
Chris@168 845
Chris@168 846 const map<int, double> &active = m_pianoRoll[width];
Chris@41 847
Chris@165 848 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 849
Chris@165 850 // only keep notes >= 100ms or thereabouts
Chris@165 851 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 852 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 853
Chris@41 854 FeatureList noteFeatures;
Chris@41 855
Chris@41 856 if (width < durationThreshold + 1) {
Chris@41 857 return noteFeatures;
Chris@41 858 }
Chris@41 859
Chris@150 860 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 861
Chris@55 862 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 863 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 864
Chris@55 865 int note = ni->first;
Chris@41 866
Chris@41 867 if (active.find(note) != active.end()) {
Chris@41 868 // the note is still playing
Chris@41 869 continue;
Chris@41 870 }
Chris@41 871
Chris@41 872 // the note was playing but just ended
Chris@41 873 int end = width;
Chris@41 874 int start = end-1;
Chris@41 875
Chris@41 876 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 877 --start;
Chris@41 878 }
Chris@41 879 ++start;
Chris@41 880
Chris@169 881 if ((end - start) < durationThreshold) {
Chris@41 882 continue;
Chris@41 883 }
Chris@41 884
Chris@169 885 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 886 }
Chris@41 887
Chris@62 888 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 889
Chris@41 890 return noteFeatures;
Chris@41 891 }
Chris@41 892
Chris@169 893 void
Chris@169 894 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 895 FeatureList &noteFeatures)
Chris@169 896 {
Chris@169 897 int partStart = start;
Chris@169 898 int partShift = 0;
Chris@169 899 int partVelocity = 0;
Chris@169 900
Chris@252 901 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 902
Chris@169 903 for (int i = start; i != end; ++i) {
Chris@169 904
Chris@169 905 double strength = m_pianoRoll[i][note];
Chris@169 906
Chris@169 907 int shift = 0;
Chris@169 908
Chris@169 909 if (shiftCount > 1) {
Chris@169 910
Chris@169 911 shift = m_pianoRollShifts[i][note];
Chris@169 912
Chris@169 913 if (i == partStart) {
Chris@169 914 partShift = shift;
Chris@169 915 }
Chris@169 916
Chris@169 917 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 918
Chris@169 919 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 920
Chris@169 921 // pitch has changed, emit an intermediate note
Chris@252 922 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 923 i,
Chris@252 924 note,
Chris@252 925 partShift,
Chris@252 926 shiftCount,
Chris@252 927 partVelocity));
Chris@169 928 partStart = i;
Chris@169 929 partShift = shift;
Chris@169 930 partVelocity = 0;
Chris@169 931 }
Chris@169 932 }
Chris@169 933
Chris@246 934 int v = round(strength * 2);
Chris@169 935 if (v > partVelocity) {
Chris@169 936 partVelocity = v;
Chris@169 937 }
Chris@169 938 }
Chris@169 939
Chris@169 940 if (end >= partStart + partThreshold) {
Chris@252 941 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 942 end,
Chris@252 943 note,
Chris@252 944 partShift,
Chris@252 945 shiftCount,
Chris@252 946 partVelocity));
Chris@169 947 }
Chris@169 948 }
Chris@252 949
Chris@252 950 Silvet::Feature
Chris@252 951 Silvet::makeNoteFeature(int start,
Chris@252 952 int end,
Chris@252 953 int note,
Chris@252 954 int shift,
Chris@252 955 int shiftCount,
Chris@252 956 int velocity)
Chris@252 957 {
Chris@252 958 double columnDuration = 1.0 / m_colsPerSec;
Chris@252 959 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@252 960
Chris@252 961 Feature f;
Chris@252 962
Chris@252 963 f.hasTimestamp = true;
Chris@285 964 f.timestamp = m_startTime + RealTime::fromSeconds
Chris@252 965 (columnDuration * (start - postFilterLatency) + 0.02);
Chris@252 966
Chris@252 967 f.hasDuration = true;
Chris@252 968 f.duration = RealTime::fromSeconds
Chris@252 969 (columnDuration * (end - start));
Chris@252 970
Chris@252 971 f.values.clear();
Chris@252 972
Chris@252 973 f.values.push_back
Chris@252 974 (noteFrequency(note, shift, shiftCount));
Chris@252 975
Chris@252 976 float inputGain = getInputGainAt(f.timestamp);
Chris@252 977 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 978 velocity = round(velocity / inputGain);
Chris@252 979 if (velocity > 127) velocity = 127;
Chris@252 980 if (velocity < 1) velocity = 1;
Chris@252 981 f.values.push_back(velocity);
Chris@252 982
Chris@252 983 f.label = noteName(note, shift, shiftCount);
Chris@252 984
Chris@252 985 return f;
Chris@252 986 }
Chris@252 987
Chris@252 988 float
Chris@252 989 Silvet::getInputGainAt(RealTime t)
Chris@252 990 {
Chris@252 991 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 992
Chris@252 993 if (i == m_inputGains.end()) {
Chris@252 994 if (i != m_inputGains.begin()) {
Chris@252 995 --i;
Chris@252 996 } else {
Chris@252 997 return 1.f; // no data
Chris@252 998 }
Chris@252 999 }
Chris@252 1000
Chris@252 1001 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 1002
Chris@252 1003 return i->second;
Chris@252 1004 }
Chris@252 1005