annotate src/Silvet.cpp @ 309:07ee4ebea57c

Add chromagram output
author Chris Cannam
date Mon, 19 Jan 2015 11:23:07 +0000
parents 04a3c152e590
children 99af9557dfc1
rev   line source
Chris@31 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@31 2
Chris@31 3 /*
Chris@31 4 Silvet
Chris@31 5
Chris@31 6 A Vamp plugin for note transcription.
Chris@31 7 Centre for Digital Music, Queen Mary University of London.
Chris@31 8
Chris@31 9 This program is free software; you can redistribute it and/or
Chris@31 10 modify it under the terms of the GNU General Public License as
Chris@31 11 published by the Free Software Foundation; either version 2 of the
Chris@31 12 License, or (at your option) any later version. See the file
Chris@31 13 COPYING included with this distribution for more information.
Chris@31 14 */
Chris@31 15
Chris@31 16 #include "Silvet.h"
Chris@34 17 #include "EM.h"
Chris@31 18
Chris@152 19 #include <cq/CQSpectrogram.h>
Chris@31 20
Chris@152 21 #include "MedianFilter.h"
Chris@152 22 #include "constant-q-cpp/src/dsp/Resampler.h"
Chris@246 23 #include "flattendynamics-ladspa.h"
Chris@31 24
Chris@31 25 #include <vector>
Chris@31 26
Chris@32 27 #include <cstdio>
Chris@32 28
Chris@31 29 using std::vector;
Chris@48 30 using std::cout;
Chris@31 31 using std::cerr;
Chris@31 32 using std::endl;
Chris@40 33 using Vamp::RealTime;
Chris@31 34
Chris@31 35 static int processingSampleRate = 44100;
Chris@31 36 static int processingBPO = 60;
Chris@170 37
Chris@272 38 static int minInputSampleRate = 100;
Chris@272 39 static int maxInputSampleRate = 192000;
Chris@272 40
Chris@31 41 Silvet::Silvet(float inputSampleRate) :
Chris@31 42 Plugin(inputSampleRate),
Chris@161 43 m_instruments(InstrumentPack::listInstrumentPacks()),
Chris@31 44 m_resampler(0),
Chris@246 45 m_flattener(0),
Chris@110 46 m_cq(0),
Chris@162 47 m_hqMode(true),
Chris@166 48 m_fineTuning(false),
Chris@178 49 m_instrument(0),
Chris@178 50 m_colsPerSec(50)
Chris@31 51 {
Chris@31 52 }
Chris@31 53
Chris@31 54 Silvet::~Silvet()
Chris@31 55 {
Chris@31 56 delete m_resampler;
Chris@246 57 delete m_flattener;
Chris@31 58 delete m_cq;
Chris@41 59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 60 delete m_postFilter[i];
Chris@41 61 }
Chris@31 62 }
Chris@31 63
Chris@31 64 string
Chris@31 65 Silvet::getIdentifier() const
Chris@31 66 {
Chris@31 67 return "silvet";
Chris@31 68 }
Chris@31 69
Chris@31 70 string
Chris@31 71 Silvet::getName() const
Chris@31 72 {
Chris@31 73 return "Silvet Note Transcription";
Chris@31 74 }
Chris@31 75
Chris@31 76 string
Chris@31 77 Silvet::getDescription() const
Chris@31 78 {
Chris@191 79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
Chris@31 80 }
Chris@31 81
Chris@31 82 string
Chris@31 83 Silvet::getMaker() const
Chris@31 84 {
Chris@191 85 return "Queen Mary, University of London";
Chris@31 86 }
Chris@31 87
Chris@31 88 int
Chris@31 89 Silvet::getPluginVersion() const
Chris@31 90 {
Chris@309 91 return 3;
Chris@31 92 }
Chris@31 93
Chris@31 94 string
Chris@31 95 Silvet::getCopyright() const
Chris@31 96 {
Chris@191 97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
Chris@31 98 }
Chris@31 99
Chris@31 100 Silvet::InputDomain
Chris@31 101 Silvet::getInputDomain() const
Chris@31 102 {
Chris@31 103 return TimeDomain;
Chris@31 104 }
Chris@31 105
Chris@31 106 size_t
Chris@31 107 Silvet::getPreferredBlockSize() const
Chris@31 108 {
Chris@31 109 return 0;
Chris@31 110 }
Chris@31 111
Chris@31 112 size_t
Chris@31 113 Silvet::getPreferredStepSize() const
Chris@31 114 {
Chris@31 115 return 0;
Chris@31 116 }
Chris@31 117
Chris@31 118 size_t
Chris@31 119 Silvet::getMinChannelCount() const
Chris@31 120 {
Chris@31 121 return 1;
Chris@31 122 }
Chris@31 123
Chris@31 124 size_t
Chris@31 125 Silvet::getMaxChannelCount() const
Chris@31 126 {
Chris@31 127 return 1;
Chris@31 128 }
Chris@31 129
Chris@31 130 Silvet::ParameterList
Chris@31 131 Silvet::getParameterDescriptors() const
Chris@31 132 {
Chris@31 133 ParameterList list;
Chris@110 134
Chris@110 135 ParameterDescriptor desc;
Chris@110 136 desc.identifier = "mode";
Chris@110 137 desc.name = "Processing mode";
Chris@110 138 desc.unit = "";
Chris@271 139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
Chris@110 140 desc.minValue = 0;
Chris@110 141 desc.maxValue = 1;
Chris@113 142 desc.defaultValue = 1;
Chris@110 143 desc.isQuantized = true;
Chris@110 144 desc.quantizeStep = 1;
Chris@166 145 desc.valueNames.push_back("Draft (faster)");
Chris@165 146 desc.valueNames.push_back("Intensive (higher quality)");
Chris@161 147 list.push_back(desc);
Chris@161 148
Chris@176 149 desc.identifier = "instrument";
Chris@176 150 desc.name = "Instrument";
Chris@161 151 desc.unit = "";
Chris@271 152 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
Chris@161 153 desc.minValue = 0;
Chris@162 154 desc.maxValue = m_instruments.size()-1;
Chris@162 155 desc.defaultValue = 0;
Chris@161 156 desc.isQuantized = true;
Chris@161 157 desc.quantizeStep = 1;
Chris@161 158 desc.valueNames.clear();
Chris@162 159 for (int i = 0; i < int(m_instruments.size()); ++i) {
Chris@162 160 desc.valueNames.push_back(m_instruments[i].name);
Chris@162 161 }
Chris@166 162 list.push_back(desc);
Chris@161 163
Chris@166 164 desc.identifier = "finetune";
Chris@166 165 desc.name = "Return fine pitch estimates";
Chris@166 166 desc.unit = "";
Chris@271 167 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
Chris@166 168 desc.minValue = 0;
Chris@166 169 desc.maxValue = 1;
Chris@166 170 desc.defaultValue = 0;
Chris@166 171 desc.isQuantized = true;
Chris@166 172 desc.quantizeStep = 1;
Chris@166 173 desc.valueNames.clear();
Chris@110 174 list.push_back(desc);
Chris@110 175
Chris@31 176 return list;
Chris@31 177 }
Chris@31 178
Chris@31 179 float
Chris@31 180 Silvet::getParameter(string identifier) const
Chris@31 181 {
Chris@110 182 if (identifier == "mode") {
Chris@110 183 return m_hqMode ? 1.f : 0.f;
Chris@166 184 } else if (identifier == "finetune") {
Chris@166 185 return m_fineTuning ? 1.f : 0.f;
Chris@176 186 } else if (identifier == "instrument") {
Chris@162 187 return m_instrument;
Chris@110 188 }
Chris@31 189 return 0;
Chris@31 190 }
Chris@31 191
Chris@31 192 void
Chris@31 193 Silvet::setParameter(string identifier, float value)
Chris@31 194 {
Chris@110 195 if (identifier == "mode") {
Chris@110 196 m_hqMode = (value > 0.5);
Chris@166 197 } else if (identifier == "finetune") {
Chris@166 198 m_fineTuning = (value > 0.5);
Chris@176 199 } else if (identifier == "instrument") {
Chris@162 200 m_instrument = lrintf(value);
Chris@110 201 }
Chris@31 202 }
Chris@31 203
Chris@31 204 Silvet::ProgramList
Chris@31 205 Silvet::getPrograms() const
Chris@31 206 {
Chris@31 207 ProgramList list;
Chris@31 208 return list;
Chris@31 209 }
Chris@31 210
Chris@31 211 string
Chris@31 212 Silvet::getCurrentProgram() const
Chris@31 213 {
Chris@31 214 return "";
Chris@31 215 }
Chris@31 216
Chris@31 217 void
Chris@31 218 Silvet::selectProgram(string name)
Chris@31 219 {
Chris@31 220 }
Chris@31 221
Chris@31 222 Silvet::OutputList
Chris@31 223 Silvet::getOutputDescriptors() const
Chris@31 224 {
Chris@31 225 OutputList list;
Chris@31 226
Chris@31 227 OutputDescriptor d;
Chris@51 228 d.identifier = "notes";
Chris@51 229 d.name = "Note transcription";
Chris@271 230 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
Chris@41 231 d.unit = "Hz";
Chris@31 232 d.hasFixedBinCount = true;
Chris@31 233 d.binCount = 2;
Chris@41 234 d.binNames.push_back("Frequency");
Chris@31 235 d.binNames.push_back("Velocity");
Chris@31 236 d.hasKnownExtents = false;
Chris@31 237 d.isQuantized = false;
Chris@31 238 d.sampleType = OutputDescriptor::VariableSampleRate;
Chris@246 239 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
Chris@31 240 d.hasDuration = true;
Chris@32 241 m_notesOutputNo = list.size();
Chris@32 242 list.push_back(d);
Chris@32 243
Chris@178 244 d.identifier = "timefreq";
Chris@178 245 d.name = "Time-frequency distribution";
Chris@271 246 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
Chris@178 247 d.unit = "";
Chris@178 248 d.hasFixedBinCount = true;
Chris@178 249 d.binCount = m_instruments[0].templateHeight;
Chris@178 250 d.binNames.clear();
Chris@178 251 if (m_cq) {
Chris@294 252 char name[50];
Chris@178 253 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
Chris@178 254 // We have a 600-bin (10 oct 60-bin CQ) of which the
Chris@178 255 // lowest-frequency 55 bins have been dropped, for a
Chris@178 256 // 545-bin template. The native CQ bins go high->low
Chris@178 257 // frequency though, so these are still the first 545 bins
Chris@178 258 // as reported by getBinFrequency, though in reverse order
Chris@178 259 float freq = m_cq->getBinFrequency
Chris@178 260 (m_instruments[0].templateHeight - i - 1);
Chris@178 261 sprintf(name, "%.1f Hz", freq);
Chris@178 262 d.binNames.push_back(name);
Chris@178 263 }
Chris@178 264 }
Chris@178 265 d.hasKnownExtents = false;
Chris@178 266 d.isQuantized = false;
Chris@178 267 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@178 268 d.sampleRate = m_colsPerSec;
Chris@178 269 d.hasDuration = false;
Chris@178 270 m_fcqOutputNo = list.size();
Chris@178 271 list.push_back(d);
Chris@178 272
Chris@294 273 d.identifier = "pitchactivation";
Chris@294 274 d.name = "Pitch activation distribution";
Chris@294 275 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
Chris@294 276 d.unit = "";
Chris@294 277 d.hasFixedBinCount = true;
Chris@294 278 d.binCount = m_instruments[0].templateNoteCount;
Chris@294 279 d.binNames.clear();
Chris@294 280 if (m_cq) {
Chris@294 281 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@294 282 d.binNames.push_back(noteName(i, 0, 1));
Chris@294 283 }
Chris@294 284 }
Chris@294 285 d.hasKnownExtents = false;
Chris@294 286 d.isQuantized = false;
Chris@294 287 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@294 288 d.sampleRate = m_colsPerSec;
Chris@294 289 d.hasDuration = false;
Chris@294 290 m_pitchOutputNo = list.size();
Chris@294 291 list.push_back(d);
Chris@294 292
Chris@309 293 d.identifier = "chroma";
Chris@309 294 d.name = "Pitch chroma distribution";
Chris@309 295 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
Chris@309 296 d.unit = "";
Chris@309 297 d.hasFixedBinCount = true;
Chris@309 298 d.binCount = 12;
Chris@309 299 d.binNames.clear();
Chris@309 300 if (m_cq) {
Chris@309 301 for (int i = 0; i < 12; ++i) {
Chris@309 302 d.binNames.push_back(chromaName(i));
Chris@309 303 }
Chris@309 304 }
Chris@309 305 d.hasKnownExtents = false;
Chris@309 306 d.isQuantized = false;
Chris@309 307 d.sampleType = OutputDescriptor::FixedSampleRate;
Chris@309 308 d.sampleRate = m_colsPerSec;
Chris@309 309 d.hasDuration = false;
Chris@309 310 m_chromaOutputNo = list.size();
Chris@309 311 list.push_back(d);
Chris@309 312
Chris@31 313 return list;
Chris@31 314 }
Chris@31 315
Chris@38 316 std::string
Chris@309 317 Silvet::chromaName(int pitch) const
Chris@38 318 {
Chris@38 319 static const char *names[] = {
Chris@38 320 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
Chris@38 321 };
Chris@38 322
Chris@309 323 return names[pitch];
Chris@309 324 }
Chris@309 325
Chris@309 326 std::string
Chris@309 327 Silvet::noteName(int note, int shift, int shiftCount) const
Chris@309 328 {
Chris@309 329 string n = chromaName(note % 12);
Chris@38 330
Chris@175 331 int oct = (note + 9) / 12;
Chris@38 332
Chris@175 333 char buf[30];
Chris@175 334
Chris@175 335 float pshift = 0.f;
Chris@175 336 if (shiftCount > 1) {
Chris@175 337 // see noteFrequency below
Chris@175 338 pshift =
Chris@175 339 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 340 }
Chris@175 341
Chris@175 342 if (pshift > 0.f) {
Chris@309 343 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
Chris@175 344 } else if (pshift < 0.f) {
Chris@309 345 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
Chris@175 346 } else {
Chris@309 347 sprintf(buf, "%s%d", n.c_str(), oct);
Chris@175 348 }
Chris@38 349
Chris@38 350 return buf;
Chris@38 351 }
Chris@38 352
Chris@41 353 float
Chris@168 354 Silvet::noteFrequency(int note, int shift, int shiftCount) const
Chris@41 355 {
Chris@169 356 // Convert shift number to a pitch shift. The given shift number
Chris@169 357 // is an offset into the template array, which starts with some
Chris@169 358 // zeros, followed by the template, then some trailing zeros.
Chris@169 359 //
Chris@169 360 // Example: if we have templateMaxShift == 2 and thus shiftCount
Chris@169 361 // == 5, then the number will be in the range 0-4 and the template
Chris@169 362 // will have 2 zeros at either end. Thus number 2 represents the
Chris@169 363 // template "as recorded", for a pitch shift of 0; smaller indices
Chris@169 364 // represent moving the template *up* in pitch (by introducing
Chris@169 365 // zeros at the start, which is the low-frequency end), for a
Chris@169 366 // positive pitch shift; and higher values represent moving it
Chris@169 367 // down in pitch, for a negative pitch shift.
Chris@169 368
Chris@175 369 float pshift = 0.f;
Chris@175 370 if (shiftCount > 1) {
Chris@175 371 pshift =
Chris@175 372 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
Chris@175 373 }
Chris@169 374
Chris@169 375 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
Chris@41 376 }
Chris@41 377
Chris@31 378 bool
Chris@31 379 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
Chris@31 380 {
Chris@272 381 if (m_inputSampleRate < minInputSampleRate ||
Chris@272 382 m_inputSampleRate > maxInputSampleRate) {
Chris@272 383 cerr << "Silvet::initialise: Unsupported input sample rate "
Chris@272 384 << m_inputSampleRate << " (supported min " << minInputSampleRate
Chris@272 385 << ", max " << maxInputSampleRate << ")" << endl;
Chris@272 386 return false;
Chris@272 387 }
Chris@272 388
Chris@31 389 if (channels < getMinChannelCount() ||
Chris@272 390 channels > getMaxChannelCount()) {
Chris@272 391 cerr << "Silvet::initialise: Unsupported channel count " << channels
Chris@272 392 << " (supported min " << getMinChannelCount() << ", max "
Chris@272 393 << getMaxChannelCount() << ")" << endl;
Chris@272 394 return false;
Chris@272 395 }
Chris@31 396
Chris@31 397 if (stepSize != blockSize) {
Chris@31 398 cerr << "Silvet::initialise: Step size must be the same as block size ("
Chris@31 399 << stepSize << " != " << blockSize << ")" << endl;
Chris@31 400 return false;
Chris@31 401 }
Chris@31 402
Chris@31 403 m_blockSize = blockSize;
Chris@31 404
Chris@31 405 reset();
Chris@31 406
Chris@31 407 return true;
Chris@31 408 }
Chris@31 409
Chris@31 410 void
Chris@31 411 Silvet::reset()
Chris@31 412 {
Chris@31 413 delete m_resampler;
Chris@246 414 delete m_flattener;
Chris@31 415 delete m_cq;
Chris@31 416
Chris@31 417 if (m_inputSampleRate != processingSampleRate) {
Chris@31 418 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
Chris@31 419 } else {
Chris@31 420 m_resampler = 0;
Chris@31 421 }
Chris@31 422
Chris@246 423 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
Chris@246 424 m_flattener->reset();
Chris@246 425
Chris@173 426 double minFreq = 27.5;
Chris@173 427
Chris@173 428 if (!m_hqMode) {
Chris@173 429 // We don't actually return any notes from the bottom octave,
Chris@173 430 // so we can just pad with zeros
Chris@173 431 minFreq *= 2;
Chris@173 432 }
Chris@173 433
Chris@154 434 CQParameters params(processingSampleRate,
Chris@173 435 minFreq,
Chris@154 436 processingSampleRate / 3,
Chris@154 437 processingBPO);
Chris@154 438
Chris@155 439 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
Chris@155 440 // drops the FFT size to 512 from 1024 and alters
Chris@155 441 // some other processing parameters, making
Chris@155 442 // everything much, much slower. Could be a flaw
Chris@155 443 // in the CQ parameter calculations, must check
Chris@154 444 params.atomHopFactor = 0.3;
Chris@154 445 params.threshold = 0.0005;
Chris@172 446 params.window = CQParameters::Hann;
Chris@154 447
Chris@154 448 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
Chris@31 449
Chris@165 450 m_colsPerSec = m_hqMode ? 50 : 25;
Chris@165 451
Chris@41 452 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
Chris@41 453 delete m_postFilter[i];
Chris@41 454 }
Chris@41 455 m_postFilter.clear();
Chris@176 456 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
Chris@41 457 m_postFilter.push_back(new MedianFilter<double>(3));
Chris@41 458 }
Chris@41 459 m_pianoRoll.clear();
Chris@246 460 m_inputGains.clear();
Chris@32 461 m_columnCount = 0;
Chris@272 462 m_resampledCount = 0;
Chris@40 463 m_startTime = RealTime::zeroTime;
Chris@31 464 }
Chris@31 465
Chris@31 466 Silvet::FeatureSet
Chris@31 467 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Chris@31 468 {
Chris@40 469 if (m_columnCount == 0) {
Chris@40 470 m_startTime = timestamp;
Chris@40 471 }
Chris@246 472
Chris@246 473 vector<float> flattened(m_blockSize);
Chris@246 474 float gain = 1.f;
Chris@246 475 m_flattener->connectInputPort
Chris@246 476 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
Chris@246 477 m_flattener->connectOutputPort
Chris@246 478 (FlattenDynamics::AudioOutputPort, &flattened[0]);
Chris@246 479 m_flattener->connectOutputPort
Chris@246 480 (FlattenDynamics::GainOutputPort, &gain);
Chris@246 481 m_flattener->process(m_blockSize);
Chris@246 482
Chris@252 483 m_inputGains[timestamp] = gain;
Chris@40 484
Chris@31 485 vector<double> data;
Chris@40 486 for (int i = 0; i < m_blockSize; ++i) {
Chris@246 487 double d = flattened[i];
Chris@235 488 data.push_back(d);
Chris@40 489 }
Chris@31 490
Chris@31 491 if (m_resampler) {
Chris@272 492
Chris@31 493 data = m_resampler->process(data.data(), data.size());
Chris@272 494
Chris@272 495 int hadCount = m_resampledCount;
Chris@272 496 m_resampledCount += data.size();
Chris@272 497
Chris@272 498 int resamplerLatency = m_resampler->getLatency();
Chris@272 499
Chris@272 500 if (hadCount < resamplerLatency) {
Chris@272 501 int stillToDrop = resamplerLatency - hadCount;
Chris@272 502 if (stillToDrop >= int(data.size())) {
Chris@272 503 return FeatureSet();
Chris@272 504 } else {
Chris@272 505 data = vector<double>(data.begin() + stillToDrop, data.end());
Chris@272 506 }
Chris@272 507 }
Chris@31 508 }
Chris@272 509
Chris@32 510 Grid cqout = m_cq->process(data);
Chris@51 511 FeatureSet fs = transcribe(cqout);
Chris@51 512 return fs;
Chris@34 513 }
Chris@34 514
Chris@34 515 Silvet::FeatureSet
Chris@34 516 Silvet::getRemainingFeatures()
Chris@34 517 {
Chris@145 518 Grid cqout = m_cq->getRemainingOutput();
Chris@51 519 FeatureSet fs = transcribe(cqout);
Chris@51 520 return fs;
Chris@34 521 }
Chris@34 522
Chris@34 523 Silvet::FeatureSet
Chris@34 524 Silvet::transcribe(const Grid &cqout)
Chris@34 525 {
Chris@32 526 Grid filtered = preProcess(cqout);
Chris@31 527
Chris@32 528 FeatureSet fs;
Chris@32 529
Chris@104 530 if (filtered.empty()) return fs;
Chris@170 531
Chris@170 532 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@104 533
Chris@178 534 for (int i = 0; i < (int)filtered.size(); ++i) {
Chris@178 535 Feature f;
Chris@178 536 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 537 f.values.push_back(float(filtered[i][j]));
Chris@178 538 }
Chris@178 539 fs[m_fcqOutputNo].push_back(f);
Chris@178 540 }
Chris@178 541
Chris@34 542 int width = filtered.size();
Chris@34 543
Chris@164 544 int iterations = m_hqMode ? 20 : 10;
Chris@34 545
Chris@176 546 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
Chris@170 547
Chris@170 548 bool wantShifts = m_hqMode && m_fineTuning;
Chris@170 549 int shiftCount = 1;
Chris@170 550 if (wantShifts) {
Chris@170 551 shiftCount = pack.templateMaxShift * 2 + 1;
Chris@170 552 }
Chris@170 553
Chris@170 554 vector<vector<int> > localBestShifts;
Chris@170 555 if (wantShifts) {
Chris@170 556 localBestShifts =
Chris@176 557 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
Chris@170 558 }
Chris@170 559
Chris@305 560 double columnThreshold = 1e-5;
Chris@305 561
Chris@123 562 #pragma omp parallel for
Chris@123 563 for (int i = 0; i < width; ++i) {
Chris@104 564
Chris@170 565 double sum = 0.0;
Chris@176 566 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@170 567 sum += filtered.at(i).at(j);
Chris@170 568 }
Chris@305 569 if (sum < columnThreshold) continue;
Chris@170 570
Chris@170 571 EM em(&pack, m_hqMode);
Chris@170 572
Chris@183 573 em.setPitchSparsity(pack.pitchSparsity);
Chris@213 574 em.setSourceSparsity(pack.sourceSparsity);
Chris@183 575
Chris@170 576 for (int j = 0; j < iterations; ++j) {
Chris@170 577 em.iterate(filtered.at(i).data());
Chris@37 578 }
Chris@37 579
Chris@170 580 const float *pitchDist = em.getPitchDistribution();
Chris@170 581 const float *const *shiftDist = em.getShifts();
Chris@37 582
Chris@176 583 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@104 584
Chris@170 585 localPitches[i][j] = pitchDist[j] * sum;
Chris@170 586
Chris@170 587 int bestShift = 0;
Chris@179 588 float bestShiftValue = 0.0;
Chris@170 589 if (wantShifts) {
Chris@170 590 for (int k = 0; k < shiftCount; ++k) {
Chris@179 591 float value = shiftDist[k][j];
Chris@179 592 if (k == 0 || value > bestShiftValue) {
Chris@179 593 bestShiftValue = value;
Chris@170 594 bestShift = k;
Chris@170 595 }
Chris@170 596 }
Chris@170 597 localBestShifts[i][j] = bestShift;
Chris@170 598 }
Chris@123 599 }
Chris@123 600 }
Chris@166 601
Chris@166 602 for (int i = 0; i < width; ++i) {
Chris@37 603
Chris@309 604 // This returns a filtered column, and pushes the
Chris@309 605 // up-to-max-polyphony activation column to m_pianoRoll
Chris@294 606 vector<double> filtered = postProcess
Chris@294 607 (localPitches[i], localBestShifts[i], wantShifts);
Chris@294 608
Chris@309 609 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
Chris@309 610 float inputGain = getInputGainAt(timestamp);
Chris@309 611
Chris@294 612 Feature f;
Chris@294 613 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 614 float v = filtered[j];
Chris@294 615 if (v < pack.levelThreshold) v = 0.f;
Chris@309 616 f.values.push_back(v / inputGain);
Chris@294 617 }
Chris@294 618 fs[m_pitchOutputNo].push_back(f);
Chris@309 619
Chris@309 620 f.values.clear();
Chris@309 621 f.values.resize(12);
Chris@309 622 for (int j = 0; j < (int)filtered.size(); ++j) {
Chris@309 623 f.values[j % 12] += filtered[j] / inputGain;
Chris@309 624 }
Chris@309 625 fs[m_chromaOutputNo].push_back(f);
Chris@166 626
Chris@168 627 FeatureList noteFeatures = noteTrack(shiftCount);
Chris@38 628
Chris@123 629 for (FeatureList::const_iterator fi = noteFeatures.begin();
Chris@123 630 fi != noteFeatures.end(); ++fi) {
Chris@123 631 fs[m_notesOutputNo].push_back(*fi);
Chris@40 632 }
Chris@34 633 }
Chris@34 634
Chris@32 635 return fs;
Chris@31 636 }
Chris@31 637
Chris@32 638 Silvet::Grid
Chris@32 639 Silvet::preProcess(const Grid &in)
Chris@32 640 {
Chris@32 641 int width = in.size();
Chris@32 642
Chris@165 643 int spacing = processingSampleRate / m_colsPerSec;
Chris@32 644
Chris@165 645 // need to be careful that col spacing is an integer number of samples!
Chris@165 646 assert(spacing * m_colsPerSec == processingSampleRate);
Chris@32 647
Chris@32 648 Grid out;
Chris@32 649
Chris@58 650 // We count the CQ latency in terms of processing hops, but
Chris@58 651 // actually it probably isn't an exact number of hops so this
Chris@58 652 // isn't quite accurate. But the small constant offset is
Chris@165 653 // practically irrelevant compared to the jitter from the frame
Chris@165 654 // size we reduce to in a moment
Chris@33 655 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
Chris@33 656
Chris@176 657 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 658
Chris@32 659 for (int i = 0; i < width; ++i) {
Chris@32 660
Chris@33 661 if (m_columnCount < latentColumns) {
Chris@33 662 ++m_columnCount;
Chris@33 663 continue;
Chris@33 664 }
Chris@33 665
Chris@32 666 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
Chris@32 667 int sampleNo = m_columnCount * m_cq->getColumnHop();
Chris@32 668
Chris@32 669 bool select = (sampleNo / spacing != prevSampleNo / spacing);
Chris@32 670
Chris@32 671 if (select) {
Chris@32 672 vector<double> inCol = in[i];
Chris@176 673 vector<double> outCol(pack.templateHeight);
Chris@32 674
Chris@178 675 // In HQ mode, the CQ returns 600 bins and we ignore the
Chris@178 676 // lowest 55 of them.
Chris@178 677 //
Chris@178 678 // In draft mode the CQ is an octave shorter, returning
Chris@178 679 // 540 bins, so we instead pad them with an additional 5
Chris@178 680 // zeros.
Chris@178 681 //
Chris@178 682 // We also need to reverse the column as we go, since the
Chris@178 683 // raw CQ has the high frequencies first and we need it
Chris@178 684 // the other way around.
Chris@32 685
Chris@178 686 if (m_hqMode) {
Chris@178 687 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@178 688 int ix = inCol.size() - j - 55;
Chris@178 689 outCol[j] = inCol[ix];
Chris@178 690 }
Chris@178 691 } else {
Chris@178 692 for (int j = 0; j < 5; ++j) {
Chris@178 693 outCol[j] = 0.0;
Chris@178 694 }
Chris@178 695 for (int j = 5; j < pack.templateHeight; ++j) {
Chris@178 696 int ix = inCol.size() - j + 4;
Chris@178 697 outCol[j] = inCol[ix];
Chris@178 698 }
Chris@46 699 }
Chris@32 700
Chris@46 701 vector<double> noiseLevel1 =
Chris@46 702 MedianFilter<double>::filter(40, outCol);
Chris@176 703 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 704 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
Chris@46 705 }
Chris@32 706
Chris@46 707 vector<double> noiseLevel2 =
Chris@46 708 MedianFilter<double>::filter(40, noiseLevel1);
Chris@176 709 for (int j = 0; j < pack.templateHeight; ++j) {
Chris@46 710 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
Chris@32 711 }
Chris@32 712
Chris@165 713 out.push_back(outCol);
Chris@32 714 }
Chris@32 715
Chris@32 716 ++m_columnCount;
Chris@32 717 }
Chris@32 718
Chris@32 719 return out;
Chris@32 720 }
Chris@32 721
Chris@294 722 vector<double>
Chris@170 723 Silvet::postProcess(const vector<double> &pitches,
Chris@170 724 const vector<int> &bestShifts,
Chris@170 725 bool wantShifts)
Chris@166 726 {
Chris@176 727 const InstrumentPack &pack = m_instruments[m_instrument];
Chris@176 728
Chris@41 729 vector<double> filtered;
Chris@41 730
Chris@176 731 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@170 732 m_postFilter[j]->push(pitches[j]);
Chris@41 733 filtered.push_back(m_postFilter[j]->get());
Chris@41 734 }
Chris@41 735
Chris@41 736 // Threshold for level and reduce number of candidate pitches
Chris@41 737
Chris@41 738 typedef std::multimap<double, int> ValueIndexMap;
Chris@41 739
Chris@41 740 ValueIndexMap strengths;
Chris@166 741
Chris@176 742 for (int j = 0; j < pack.templateNoteCount; ++j) {
Chris@166 743 double strength = filtered[j];
Chris@183 744 if (strength < pack.levelThreshold) continue;
Chris@168 745 strengths.insert(ValueIndexMap::value_type(strength, j));
Chris@168 746 }
Chris@166 747
Chris@168 748 ValueIndexMap::const_iterator si = strengths.end();
Chris@167 749
Chris@168 750 map<int, double> active;
Chris@168 751 map<int, int> activeShifts;
Chris@168 752
Chris@183 753 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
Chris@168 754
Chris@168 755 --si;
Chris@168 756
Chris@168 757 double strength = si->first;
Chris@168 758 int j = si->second;
Chris@168 759
Chris@168 760 active[j] = strength;
Chris@168 761
Chris@170 762 if (wantShifts) {
Chris@170 763 activeShifts[j] = bestShifts[j];
Chris@167 764 }
Chris@41 765 }
Chris@41 766
Chris@168 767 m_pianoRoll.push_back(active);
Chris@170 768
Chris@170 769 if (wantShifts) {
Chris@168 770 m_pianoRollShifts.push_back(activeShifts);
Chris@41 771 }
Chris@294 772
Chris@294 773 return filtered;
Chris@166 774 }
Chris@166 775
Chris@166 776 Vamp::Plugin::FeatureList
Chris@168 777 Silvet::noteTrack(int shiftCount)
Chris@166 778 {
Chris@41 779 // Minimum duration pruning, and conversion to notes. We can only
Chris@41 780 // report notes that have just ended (i.e. that are absent in the
Chris@168 781 // latest active set but present in the prior set in the piano
Chris@41 782 // roll) -- any notes that ended earlier will have been reported
Chris@41 783 // already, and if they haven't ended, we don't know their
Chris@41 784 // duration.
Chris@41 785
Chris@168 786 int width = m_pianoRoll.size() - 1;
Chris@168 787
Chris@168 788 const map<int, double> &active = m_pianoRoll[width];
Chris@41 789
Chris@165 790 double columnDuration = 1.0 / m_colsPerSec;
Chris@165 791
Chris@165 792 // only keep notes >= 100ms or thereabouts
Chris@165 793 int durationThreshold = floor(0.1 / columnDuration); // columns
Chris@165 794 if (durationThreshold < 1) durationThreshold = 1;
Chris@41 795
Chris@41 796 FeatureList noteFeatures;
Chris@41 797
Chris@41 798 if (width < durationThreshold + 1) {
Chris@41 799 return noteFeatures;
Chris@41 800 }
Chris@41 801
Chris@150 802 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
Chris@150 803
Chris@55 804 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
Chris@41 805 ni != m_pianoRoll[width-1].end(); ++ni) {
Chris@41 806
Chris@55 807 int note = ni->first;
Chris@41 808
Chris@41 809 if (active.find(note) != active.end()) {
Chris@41 810 // the note is still playing
Chris@41 811 continue;
Chris@41 812 }
Chris@41 813
Chris@41 814 // the note was playing but just ended
Chris@41 815 int end = width;
Chris@41 816 int start = end-1;
Chris@41 817
Chris@41 818 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
Chris@41 819 --start;
Chris@41 820 }
Chris@41 821 ++start;
Chris@41 822
Chris@169 823 if ((end - start) < durationThreshold) {
Chris@41 824 continue;
Chris@41 825 }
Chris@41 826
Chris@169 827 emitNote(start, end, note, shiftCount, noteFeatures);
Chris@41 828 }
Chris@41 829
Chris@62 830 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
Chris@41 831
Chris@41 832 return noteFeatures;
Chris@41 833 }
Chris@41 834
Chris@169 835 void
Chris@169 836 Silvet::emitNote(int start, int end, int note, int shiftCount,
Chris@169 837 FeatureList &noteFeatures)
Chris@169 838 {
Chris@169 839 int partStart = start;
Chris@169 840 int partShift = 0;
Chris@169 841 int partVelocity = 0;
Chris@169 842
Chris@252 843 int partThreshold = floor(0.05 * m_colsPerSec);
Chris@169 844
Chris@169 845 for (int i = start; i != end; ++i) {
Chris@169 846
Chris@169 847 double strength = m_pianoRoll[i][note];
Chris@169 848
Chris@169 849 int shift = 0;
Chris@169 850
Chris@169 851 if (shiftCount > 1) {
Chris@169 852
Chris@169 853 shift = m_pianoRollShifts[i][note];
Chris@169 854
Chris@169 855 if (i == partStart) {
Chris@169 856 partShift = shift;
Chris@169 857 }
Chris@169 858
Chris@169 859 if (i > partStart + partThreshold && shift != partShift) {
Chris@169 860
Chris@169 861 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
Chris@169 862
Chris@169 863 // pitch has changed, emit an intermediate note
Chris@252 864 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 865 i,
Chris@252 866 note,
Chris@252 867 partShift,
Chris@252 868 shiftCount,
Chris@252 869 partVelocity));
Chris@169 870 partStart = i;
Chris@169 871 partShift = shift;
Chris@169 872 partVelocity = 0;
Chris@169 873 }
Chris@169 874 }
Chris@169 875
Chris@246 876 int v = round(strength * 2);
Chris@169 877 if (v > partVelocity) {
Chris@169 878 partVelocity = v;
Chris@169 879 }
Chris@169 880 }
Chris@169 881
Chris@169 882 if (end >= partStart + partThreshold) {
Chris@252 883 noteFeatures.push_back(makeNoteFeature(partStart,
Chris@252 884 end,
Chris@252 885 note,
Chris@252 886 partShift,
Chris@252 887 shiftCount,
Chris@252 888 partVelocity));
Chris@169 889 }
Chris@169 890 }
Chris@252 891
Chris@309 892 RealTime
Chris@309 893 Silvet::getColumnTimestamp(int column)
Chris@309 894 {
Chris@309 895 double columnDuration = 1.0 / m_colsPerSec;
Chris@309 896 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
Chris@309 897
Chris@309 898 return m_startTime + RealTime::fromSeconds
Chris@309 899 (columnDuration * (column - postFilterLatency) + 0.02);
Chris@309 900 }
Chris@309 901
Chris@252 902 Silvet::Feature
Chris@252 903 Silvet::makeNoteFeature(int start,
Chris@252 904 int end,
Chris@252 905 int note,
Chris@252 906 int shift,
Chris@252 907 int shiftCount,
Chris@252 908 int velocity)
Chris@252 909 {
Chris@252 910 Feature f;
Chris@252 911
Chris@252 912 f.hasTimestamp = true;
Chris@309 913 f.timestamp = getColumnTimestamp(start);
Chris@252 914
Chris@252 915 f.hasDuration = true;
Chris@309 916 f.duration = getColumnTimestamp(end) - f.timestamp;
Chris@252 917
Chris@252 918 f.values.clear();
Chris@252 919
Chris@252 920 f.values.push_back
Chris@252 921 (noteFrequency(note, shift, shiftCount));
Chris@252 922
Chris@252 923 float inputGain = getInputGainAt(f.timestamp);
Chris@252 924 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
Chris@252 925 velocity = round(velocity / inputGain);
Chris@252 926 if (velocity > 127) velocity = 127;
Chris@252 927 if (velocity < 1) velocity = 1;
Chris@252 928 f.values.push_back(velocity);
Chris@252 929
Chris@252 930 f.label = noteName(note, shift, shiftCount);
Chris@252 931
Chris@252 932 return f;
Chris@252 933 }
Chris@252 934
Chris@252 935 float
Chris@252 936 Silvet::getInputGainAt(RealTime t)
Chris@252 937 {
Chris@252 938 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
Chris@252 939
Chris@252 940 if (i == m_inputGains.end()) {
Chris@252 941 if (i != m_inputGains.begin()) {
Chris@252 942 --i;
Chris@252 943 } else {
Chris@252 944 return 1.f; // no data
Chris@252 945 }
Chris@252 946 }
Chris@252 947
Chris@252 948 // cerr << "gain at time " << t << " = " << i->second << endl;
Chris@252 949
Chris@252 950 return i->second;
Chris@252 951 }
Chris@252 952