comparison src/Silvet.cpp @ 330:8f5cfd7dbaa5 livemode

Merge
author Chris Cannam
date Tue, 28 Apr 2015 18:56:54 +0100
parents 447ccdbfc6c0 92293058368a
children e8e37f471650
comparison
equal deleted inserted replaced
329:447ccdbfc6c0 330:8f5cfd7dbaa5
22 #include "constant-q-cpp/src/dsp/Resampler.h" 22 #include "constant-q-cpp/src/dsp/Resampler.h"
23 #include "flattendynamics-ladspa.h" 23 #include "flattendynamics-ladspa.h"
24 #include "LiveInstruments.h" 24 #include "LiveInstruments.h"
25 25
26 #include <vector> 26 #include <vector>
27 #include <future>
27 28
28 #include <cstdio> 29 #include <cstdio>
29 30
30 using std::vector; 31 using std::vector;
31 using std::cout; 32 using std::cout;
32 using std::cerr; 33 using std::cerr;
33 using std::endl; 34 using std::endl;
35 using std::pair;
36 using std::future;
37 using std::async;
34 using Vamp::RealTime; 38 using Vamp::RealTime;
35 39
36 static int processingSampleRate = 44100; 40 static int processingSampleRate = 44100;
37 41
38 static int binsPerSemitoneLive = 1; 42 static int binsPerSemitoneLive = 1;
39 static int binsPerSemitoneNormal = 5; 43 static int binsPerSemitoneNormal = 5;
40 44
41 static int minInputSampleRate = 100; 45 static int minInputSampleRate = 100;
42 static int maxInputSampleRate = 192000; 46 static int maxInputSampleRate = 192000;
47
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
43 49
44 Silvet::Silvet(float inputSampleRate) : 50 Silvet::Silvet(float inputSampleRate) :
45 Plugin(inputSampleRate), 51 Plugin(inputSampleRate),
46 m_instruments(InstrumentPack::listInstrumentPacks()), 52 m_instruments(InstrumentPack::listInstrumentPacks()),
47 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)), 53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
48 m_resampler(0), 54 m_resampler(0),
49 m_flattener(0), 55 m_flattener(0),
50 m_cq(0), 56 m_cq(0),
51 m_mode(HighQualityMode), 57 m_mode(defaultMode),
52 m_fineTuning(false), 58 m_fineTuning(false),
53 m_instrument(0), 59 m_instrument(0),
54 m_colsPerSec(50) 60 m_colsPerSec(50),
61 m_haveStartTime(false)
55 { 62 {
56 } 63 }
57 64
58 Silvet::~Silvet() 65 Silvet::~Silvet()
59 { 66 {
141 desc.name = "Processing mode"; 148 desc.name = "Processing mode";
142 desc.unit = ""; 149 desc.unit = "";
143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results."; 150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
144 desc.minValue = 0; 151 desc.minValue = 0;
145 desc.maxValue = 2; 152 desc.maxValue = 2;
146 desc.defaultValue = 1; 153 desc.defaultValue = int(defaultMode);
147 desc.isQuantized = true; 154 desc.isQuantized = true;
148 desc.quantizeStep = 1; 155 desc.quantizeStep = 1;
149 desc.valueNames.push_back("Draft (faster)"); 156 desc.valueNames.push_back("Draft (faster)");
150 desc.valueNames.push_back("Intensive (higher quality)"); 157 desc.valueNames.push_back("Intensive (higher quality)");
151 desc.valueNames.push_back("Live (lower latency)"); 158 desc.valueNames.push_back("Live (lower latency)");
293 d.sampleRate = m_colsPerSec; 300 d.sampleRate = m_colsPerSec;
294 d.hasDuration = false; 301 d.hasDuration = false;
295 m_pitchOutputNo = list.size(); 302 m_pitchOutputNo = list.size();
296 list.push_back(d); 303 list.push_back(d);
297 304
305 d.identifier = "chroma";
306 d.name = "Pitch chroma distribution";
307 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
308 d.unit = "";
309 d.hasFixedBinCount = true;
310 d.binCount = 12;
311 d.binNames.clear();
312 if (m_cq) {
313 for (int i = 0; i < 12; ++i) {
314 d.binNames.push_back(chromaName(i));
315 }
316 }
317 d.hasKnownExtents = false;
318 d.isQuantized = false;
319 d.sampleType = OutputDescriptor::FixedSampleRate;
320 d.sampleRate = m_colsPerSec;
321 d.hasDuration = false;
322 m_chromaOutputNo = list.size();
323 list.push_back(d);
324
298 d.identifier = "templates"; 325 d.identifier = "templates";
299 d.name = "Templates"; 326 d.name = "Templates";
300 d.description = "Constant-Q spectral templates for the selected instrument pack."; 327 d.description = "Constant-Q spectral templates for the selected instrument pack.";
301 d.unit = ""; 328 d.unit = "";
302 d.hasFixedBinCount = true; 329 d.hasFixedBinCount = true;
326 353
327 return list; 354 return list;
328 } 355 }
329 356
330 std::string 357 std::string
331 Silvet::noteName(int note, int shift, int shiftCount) const 358 Silvet::chromaName(int pitch) const
332 { 359 {
333 static const char *names[] = { 360 static const char *names[] = {
334 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#" 361 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
335 }; 362 };
336 363
337 const char *n = names[note % 12]; 364 return names[pitch];
365 }
366
367 std::string
368 Silvet::noteName(int note, int shift, int shiftCount) const
369 {
370 string n = chromaName(note % 12);
338 371
339 int oct = (note + 9) / 12; 372 int oct = (note + 9) / 12;
340 373
341 char buf[30]; 374 char buf[30];
342 375
346 pshift = 379 pshift =
347 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount; 380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
348 } 381 }
349 382
350 if (pshift > 0.f) { 383 if (pshift > 0.f) {
351 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100))); 384 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
352 } else if (pshift < 0.f) { 385 } else if (pshift < 0.f) {
353 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100))); 386 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
354 } else { 387 } else {
355 sprintf(buf, "%s%d", n, oct); 388 sprintf(buf, "%s%d", n.c_str(), oct);
356 } 389 }
357 390
358 return buf; 391 return buf;
359 } 392 }
360 393
461 CQParameters params(processingSampleRate, 494 CQParameters params(processingSampleRate,
462 minFreq, 495 minFreq,
463 maxFreq, 496 maxFreq,
464 bpo); 497 bpo);
465 498
466 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower 499 // For params.q, the MIREX code uses 0.8, but it seems that with
467 // drops the FFT size to 512 from 1024 and alters 500 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT
468 // some other processing parameters, making 501 // size to 512 from 1024 and alters some other processing
469 // everything much, much slower. Could be a flaw 502 // parameters, making everything much, much slower. Could be a
470 // in the CQ parameter calculations, must check 503 // flaw in the CQ parameter calculations, must check. For
471 params.atomHopFactor = 0.3; 504 // atomHopFactor == 1, q == 0.8 is fine
505 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8);
506 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0);
472 params.threshold = 0.0005; 507 params.threshold = 0.0005;
508 params.decimator =
509 (m_mode == LiveMode ?
510 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
473 params.window = CQParameters::Hann; 511 params.window = CQParameters::Hann;
474 512
475 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear); 513 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
476 514
477 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl; 515 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
490 m_pianoRoll.clear(); 528 m_pianoRoll.clear();
491 m_inputGains.clear(); 529 m_inputGains.clear();
492 m_columnCount = 0; 530 m_columnCount = 0;
493 m_resampledCount = 0; 531 m_resampledCount = 0;
494 m_startTime = RealTime::zeroTime; 532 m_startTime = RealTime::zeroTime;
533 m_haveStartTime = false;
495 } 534 }
496 535
497 Silvet::FeatureSet 536 Silvet::FeatureSet
498 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp) 537 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
499 { 538 {
500 FeatureSet fs; 539 FeatureSet fs;
501 540
502 if (m_columnCount == 0) { 541 if (!m_haveStartTime) {
542
503 m_startTime = timestamp; 543 m_startTime = timestamp;
544 m_haveStartTime = true;
545
504 insertTemplateFeatures(fs); 546 insertTemplateFeatures(fs);
505 } 547 }
506 548
507 vector<float> flattened(m_blockSize); 549 vector<float> flattened(m_blockSize);
508 float gain = 1.f; 550 float gain = 1.f;
595 fs[m_fcqOutputNo].push_back(f); 637 fs[m_fcqOutputNo].push_back(f);
596 } 638 }
597 639
598 int width = filtered.size(); 640 int width = filtered.size();
599 641
600 int iterations = (m_mode == HighQualityMode ? 20 : 10); 642 Grid localPitches(width);
601
602 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
603 643
604 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning; 644 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
605 int shiftCount = 1; 645 int shiftCount = 1;
606 if (wantShifts) { 646 if (wantShifts) {
607 shiftCount = pack.templateMaxShift * 2 + 1; 647 shiftCount = pack.templateMaxShift * 2 + 1;
608 } 648 }
609 649
610 vector<vector<int> > localBestShifts; 650 vector<vector<int> > localBestShifts;
611 if (wantShifts) { 651 if (wantShifts) {
612 localBestShifts = 652 localBestShifts = vector<vector<int> >(width);
613 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0)); 653 }
614 } 654
615 655 #ifndef MAX_EM_THREADS
616 double columnThreshold = 1e-5; 656 #define MAX_EM_THREADS 8
617 657 #endif
618 if (m_mode == LiveMode) { 658
619 columnThreshold /= 20; 659 int emThreadCount = MAX_EM_THREADS;
620 } 660 if (m_mode == LiveMode && pack.templates.size() == 1) {
621 661 // The EM step is probably not slow enough to merit it
622 #pragma omp parallel for 662 emThreadCount = 1;
623 for (int i = 0; i < width; ++i) { 663 }
624 664
625 double sum = 0.0; 665 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
626 for (int j = 0; j < pack.templateHeight; ++j) { 666 if (emThreadCount > 1) {
627 sum += filtered.at(i).at(j); 667 for (int i = 0; i < width; ) {
628 } 668 typedef future<pair<vector<double>, vector<int>>> EMFuture;
629 if (sum < columnThreshold) continue; 669 vector<EMFuture> results;
630 670 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
631 EM em(&pack, m_mode == HighQualityMode); 671 results.push_back
632 672 (async(std::launch::async,
633 em.setPitchSparsity(pack.pitchSparsity); 673 [&](int index) {
634 em.setSourceSparsity(pack.sourceSparsity); 674 return applyEM(pack, filtered.at(index), wantShifts);
635 675 }, i + j));
636 for (int j = 0; j < iterations; ++j) { 676 }
637 em.iterate(filtered.at(i).data()); 677 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
638 } 678 auto out = results[j].get();
639 679 localPitches[i+j] = out.first;
640 const float *pitchDist = em.getPitchDistribution(); 680 if (wantShifts) localBestShifts[i+j] = out.second;
641 const float *const *shiftDist = em.getShifts(); 681 }
642 682 i += emThreadCount;
643 for (int j = 0; j < pack.templateNoteCount; ++j) { 683 }
644 684 }
645 localPitches[i][j] = pitchDist[j] * sum; 685 #endif
646 686
647 int bestShift = 0; 687 if (emThreadCount == 1) {
648 float bestShiftValue = 0.0; 688 for (int i = 0; i < width; ++i) {
649 if (wantShifts) { 689 auto out = applyEM(pack, filtered.at(i), wantShifts);
650 for (int k = 0; k < shiftCount; ++k) { 690 localPitches[i] = out.first;
651 float value = shiftDist[k][j]; 691 if (wantShifts) localBestShifts[i] = out.second;
652 if (k == 0 || value > bestShiftValue) {
653 bestShiftValue = value;
654 bestShift = k;
655 }
656 }
657 localBestShifts[i][j] = bestShift;
658 }
659 } 692 }
660 } 693 }
661 694
662 for (int i = 0; i < width; ++i) { 695 for (int i = 0; i < width; ++i) {
663 696
697 // This returns a filtered column, and pushes the
698 // up-to-max-polyphony activation column to m_pianoRoll
664 vector<double> filtered = postProcess 699 vector<double> filtered = postProcess
665 (localPitches[i], localBestShifts[i], wantShifts); 700 (localPitches[i], localBestShifts[i], wantShifts);
666 701
702 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
703 float inputGain = getInputGainAt(timestamp);
704
667 Feature f; 705 Feature f;
668 for (int j = 0; j < (int)filtered.size(); ++j) { 706 for (int j = 0; j < (int)filtered.size(); ++j) {
669 float v(filtered[j]); 707 float v = filtered[j];
670 if (v < pack.levelThreshold) v = 0.f; 708 if (v < pack.levelThreshold) v = 0.f;
671 f.values.push_back(v); 709 f.values.push_back(v / inputGain);
672 } 710 }
673 fs[m_pitchOutputNo].push_back(f); 711 fs[m_pitchOutputNo].push_back(f);
712
713 f.values.clear();
714 f.values.resize(12);
715 for (int j = 0; j < (int)filtered.size(); ++j) {
716 f.values[j % 12] += filtered[j] / inputGain;
717 }
718 fs[m_chromaOutputNo].push_back(f);
674 719
675 FeatureList noteFeatures = noteTrack(shiftCount); 720 FeatureList noteFeatures = noteTrack(shiftCount);
676 721
677 for (FeatureList::const_iterator fi = noteFeatures.begin(); 722 for (FeatureList::const_iterator fi = noteFeatures.begin();
678 fi != noteFeatures.end(); ++fi) { 723 fi != noteFeatures.end(); ++fi) {
679 fs[m_notesOutputNo].push_back(*fi); 724 fs[m_notesOutputNo].push_back(*fi);
680 } 725 }
681 } 726 }
727 }
728
729 pair<vector<double>, vector<int> >
730 Silvet::applyEM(const InstrumentPack &pack,
731 const vector<double> &column,
732 bool wantShifts)
733 {
734 double columnThreshold = 1e-5;
735
736 if (m_mode == LiveMode) {
737 columnThreshold /= 20;
738 }
739
740 vector<double> pitches(pack.templateNoteCount, 0.0);
741 vector<int> bestShifts;
742
743 double sum = 0.0;
744 for (int j = 0; j < pack.templateHeight; ++j) {
745 sum += column.at(j);
746 }
747 if (sum < columnThreshold) return { pitches, bestShifts };
748
749 EM em(&pack, m_mode == HighQualityMode);
750
751 em.setPitchSparsity(pack.pitchSparsity);
752 em.setSourceSparsity(pack.sourceSparsity);
753
754 int iterations = (m_mode == HighQualityMode ? 20 : 10);
755
756 for (int j = 0; j < iterations; ++j) {
757 em.iterate(column.data());
758 }
759
760 const float *pitchDist = em.getPitchDistribution();
761 const float *const *shiftDist = em.getShifts();
762
763 int shiftCount = 1;
764 if (wantShifts) {
765 shiftCount = pack.templateMaxShift * 2 + 1;
766 }
767
768 for (int j = 0; j < pack.templateNoteCount; ++j) {
769
770 pitches[j] = pitchDist[j] * sum;
771
772 int bestShift = 0;
773 float bestShiftValue = 0.0;
774 if (wantShifts) {
775 for (int k = 0; k < shiftCount; ++k) {
776 float value = shiftDist[k][j];
777 if (k == 0 || value > bestShiftValue) {
778 bestShiftValue = value;
779 bestShift = k;
780 }
781 }
782 bestShifts.push_back(bestShift);
783 }
784 }
785
786 return { pitches, bestShifts };
682 } 787 }
683 788
684 Silvet::Grid 789 Silvet::Grid
685 Silvet::preProcess(const Grid &in) 790 Silvet::preProcess(const Grid &in)
686 { 791 {
780 for (int j = 0; j < pack.templateNoteCount; ++j) { 885 for (int j = 0; j < pack.templateNoteCount; ++j) {
781 m_postFilter[j]->push(pitches[j]); 886 m_postFilter[j]->push(pitches[j]);
782 filtered.push_back(m_postFilter[j]->get()); 887 filtered.push_back(m_postFilter[j]->get());
783 } 888 }
784 889
890 if (m_mode == LiveMode) {
891 // In live mode with only a 12-bpo CQ, we are very likely to
892 // get clusters of two or three high scores at a time for
893 // neighbouring semitones. Eliminate these by picking only the
894 // peaks. This means we can't recognise actual semitone chords
895 // if they ever appear, but it's not as if live mode is good
896 // enough for that to be a big deal anyway.
897 for (int j = 0; j < pack.templateNoteCount; ++j) {
898 if (j > 0 && j + 1 < pack.templateNoteCount &&
899 filtered[j] >= filtered[j-1] &&
900 filtered[j] >= filtered[j+1]) {
901 } else {
902 filtered[j] = 0.0;
903 }
904 }
905 }
906
785 // Threshold for level and reduce number of candidate pitches 907 // Threshold for level and reduce number of candidate pitches
786 908
787 typedef std::multimap<double, int> ValueIndexMap; 909 typedef std::multimap<double, int> ValueIndexMap;
788 910
789 ValueIndexMap strengths; 911 ValueIndexMap strengths;
922 } 1044 }
923 } 1045 }
924 1046
925 int v; 1047 int v;
926 if (m_mode == LiveMode) { 1048 if (m_mode == LiveMode) {
927 v = round(strength * 30); 1049 v = round(strength * 20);
928 } else { 1050 } else {
929 v = round(strength * 2); 1051 v = round(strength * 2);
930 } 1052 }
931 if (v > partVelocity) { 1053 if (v > partVelocity) {
932 partVelocity = v; 1054 partVelocity = v;
941 shiftCount, 1063 shiftCount,
942 partVelocity)); 1064 partVelocity));
943 } 1065 }
944 } 1066 }
945 1067
1068 RealTime
1069 Silvet::getColumnTimestamp(int column)
1070 {
1071 double columnDuration = 1.0 / m_colsPerSec;
1072 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
1073
1074 return m_startTime + RealTime::fromSeconds
1075 (columnDuration * (column - postFilterLatency) + 0.02);
1076 }
1077
946 Silvet::Feature 1078 Silvet::Feature
947 Silvet::makeNoteFeature(int start, 1079 Silvet::makeNoteFeature(int start,
948 int end, 1080 int end,
949 int note, 1081 int note,
950 int shift, 1082 int shift,
951 int shiftCount, 1083 int shiftCount,
952 int velocity) 1084 int velocity)
953 { 1085 {
954 double columnDuration = 1.0 / m_colsPerSec;
955 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
956
957 Feature f; 1086 Feature f;
958 1087
959 f.hasTimestamp = true; 1088 f.hasTimestamp = true;
960 f.timestamp = m_startTime + RealTime::fromSeconds 1089 f.timestamp = getColumnTimestamp(start);
961 (columnDuration * (start - postFilterLatency) + 0.02);
962 1090
963 f.hasDuration = true; 1091 f.hasDuration = true;
964 f.duration = RealTime::fromSeconds 1092 f.duration = getColumnTimestamp(end) - f.timestamp;
965 (columnDuration * (end - start));
966 1093
967 f.values.clear(); 1094 f.values.clear();
968 1095
969 f.values.push_back 1096 f.values.push_back
970 (noteFrequency(note, shift, shiftCount)); 1097 (noteFrequency(note, shift, shiftCount));