Mercurial > hg > silvet
comparison src/Silvet.cpp @ 330:8f5cfd7dbaa5 livemode
Merge
author | Chris Cannam |
---|---|
date | Tue, 28 Apr 2015 18:56:54 +0100 |
parents | 447ccdbfc6c0 92293058368a |
children | e8e37f471650 |
comparison
equal
deleted
inserted
replaced
329:447ccdbfc6c0 | 330:8f5cfd7dbaa5 |
---|---|
22 #include "constant-q-cpp/src/dsp/Resampler.h" | 22 #include "constant-q-cpp/src/dsp/Resampler.h" |
23 #include "flattendynamics-ladspa.h" | 23 #include "flattendynamics-ladspa.h" |
24 #include "LiveInstruments.h" | 24 #include "LiveInstruments.h" |
25 | 25 |
26 #include <vector> | 26 #include <vector> |
27 #include <future> | |
27 | 28 |
28 #include <cstdio> | 29 #include <cstdio> |
29 | 30 |
30 using std::vector; | 31 using std::vector; |
31 using std::cout; | 32 using std::cout; |
32 using std::cerr; | 33 using std::cerr; |
33 using std::endl; | 34 using std::endl; |
35 using std::pair; | |
36 using std::future; | |
37 using std::async; | |
34 using Vamp::RealTime; | 38 using Vamp::RealTime; |
35 | 39 |
36 static int processingSampleRate = 44100; | 40 static int processingSampleRate = 44100; |
37 | 41 |
38 static int binsPerSemitoneLive = 1; | 42 static int binsPerSemitoneLive = 1; |
39 static int binsPerSemitoneNormal = 5; | 43 static int binsPerSemitoneNormal = 5; |
40 | 44 |
41 static int minInputSampleRate = 100; | 45 static int minInputSampleRate = 100; |
42 static int maxInputSampleRate = 192000; | 46 static int maxInputSampleRate = 192000; |
47 | |
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode; | |
43 | 49 |
44 Silvet::Silvet(float inputSampleRate) : | 50 Silvet::Silvet(float inputSampleRate) : |
45 Plugin(inputSampleRate), | 51 Plugin(inputSampleRate), |
46 m_instruments(InstrumentPack::listInstrumentPacks()), | 52 m_instruments(InstrumentPack::listInstrumentPacks()), |
47 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)), | 53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)), |
48 m_resampler(0), | 54 m_resampler(0), |
49 m_flattener(0), | 55 m_flattener(0), |
50 m_cq(0), | 56 m_cq(0), |
51 m_mode(HighQualityMode), | 57 m_mode(defaultMode), |
52 m_fineTuning(false), | 58 m_fineTuning(false), |
53 m_instrument(0), | 59 m_instrument(0), |
54 m_colsPerSec(50) | 60 m_colsPerSec(50), |
61 m_haveStartTime(false) | |
55 { | 62 { |
56 } | 63 } |
57 | 64 |
58 Silvet::~Silvet() | 65 Silvet::~Silvet() |
59 { | 66 { |
141 desc.name = "Processing mode"; | 148 desc.name = "Processing mode"; |
142 desc.unit = ""; | 149 desc.unit = ""; |
143 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results."; | 150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results."; |
144 desc.minValue = 0; | 151 desc.minValue = 0; |
145 desc.maxValue = 2; | 152 desc.maxValue = 2; |
146 desc.defaultValue = 1; | 153 desc.defaultValue = int(defaultMode); |
147 desc.isQuantized = true; | 154 desc.isQuantized = true; |
148 desc.quantizeStep = 1; | 155 desc.quantizeStep = 1; |
149 desc.valueNames.push_back("Draft (faster)"); | 156 desc.valueNames.push_back("Draft (faster)"); |
150 desc.valueNames.push_back("Intensive (higher quality)"); | 157 desc.valueNames.push_back("Intensive (higher quality)"); |
151 desc.valueNames.push_back("Live (lower latency)"); | 158 desc.valueNames.push_back("Live (lower latency)"); |
293 d.sampleRate = m_colsPerSec; | 300 d.sampleRate = m_colsPerSec; |
294 d.hasDuration = false; | 301 d.hasDuration = false; |
295 m_pitchOutputNo = list.size(); | 302 m_pitchOutputNo = list.size(); |
296 list.push_back(d); | 303 list.push_back(d); |
297 | 304 |
305 d.identifier = "chroma"; | |
306 d.name = "Pitch chroma distribution"; | |
307 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins."; | |
308 d.unit = ""; | |
309 d.hasFixedBinCount = true; | |
310 d.binCount = 12; | |
311 d.binNames.clear(); | |
312 if (m_cq) { | |
313 for (int i = 0; i < 12; ++i) { | |
314 d.binNames.push_back(chromaName(i)); | |
315 } | |
316 } | |
317 d.hasKnownExtents = false; | |
318 d.isQuantized = false; | |
319 d.sampleType = OutputDescriptor::FixedSampleRate; | |
320 d.sampleRate = m_colsPerSec; | |
321 d.hasDuration = false; | |
322 m_chromaOutputNo = list.size(); | |
323 list.push_back(d); | |
324 | |
298 d.identifier = "templates"; | 325 d.identifier = "templates"; |
299 d.name = "Templates"; | 326 d.name = "Templates"; |
300 d.description = "Constant-Q spectral templates for the selected instrument pack."; | 327 d.description = "Constant-Q spectral templates for the selected instrument pack."; |
301 d.unit = ""; | 328 d.unit = ""; |
302 d.hasFixedBinCount = true; | 329 d.hasFixedBinCount = true; |
326 | 353 |
327 return list; | 354 return list; |
328 } | 355 } |
329 | 356 |
330 std::string | 357 std::string |
331 Silvet::noteName(int note, int shift, int shiftCount) const | 358 Silvet::chromaName(int pitch) const |
332 { | 359 { |
333 static const char *names[] = { | 360 static const char *names[] = { |
334 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#" | 361 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#" |
335 }; | 362 }; |
336 | 363 |
337 const char *n = names[note % 12]; | 364 return names[pitch]; |
365 } | |
366 | |
367 std::string | |
368 Silvet::noteName(int note, int shift, int shiftCount) const | |
369 { | |
370 string n = chromaName(note % 12); | |
338 | 371 |
339 int oct = (note + 9) / 12; | 372 int oct = (note + 9) / 12; |
340 | 373 |
341 char buf[30]; | 374 char buf[30]; |
342 | 375 |
346 pshift = | 379 pshift = |
347 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount; | 380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount; |
348 } | 381 } |
349 | 382 |
350 if (pshift > 0.f) { | 383 if (pshift > 0.f) { |
351 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100))); | 384 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100))); |
352 } else if (pshift < 0.f) { | 385 } else if (pshift < 0.f) { |
353 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100))); | 386 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100))); |
354 } else { | 387 } else { |
355 sprintf(buf, "%s%d", n, oct); | 388 sprintf(buf, "%s%d", n.c_str(), oct); |
356 } | 389 } |
357 | 390 |
358 return buf; | 391 return buf; |
359 } | 392 } |
360 | 393 |
461 CQParameters params(processingSampleRate, | 494 CQParameters params(processingSampleRate, |
462 minFreq, | 495 minFreq, |
463 maxFreq, | 496 maxFreq, |
464 bpo); | 497 bpo); |
465 | 498 |
466 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower | 499 // For params.q, the MIREX code uses 0.8, but it seems that with |
467 // drops the FFT size to 512 from 1024 and alters | 500 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT |
468 // some other processing parameters, making | 501 // size to 512 from 1024 and alters some other processing |
469 // everything much, much slower. Could be a flaw | 502 // parameters, making everything much, much slower. Could be a |
470 // in the CQ parameter calculations, must check | 503 // flaw in the CQ parameter calculations, must check. For |
471 params.atomHopFactor = 0.3; | 504 // atomHopFactor == 1, q == 0.8 is fine |
505 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8); | |
506 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0); | |
472 params.threshold = 0.0005; | 507 params.threshold = 0.0005; |
508 params.decimator = | |
509 (m_mode == LiveMode ? | |
510 CQParameters::FasterDecimator : CQParameters::BetterDecimator); | |
473 params.window = CQParameters::Hann; | 511 params.window = CQParameters::Hann; |
474 | 512 |
475 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear); | 513 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear); |
476 | 514 |
477 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl; | 515 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl; |
490 m_pianoRoll.clear(); | 528 m_pianoRoll.clear(); |
491 m_inputGains.clear(); | 529 m_inputGains.clear(); |
492 m_columnCount = 0; | 530 m_columnCount = 0; |
493 m_resampledCount = 0; | 531 m_resampledCount = 0; |
494 m_startTime = RealTime::zeroTime; | 532 m_startTime = RealTime::zeroTime; |
533 m_haveStartTime = false; | |
495 } | 534 } |
496 | 535 |
497 Silvet::FeatureSet | 536 Silvet::FeatureSet |
498 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp) | 537 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp) |
499 { | 538 { |
500 FeatureSet fs; | 539 FeatureSet fs; |
501 | 540 |
502 if (m_columnCount == 0) { | 541 if (!m_haveStartTime) { |
542 | |
503 m_startTime = timestamp; | 543 m_startTime = timestamp; |
544 m_haveStartTime = true; | |
545 | |
504 insertTemplateFeatures(fs); | 546 insertTemplateFeatures(fs); |
505 } | 547 } |
506 | 548 |
507 vector<float> flattened(m_blockSize); | 549 vector<float> flattened(m_blockSize); |
508 float gain = 1.f; | 550 float gain = 1.f; |
595 fs[m_fcqOutputNo].push_back(f); | 637 fs[m_fcqOutputNo].push_back(f); |
596 } | 638 } |
597 | 639 |
598 int width = filtered.size(); | 640 int width = filtered.size(); |
599 | 641 |
600 int iterations = (m_mode == HighQualityMode ? 20 : 10); | 642 Grid localPitches(width); |
601 | |
602 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0)); | |
603 | 643 |
604 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning; | 644 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning; |
605 int shiftCount = 1; | 645 int shiftCount = 1; |
606 if (wantShifts) { | 646 if (wantShifts) { |
607 shiftCount = pack.templateMaxShift * 2 + 1; | 647 shiftCount = pack.templateMaxShift * 2 + 1; |
608 } | 648 } |
609 | 649 |
610 vector<vector<int> > localBestShifts; | 650 vector<vector<int> > localBestShifts; |
611 if (wantShifts) { | 651 if (wantShifts) { |
612 localBestShifts = | 652 localBestShifts = vector<vector<int> >(width); |
613 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0)); | 653 } |
614 } | 654 |
615 | 655 #ifndef MAX_EM_THREADS |
616 double columnThreshold = 1e-5; | 656 #define MAX_EM_THREADS 8 |
617 | 657 #endif |
618 if (m_mode == LiveMode) { | 658 |
619 columnThreshold /= 20; | 659 int emThreadCount = MAX_EM_THREADS; |
620 } | 660 if (m_mode == LiveMode && pack.templates.size() == 1) { |
621 | 661 // The EM step is probably not slow enough to merit it |
622 #pragma omp parallel for | 662 emThreadCount = 1; |
623 for (int i = 0; i < width; ++i) { | 663 } |
624 | 664 |
625 double sum = 0.0; | 665 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1)) |
626 for (int j = 0; j < pack.templateHeight; ++j) { | 666 if (emThreadCount > 1) { |
627 sum += filtered.at(i).at(j); | 667 for (int i = 0; i < width; ) { |
628 } | 668 typedef future<pair<vector<double>, vector<int>>> EMFuture; |
629 if (sum < columnThreshold) continue; | 669 vector<EMFuture> results; |
630 | 670 for (int j = 0; j < emThreadCount && i + j < width; ++j) { |
631 EM em(&pack, m_mode == HighQualityMode); | 671 results.push_back |
632 | 672 (async(std::launch::async, |
633 em.setPitchSparsity(pack.pitchSparsity); | 673 [&](int index) { |
634 em.setSourceSparsity(pack.sourceSparsity); | 674 return applyEM(pack, filtered.at(index), wantShifts); |
635 | 675 }, i + j)); |
636 for (int j = 0; j < iterations; ++j) { | 676 } |
637 em.iterate(filtered.at(i).data()); | 677 for (int j = 0; j < emThreadCount && i + j < width; ++j) { |
638 } | 678 auto out = results[j].get(); |
639 | 679 localPitches[i+j] = out.first; |
640 const float *pitchDist = em.getPitchDistribution(); | 680 if (wantShifts) localBestShifts[i+j] = out.second; |
641 const float *const *shiftDist = em.getShifts(); | 681 } |
642 | 682 i += emThreadCount; |
643 for (int j = 0; j < pack.templateNoteCount; ++j) { | 683 } |
644 | 684 } |
645 localPitches[i][j] = pitchDist[j] * sum; | 685 #endif |
646 | 686 |
647 int bestShift = 0; | 687 if (emThreadCount == 1) { |
648 float bestShiftValue = 0.0; | 688 for (int i = 0; i < width; ++i) { |
649 if (wantShifts) { | 689 auto out = applyEM(pack, filtered.at(i), wantShifts); |
650 for (int k = 0; k < shiftCount; ++k) { | 690 localPitches[i] = out.first; |
651 float value = shiftDist[k][j]; | 691 if (wantShifts) localBestShifts[i] = out.second; |
652 if (k == 0 || value > bestShiftValue) { | |
653 bestShiftValue = value; | |
654 bestShift = k; | |
655 } | |
656 } | |
657 localBestShifts[i][j] = bestShift; | |
658 } | |
659 } | 692 } |
660 } | 693 } |
661 | 694 |
662 for (int i = 0; i < width; ++i) { | 695 for (int i = 0; i < width; ++i) { |
663 | 696 |
697 // This returns a filtered column, and pushes the | |
698 // up-to-max-polyphony activation column to m_pianoRoll | |
664 vector<double> filtered = postProcess | 699 vector<double> filtered = postProcess |
665 (localPitches[i], localBestShifts[i], wantShifts); | 700 (localPitches[i], localBestShifts[i], wantShifts); |
666 | 701 |
702 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1); | |
703 float inputGain = getInputGainAt(timestamp); | |
704 | |
667 Feature f; | 705 Feature f; |
668 for (int j = 0; j < (int)filtered.size(); ++j) { | 706 for (int j = 0; j < (int)filtered.size(); ++j) { |
669 float v(filtered[j]); | 707 float v = filtered[j]; |
670 if (v < pack.levelThreshold) v = 0.f; | 708 if (v < pack.levelThreshold) v = 0.f; |
671 f.values.push_back(v); | 709 f.values.push_back(v / inputGain); |
672 } | 710 } |
673 fs[m_pitchOutputNo].push_back(f); | 711 fs[m_pitchOutputNo].push_back(f); |
712 | |
713 f.values.clear(); | |
714 f.values.resize(12); | |
715 for (int j = 0; j < (int)filtered.size(); ++j) { | |
716 f.values[j % 12] += filtered[j] / inputGain; | |
717 } | |
718 fs[m_chromaOutputNo].push_back(f); | |
674 | 719 |
675 FeatureList noteFeatures = noteTrack(shiftCount); | 720 FeatureList noteFeatures = noteTrack(shiftCount); |
676 | 721 |
677 for (FeatureList::const_iterator fi = noteFeatures.begin(); | 722 for (FeatureList::const_iterator fi = noteFeatures.begin(); |
678 fi != noteFeatures.end(); ++fi) { | 723 fi != noteFeatures.end(); ++fi) { |
679 fs[m_notesOutputNo].push_back(*fi); | 724 fs[m_notesOutputNo].push_back(*fi); |
680 } | 725 } |
681 } | 726 } |
727 } | |
728 | |
729 pair<vector<double>, vector<int> > | |
730 Silvet::applyEM(const InstrumentPack &pack, | |
731 const vector<double> &column, | |
732 bool wantShifts) | |
733 { | |
734 double columnThreshold = 1e-5; | |
735 | |
736 if (m_mode == LiveMode) { | |
737 columnThreshold /= 20; | |
738 } | |
739 | |
740 vector<double> pitches(pack.templateNoteCount, 0.0); | |
741 vector<int> bestShifts; | |
742 | |
743 double sum = 0.0; | |
744 for (int j = 0; j < pack.templateHeight; ++j) { | |
745 sum += column.at(j); | |
746 } | |
747 if (sum < columnThreshold) return { pitches, bestShifts }; | |
748 | |
749 EM em(&pack, m_mode == HighQualityMode); | |
750 | |
751 em.setPitchSparsity(pack.pitchSparsity); | |
752 em.setSourceSparsity(pack.sourceSparsity); | |
753 | |
754 int iterations = (m_mode == HighQualityMode ? 20 : 10); | |
755 | |
756 for (int j = 0; j < iterations; ++j) { | |
757 em.iterate(column.data()); | |
758 } | |
759 | |
760 const float *pitchDist = em.getPitchDistribution(); | |
761 const float *const *shiftDist = em.getShifts(); | |
762 | |
763 int shiftCount = 1; | |
764 if (wantShifts) { | |
765 shiftCount = pack.templateMaxShift * 2 + 1; | |
766 } | |
767 | |
768 for (int j = 0; j < pack.templateNoteCount; ++j) { | |
769 | |
770 pitches[j] = pitchDist[j] * sum; | |
771 | |
772 int bestShift = 0; | |
773 float bestShiftValue = 0.0; | |
774 if (wantShifts) { | |
775 for (int k = 0; k < shiftCount; ++k) { | |
776 float value = shiftDist[k][j]; | |
777 if (k == 0 || value > bestShiftValue) { | |
778 bestShiftValue = value; | |
779 bestShift = k; | |
780 } | |
781 } | |
782 bestShifts.push_back(bestShift); | |
783 } | |
784 } | |
785 | |
786 return { pitches, bestShifts }; | |
682 } | 787 } |
683 | 788 |
684 Silvet::Grid | 789 Silvet::Grid |
685 Silvet::preProcess(const Grid &in) | 790 Silvet::preProcess(const Grid &in) |
686 { | 791 { |
780 for (int j = 0; j < pack.templateNoteCount; ++j) { | 885 for (int j = 0; j < pack.templateNoteCount; ++j) { |
781 m_postFilter[j]->push(pitches[j]); | 886 m_postFilter[j]->push(pitches[j]); |
782 filtered.push_back(m_postFilter[j]->get()); | 887 filtered.push_back(m_postFilter[j]->get()); |
783 } | 888 } |
784 | 889 |
890 if (m_mode == LiveMode) { | |
891 // In live mode with only a 12-bpo CQ, we are very likely to | |
892 // get clusters of two or three high scores at a time for | |
893 // neighbouring semitones. Eliminate these by picking only the | |
894 // peaks. This means we can't recognise actual semitone chords | |
895 // if they ever appear, but it's not as if live mode is good | |
896 // enough for that to be a big deal anyway. | |
897 for (int j = 0; j < pack.templateNoteCount; ++j) { | |
898 if (j > 0 && j + 1 < pack.templateNoteCount && | |
899 filtered[j] >= filtered[j-1] && | |
900 filtered[j] >= filtered[j+1]) { | |
901 } else { | |
902 filtered[j] = 0.0; | |
903 } | |
904 } | |
905 } | |
906 | |
785 // Threshold for level and reduce number of candidate pitches | 907 // Threshold for level and reduce number of candidate pitches |
786 | 908 |
787 typedef std::multimap<double, int> ValueIndexMap; | 909 typedef std::multimap<double, int> ValueIndexMap; |
788 | 910 |
789 ValueIndexMap strengths; | 911 ValueIndexMap strengths; |
922 } | 1044 } |
923 } | 1045 } |
924 | 1046 |
925 int v; | 1047 int v; |
926 if (m_mode == LiveMode) { | 1048 if (m_mode == LiveMode) { |
927 v = round(strength * 30); | 1049 v = round(strength * 20); |
928 } else { | 1050 } else { |
929 v = round(strength * 2); | 1051 v = round(strength * 2); |
930 } | 1052 } |
931 if (v > partVelocity) { | 1053 if (v > partVelocity) { |
932 partVelocity = v; | 1054 partVelocity = v; |
941 shiftCount, | 1063 shiftCount, |
942 partVelocity)); | 1064 partVelocity)); |
943 } | 1065 } |
944 } | 1066 } |
945 | 1067 |
1068 RealTime | |
1069 Silvet::getColumnTimestamp(int column) | |
1070 { | |
1071 double columnDuration = 1.0 / m_colsPerSec; | |
1072 int postFilterLatency = int(m_postFilter[0]->getSize() / 2); | |
1073 | |
1074 return m_startTime + RealTime::fromSeconds | |
1075 (columnDuration * (column - postFilterLatency) + 0.02); | |
1076 } | |
1077 | |
946 Silvet::Feature | 1078 Silvet::Feature |
947 Silvet::makeNoteFeature(int start, | 1079 Silvet::makeNoteFeature(int start, |
948 int end, | 1080 int end, |
949 int note, | 1081 int note, |
950 int shift, | 1082 int shift, |
951 int shiftCount, | 1083 int shiftCount, |
952 int velocity) | 1084 int velocity) |
953 { | 1085 { |
954 double columnDuration = 1.0 / m_colsPerSec; | |
955 int postFilterLatency = int(m_postFilter[0]->getSize() / 2); | |
956 | |
957 Feature f; | 1086 Feature f; |
958 | 1087 |
959 f.hasTimestamp = true; | 1088 f.hasTimestamp = true; |
960 f.timestamp = m_startTime + RealTime::fromSeconds | 1089 f.timestamp = getColumnTimestamp(start); |
961 (columnDuration * (start - postFilterLatency) + 0.02); | |
962 | 1090 |
963 f.hasDuration = true; | 1091 f.hasDuration = true; |
964 f.duration = RealTime::fromSeconds | 1092 f.duration = getColumnTimestamp(end) - f.timestamp; |
965 (columnDuration * (end - start)); | |
966 | 1093 |
967 f.values.clear(); | 1094 f.values.clear(); |
968 | 1095 |
969 f.values.push_back | 1096 f.values.push_back |
970 (noteFrequency(note, shift, shiftCount)); | 1097 (noteFrequency(note, shift, shiftCount)); |