comparison src/SubsequenceMatchVampPlugin.cpp @ 246:aac9ad4064ea subsequence tip

Fix incorrect handling of silent tail in the non-subsequence MATCH phase; some debug output changes
author Chris Cannam
date Fri, 24 Jul 2020 14:29:55 +0100
parents f68277668ad4
children
comparison
equal deleted inserted replaced
245:91796976e8c6 246:aac9ad4064ea
29 using std::vector; 29 using std::vector;
30 using std::cerr; 30 using std::cerr;
31 using std::cout; 31 using std::cout;
32 using std::endl; 32 using std::endl;
33 33
34 //#define DEBUG_SUBSEQUENCE_MATCH 1
35
34 // We want to ensure our freq map / crossover bin are always valid 36 // We want to ensure our freq map / crossover bin are always valid
35 // with a fixed FFT length in seconds, so must reject low sample rates 37 // with a fixed FFT length in seconds, so must reject low sample rates
36 static float sampleRateMin = 5000.f; 38 static float sampleRateMin = 5000.f;
37 39
38 static float defaultStepTime = 0.020f; 40 static float defaultStepTime = 0.020f;
44 Plugin(inputSampleRate), 46 Plugin(inputSampleRate),
45 m_stepSize(int(inputSampleRate * defaultStepTime + 0.001)), 47 m_stepSize(int(inputSampleRate * defaultStepTime + 0.001)),
46 m_stepTime(defaultStepTime), 48 m_stepTime(defaultStepTime),
47 m_blockSize(2048), 49 m_blockSize(2048),
48 m_coarseDownsample(defaultCoarseDownsample), 50 m_coarseDownsample(defaultCoarseDownsample),
51 m_downsamplePeaks(false),
49 m_serialise(false), 52 m_serialise(false),
50 m_smooth(false), 53 m_smooth(false),
51 m_channelCount(0), 54 m_channelCount(0),
52 m_params(defaultStepTime), 55 m_params(defaultStepTime),
53 m_defaultParams(defaultStepTime), 56 m_defaultParams(defaultStepTime),
136 desc.unit = "Hz"; 139 desc.unit = "Hz";
137 list.push_back(desc); 140 list.push_back(desc);
138 141
139 desc.identifier = "freq2"; 142 desc.identifier = "freq2";
140 desc.name = "Tuning frequency of second input"; 143 desc.name = "Tuning frequency of second input";
141 desc.description = "Tuning frequency (concert A) for the other audio"; 144 desc.description = "Tuning frequency (concert A) for the other audio.";
142 desc.minValue = 220.0; 145 desc.minValue = 220.0;
143 desc.maxValue = 880.0; 146 desc.maxValue = 880.0;
144 desc.defaultValue = float(m_defaultFeParams.referenceFrequency); 147 desc.defaultValue = float(m_defaultFeParams.referenceFrequency);
145 desc.isQuantized = false; 148 desc.isQuantized = false;
146 desc.unit = "Hz"; 149 desc.unit = "Hz";
176 desc.defaultValue = float(defaultCoarseDownsample); 179 desc.defaultValue = float(defaultCoarseDownsample);
177 desc.isQuantized = true; 180 desc.isQuantized = true;
178 desc.quantizeStep = 1; 181 desc.quantizeStep = 1;
179 list.push_back(desc); 182 list.push_back(desc);
180 183
184 desc.identifier = "downsamplemethod";
185 desc.name = "Coarse alignment downsample method";
186 desc.description = "Downsample method for features used in first coarse subsequence-alignment step";
187 desc.minValue = 0;
188 desc.maxValue = 1;
189 desc.defaultValue = 0;
190 desc.isQuantized = true;
191 desc.quantizeStep = 1;
192 desc.valueNames.clear();
193 desc.valueNames.push_back("Average");
194 desc.valueNames.push_back("Peak");
195 list.push_back(desc);
196
181 desc.identifier = "usechroma"; 197 desc.identifier = "usechroma";
182 desc.name = "Feature type"; 198 desc.name = "Feature type";
183 desc.description = "Whether to use warped spectrogram or chroma frequency map"; 199 desc.description = "Whether to use warped spectrogram or chroma frequency map";
184 desc.minValue = 0; 200 desc.minValue = 0;
185 desc.maxValue = 1; 201 desc.maxValue = 1;
193 209
194 desc.valueNames.clear(); 210 desc.valueNames.clear();
195 211
196 desc.identifier = "usespecdiff"; 212 desc.identifier = "usespecdiff";
197 desc.name = "Use feature difference"; 213 desc.name = "Use feature difference";
198 desc.description = "Whether to use half-wave rectified feature-to-feature difference instead of straight spectral or chroma feature"; 214 desc.description = "Whether to use half-wave rectified feature-to-feature difference instead of straight spectral or chroma feature (does not apply to downsampled features)";
199 desc.minValue = 0; 215 desc.minValue = 0;
200 desc.maxValue = 1; 216 desc.maxValue = 1;
201 desc.defaultValue = float(m_defaultFcParams.order); 217 desc.defaultValue = float(m_defaultFcParams.order);
202 desc.isQuantized = true; 218 desc.isQuantized = true;
203 desc.quantizeStep = 1; 219 desc.quantizeStep = 1;
383 return float(m_feParams.minFrequency); 399 return float(m_feParams.minFrequency);
384 } else if (name == "maxfreq") { 400 } else if (name == "maxfreq") {
385 return float(m_feParams.maxFrequency); 401 return float(m_feParams.maxFrequency);
386 } else if (name == "coarsedownsample") { 402 } else if (name == "coarsedownsample") {
387 return float(m_coarseDownsample); 403 return float(m_coarseDownsample);
404 } else if (name == "downsamplemethod") {
405 return m_downsamplePeaks ? 1.0 : 0.0;
388 } 406 }
389 407
390 return 0.0; 408 return 0.0;
391 } 409 }
392 410
429 m_feParams.minFrequency = value; 447 m_feParams.minFrequency = value;
430 } else if (name == "maxfreq") { 448 } else if (name == "maxfreq") {
431 m_feParams.maxFrequency = value; 449 m_feParams.maxFrequency = value;
432 } else if (name == "coarsedownsample") { 450 } else if (name == "coarsedownsample") {
433 m_coarseDownsample = int(value + 0.1); 451 m_coarseDownsample = int(value + 0.1);
434 } 452 } else if (name == "downsamplemethod") {
453 m_downsamplePeaks = (value > 0.5);
454 }
455 }
456
457 SubsequenceMatchVampPlugin::InputDomain
458 SubsequenceMatchVampPlugin::getInputDomain() const
459 {
460 return FrequencyDomain;
435 } 461 }
436 462
437 size_t 463 size_t
438 SubsequenceMatchVampPlugin::getPreferredStepSize() const 464 SubsequenceMatchVampPlugin::getPreferredStepSize() const
439 { 465 {
442 468
443 size_t 469 size_t
444 SubsequenceMatchVampPlugin::getPreferredBlockSize() const 470 SubsequenceMatchVampPlugin::getPreferredBlockSize() const
445 { 471 {
446 return m_defaultFeParams.fftSize; 472 return m_defaultFeParams.fftSize;
473 }
474
475 size_t
476 SubsequenceMatchVampPlugin::getMinChannelCount() const
477 {
478 return 2;
479 }
480
481 size_t
482 SubsequenceMatchVampPlugin::getMaxChannelCount() const
483 {
484 return 2;
447 } 485 }
448 486
449 bool 487 bool
450 SubsequenceMatchVampPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize) 488 SubsequenceMatchVampPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize)
451 { 489 {
567 } 605 }
568 606
569 return {}; 607 return {};
570 } 608 }
571 609
572 featureseq_t 610 size_t
573 SubsequenceMatchVampPlugin::downsample(const featureseq_t &ff) 611 SubsequenceMatchVampPlugin::findNonEmptyLength(const featureseq_t &ff)
574 { 612 {
575 if (ff.empty()) { 613 bool haveNonEmpty = false;
576 return ff;
577 }
578
579 size_t lastNonEmpty = 0; 614 size_t lastNonEmpty = 0;
580 for (size_t i = ff.size(); i > 0; ) { 615 for (size_t i = ff.size(); i > 0; ) {
581 --i; 616 --i;
582 if (MatchPipeline::isAboveEndingThreshold(ff[i])) { 617 if (MatchPipeline::isAboveEndingThreshold(ff[i])) {
618 haveNonEmpty = true;
583 lastNonEmpty = i; 619 lastNonEmpty = i;
584 break; 620 break;
585 } 621 }
622 }
623 if (haveNonEmpty) {
624 return lastNonEmpty + 1;
625 } else {
626 return 0;
627 }
628 }
629
630 featureseq_t
631 SubsequenceMatchVampPlugin::downsample(const featureseq_t &ff,
632 size_t inLength)
633 {
634 if (ff.empty()) {
635 return ff;
586 } 636 }
587 637
588 FeatureConditioner::Parameters fcParams(m_fcParams); 638 FeatureConditioner::Parameters fcParams(m_fcParams);
589 fcParams.order = FeatureConditioner::OutputFeatures; // not the difference 639 fcParams.order = FeatureConditioner::OutputFeatures; // not the difference
590 FeatureConditioner fc(fcParams); 640 FeatureConditioner fc(fcParams);
592 int featureSize = m_featureExtractors[0].getFeatureSize(); 642 int featureSize = m_featureExtractors[0].getFeatureSize();
593 643
594 featureseq_t d; 644 featureseq_t d;
595 645
596 size_t i = 0; 646 size_t i = 0;
597 while (i < lastNonEmpty) { 647 while (i < inLength) {
598 feature_t acc(featureSize, 0); 648 feature_t acc(featureSize, 0);
599 int j = 0; 649 int j = 0;
600 while (j < m_coarseDownsample) { 650 while (j < m_coarseDownsample) {
601 if (i >= ff.size()) break; 651 if (i >= ff.size()) break;
602 feature_t feature = fc.process(ff[i]); 652 feature_t feature = fc.process(ff[i]);
603 for (int k = 0; k < featureSize; ++k) { 653 if (m_downsamplePeaks) {
604 acc[k] += feature[k]; 654 for (int k = 0; k < featureSize; ++k) {
655 if (feature[k] > acc[k]) {
656 acc[k] = feature[k];
657 }
658 }
659 } else {
660 for (int k = 0; k < featureSize; ++k) {
661 acc[k] += feature[k];
662 }
605 } 663 }
606 ++i; 664 ++i;
607 ++j; 665 ++j;
608 } 666 }
609 if (j > 0) { 667 if (!m_downsamplePeaks && j > 0) {
610 for (int k = 0; k < featureSize; ++k) { 668 for (int k = 0; k < featureSize; ++k) {
611 acc[k] /= float(j); 669 acc[k] /= float(j);
612 } 670 }
613 } 671 }
614 d.push_back(acc); 672 d.push_back(acc);
662 } 720 }
663 721
664 SubsequenceMatchVampPlugin::FeatureSet 722 SubsequenceMatchVampPlugin::FeatureSet
665 SubsequenceMatchVampPlugin::performAlignment() 723 SubsequenceMatchVampPlugin::performAlignment()
666 { 724 {
667 featureseq_t downsampledRef = downsample(m_features[0]); 725 size_t refLength = findNonEmptyLength(m_features[0]);
668 726 featureseq_t downsampledRef = downsample(m_features[0], refLength);
669 cerr << "SubsequenceMatchVampPlugin: reference downsampled sequence length = " << downsampledRef.size() << endl; 727
728 #ifdef DEBUG_SUBSEQUENCE_MATCH
729 cerr << "SubsequenceMatchVampPlugin: reference downsampled sequence length = " << downsampledRef.size() << " (from " << refLength << " non-empty of " << m_features[0].size() << " total)" << endl;
730 #endif
670 731
671 FullDTW dtw(m_fdParams, m_dParams); 732 FullDTW dtw(m_fdParams, m_dParams);
672 733
673 FeatureSet returnFeatures; 734 FeatureSet returnFeatures;
674 int featureSize = m_featureExtractors[0].getFeatureSize(); 735 int featureSize = m_featureExtractors[0].getFeatureSize();
675 736
676 int rate = int(m_inputSampleRate + 0.5); 737 int rate = int(m_inputSampleRate + 0.5);
677 738
678 for (size_t c = 1; c < m_channelCount; ++c) { 739 for (size_t c = 1; c < m_channelCount; ++c) {
679 740
680 featureseq_t downsampledOther = downsample(m_features[c]); 741 size_t otherLength = findNonEmptyLength(m_features[c]);
681 742 featureseq_t downsampledOther = downsample(m_features[c], otherLength);
682 cerr << "SubsequenceMatchVampPlugin: other downsampled sequence length = " << downsampledOther.size() << endl; 743
744 #ifdef DEBUG_SUBSEQUENCE_MATCH
745 cerr << "SubsequenceMatchVampPlugin: other downsampled sequence length = " << downsampledOther.size() << " (from " << otherLength << " non-empty of " << m_features[c].size() << " total)" << endl;
746 #endif
683 747
684 vector<size_t> subsequenceAlignment = dtw.align(downsampledRef, 748 vector<size_t> subsequenceAlignment = dtw.align(downsampledRef,
685 downsampledOther); 749 downsampledOther);
686 750
687 if (subsequenceAlignment.empty()) { 751 if (subsequenceAlignment.empty()) {
689 continue; 753 continue;
690 } 754 }
691 755
692 int64_t first = subsequenceAlignment[0]; 756 int64_t first = subsequenceAlignment[0];
693 int64_t last = subsequenceAlignment[subsequenceAlignment.size()-1]; 757 int64_t last = subsequenceAlignment[subsequenceAlignment.size()-1];
694 cerr << "Subsequence alignment span: " << first << " to " << last << endl; 758
695 759 #ifdef DEBUG_SUBSEQUENCE_MATCH
760 cerr << "Subsequence alignment maps 0 -> " << subsequenceAlignment.size()-1 << " to " << first << " -> " << last << endl;
761 #endif
696 762
697 if (last <= first) { 763 if (last <= first) {
698 cerr << "NOTE: Invalid span (" << first << " to " << last 764 cerr << "NOTE: Invalid span (" << first << " to " << last
699 << "), reverting to aligning against whole of reference" 765 << "), reverting to aligning against whole of reference"
700 << endl; 766 << endl;
727 793
728 featureseq_t referenceSubsequence 794 featureseq_t referenceSubsequence
729 (m_features[0].begin() + firstAtOriginalRate, 795 (m_features[0].begin() + firstAtOriginalRate,
730 m_features[0].begin() + lastAtOriginalRate); 796 m_features[0].begin() + lastAtOriginalRate);
731 797
798 #ifdef DEBUG_SUBSEQUENCE_MATCH
799 cerr << "Reference subsequence length = " << referenceSubsequence.size()
800 << endl;
801 cerr << "Other sequence length = " << otherLength << endl;
802 #endif
803
732 MatchPipeline pipeline(m_feParams, 804 MatchPipeline pipeline(m_feParams,
733 m_fcParams, 805 m_fcParams,
734 m_dParams, 806 m_dParams,
735 m_params, 807 m_params,
736 m_secondReferenceFrequency); 808 m_secondReferenceFrequency);
737 809
738 for (size_t i = 0; i < referenceSubsequence.size() && 810 size_t sequenceLength = std::max(referenceSubsequence.size(),
739 i < m_features[c].size(); ++i) { 811 otherLength);
812
813 #ifdef DEBUG_SUBSEQUENCE_MATCH
814 cerr << "MATCH input sequences have length " << sequenceLength << endl;
815 #endif
816
817 for (size_t i = 0; i < sequenceLength; ++i) {
740 feature_t f1(featureSize, 0); 818 feature_t f1(featureSize, 0);
741 feature_t f2(featureSize, 0); 819 feature_t f2(featureSize, 0);
742 if (i < referenceSubsequence.size()) { 820 if (i < referenceSubsequence.size()) {
743 f1 = referenceSubsequence[i]; 821 f1 = referenceSubsequence[i];
744 } 822 }
745 if (i < m_features[c].size()) { 823 if (i < otherLength) {
746 f2 = m_features[c][i]; 824 f2 = m_features[c][i];
747 } 825 }
748 pipeline.feedFeatures(f1, f2); 826 pipeline.feedFeatures(f1, f2);
749 } 827 }
750 828
753 vector<int> pathx; 831 vector<int> pathx;
754 vector<int> pathy; 832 vector<int> pathy;
755 int len = pipeline.retrievePath(m_smooth, pathx, pathy); 833 int len = pipeline.retrievePath(m_smooth, pathx, pathy);
756 834
757 int prevy = 0; 835 int prevy = 0;
836
837 #ifdef DEBUG_SUBSEQUENCE_MATCH
838 cerr << "MATCH path has length " << len;
839 if (len > 0) {
840 cerr << " and goes from ("
841 << pathx[0] << ", " << pathy[0] << ") to ("
842 << pathx[len-1] << ", " << pathy[len-1] << ")";
843 if (len > 2) {
844 cerr << " with penultimate point at ("
845 << pathx[len-2] << ", " << pathy[len-2] << ")";
846 }
847 cerr << endl;
848 } else {
849 cerr << endl;
850 }
851 #endif
758 852
759 for (int i = 0; i < len; ++i) { 853 for (int i = 0; i < len; ++i) {
760 854
761 int x = pathx[i]; 855 int x = pathx[i];
762 int y = pathy[i] + int(first * m_coarseDownsample); 856 int y = pathy[i] + int(first * m_coarseDownsample);
763 857
764 Vamp::RealTime xt = Vamp::RealTime::frame2RealTime 858 Vamp::RealTime xt = Vamp::RealTime::frame2RealTime
765 (x * m_stepSize, rate) + m_startTime; 859 (x * m_stepSize, rate) + m_startTime;
766 Vamp::RealTime yt = Vamp::RealTime::frame2RealTime 860 Vamp::RealTime yt = Vamp::RealTime::frame2RealTime
767 (y * m_stepSize, rate) + m_startTime; 861 (y * m_stepSize, rate) + m_startTime;
768 862
783 877
784 prevy = y; 878 prevy = y;
785 } 879 }
786 } 880 }
787 881
882 #ifdef DEBUG_SUBSEQUENCE_MATCH
883 cerr << endl;
884 #endif
885
788 return returnFeatures; 886 return returnFeatures;
789 } 887 }