Mercurial > hg > pyin
comparison PYinVamp.cpp @ 132:926c292fa3ff fixedlag
fixed lag smoothing for pitch track working
author | Matthias Mauch <mail@matthiasmauch.net> |
---|---|
date | Fri, 03 Jul 2015 17:34:38 +0100 |
parents | b877df85ad9e |
children | 83978b93aac1 |
comparison
equal
deleted
inserted
replaced
131:b877df85ad9e | 132:926c292fa3ff |
---|---|
11 COPYING included with this distribution for more information. | 11 COPYING included with this distribution for more information. |
12 */ | 12 */ |
13 | 13 |
14 #include "PYinVamp.h" | 14 #include "PYinVamp.h" |
15 #include "MonoNote.h" | 15 #include "MonoNote.h" |
16 #include "MonoPitch.h" | |
17 #include "MonoPitchHMM.h" | 16 #include "MonoPitchHMM.h" |
18 | 17 |
19 #include "vamp-sdk/FFT.h" | 18 #include "vamp-sdk/FFT.h" |
20 | 19 |
21 #include <vector> | 20 #include <vector> |
43 m_oVoicedProb(0), | 42 m_oVoicedProb(0), |
44 m_oCandidateSalience(0), | 43 m_oCandidateSalience(0), |
45 m_oSmoothedPitchTrack(0), | 44 m_oSmoothedPitchTrack(0), |
46 m_oNotes(0), | 45 m_oNotes(0), |
47 m_threshDistr(2.0f), | 46 m_threshDistr(2.0f), |
48 m_fixedLag(0.0f), | 47 m_fixedLag(1.0f), |
49 m_outputUnvoiced(0.0f), | 48 m_outputUnvoiced(0.0f), |
50 m_preciseTime(0.0f), | 49 m_preciseTime(0.0f), |
51 m_lowAmp(0.1f), | 50 m_lowAmp(0.1f), |
52 m_onsetSensitivity(0.7f), | 51 m_onsetSensitivity(0.7f), |
53 m_pruneThresh(0.1f), | 52 m_pruneThresh(0.1f), |
54 m_pitchHmm(), | 53 m_pitchHmm(0), |
55 m_pitchProb(0), | 54 m_pitchProb(0), |
56 m_timestamp(0), | 55 m_timestamp(0), |
57 m_level(0) | 56 m_level(0) |
58 { | 57 { |
59 } | 58 } |
440 PYinVamp::reset() | 439 PYinVamp::reset() |
441 { | 440 { |
442 m_yin.setThresholdDistr(m_threshDistr); | 441 m_yin.setThresholdDistr(m_threshDistr); |
443 m_yin.setFrameSize(m_blockSize); | 442 m_yin.setFrameSize(m_blockSize); |
444 m_yin.setFast(!m_preciseTime); | 443 m_yin.setFast(!m_preciseTime); |
444 | |
445 if (m_fixedLag == 1.f) m_pitchHmm = MonoPitchHMM(100); | |
446 else m_pitchHmm = MonoPitchHMM(0); | |
445 | 447 |
446 m_pitchProb.clear(); | 448 m_pitchProb.clear(); |
447 m_timestamp.clear(); | 449 m_timestamp.clear(); |
448 m_level.clear(); | 450 m_level.clear(); |
449 /* | 451 /* |
491 tempPitchProb.push_back(pair<double, double> | 493 tempPitchProb.push_back(pair<double, double> |
492 (tempPitch, yo.freqProb[iCandidate].second*factor)); | 494 (tempPitch, yo.freqProb[iCandidate].second*factor)); |
493 } | 495 } |
494 } | 496 } |
495 | 497 |
496 if (m_fixedLag == 0.f) | 498 vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb); |
497 { | 499 if (m_timestamp.empty()) |
498 vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb); | 500 { |
499 if (m_timestamp.empty()) | 501 m_pitchHmm.initialise(tempObsProb); |
502 } else { | |
503 m_pitchHmm.process(tempObsProb); | |
504 } | |
505 | |
506 m_pitchProb.push_back(tempPitchProb); | |
507 m_timestamp.push_back(timestamp); | |
508 | |
509 int lag = m_pitchHmm.m_fixedLag; | |
510 | |
511 if (m_fixedLag == 1.f) | |
512 { | |
513 if (m_timestamp.size() == lag + 1) | |
500 { | 514 { |
501 m_pitchHmm.initialise(tempObsProb); | 515 m_timestamp.pop_front(); |
502 } else { | 516 m_pitchProb.pop_front(); |
503 m_pitchHmm.process(tempObsProb); | 517 |
518 Feature f; | |
519 f.hasTimestamp = true; | |
520 vector<int> rawPitchPath = m_pitchHmm.track(); | |
521 float freq = m_pitchHmm.nearestFreq(rawPitchPath[0], | |
522 m_pitchProb[0]); | |
523 f.timestamp = m_timestamp[0]; | |
524 f.values.clear(); | |
525 | |
526 // different output modes | |
527 if (freq < 0 && (m_outputUnvoiced==0)) | |
528 { | |
529 | |
530 } else { | |
531 if (m_outputUnvoiced == 1) | |
532 { | |
533 f.values.push_back(fabs(freq)); | |
534 } else { | |
535 f.values.push_back(freq); | |
536 } | |
537 fs[m_oSmoothedPitchTrack].push_back(f); | |
538 } | |
504 } | 539 } |
505 m_pitchProb.push_back(tempPitchProb); | 540 } |
506 } else { | 541 |
507 // Damn, so I need the hmm right here! Sadly it isn't defined here yet. | |
508 // Perhaps I could re-design the whole shabang | |
509 } | |
510 m_timestamp.push_back(timestamp); | |
511 | 542 |
512 // F0 CANDIDATES | 543 // F0 CANDIDATES |
513 Feature f; | 544 Feature f; |
514 f.hasTimestamp = true; | 545 f.hasTimestamp = true; |
515 f.timestamp = timestamp; | 546 f.timestamp = timestamp; |
558 return fs; | 589 return fs; |
559 } | 590 } |
560 | 591 |
561 // ================== P I T C H T R A C K ================================= | 592 // ================== P I T C H T R A C K ================================= |
562 | 593 |
563 vector<int> rawPitchPath = m_pitchHmm.finalise(); | 594 vector<int> rawPitchPath = m_pitchHmm.track(); |
564 vector<float> mpOut; | 595 vector<float> mpOut; |
565 | 596 |
566 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) | 597 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) |
567 { | 598 { |
568 float freq = pitchState2Freq(rawPitchPath[iFrame], m_pitchProb[iFrame]); | 599 float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame], |
600 m_pitchProb[iFrame]); | |
569 mpOut.push_back(freq); // for note processing below | 601 mpOut.push_back(freq); // for note processing below |
570 | 602 |
571 f.timestamp = m_timestamp[iFrame]; | 603 f.timestamp = m_timestamp[iFrame]; |
572 // std::cerr << f.timestamp << std::endl; | |
573 f.values.clear(); | 604 f.values.clear(); |
574 | 605 |
575 // different output modes | 606 // different output modes |
576 if (freq < 0 && (m_outputUnvoiced==0)) continue; | 607 if (freq < 0 && (m_outputUnvoiced==0)) continue; |
577 if (m_outputUnvoiced == 1) | 608 if (m_outputUnvoiced == 1) |
580 } else { | 611 } else { |
581 f.values.push_back(freq); | 612 f.values.push_back(freq); |
582 } | 613 } |
583 fs[m_oSmoothedPitchTrack].push_back(f); | 614 fs[m_oSmoothedPitchTrack].push_back(f); |
584 } | 615 } |
585 | 616 |
586 // for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) | 617 // ======================== N O T E S ====================================== |
618 // MonoNote mn; | |
619 // std::vector<std::vector<std::pair<double, double> > > smoothedPitch; | |
620 // for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) { | |
621 // std::vector<std::pair<double, double> > temp; | |
622 // if (mpOut[iFrame] > 0) | |
623 // { | |
624 // double tempPitch = 12 * | |
625 // std::log(mpOut[iFrame]/440)/std::log(2.) + 69; | |
626 // temp.push_back(std::pair<double,double>(tempPitch, .9)); | |
627 // } | |
628 // smoothedPitch.push_back(temp); | |
629 // } | |
630 // // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb); | |
631 // vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch); | |
632 | |
633 // // turning feature into a note feature | |
634 // f.hasTimestamp = true; | |
635 // f.hasDuration = true; | |
636 // f.values.clear(); | |
637 | |
638 // int onsetFrame = 0; | |
639 // bool isVoiced = 0; | |
640 // bool oldIsVoiced = 0; | |
641 // size_t nFrame = m_pitchProb.size(); | |
642 | |
643 // float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize; | |
644 | |
645 // // the body of the loop below should be in a function/method | |
646 // std::vector<float> notePitchTrack; // collects pitches for one note at a time | |
647 // for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) | |
587 // { | 648 // { |
588 // if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue; | 649 // isVoiced = mnOut[iFrame].noteState < 3 |
589 | 650 // && smoothedPitch[iFrame].size() > 0 |
590 // if (m_outputUnvoiced == 1) | 651 // && (iFrame >= nFrame-2 |
652 // || ((m_level[iFrame]/m_level[iFrame+2]) > | |
653 // m_onsetSensitivity)); | |
654 // if (isVoiced && iFrame != nFrame-1) | |
591 // { | 655 // { |
592 // f.values.push_back(fabs(mpOut[iFrame])); | 656 // if (oldIsVoiced == 0) // beginning of a note |
593 // } else { | 657 // { |
594 // f.values.push_back(mpOut[iFrame]); | 658 // onsetFrame = iFrame; |
659 // } | |
660 // float pitch = smoothedPitch[iFrame][0].first; | |
661 // notePitchTrack.push_back(pitch); // add to the note's pitch track | |
662 // } else { // not currently voiced | |
663 // if (oldIsVoiced == 1) // end of note | |
664 // { | |
665 // if (notePitchTrack.size() >= minNoteFrames) | |
666 // { | |
667 // std::sort(notePitchTrack.begin(), notePitchTrack.end()); | |
668 // float medianPitch = notePitchTrack[notePitchTrack.size()/2]; | |
669 // float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440; | |
670 // f.values.clear(); | |
671 // f.values.push_back(medianFreq); | |
672 // f.timestamp = m_timestamp[onsetFrame]; | |
673 // f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame]; | |
674 // fs[m_oNotes].push_back(f); | |
675 // } | |
676 // notePitchTrack.clear(); | |
677 // } | |
595 // } | 678 // } |
596 | 679 // oldIsVoiced = isVoiced; |
597 // fs[m_oSmoothedPitchTrack].push_back(f); | |
598 // } | 680 // } |
599 | |
600 // ======================== N O T E S ====================================== | |
601 MonoNote mn; | |
602 std::vector<std::vector<std::pair<double, double> > > smoothedPitch; | |
603 for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) { | |
604 std::vector<std::pair<double, double> > temp; | |
605 if (mpOut[iFrame] > 0) | |
606 { | |
607 double tempPitch = 12 * | |
608 std::log(mpOut[iFrame]/440)/std::log(2.) + 69; | |
609 temp.push_back(std::pair<double,double>(tempPitch, .9)); | |
610 } | |
611 smoothedPitch.push_back(temp); | |
612 } | |
613 // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb); | |
614 vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch); | |
615 | |
616 // turning feature into a note feature | |
617 f.hasTimestamp = true; | |
618 f.hasDuration = true; | |
619 f.values.clear(); | |
620 | |
621 int onsetFrame = 0; | |
622 bool isVoiced = 0; | |
623 bool oldIsVoiced = 0; | |
624 size_t nFrame = m_pitchProb.size(); | |
625 | |
626 float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize; | |
627 | |
628 // the body of the loop below should be in a function/method | |
629 std::vector<float> notePitchTrack; // collects pitches for one note at a time | |
630 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) | |
631 { | |
632 isVoiced = mnOut[iFrame].noteState < 3 | |
633 && smoothedPitch[iFrame].size() > 0 | |
634 && (iFrame >= nFrame-2 | |
635 || ((m_level[iFrame]/m_level[iFrame+2]) > | |
636 m_onsetSensitivity)); | |
637 if (isVoiced && iFrame != nFrame-1) | |
638 { | |
639 if (oldIsVoiced == 0) // beginning of a note | |
640 { | |
641 onsetFrame = iFrame; | |
642 } | |
643 float pitch = smoothedPitch[iFrame][0].first; | |
644 notePitchTrack.push_back(pitch); // add to the note's pitch track | |
645 } else { // not currently voiced | |
646 if (oldIsVoiced == 1) // end of note | |
647 { | |
648 if (notePitchTrack.size() >= minNoteFrames) | |
649 { | |
650 std::sort(notePitchTrack.begin(), notePitchTrack.end()); | |
651 float medianPitch = notePitchTrack[notePitchTrack.size()/2]; | |
652 float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440; | |
653 f.values.clear(); | |
654 f.values.push_back(medianFreq); | |
655 f.timestamp = m_timestamp[onsetFrame]; | |
656 f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame]; | |
657 fs[m_oNotes].push_back(f); | |
658 } | |
659 notePitchTrack.clear(); | |
660 } | |
661 } | |
662 oldIsVoiced = isVoiced; | |
663 } | |
664 return fs; | 681 return fs; |
665 } | 682 } |
666 | |
667 float | |
668 PYinVamp::pitchState2Freq(int state, vector<pair<double, double> > pitchProb) | |
669 { | |
670 float hmmFreq = m_pitchHmm.m_freqs[state]; | |
671 float bestFreq = 0; | |
672 float leastDist = 10000; | |
673 if (hmmFreq > 0) | |
674 { | |
675 // This was a Yin estimate, so try to get original pitch estimate back | |
676 // ... a bit hacky, since we could have direclty saved the frequency | |
677 // that was assigned to the HMM bin in hmm.calculateObsProb -- but would | |
678 // have had to rethink the interface of that method. | |
679 for (size_t iPt = 0; iPt < pitchProb.size(); ++iPt) | |
680 { | |
681 float freq = 440. * | |
682 std::pow(2, | |
683 (pitchProb[iPt].first - 69)/12); | |
684 float dist = std::abs(hmmFreq-freq); | |
685 if (dist < leastDist) | |
686 { | |
687 leastDist = dist; | |
688 bestFreq = freq; | |
689 } | |
690 } | |
691 } else { | |
692 bestFreq = hmmFreq; | |
693 } | |
694 return bestFreq; | |
695 } |