comparison PYinVamp.cpp @ 132:926c292fa3ff fixedlag

fixed lag smoothing for pitch track working
author Matthias Mauch <mail@matthiasmauch.net>
date Fri, 03 Jul 2015 17:34:38 +0100
parents b877df85ad9e
children 83978b93aac1
comparison
equal deleted inserted replaced
131:b877df85ad9e 132:926c292fa3ff
11 COPYING included with this distribution for more information. 11 COPYING included with this distribution for more information.
12 */ 12 */
13 13
14 #include "PYinVamp.h" 14 #include "PYinVamp.h"
15 #include "MonoNote.h" 15 #include "MonoNote.h"
16 #include "MonoPitch.h"
17 #include "MonoPitchHMM.h" 16 #include "MonoPitchHMM.h"
18 17
19 #include "vamp-sdk/FFT.h" 18 #include "vamp-sdk/FFT.h"
20 19
21 #include <vector> 20 #include <vector>
43 m_oVoicedProb(0), 42 m_oVoicedProb(0),
44 m_oCandidateSalience(0), 43 m_oCandidateSalience(0),
45 m_oSmoothedPitchTrack(0), 44 m_oSmoothedPitchTrack(0),
46 m_oNotes(0), 45 m_oNotes(0),
47 m_threshDistr(2.0f), 46 m_threshDistr(2.0f),
48 m_fixedLag(0.0f), 47 m_fixedLag(1.0f),
49 m_outputUnvoiced(0.0f), 48 m_outputUnvoiced(0.0f),
50 m_preciseTime(0.0f), 49 m_preciseTime(0.0f),
51 m_lowAmp(0.1f), 50 m_lowAmp(0.1f),
52 m_onsetSensitivity(0.7f), 51 m_onsetSensitivity(0.7f),
53 m_pruneThresh(0.1f), 52 m_pruneThresh(0.1f),
54 m_pitchHmm(), 53 m_pitchHmm(0),
55 m_pitchProb(0), 54 m_pitchProb(0),
56 m_timestamp(0), 55 m_timestamp(0),
57 m_level(0) 56 m_level(0)
58 { 57 {
59 } 58 }
440 PYinVamp::reset() 439 PYinVamp::reset()
441 { 440 {
442 m_yin.setThresholdDistr(m_threshDistr); 441 m_yin.setThresholdDistr(m_threshDistr);
443 m_yin.setFrameSize(m_blockSize); 442 m_yin.setFrameSize(m_blockSize);
444 m_yin.setFast(!m_preciseTime); 443 m_yin.setFast(!m_preciseTime);
444
445 if (m_fixedLag == 1.f) m_pitchHmm = MonoPitchHMM(100);
446 else m_pitchHmm = MonoPitchHMM(0);
445 447
446 m_pitchProb.clear(); 448 m_pitchProb.clear();
447 m_timestamp.clear(); 449 m_timestamp.clear();
448 m_level.clear(); 450 m_level.clear();
449 /* 451 /*
491 tempPitchProb.push_back(pair<double, double> 493 tempPitchProb.push_back(pair<double, double>
492 (tempPitch, yo.freqProb[iCandidate].second*factor)); 494 (tempPitch, yo.freqProb[iCandidate].second*factor));
493 } 495 }
494 } 496 }
495 497
496 if (m_fixedLag == 0.f) 498 vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb);
497 { 499 if (m_timestamp.empty())
498 vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb); 500 {
499 if (m_timestamp.empty()) 501 m_pitchHmm.initialise(tempObsProb);
502 } else {
503 m_pitchHmm.process(tempObsProb);
504 }
505
506 m_pitchProb.push_back(tempPitchProb);
507 m_timestamp.push_back(timestamp);
508
509 int lag = m_pitchHmm.m_fixedLag;
510
511 if (m_fixedLag == 1.f)
512 {
513 if (m_timestamp.size() == lag + 1)
500 { 514 {
501 m_pitchHmm.initialise(tempObsProb); 515 m_timestamp.pop_front();
502 } else { 516 m_pitchProb.pop_front();
503 m_pitchHmm.process(tempObsProb); 517
518 Feature f;
519 f.hasTimestamp = true;
520 vector<int> rawPitchPath = m_pitchHmm.track();
521 float freq = m_pitchHmm.nearestFreq(rawPitchPath[0],
522 m_pitchProb[0]);
523 f.timestamp = m_timestamp[0];
524 f.values.clear();
525
526 // different output modes
527 if (freq < 0 && (m_outputUnvoiced==0))
528 {
529
530 } else {
531 if (m_outputUnvoiced == 1)
532 {
533 f.values.push_back(fabs(freq));
534 } else {
535 f.values.push_back(freq);
536 }
537 fs[m_oSmoothedPitchTrack].push_back(f);
538 }
504 } 539 }
505 m_pitchProb.push_back(tempPitchProb); 540 }
506 } else { 541
507 // Damn, so I need the hmm right here! Sadly it isn't defined here yet.
508 // Perhaps I could re-design the whole shabang
509 }
510 m_timestamp.push_back(timestamp);
511 542
512 // F0 CANDIDATES 543 // F0 CANDIDATES
513 Feature f; 544 Feature f;
514 f.hasTimestamp = true; 545 f.hasTimestamp = true;
515 f.timestamp = timestamp; 546 f.timestamp = timestamp;
558 return fs; 589 return fs;
559 } 590 }
560 591
561 // ================== P I T C H T R A C K ================================= 592 // ================== P I T C H T R A C K =================================
562 593
563 vector<int> rawPitchPath = m_pitchHmm.finalise(); 594 vector<int> rawPitchPath = m_pitchHmm.track();
564 vector<float> mpOut; 595 vector<float> mpOut;
565 596
566 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) 597 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame)
567 { 598 {
568 float freq = pitchState2Freq(rawPitchPath[iFrame], m_pitchProb[iFrame]); 599 float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame],
600 m_pitchProb[iFrame]);
569 mpOut.push_back(freq); // for note processing below 601 mpOut.push_back(freq); // for note processing below
570 602
571 f.timestamp = m_timestamp[iFrame]; 603 f.timestamp = m_timestamp[iFrame];
572 // std::cerr << f.timestamp << std::endl;
573 f.values.clear(); 604 f.values.clear();
574 605
575 // different output modes 606 // different output modes
576 if (freq < 0 && (m_outputUnvoiced==0)) continue; 607 if (freq < 0 && (m_outputUnvoiced==0)) continue;
577 if (m_outputUnvoiced == 1) 608 if (m_outputUnvoiced == 1)
580 } else { 611 } else {
581 f.values.push_back(freq); 612 f.values.push_back(freq);
582 } 613 }
583 fs[m_oSmoothedPitchTrack].push_back(f); 614 fs[m_oSmoothedPitchTrack].push_back(f);
584 } 615 }
585 616
586 // for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) 617 // ======================== N O T E S ======================================
618 // MonoNote mn;
619 // std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
620 // for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) {
621 // std::vector<std::pair<double, double> > temp;
622 // if (mpOut[iFrame] > 0)
623 // {
624 // double tempPitch = 12 *
625 // std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
626 // temp.push_back(std::pair<double,double>(tempPitch, .9));
627 // }
628 // smoothedPitch.push_back(temp);
629 // }
630 // // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb);
631 // vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
632
633 // // turning feature into a note feature
634 // f.hasTimestamp = true;
635 // f.hasDuration = true;
636 // f.values.clear();
637
638 // int onsetFrame = 0;
639 // bool isVoiced = 0;
640 // bool oldIsVoiced = 0;
641 // size_t nFrame = m_pitchProb.size();
642
643 // float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
644
645 // // the body of the loop below should be in a function/method
646 // std::vector<float> notePitchTrack; // collects pitches for one note at a time
647 // for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
587 // { 648 // {
588 // if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue; 649 // isVoiced = mnOut[iFrame].noteState < 3
589 650 // && smoothedPitch[iFrame].size() > 0
590 // if (m_outputUnvoiced == 1) 651 // && (iFrame >= nFrame-2
652 // || ((m_level[iFrame]/m_level[iFrame+2]) >
653 // m_onsetSensitivity));
654 // if (isVoiced && iFrame != nFrame-1)
591 // { 655 // {
592 // f.values.push_back(fabs(mpOut[iFrame])); 656 // if (oldIsVoiced == 0) // beginning of a note
593 // } else { 657 // {
594 // f.values.push_back(mpOut[iFrame]); 658 // onsetFrame = iFrame;
659 // }
660 // float pitch = smoothedPitch[iFrame][0].first;
661 // notePitchTrack.push_back(pitch); // add to the note's pitch track
662 // } else { // not currently voiced
663 // if (oldIsVoiced == 1) // end of note
664 // {
665 // if (notePitchTrack.size() >= minNoteFrames)
666 // {
667 // std::sort(notePitchTrack.begin(), notePitchTrack.end());
668 // float medianPitch = notePitchTrack[notePitchTrack.size()/2];
669 // float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440;
670 // f.values.clear();
671 // f.values.push_back(medianFreq);
672 // f.timestamp = m_timestamp[onsetFrame];
673 // f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame];
674 // fs[m_oNotes].push_back(f);
675 // }
676 // notePitchTrack.clear();
677 // }
595 // } 678 // }
596 679 // oldIsVoiced = isVoiced;
597 // fs[m_oSmoothedPitchTrack].push_back(f);
598 // } 680 // }
599
600 // ======================== N O T E S ======================================
601 MonoNote mn;
602 std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
603 for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) {
604 std::vector<std::pair<double, double> > temp;
605 if (mpOut[iFrame] > 0)
606 {
607 double tempPitch = 12 *
608 std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
609 temp.push_back(std::pair<double,double>(tempPitch, .9));
610 }
611 smoothedPitch.push_back(temp);
612 }
613 // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb);
614 vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
615
616 // turning feature into a note feature
617 f.hasTimestamp = true;
618 f.hasDuration = true;
619 f.values.clear();
620
621 int onsetFrame = 0;
622 bool isVoiced = 0;
623 bool oldIsVoiced = 0;
624 size_t nFrame = m_pitchProb.size();
625
626 float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
627
628 // the body of the loop below should be in a function/method
629 std::vector<float> notePitchTrack; // collects pitches for one note at a time
630 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
631 {
632 isVoiced = mnOut[iFrame].noteState < 3
633 && smoothedPitch[iFrame].size() > 0
634 && (iFrame >= nFrame-2
635 || ((m_level[iFrame]/m_level[iFrame+2]) >
636 m_onsetSensitivity));
637 if (isVoiced && iFrame != nFrame-1)
638 {
639 if (oldIsVoiced == 0) // beginning of a note
640 {
641 onsetFrame = iFrame;
642 }
643 float pitch = smoothedPitch[iFrame][0].first;
644 notePitchTrack.push_back(pitch); // add to the note's pitch track
645 } else { // not currently voiced
646 if (oldIsVoiced == 1) // end of note
647 {
648 if (notePitchTrack.size() >= minNoteFrames)
649 {
650 std::sort(notePitchTrack.begin(), notePitchTrack.end());
651 float medianPitch = notePitchTrack[notePitchTrack.size()/2];
652 float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440;
653 f.values.clear();
654 f.values.push_back(medianFreq);
655 f.timestamp = m_timestamp[onsetFrame];
656 f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame];
657 fs[m_oNotes].push_back(f);
658 }
659 notePitchTrack.clear();
660 }
661 }
662 oldIsVoiced = isVoiced;
663 }
664 return fs; 681 return fs;
665 } 682 }
666
667 float
668 PYinVamp::pitchState2Freq(int state, vector<pair<double, double> > pitchProb)
669 {
670 float hmmFreq = m_pitchHmm.m_freqs[state];
671 float bestFreq = 0;
672 float leastDist = 10000;
673 if (hmmFreq > 0)
674 {
675 // This was a Yin estimate, so try to get original pitch estimate back
676 // ... a bit hacky, since we could have direclty saved the frequency
677 // that was assigned to the HMM bin in hmm.calculateObsProb -- but would
678 // have had to rethink the interface of that method.
679 for (size_t iPt = 0; iPt < pitchProb.size(); ++iPt)
680 {
681 float freq = 440. *
682 std::pow(2,
683 (pitchProb[iPt].first - 69)/12);
684 float dist = std::abs(hmmFreq-freq);
685 if (dist < leastDist)
686 {
687 leastDist = dist;
688 bestFreq = freq;
689 }
690 }
691 } else {
692 bestFreq = hmmFreq;
693 }
694 return bestFreq;
695 }