comparison PYinVamp.cpp @ 133:83978b93aac1 fixedlag

ah, didn't commit when I stopped working... what did I do?
author Matthias Mauch <mail@matthiasmauch.net>
date Mon, 13 Jul 2015 12:10:06 +0100
parents 926c292fa3ff
children 72bda34e0e64
comparison
equal deleted inserted replaced
132:926c292fa3ff 133:83978b93aac1
51 m_onsetSensitivity(0.7f), 51 m_onsetSensitivity(0.7f),
52 m_pruneThresh(0.1f), 52 m_pruneThresh(0.1f),
53 m_pitchHmm(0), 53 m_pitchHmm(0),
54 m_pitchProb(0), 54 m_pitchProb(0),
55 m_timestamp(0), 55 m_timestamp(0),
56 m_level(0) 56 m_level(0),
57 m_pitchTrack(0)
57 { 58 {
58 } 59 }
59 60
60 PYinVamp::~PYinVamp() 61 PYinVamp::~PYinVamp()
61 { 62 {
446 else m_pitchHmm = MonoPitchHMM(0); 447 else m_pitchHmm = MonoPitchHMM(0);
447 448
448 m_pitchProb.clear(); 449 m_pitchProb.clear();
449 m_timestamp.clear(); 450 m_timestamp.clear();
450 m_level.clear(); 451 m_level.clear();
452 m_pitchTrack.clear();
451 /* 453 /*
452 std::cerr << "PYinVamp::reset" 454 std::cerr << "PYinVamp::reset"
453 << ", blockSize = " << m_blockSize 455 << ", blockSize = " << m_blockSize
454 << std::endl; 456 << std::endl;
455 */ 457 */
456 } 458 }
457 459
458 PYinVamp::FeatureSet 460 PYinVamp::FeatureSet
459 PYinVamp::process(const float *const *inputBuffers, RealTime timestamp) 461 PYinVamp::process(const float *const *inputBuffers, RealTime timestamp)
460 { 462 {
463 std::cerr << timestamp << std::endl;
461 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4; 464 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
462 timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate)); 465 timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset,
466 lrintf(m_inputSampleRate));
463 467
464 FeatureSet fs; 468 FeatureSet fs;
465 469
466 float rms = 0; 470 float rms = 0;
467 471
506 m_pitchProb.push_back(tempPitchProb); 510 m_pitchProb.push_back(tempPitchProb);
507 m_timestamp.push_back(timestamp); 511 m_timestamp.push_back(timestamp);
508 512
509 int lag = m_pitchHmm.m_fixedLag; 513 int lag = m_pitchHmm.m_fixedLag;
510 514
511 if (m_fixedLag == 1.f) 515 if (m_fixedLag == 1.f) // do fixed-lag smoothing instead of full Viterbi
512 { 516 {
513 if (m_timestamp.size() == lag + 1) 517 if (m_timestamp.size() == lag + 1)
514 { 518 {
515 m_timestamp.pop_front(); 519 m_timestamp.pop_front();
516 m_pitchProb.pop_front(); 520 m_pitchProb.pop_front();
518 Feature f; 522 Feature f;
519 f.hasTimestamp = true; 523 f.hasTimestamp = true;
520 vector<int> rawPitchPath = m_pitchHmm.track(); 524 vector<int> rawPitchPath = m_pitchHmm.track();
521 float freq = m_pitchHmm.nearestFreq(rawPitchPath[0], 525 float freq = m_pitchHmm.nearestFreq(rawPitchPath[0],
522 m_pitchProb[0]); 526 m_pitchProb[0]);
527 m_pitchTrack.push_back(freq);
523 f.timestamp = m_timestamp[0]; 528 f.timestamp = m_timestamp[0];
524 f.values.clear(); 529 f.values.clear();
525 530
526 // different output modes 531 // different output modes
527 if (freq < 0 && (m_outputUnvoiced==0)) 532 if (freq < 0 && (m_outputUnvoiced==0))
590 } 595 }
591 596
592 // ================== P I T C H T R A C K ================================= 597 // ================== P I T C H T R A C K =================================
593 598
594 vector<int> rawPitchPath = m_pitchHmm.track(); 599 vector<int> rawPitchPath = m_pitchHmm.track();
595 vector<float> mpOut;
596 600
597 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) 601 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame)
598 { 602 {
599 float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame], 603 float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame],
600 m_pitchProb[iFrame]); 604 m_pitchProb[iFrame]);
601 mpOut.push_back(freq); // for note processing below 605 m_pitchTrack.push_back(freq); // for note processing below
602 606
603 f.timestamp = m_timestamp[iFrame]; 607 f.timestamp = m_timestamp[iFrame];
604 f.values.clear(); 608 f.values.clear();
605 609
606 // different output modes 610 // different output modes
607 if (freq < 0 && (m_outputUnvoiced==0)) continue; 611 if (freq < 0 && (m_outputUnvoiced==0)) continue;
613 } 617 }
614 fs[m_oSmoothedPitchTrack].push_back(f); 618 fs[m_oSmoothedPitchTrack].push_back(f);
615 } 619 }
616 620
617 // ======================== N O T E S ====================================== 621 // ======================== N O T E S ======================================
618 // MonoNote mn; 622 MonoNote mn;
619 // std::vector<std::vector<std::pair<double, double> > > smoothedPitch; 623 std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
620 // for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) { 624 for (size_t iFrame = 0; iFrame < m_pitchTrack.size(); ++iFrame) {
621 // std::vector<std::pair<double, double> > temp; 625 std::vector<std::pair<double, double> > temp;
622 // if (mpOut[iFrame] > 0) 626 if (m_pitchTrack[iFrame] > 0)
623 // { 627 {
624 // double tempPitch = 12 * 628 double tempPitch = 12 *
625 // std::log(mpOut[iFrame]/440)/std::log(2.) + 69; 629 std::log(m_pitchTrack[iFrame]/440)/std::log(2.) + 69;
626 // temp.push_back(std::pair<double,double>(tempPitch, .9)); 630 temp.push_back(std::pair<double,double>(tempPitch, .9));
627 // } 631 // std::cerr << "tempPitch: " << tempPitch << std::endl;
628 // smoothedPitch.push_back(temp); 632 }
629 // } 633 // std::cerr << "temp size: " << temp.size() << std::endl;
630 // // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb); 634 smoothedPitch.push_back(temp);
631 // vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch); 635 }
632 636
633 // // turning feature into a note feature 637 vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
634 // f.hasTimestamp = true; 638 std::cerr << "mnOut size: " << mnOut.size() << std::endl;
635 // f.hasDuration = true; 639 std::cerr << "m_pitchTrack size: " << m_pitchTrack.size() << std::endl;
636 // f.values.clear(); 640
641 // turning feature into a note feature
642 f.hasTimestamp = true;
643 f.hasDuration = true;
644 f.values.clear();
637 645
638 // int onsetFrame = 0; 646 int onsetFrame = 0;
639 // bool isVoiced = 0; 647 bool isVoiced = 0;
640 // bool oldIsVoiced = 0; 648 bool oldIsVoiced = 0;
641 // size_t nFrame = m_pitchProb.size(); 649 size_t nFrame = m_pitchTrack.size();
642 650
643 // float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize; 651 float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
644 652
645 // // the body of the loop below should be in a function/method 653 // the body of the loop below should be in a function/method
646 // std::vector<float> notePitchTrack; // collects pitches for one note at a time 654 // but what does it actually do??
647 // for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) 655 // * takes the result of the note tracking HMM
648 // { 656 // * collects contiguously pitched pitches
649 // isVoiced = mnOut[iFrame].noteState < 3 657 // * writes a note once it notices the voiced segment has ended
650 // && smoothedPitch[iFrame].size() > 0 658 // complications:
651 // && (iFrame >= nFrame-2 659 // * it needs a lookahead of two frames for m_level (wtf was I thinking)
652 // || ((m_level[iFrame]/m_level[iFrame+2]) > 660 // * it needs to know the timestamp (which can be guessed from the frame no)
653 // m_onsetSensitivity)); 661 // *
654 // if (isVoiced && iFrame != nFrame-1) 662 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
655 // { 663 RealTime timestampOffset = Vamp::RealTime::frame2RealTime(offset,
656 // if (oldIsVoiced == 0) // beginning of a note 664 lrintf(m_inputSampleRate));
657 // { 665
658 // onsetFrame = iFrame; 666 std::vector<float> notePitchTrack; // collects pitches for 1 note at a time
659 // } 667 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
660 // float pitch = smoothedPitch[iFrame][0].first; 668 {
661 // notePitchTrack.push_back(pitch); // add to the note's pitch track 669 isVoiced = mnOut[iFrame].noteState < 3
662 // } else { // not currently voiced 670 && smoothedPitch[iFrame].size() > 0
663 // if (oldIsVoiced == 1) // end of note 671 && (iFrame >= nFrame-2
664 // { 672 || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
665 // if (notePitchTrack.size() >= minNoteFrames) 673 if (isVoiced && iFrame != nFrame-1)
666 // { 674 {
667 // std::sort(notePitchTrack.begin(), notePitchTrack.end()); 675 if (oldIsVoiced == 0) // beginning of a note
668 // float medianPitch = notePitchTrack[notePitchTrack.size()/2]; 676 {
669 // float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440; 677 onsetFrame = iFrame;
670 // f.values.clear(); 678 }
671 // f.values.push_back(medianFreq); 679 float pitch = smoothedPitch[iFrame][0].first;
672 // f.timestamp = m_timestamp[onsetFrame]; 680 notePitchTrack.push_back(pitch); // add to the note's pitch track
673 // f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame]; 681 } else { // not currently voiced
674 // fs[m_oNotes].push_back(f); 682 if (oldIsVoiced == 1) // end of note
675 // } 683 {
676 // notePitchTrack.clear(); 684 if (notePitchTrack.size() >= minNoteFrames)
677 // } 685 {
678 // } 686 std::sort(notePitchTrack.begin(), notePitchTrack.end());
679 // oldIsVoiced = isVoiced; 687 float medianPitch = notePitchTrack[notePitchTrack.size()/2];
680 // } 688 float medianFreq =
689 std::pow(2,(medianPitch - 69) / 12) * 440;
690 f.values.clear();
691 f.values.push_back(medianFreq);
692 RealTime start = RealTime::frame2RealTime(
693 onsetFrame * m_stepSize, lrintf(m_inputSampleRate)) +
694 timestampOffset;
695 RealTime end = RealTime::frame2RealTime(
696 iFrame * m_stepSize, lrintf(m_inputSampleRate)) +
697 timestampOffset;
698 f.timestamp = start;
699 f.duration = end - start;
700 fs[m_oNotes].push_back(f);
701 }
702 notePitchTrack.clear();
703 }
704 }
705 oldIsVoiced = isVoiced;
706 }
681 return fs; 707 return fs;
682 } 708 }