Mercurial > hg > pyin
changeset 141:72bda34e0e64 fixedlag
Merge from branch vamp-fft-revision
author | Chris Cannam |
---|---|
date | Fri, 24 Mar 2017 14:50:44 +0000 |
parents | 83978b93aac1 (diff) c2b426f4d841 (current diff) |
children | afcdcb64c8ab |
files | LocalCandidatePYIN.cpp Makefile.inc Makefile.osx MonoPitchHMM.cpp MonoPitchHMM.h PYinVamp.cpp SparseHMM.cpp YinUtil.cpp |
diffstat | 15 files changed, 422 insertions(+), 308 deletions(-) [+] |
line wrap: on
line diff
--- a/LocalCandidatePYIN.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/LocalCandidatePYIN.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -12,7 +12,7 @@ */ #include "LocalCandidatePYIN.h" -#include "MonoPitch.h" +#include "MonoPitchHMM.h" #include "YinUtil.h" #include "vamp-sdk/FFT.h" @@ -348,7 +348,7 @@ } // MONO-PITCH STUFF - MonoPitch mp; + MonoPitchHMM hmm(0); size_t nFrame = m_timestamp.size(); vector<vector<float> > pitchTracks; vector<float> freqSum = vector<float>(m_nCandidate); @@ -362,11 +362,11 @@ for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) { pitchTracks.push_back(vector<float>(nFrame)); - vector<vector<pair<double,double> > > tempPitchProb; + vector<pair<double,double> > tempPitchProb; + vector<vector<double> > tempObsProb; float centrePitch = 45 + 3 * iCandidate; for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) { - tempPitchProb.push_back(vector<pair<double,double> >()); float sumProb = 0; float pitch = 0; float prob = 0; @@ -377,17 +377,27 @@ boost::math::pdf(normalDist, pitch-centrePitch) / maxNormalDist * 2; sumProb += prob; - tempPitchProb[iFrame].push_back( + tempPitchProb.push_back( pair<double,double>(pitch,prob)); } for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb) { - tempPitchProb[iFrame][iProb].second /= sumProb; + tempPitchProb[iProb].second /= sumProb; } + tempObsProb.push_back(hmm.calculateObsProb(tempPitchProb)); } - vector<float> mpOut = mp.process(tempPitchProb); - for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) + vector<int> rawPitchPath = hmm.decodeViterbi(tempObsProb); + vector<float> mpOut; + + for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) + { + float freq = hmm.nearestFreq(rawPitchPath[iFrame], + m_pitchProb[iFrame]); + mpOut.push_back(freq); // for note processing below + } + + for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) { if (mpOut[iFrame] > 0) {
--- a/Makefile.inc Fri Aug 19 13:40:11 2016 +0100 +++ b/Makefile.inc Fri Mar 24 14:50:44 2017 +0000 @@ -15,7 +15,6 @@ Yin.cpp \ YinUtil.cpp \ MonoNote.cpp \ - MonoPitch.cpp \ MonoNoteParameters.cpp \ SparseHMM.cpp \ MonoNoteHMM.cpp \ @@ -66,15 +65,14 @@ # DO NOT DELETE libmain.o: PYinVamp.h Yin.h MeanFilter.h YinVamp.h LocalCandidatePYIN.h -LocalCandidatePYIN.o: LocalCandidatePYIN.h Yin.h MeanFilter.h MonoPitch.h +LocalCandidatePYIN.o: LocalCandidatePYIN.h Yin.h MeanFilter.h LocalCandidatePYIN.o: MonoPitchHMM.h SparseHMM.h YinUtil.h MonoNote.o: MonoNote.h MonoNoteHMM.h MonoNoteParameters.h SparseHMM.h MonoNoteHMM.o: MonoNoteHMM.h MonoNoteParameters.h SparseHMM.h MonoNoteParameters.o: MonoNoteParameters.h -MonoPitch.o: MonoPitch.h MonoPitchHMM.h SparseHMM.h MonoPitchHMM.o: MonoPitchHMM.h SparseHMM.h PYinVamp.o: PYinVamp.h Yin.h MeanFilter.h MonoNote.h MonoNoteHMM.h -PYinVamp.o: MonoNoteParameters.h SparseHMM.h MonoPitch.h MonoPitchHMM.h +PYinVamp.o: MonoNoteParameters.h SparseHMM.h MonoPitchHMM.h SparseHMM.o: SparseHMM.h Yin.o: Yin.h MeanFilter.h YinUtil.h YinUtil.o: YinUtil.h MeanFilter.h @@ -91,7 +89,6 @@ LocalCandidatePYIN.o: Yin.h MeanFilter.h MonoNote.o: MonoNoteHMM.h MonoNoteParameters.h SparseHMM.h MonoNoteHMM.o: MonoNoteParameters.h SparseHMM.h -MonoPitch.o: MonoPitchHMM.h SparseHMM.h MonoPitchHMM.o: SparseHMM.h PYinVamp.o: Yin.h MeanFilter.h Yin.o: MeanFilter.h
--- a/Makefile.osx Fri Aug 19 13:40:11 2016 +0100 +++ b/Makefile.osx Fri Mar 24 14:50:44 2017 +0000 @@ -2,7 +2,7 @@ CFLAGS := $(ARCHFLAGS) -O3 -I../vamp-plugin-sdk -I../../vamp-plugin-sdk -I/usr/local/boost -Wall -fPIC CXXFLAGS := $(CFLAGS) -LDFLAGS := -L../vamp-plugin-sdk -L../../vamp-plugin-sdk -lvamp-sdk $(ARCHFLAGS) +LDFLAGS := -L../vamp-plugin-sdk -L../vamp-plugin-sdk -lvamp-sdk $(ARCHFLAGS) -L/usr/local/lib PLUGIN_LDFLAGS := -dynamiclib $(LDFLAGS) -exported_symbols_list vamp-plugin.list TEST_LDFLAGS := $(LDFLAGS) -lboost_unit_test_framework PLUGIN_EXT := .dylib
--- a/MonoNote.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/MonoNote.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -22,7 +22,7 @@ using std::pair; MonoNote::MonoNote() : - hmm() + hmm(0) { } @@ -39,11 +39,9 @@ obsProb.push_back(hmm.calculateObsProb(pitchProb[iFrame])); } - vector<double> *scale = new vector<double>(pitchProb.size()); - vector<MonoNote::FrameOutput> out; - vector<int> path = hmm.decodeViterbi(obsProb, scale); + vector<int> path = hmm.decodeViterbi(obsProb); for (size_t iFrame = 0; iFrame < path.size(); ++iFrame) { @@ -54,8 +52,6 @@ stateKind = (path[iFrame]) % hmm.par.nSPP + 1; out.push_back(FrameOutput(iFrame, currPitch, stateKind)); - // std::cerr << path[iFrame] << " -- "<< pitchProb[iFrame][0].first << " -- "<< currPitch << " -- " << stateKind << std::endl; } - delete scale; return(out); }
--- a/MonoNoteHMM.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/MonoNoteHMM.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -21,7 +21,8 @@ using std::vector; using std::pair; -MonoNoteHMM::MonoNoteHMM() : +MonoNoteHMM::MonoNoteHMM(int fixedLag) : + SparseHMM(fixedLag), par() { build(); @@ -36,14 +37,13 @@ // what is the probability of pitched double pIsPitched = 0; - for (size_t iCandidate = 0; iCandidate < nCandidate; ++iCandidate) + for (size_t iCand = 0; iCand < nCandidate; ++iCand) { - // pIsPitched = pitchProb[iCandidate].second > pIsPitched ? pitchProb[iCandidate].second : pIsPitched; - pIsPitched += pitchProb[iCandidate].second; + pIsPitched += pitchProb[iCand].second; } - // pIsPitched = std::pow(pIsPitched, (1-par.priorWeight)) * std::pow(par.priorPitchedProb, par.priorWeight); - pIsPitched = pIsPitched * (1-par.priorWeight) + par.priorPitchedProb * par.priorWeight; + pIsPitched = pIsPitched * (1-par.priorWeight) + + par.priorPitchedProb * par.priorWeight; vector<double> out = vector<double>(par.n); double tempProbSum = 0; @@ -58,14 +58,15 @@ double minDist = 10000.0; double minDistProb = 0; size_t minDistCandidate = 0; - for (size_t iCandidate = 0; iCandidate < nCandidate; ++iCandidate) + for (size_t iCand = 0; iCand < nCandidate; ++iCand) { - double currDist = std::abs(getMidiPitch(i)-pitchProb[iCandidate].first); + double currDist = std::abs(getMidiPitch(i)- + pitchProb[iCand].first); if (currDist < minDist) { minDist = currDist; - minDistProb = pitchProb[iCandidate].second; - minDistCandidate = iCandidate; + minDistProb = pitchProb[iCand].second; + minDistCandidate = iCand; } } tempProb = std::pow(minDistProb, par.yinTrust) * @@ -107,6 +108,8 @@ // 3. attack state // ... + m_nState = par.n; + // observation distributions for (size_t iState = 0; iState < par.n; ++iState) { @@ -114,9 +117,9 @@ if (iState % par.nSPP == 2) { // silent state starts tracking - init.push_back(1.0/(par.nS * par.nPPS)); + m_init.push_back(1.0/(par.nS * par.nPPS)); } else { - init.push_back(0.0); + m_init.push_back(0.0); } } @@ -137,27 +140,27 @@ size_t index = iPitch * par.nSPP; // transitions from attack state - from.push_back(index); - to.push_back(index); - transProb.push_back(par.pAttackSelftrans); + m_from.push_back(index); + m_to.push_back(index); + m_transProb.push_back(par.pAttackSelftrans); - from.push_back(index); - to.push_back(index+1); - transProb.push_back(1-par.pAttackSelftrans); + m_from.push_back(index); + m_to.push_back(index+1); + m_transProb.push_back(1-par.pAttackSelftrans); // transitions from stable state - from.push_back(index+1); - to.push_back(index+1); // to itself - transProb.push_back(par.pStableSelftrans); + m_from.push_back(index+1); + m_to.push_back(index+1); // to itself + m_transProb.push_back(par.pStableSelftrans); - from.push_back(index+1); - to.push_back(index+2); // to silent - transProb.push_back(par.pStable2Silent); + m_from.push_back(index+1); + m_to.push_back(index+2); // to silent + m_transProb.push_back(par.pStable2Silent); // the "easy" transitions from silent state - from.push_back(index+2); - to.push_back(index+2); - transProb.push_back(par.pSilentSelftrans); + m_from.push_back(index+2); + m_to.push_back(index+2); + m_transProb.push_back(par.pSilentSelftrans); // the more complicated transitions from the silent @@ -171,7 +174,7 @@ double semitoneDistance = std::abs(fromPitch - toPitch) * 1.0 / par.nPPS; - // if (std::fmod(semitoneDistance, 1) == 0 && semitoneDistance > par.minSemitoneDistance) + if (semitoneDistance == 0 || (semitoneDistance > par.minSemitoneDistance && semitoneDistance < par.maxJump)) @@ -184,15 +187,19 @@ tempTransProbSilent.push_back(tempWeightSilent); - from.push_back(index+2); - to.push_back(toIndex); + m_from.push_back(index+2); + m_to.push_back(toIndex); } } for (size_t i = 0; i < tempTransProbSilent.size(); ++i) { - transProb.push_back((1-par.pSilentSelftrans) * tempTransProbSilent[i]/probSumSilent); + m_transProb.push_back((1-par.pSilentSelftrans) * + tempTransProbSilent[i]/probSumSilent); } } + m_nTrans = m_transProb.size(); + m_delta = vector<double>(m_nState); + m_oldDelta = vector<double>(m_nState); } double
--- a/MonoNoteHMM.h Fri Aug 19 13:40:11 2016 +0100 +++ b/MonoNoteHMM.h Fri Mar 24 14:50:44 2017 +0000 @@ -27,8 +27,9 @@ class MonoNoteHMM : public SparseHMM { public: - MonoNoteHMM(); + MonoNoteHMM(int fixedLag); const std::vector<double> calculateObsProb(const vector<pair<double, double> >); + double getMidiPitch(size_t index); double getFrequency(size_t index); void build();
--- a/MonoPitch.cpp Fri Aug 19 13:40:11 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ - -/* - pYIN - A fundamental frequency estimator for monophonic audio - Centre for Digital Music, Queen Mary, University of London. - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. See the file - COPYING included with this distribution for more information. -*/ - -#include "MonoPitch.h" -#include "MonoPitchHMM.h" -#include <vector> - -#include <cstdio> -#include <cmath> -#include <complex> - -using std::vector; -using std::pair; - -MonoPitch::MonoPitch() : - hmm() -{ -} - -MonoPitch::~MonoPitch() -{ -} - -const vector<float> -MonoPitch::process(const vector<vector<pair<double, double> > > pitchProb) -{ - // std::cerr << "before observation prob calculation" << std::endl; - vector<vector<double> > obsProb; - for (size_t iFrame = 0; iFrame < pitchProb.size(); ++iFrame) - { - obsProb.push_back(hmm.calculateObsProb(pitchProb[iFrame])); - } - - vector<double> *scale = new vector<double>(0); - - vector<float> out; - - // std::cerr << "before Viterbi decoding" << obsProb.size() << "ng" << obsProb[1].size() << std::endl; - vector<int> path = hmm.decodeViterbi(obsProb, scale); - // std::cerr << "after Viterbi decoding" << std::endl; - - for (size_t iFrame = 0; iFrame < path.size(); ++iFrame) - { - // std::cerr << path[iFrame] << " " << hmm.m_freqs[path[iFrame]] << std::endl; - float hmmFreq = hmm.m_freqs[path[iFrame]]; - float bestFreq = 0; - float leastDist = 10000; - if (hmmFreq > 0) - { - // This was a Yin estimate, so try to get original pitch estimate back - // ... a bit hacky, since we could have direclty saved the frequency - // that was assigned to the HMM bin in hmm.calculateObsProb -- but would - // have had to rethink the interface of that method. - for (size_t iPitch = 0; iPitch < pitchProb[iFrame].size(); ++iPitch) - { - float freq = 440. * std::pow(2, (pitchProb[iFrame][iPitch].first - 69)/12); - float dist = std::abs(hmmFreq-freq); - if (dist < leastDist) - { - leastDist = dist; - bestFreq = freq; - } - } - } else { - bestFreq = hmmFreq; - } - out.push_back(bestFreq); - } - delete scale; - return(out); -}
--- a/MonoPitch.h Fri Aug 19 13:40:11 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ - -/* - pYIN - A fundamental frequency estimator for monophonic audio - Centre for Digital Music, Queen Mary, University of London. - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. See the file - COPYING included with this distribution for more information. -*/ - -#ifndef _MONOPITCH_H_ -#define _MONOPITCH_H_ - -#include "MonoPitchHMM.h" - -#include <iostream> -#include <vector> -#include <exception> - -using std::vector; -using std::pair; - -class MonoPitch { -public: - MonoPitch(); - virtual ~MonoPitch(); - - // pitchProb is a frame-wise vector carrying a vector of pitch-probability pairs - const vector<float> process(const vector<vector<pair<double, double> > > pitchProb); -private: - MonoPitchHMM hmm; -}; - -#endif
--- a/MonoPitchHMM.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/MonoPitchHMM.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -17,11 +17,13 @@ #include <cstdio> #include <cmath> +#include <iostream> using std::vector; using std::pair; -MonoPitchHMM::MonoPitchHMM() : +MonoPitchHMM::MonoPitchHMM(int fixedLag) : + SparseHMM(fixedLag), m_minFreq(61.735), m_nBPS(5), m_nPitch(0), @@ -32,6 +34,7 @@ { m_transitionWidth = 5*(m_nBPS/2) + 1; m_nPitch = 69 * m_nBPS; + m_nState = 2 * m_nPitch; // voiced and unvoiced m_freqs = vector<double>(2*m_nPitch); for (int iPitch = 0; iPitch < m_nPitch; ++iPitch) { @@ -83,7 +86,7 @@ MonoPitchHMM::build() { // INITIAL VECTOR - init = vector<double>(2*m_nPitch, 1.0 / 2*m_nPitch); + m_init = vector<double>(2*m_nPitch, 1.0 / 2*m_nPitch); // TRANSITIONS for (int iPitch = 0; iPitch < int(m_nPitch); ++iPitch) @@ -112,22 +115,22 @@ // TRANSITIONS TO CLOSE PITCH for (int i = minNextPitch; i <= maxNextPitch; ++i) { - from.push_back(iPitch); - to.push_back(i); - transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans); + m_from.push_back(iPitch); + m_to.push_back(i); + m_transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans); - from.push_back(iPitch); - to.push_back(i+m_nPitch); - transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans)); + m_from.push_back(iPitch); + m_to.push_back(i+m_nPitch); + m_transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans)); - from.push_back(iPitch+m_nPitch); - to.push_back(i+m_nPitch); - transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans); + m_from.push_back(iPitch+m_nPitch); + m_to.push_back(i+m_nPitch); + m_transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans); // transProb.push_back(weights[i-minNextPitch] / weightSum * 0.5); - from.push_back(iPitch+m_nPitch); - to.push_back(i); - transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans)); + m_from.push_back(iPitch+m_nPitch); + m_to.push_back(i); + m_transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans)); // transProb.push_back(weights[i-minNextPitch] / weightSum * 0.5); } @@ -149,5 +152,45 @@ // for (int i = 0; i < from.size(); ++i) { // std::cerr << "P(["<< from[i] << " --> " << to[i] << "]) = " << transProb[i] << std::endl; // } - + m_nTrans = m_transProb.size(); + m_delta = vector<double>(m_nState); + m_oldDelta = vector<double>(m_nState); } + +/* +Takes a state number and a pitch-prob vector, then finds the pitch that would +have been closest to the pitch of the state. Easy to understand? ;) +*/ +const float +MonoPitchHMM::nearestFreq(int state, vector<pair<double, double> > pitchProb) +{ + float hmmFreq = m_freqs[state]; + // std::cerr << "hmmFreq " << hmmFreq << std::endl; + float bestFreq = 0; + float leastDist = 10000; + if (hmmFreq > 0) + { + // This was a Yin estimate, so try to get original pitch estimate back + // ... a bit hacky, since we could have direclty saved the frequency + // that was assigned to the HMM bin in hmm.calculateObsProb -- but would + // have had to rethink the interface of that method. + + // std::cerr << "pitch prob size " << pitchProb.size() << std::endl; + + for (size_t iPt = 0; iPt < pitchProb.size(); ++iPt) + { + float freq = 440. * + std::pow(2, + (pitchProb[iPt].first - 69)/12); + float dist = std::abs(hmmFreq-freq); + if (dist < leastDist) + { + leastDist = dist; + bestFreq = freq; + } + } + } else { + bestFreq = hmmFreq; + } + return bestFreq; +}
--- a/MonoPitchHMM.h Fri Aug 19 13:40:11 2016 +0100 +++ b/MonoPitchHMM.h Fri Mar 24 14:50:44 2017 +0000 @@ -26,8 +26,9 @@ class MonoPitchHMM : public SparseHMM { public: - MonoPitchHMM(); + MonoPitchHMM(int fixedLag); const std::vector<double> calculateObsProb(const vector<pair<double, double> >); + const float nearestFreq(int state, vector<pair<double, double> > pitchProb); void build(); double m_minFreq; // 82.40689f/2 int m_nBPS;
--- a/PYinVamp.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/PYinVamp.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -13,7 +13,7 @@ #include "PYinVamp.h" #include "MonoNote.h" -#include "MonoPitch.h" +#include "MonoPitchHMM.h" #include <vector> #include <algorithm> @@ -42,14 +42,17 @@ m_oSmoothedPitchTrack(0), m_oNotes(0), m_threshDistr(2.0f), + m_fixedLag(1.0f), m_outputUnvoiced(0.0f), m_preciseTime(0.0f), m_lowAmp(0.1f), m_onsetSensitivity(0.7f), m_pruneThresh(0.1f), + m_pitchHmm(0), m_pitchProb(0), m_timestamp(0), - m_level(0) + m_level(0), + m_pitchTrack(0) { } @@ -151,6 +154,19 @@ d.valueNames.push_back("Single Value 0.20"); list.push_back(d); + d.valueNames.clear(); + + d.identifier = "fixedlag"; + d.name = "Fixed-lag smoothing"; + d.description = "Use fixed lag smoothing, not full Viterbi smoothing."; + d.unit = ""; + d.minValue = 0.0f; + d.maxValue = 1.0f; + d.defaultValue = 0.0f; + d.isQuantized = true; + d.quantizeStep = 1.0f; + list.push_back(d); + d.identifier = "outputunvoiced"; d.valueNames.clear(); d.name = "Output estimates classified as unvoiced?"; @@ -220,6 +236,9 @@ if (identifier == "threshdistr") { return m_threshDistr; } + if (identifier == "fixedlag") { + return m_fixedLag; + } if (identifier == "outputunvoiced") { return m_outputUnvoiced; } @@ -245,6 +264,10 @@ { m_threshDistr = value; } + if (identifier == "fixedlag") + { + m_fixedLag = value; + } if (identifier == "outputunvoiced") { m_outputUnvoiced = value; @@ -417,10 +440,14 @@ m_yin.setThresholdDistr(m_threshDistr); m_yin.setFrameSize(m_blockSize); m_yin.setFast(!m_preciseTime); + + if (m_fixedLag == 1.f) m_pitchHmm = MonoPitchHMM(100); + else m_pitchHmm = MonoPitchHMM(0); m_pitchProb.clear(); m_timestamp.clear(); m_level.clear(); + m_pitchTrack.clear(); /* std::cerr << "PYinVamp::reset" << ", blockSize = " << m_blockSize @@ -431,8 +458,10 @@ PYinVamp::FeatureSet PYinVamp::process(const float *const *inputBuffers, RealTime timestamp) { + std::cerr << timestamp << std::endl; int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4; - timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate)); + timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, + lrintf(m_inputSampleRate)); FeatureSet fs; @@ -453,8 +482,6 @@ m_level.push_back(yo.rms); - // First, get the things out of the way that we don't want to output - // immediately, but instead save for later. vector<pair<double, double> > tempPitchProb; for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate) { @@ -469,9 +496,53 @@ (tempPitch, yo.freqProb[iCandidate].second*factor)); } } + + vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb); + if (m_timestamp.empty()) + { + m_pitchHmm.initialise(tempObsProb); + } else { + m_pitchHmm.process(tempObsProb); + } + m_pitchProb.push_back(tempPitchProb); m_timestamp.push_back(timestamp); + int lag = m_pitchHmm.m_fixedLag; + + if (m_fixedLag == 1.f) // do fixed-lag smoothing instead of full Viterbi + { + if (int(m_timestamp.size()) == lag + 1) + { + m_timestamp.pop_front(); + m_pitchProb.pop_front(); + + Feature f; + f.hasTimestamp = true; + vector<int> rawPitchPath = m_pitchHmm.track(); + float freq = m_pitchHmm.nearestFreq(rawPitchPath[0], + m_pitchProb[0]); + m_pitchTrack.push_back(freq); + f.timestamp = m_timestamp[0]; + f.values.clear(); + + // different output modes + if (freq < 0 && (m_outputUnvoiced==0)) + { + + } else { + if (m_outputUnvoiced == 1) + { + f.values.push_back(fabs(freq)); + } else { + f.values.push_back(freq); + } + fs[m_oSmoothedPitchTrack].push_back(f); + } + } + } + + // F0 CANDIDATES Feature f; f.hasTimestamp = true; @@ -521,39 +592,49 @@ return fs; } - // MONO-PITCH STUFF - MonoPitch mp; - vector<float> mpOut = mp.process(m_pitchProb); - for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) + // ================== P I T C H T R A C K ================================= + + vector<int> rawPitchPath = m_pitchHmm.track(); + + for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) { - if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue; + float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame], + m_pitchProb[iFrame]); + m_pitchTrack.push_back(freq); // for note processing below + f.timestamp = m_timestamp[iFrame]; f.values.clear(); + + // different output modes + if (freq < 0 && (m_outputUnvoiced==0)) continue; if (m_outputUnvoiced == 1) { - f.values.push_back(fabs(mpOut[iFrame])); + f.values.push_back(fabs(freq)); } else { - f.values.push_back(mpOut[iFrame]); + f.values.push_back(freq); } - fs[m_oSmoothedPitchTrack].push_back(f); } - // MONO-NOTE STUFF -// std::cerr << "Mono Note Stuff" << std::endl; + // ======================== N O T E S ====================================== MonoNote mn; std::vector<std::vector<std::pair<double, double> > > smoothedPitch; - for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) { + for (size_t iFrame = 0; iFrame < m_pitchTrack.size(); ++iFrame) { std::vector<std::pair<double, double> > temp; - if (mpOut[iFrame] > 0) + if (m_pitchTrack[iFrame] > 0) { - double tempPitch = 12 * std::log(mpOut[iFrame]/440)/std::log(2.) + 69; + double tempPitch = 12 * + std::log(m_pitchTrack[iFrame]/440)/std::log(2.) + 69; temp.push_back(std::pair<double,double>(tempPitch, .9)); + // std::cerr << "tempPitch: " << tempPitch << std::endl; } + // std::cerr << "temp size: " << temp.size() << std::endl; smoothedPitch.push_back(temp); } - // vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb); + vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch); + std::cerr << "mnOut size: " << mnOut.size() << std::endl; + std::cerr << "m_pitchTrack size: " << m_pitchTrack.size() << std::endl; // turning feature into a note feature f.hasTimestamp = true; @@ -563,18 +644,30 @@ int onsetFrame = 0; bool isVoiced = 0; bool oldIsVoiced = 0; - size_t nFrame = m_pitchProb.size(); + size_t nFrame = m_pitchTrack.size(); float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize; - std::vector<float> notePitchTrack; // collects pitches for one note at a time + // the body of the loop below should be in a function/method + // but what does it actually do?? + // * takes the result of the note tracking HMM + // * collects contiguously pitched pitches + // * writes a note once it notices the voiced segment has ended + // complications: + // * it needs a lookahead of two frames for m_level (wtf was I thinking) + // * it needs to know the timestamp (which can be guessed from the frame no) + // * + int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4; + RealTime timestampOffset = Vamp::RealTime::frame2RealTime(offset, + lrintf(m_inputSampleRate)); + + std::vector<float> notePitchTrack; // collects pitches for 1 note at a time for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) { - isVoiced = mnOut[iFrame].noteState < 3 - && smoothedPitch[iFrame].size() > 0 - && (iFrame >= nFrame-2 - || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity)); - // std::cerr << m_level[iFrame]/m_level[iFrame-1] << " " << isVoiced << std::endl; + isVoiced = mnOut[iFrame].noteState < 3 + && smoothedPitch[iFrame].size() > 0 + && (iFrame >= nFrame-2 + || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity)); if (isVoiced && iFrame != nFrame-1) { if (oldIsVoiced == 0) // beginning of a note @@ -586,16 +679,22 @@ } else { // not currently voiced if (oldIsVoiced == 1) // end of note { - // std::cerr << notePitchTrack.size() << " " << minNoteFrames << std::endl; if (notePitchTrack.size() >= minNoteFrames) { std::sort(notePitchTrack.begin(), notePitchTrack.end()); float medianPitch = notePitchTrack[notePitchTrack.size()/2]; - float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440; + float medianFreq = + std::pow(2,(medianPitch - 69) / 12) * 440; f.values.clear(); f.values.push_back(medianFreq); - f.timestamp = m_timestamp[onsetFrame]; - f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame]; + RealTime start = RealTime::frame2RealTime( + onsetFrame * m_stepSize, lrintf(m_inputSampleRate)) + + timestampOffset; + RealTime end = RealTime::frame2RealTime( + iFrame * m_stepSize, lrintf(m_inputSampleRate)) + + timestampOffset; + f.timestamp = start; + f.duration = end - start; fs[m_oNotes].push_back(f); } notePitchTrack.clear();
--- a/PYinVamp.h Fri Aug 19 13:40:11 2016 +0100 +++ b/PYinVamp.h Fri Mar 24 14:50:44 2017 +0000 @@ -17,6 +17,7 @@ #include <vamp-sdk/Plugin.h> #include "Yin.h" +#include "MonoPitchHMM.h" class PYinVamp : public Vamp::Plugin { @@ -71,14 +72,23 @@ mutable int m_oNotes; float m_threshDistr; + float m_fixedLag; float m_outputUnvoiced; float m_preciseTime; float m_lowAmp; float m_onsetSensitivity; float m_pruneThresh; - vector<vector<pair<double, double> > > m_pitchProb; - vector<Vamp::RealTime> m_timestamp; + + MonoPitchHMM m_pitchHmm; + + deque<vector<pair<double, double> > > m_pitchProb; + deque<Vamp::RealTime> m_timestamp; vector<float> m_level; + vector<float> m_pitchTrack; + + // for note writing + // vector<float> m_notePitchTrack; // contains pitches of one current note + // bool m_oldIsVoiced; }; #endif
--- a/SparseHMM.cpp Fri Aug 19 13:40:11 2016 +0100 +++ b/SparseHMM.cpp Fri Mar 24 14:50:44 2017 +0000 @@ -19,6 +19,22 @@ using std::vector; using std::pair; +SparseHMM::SparseHMM(int fixedLag) : + m_fixedLag(fixedLag), + m_nState(0), + m_nTrans(0), + m_init(0), + m_from(0), + m_to(0), + m_transProb(0), + m_scale(0), + m_psi(0), + m_delta(0), + m_oldDelta(0) +{ + +} + const vector<double> SparseHMM::calculateObsProb(const vector<pair<double, double> > ) { @@ -26,103 +42,144 @@ return(vector<double>()); } +void +SparseHMM::build() +{ } + const std::vector<int> -SparseHMM::decodeViterbi(std::vector<vector<double> > obsProb, - vector<double> *scale) +SparseHMM::decodeViterbi(std::vector<vector<double> > obsProb) { - if (obsProb.size() < 1) { + size_t nFrame = obsProb.size(); + if (nFrame < 1) { return vector<int>(); } - size_t nState = init.size(); - size_t nFrame = obsProb.size(); - - // check for consistency - size_t nTrans = transProb.size(); - - // declaring variables - std::vector<double> delta = std::vector<double>(nState); - std::vector<double> oldDelta = std::vector<double>(nState); - vector<vector<int> > psi; // "matrix" of remembered indices of the best transitions - vector<int> path = vector<int>(nFrame, nState-1); // the final output path (current assignment arbitrary, makes sense only for Chordino, where nChord-1 is the "no chord" label) + initialise(obsProb[0]); + + // rest of forward step + for (size_t iFrame = 1; iFrame < nFrame; ++iFrame) + { + process(obsProb[iFrame]); + } + + vector<int> path = track(); + return(path); +} + +void +SparseHMM::reset() +{ + m_scale.clear(); + m_psi.clear(); + for (size_t i = 0; i < m_delta.size(); ++i) m_delta[i] = 0; + for (size_t i = 0; i < m_oldDelta.size(); ++i) m_oldDelta[i] = 0; +} + +void +SparseHMM::initialise(vector<double> firstObs) +{ + reset(); double deltasum = 0; // initialise first frame - for (size_t iState = 0; iState < nState; ++iState) + for (size_t iState = 0; iState < m_nState; ++iState) { - oldDelta[iState] = init[iState] * obsProb[0][iState]; - // std::cerr << iState << " ----- " << init[iState] << std::endl; - deltasum += oldDelta[iState]; + m_oldDelta[iState] = m_init[iState] * firstObs[iState]; + deltasum += m_oldDelta[iState]; } - for (size_t iState = 0; iState < nState; ++iState) + for (size_t iState = 0; iState < m_nState; ++iState) { - oldDelta[iState] /= deltasum; // normalise (scale) - // std::cerr << oldDelta[iState] << std::endl; + m_oldDelta[iState] /= deltasum; // normalise (scale) } - scale->push_back(1.0/deltasum); - psi.push_back(vector<int>(nState,0)); + m_scale.push_back(1.0/deltasum); + m_psi.push_back(vector<int>(m_nState,0)); +} - // rest of forward step - for (size_t iFrame = 1; iFrame < nFrame; ++iFrame) +int +SparseHMM::process(vector<double> newObs) +{ + vector<int> tempPsi = vector<int>(m_nState,0); + + // calculate best previous state for every current state + size_t fromState; + size_t toState; + double currentTransProb; + double currentValue; + + // this is the "sparse" loop + for (size_t iTrans = 0; iTrans < m_nTrans; ++iTrans) { - deltasum = 0; - psi.push_back(vector<int>(nState,0)); - - // calculate best previous state for every current state - size_t fromState; - size_t toState; - double currentTransProb; - double currentValue; + fromState = m_from[iTrans]; + toState = m_to[iTrans]; + currentTransProb = m_transProb[iTrans]; - // this is the "sparse" loop - for (size_t iTrans = 0; iTrans < nTrans; ++iTrans) + currentValue = m_oldDelta[fromState] * currentTransProb; + if (currentValue > m_delta[toState]) { - fromState = from[iTrans]; - toState = to[iTrans]; - currentTransProb = transProb[iTrans]; - - currentValue = oldDelta[fromState] * currentTransProb; - if (currentValue > delta[toState]) - { - delta[toState] = currentValue; // will be multiplied by the right obs later! - psi[iFrame][toState] = fromState; - } - } - - for (size_t jState = 0; jState < nState; ++jState) - { - delta[jState] *= obsProb[iFrame][jState]; - deltasum += delta[jState]; - } - - if (deltasum > 0) - { - for (size_t iState = 0; iState < nState; ++iState) - { - oldDelta[iState] = delta[iState] / deltasum; // normalise (scale) - delta[iState] = 0; - } - scale->push_back(1.0/deltasum); - } else - { - std::cerr << "WARNING: Viterbi has been fed some zero probabilities, at least they become zero at frame " << iFrame << " in combination with the model." << std::endl; - for (size_t iState = 0; iState < nState; ++iState) - { - oldDelta[iState] = 1.0/nState; - delta[iState] = 0; - } - scale->push_back(1.0); + // will be multiplied by the right obs later! + m_delta[toState] = currentValue; + tempPsi[toState] = fromState; } } + m_psi.push_back(tempPsi); + + double deltasum = 0; + for (size_t jState = 0; jState < m_nState; ++jState) + { + m_delta[jState] *= newObs[jState]; + deltasum += m_delta[jState]; + } + + if (deltasum > 0) + { + for (size_t iState = 0; iState < m_nState; ++iState) + { + m_oldDelta[iState] = m_delta[iState] / deltasum;// normalise (scale) + m_delta[iState] = 0; + } + m_scale.push_back(1.0/deltasum); + } else + { + std::cerr << "WARNING: Viterbi has been fed some zero " + "probabilities, at least they become zero " + "in combination with the model." << std::endl; + for (size_t iState = 0; iState < m_nState; ++iState) + { + m_oldDelta[iState] = 1.0/m_nState; + m_delta[iState] = 0; + } + m_scale.push_back(1.0); + } + + if (m_fixedLag > 0 && m_psi.size() > m_fixedLag) + { + m_psi.pop_front(); + m_scale.pop_front(); + } + + // std::cerr << m_fixedLag << " " << m_psi.size() << std::endl; + + return 0; +} + +const vector<int> +SparseHMM::track() +{ // initialise backward step + size_t nFrame = m_psi.size(); + + // The final output path (current assignment arbitrary, makes sense only for + // Chordino, where nChord-1 is the "no chord" label) + vector<int> path = vector<int>(nFrame, m_nState-1); + double bestValue = 0; - for (size_t iState = 0; iState < nState; ++iState) + for (size_t iState = 0; iState < m_nState; ++iState) { - double currentValue = oldDelta[iState]; + double currentValue = m_oldDelta[iState]; if (currentValue > bestValue) { bestValue = currentValue; @@ -130,16 +187,11 @@ } } - // rest of backward step + // Rest of backward step for (int iFrame = nFrame-2; iFrame != -1; --iFrame) { - path[iFrame] = psi[iFrame+1][path[iFrame+1]]; + path[iFrame] = m_psi[iFrame+1][path[iFrame+1]]; } - - // for (size_t iState = 0; iState < nState; ++iState) - // { - // // std::cerr << psi[2][iState] << std::endl; - // } - + return path; -} +} \ No newline at end of file
--- a/SparseHMM.h Fri Aug 19 13:40:11 2016 +0100 +++ b/SparseHMM.h Fri Mar 24 14:50:44 2017 +0000 @@ -15,21 +15,39 @@ #define _SPARSEHMM_H_ #include <vector> +#include <queue> #include <cstdio> using std::vector; +using std::deque; using std::pair; class SparseHMM { public: - virtual const std::vector<double> calculateObsProb(const vector<pair<double, double> >); - const std::vector<int> decodeViterbi(std::vector<vector<double> > obs, - vector<double> *scale); - vector<double> init; - vector<size_t> from; - vector<size_t> to; - vector<double> transProb; + SparseHMM(int fixedLag); + virtual const std::vector<double> + calculateObsProb(const vector<pair<double, double> >); + virtual void build(); + const std::vector<int> decodeViterbi(std::vector<vector<double> > obs); + void reset(); + void initialise(vector<double> firstObs); + int process(vector<double> newObs); + const vector<int> track(); + // "sparse" HMM definition + int m_fixedLag; + int m_nState; + int m_nTrans; + vector<double> m_init; + vector<size_t> m_from; + vector<size_t> m_to; + vector<double> m_transProb; + + // variables for decoding + deque<double> m_scale; + deque<vector<int> > m_psi; + vector<double> m_delta; + vector<double> m_oldDelta; }; #endif
--- a/win32-build/pyin.pro Fri Aug 19 13:40:11 2016 +0100 +++ b/win32-build/pyin.pro Fri Mar 24 14:50:44 2017 +0000 @@ -13,7 +13,6 @@ ../Yin.cpp \ ../SparseHMM.cpp \ ../MonoPitchHMM.cpp \ - ../MonoPitch.cpp \ ../MonoNoteParameters.cpp \ ../MonoNoteHMM.cpp \ ../MonoNote.cpp \ @@ -27,7 +26,6 @@ ../Yin.h \ ../SparseHMM.h \ ../MonoPitchHMM.h \ - ../MonoPitch.h \ ../MonoNoteParameters.h \ ../MonoNoteHMM.h \ ../MonoNote.h \