changeset 130:080fe18f5ebf fixedlag

refactored Viterbi * perhaps I even discovered a bug (probablity sum was not reset for every frame)
author Matthias Mauch <mail@matthiasmauch.net>
date Fri, 03 Jul 2015 12:22:44 +0100
parents 72b3892c1d51
children b877df85ad9e
files Makefile.osx MonoNote.cpp MonoNoteHMM.cpp MonoPitch.cpp MonoPitchHMM.cpp PYinVamp.cpp PYinVamp.h SparseHMM.cpp SparseHMM.h
diffstat 9 files changed, 226 insertions(+), 145 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile.osx	Wed Jul 01 19:13:05 2015 +0100
+++ b/Makefile.osx	Fri Jul 03 12:22:44 2015 +0100
@@ -2,7 +2,7 @@
 CFLAGS := $(ARCHFLAGS) -O3 -I../vamp-plugin-sdk -I/usr/local/include -Wall -fPIC 
 CXXFLAGS := $(CFLAGS)
 
-LDFLAGS := -L../vamp-plugin-sdk -L../vamp-plugin-sdk -lvamp-sdk $(ARCHFLAGS) 
+LDFLAGS := -L../vamp-plugin-sdk -L../vamp-plugin-sdk -lvamp-sdk $(ARCHFLAGS) -L/usr/local/lib 
 PLUGIN_LDFLAGS := -dynamiclib $(LDFLAGS) -exported_symbols_list vamp-plugin.list
 TEST_LDFLAGS := $(LDFLAGS) -lboost_unit_test_framework
 PLUGIN_EXT := .dylib
--- a/MonoNote.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/MonoNote.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -39,11 +39,9 @@
         obsProb.push_back(hmm.calculateObsProb(pitchProb[iFrame]));
     }
     
-    vector<double> *scale = new vector<double>(pitchProb.size());
-    
     vector<MonoNote::FrameOutput> out; 
     
-    vector<int> path = hmm.decodeViterbi(obsProb, scale);
+    vector<int> path = hmm.decodeViterbi(obsProb);
     
     for (size_t iFrame = 0; iFrame < path.size(); ++iFrame)
     {
@@ -56,6 +54,5 @@
         out.push_back(FrameOutput(iFrame, currPitch, stateKind));
         // std::cerr << path[iFrame] << " -- "<< pitchProb[iFrame][0].first << " -- "<< currPitch << " -- " << stateKind << std::endl;
     }
-    delete scale;
     return(out);
 }
--- a/MonoNoteHMM.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/MonoNoteHMM.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -107,6 +107,8 @@
     //    3. attack state
     //    ...
     
+    m_nState = par.n;
+
     // observation distributions
     for (size_t iState = 0; iState < par.n; ++iState)
     {
@@ -114,9 +116,9 @@
         if (iState % par.nSPP == 2)
         {
             // silent state starts tracking
-            init.push_back(1.0/(par.nS * par.nPPS));
+            m_init.push_back(1.0/(par.nS * par.nPPS));
         } else {
-            init.push_back(0.0);            
+            m_init.push_back(0.0);            
         }
     }
 
@@ -137,27 +139,27 @@
         size_t index = iPitch * par.nSPP;
 
         // transitions from attack state
-        from.push_back(index);
-        to.push_back(index);
-        transProb.push_back(par.pAttackSelftrans);
+        m_from.push_back(index);
+        m_to.push_back(index);
+        m_transProb.push_back(par.pAttackSelftrans);
 
-        from.push_back(index);
-        to.push_back(index+1);
-        transProb.push_back(1-par.pAttackSelftrans);
+        m_from.push_back(index);
+        m_to.push_back(index+1);
+        m_transProb.push_back(1-par.pAttackSelftrans);
 
         // transitions from stable state
-        from.push_back(index+1);
-        to.push_back(index+1); // to itself
-        transProb.push_back(par.pStableSelftrans);
+        m_from.push_back(index+1);
+        m_to.push_back(index+1); // to itself
+        m_transProb.push_back(par.pStableSelftrans);
         
-        from.push_back(index+1);
-        to.push_back(index+2); // to silent
-        transProb.push_back(par.pStable2Silent);
+        m_from.push_back(index+1);
+        m_to.push_back(index+2); // to silent
+        m_transProb.push_back(par.pStable2Silent);
 
         // the "easy" transitions from silent state
-        from.push_back(index+2);
-        to.push_back(index+2);
-        transProb.push_back(par.pSilentSelftrans);
+        m_from.push_back(index+2);
+        m_to.push_back(index+2);
+        m_transProb.push_back(par.pSilentSelftrans);
         
         
         // the more complicated transitions from the silent
@@ -184,15 +186,18 @@
 
                 tempTransProbSilent.push_back(tempWeightSilent);
 
-                from.push_back(index+2);
-                to.push_back(toIndex);
+                m_from.push_back(index+2);
+                m_to.push_back(toIndex);
             }
         }
         for (size_t i = 0; i < tempTransProbSilent.size(); ++i)
         {
-            transProb.push_back((1-par.pSilentSelftrans) * tempTransProbSilent[i]/probSumSilent);
+            m_transProb.push_back((1-par.pSilentSelftrans) * tempTransProbSilent[i]/probSumSilent);
         }
     }
+    m_nTrans = m_transProb.size();
+    m_delta = vector<double>(m_nState);
+    m_oldDelta = vector<double>(m_nState);
 }
 
 double
--- a/MonoPitch.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/MonoPitch.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -40,13 +40,11 @@
     {
         obsProb.push_back(hmm.calculateObsProb(pitchProb[iFrame]));
     }
-    
-    vector<double> *scale = new vector<double>(0);
-    
+        
     vector<float> out; 
     
     // std::cerr << "before Viterbi decoding" << obsProb.size() << "ng" << obsProb[1].size() << std::endl;
-    vector<int> path = hmm.decodeViterbi(obsProb, scale);
+    vector<int> path = hmm.decodeViterbi(obsProb);
     // std::cerr << "after Viterbi decoding" << std::endl;
     
     for (size_t iFrame = 0; iFrame < path.size(); ++iFrame)
@@ -76,6 +74,5 @@
         }
         out.push_back(bestFreq);
     }
-    delete scale;
     return(out);
 }
--- a/MonoPitchHMM.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/MonoPitchHMM.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -17,11 +17,13 @@
 
 #include <cstdio>
 #include <cmath>
+#include <iostream>
 
 using std::vector;
 using std::pair;
 
 MonoPitchHMM::MonoPitchHMM() :
+SparseHMM(),
 m_minFreq(61.735),
 m_nBPS(5),
 m_nPitch(0),
@@ -32,6 +34,7 @@
 {
     m_transitionWidth = 5*(m_nBPS/2) + 1;
     m_nPitch = 69 * m_nBPS;
+    m_nState = 2 * m_nPitch; // voiced and unvoiced
     m_freqs = vector<double>(2*m_nPitch);
     for (size_t iPitch = 0; iPitch < m_nPitch; ++iPitch)
     {
@@ -83,7 +86,7 @@
 MonoPitchHMM::build()
 {
     // INITIAL VECTOR
-    init = vector<double>(2*m_nPitch, 1.0 / 2*m_nPitch);
+    m_init = vector<double>(2*m_nPitch, 1.0 / 2*m_nPitch);
     
     // TRANSITIONS
     for (size_t iPitch = 0; iPitch < m_nPitch; ++iPitch)
@@ -112,22 +115,22 @@
         // TRANSITIONS TO CLOSE PITCH
         for (size_t i = minNextPitch; i <= maxNextPitch; ++i)
         {
-            from.push_back(iPitch);
-            to.push_back(i);
-            transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans);
+            m_from.push_back(iPitch);
+            m_to.push_back(i);
+            m_transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans);
 
-            from.push_back(iPitch);
-            to.push_back(i+m_nPitch);
-            transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans));
+            m_from.push_back(iPitch);
+            m_to.push_back(i+m_nPitch);
+            m_transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans));
 
-            from.push_back(iPitch+m_nPitch);
-            to.push_back(i+m_nPitch);
-            transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans);
+            m_from.push_back(iPitch+m_nPitch);
+            m_to.push_back(i+m_nPitch);
+            m_transProb.push_back(weights[i-minNextPitch] / weightSum * m_selfTrans);
             // transProb.push_back(weights[i-minNextPitch] / weightSum * 0.5);
             
-            from.push_back(iPitch+m_nPitch);
-            to.push_back(i);
-            transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans));
+            m_from.push_back(iPitch+m_nPitch);
+            m_to.push_back(i);
+            m_transProb.push_back(weights[i-minNextPitch] / weightSum * (1-m_selfTrans));
             // transProb.push_back(weights[i-minNextPitch] / weightSum * 0.5);
         }
 
@@ -149,5 +152,7 @@
     // for (size_t i = 0; i < from.size(); ++i) {
     //     std::cerr << "P(["<< from[i] << " --> " << to[i] << "]) = " << transProb[i] << std::endl;
     // }
-    
+    m_nTrans = m_transProb.size();
+    m_delta = vector<double>(m_nState);
+    m_oldDelta = vector<double>(m_nState);
 }
--- a/PYinVamp.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/PYinVamp.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -44,6 +44,7 @@
     m_oSmoothedPitchTrack(0),
     m_oNotes(0),
     m_threshDistr(2.0f),
+    m_fixedLag(0.0f),
     m_outputUnvoiced(0.0f),
     m_preciseTime(0.0f),
     m_lowAmp(0.1f),
@@ -153,6 +154,19 @@
     d.valueNames.push_back("Single Value 0.20");
     list.push_back(d);
 
+    d.valueNames.clear();
+
+    d.identifier = "fixedlag";
+    d.name = "Fixed-lag smoothing";
+    d.description = "Use fixed lag smoothing, not full Viterbi smoothing.";
+    d.unit = "";
+    d.minValue = 0.0f;
+    d.maxValue = 1.0f;
+    d.defaultValue = 0.0f;
+    d.isQuantized = true;
+    d.quantizeStep = 1.0f;
+    list.push_back(d);
+
     d.identifier = "outputunvoiced";
     d.valueNames.clear();
     d.name = "Output estimates classified as unvoiced?";
@@ -222,6 +236,9 @@
     if (identifier == "threshdistr") {
             return m_threshDistr;
     }
+    if (identifier == "fixedlag") {
+            return m_fixedLag;
+    }
     if (identifier == "outputunvoiced") {
             return m_outputUnvoiced;
     }
@@ -247,6 +264,10 @@
     {
         m_threshDistr = value;
     }
+    if (identifier == "fixedlag")
+    {
+        m_fixedLag = value;
+    }
     if (identifier == "outputunvoiced")
     {
         m_outputUnvoiced = value;
@@ -455,8 +476,6 @@
 
     m_level.push_back(yo.rms);
 
-    // First, get the things out of the way that we don't want to output 
-    // immediately, but instead save for later.
     vector<pair<double, double> > tempPitchProb;
     for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
     {
@@ -471,7 +490,14 @@
                 (tempPitch, yo.freqProb[iCandidate].second*factor));
         }
     }
-    m_pitchProb.push_back(tempPitchProb);
+
+    if (m_fixedLag == 0.f)
+    {
+        m_pitchProb.push_back(tempPitchProb);
+    } else {
+        // Damn, so I need the hmm right here! Sadly it isn't defined here yet.
+        // Perhaps I could re-design the whole shabang 
+    }
     m_timestamp.push_back(timestamp);
 
     // F0 CANDIDATES
@@ -549,7 +575,8 @@
         std::vector<std::pair<double, double> > temp;
         if (mpOut[iFrame] > 0)
         {
-            double tempPitch = 12 * std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
+            double tempPitch = 12 * 
+                               std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
             temp.push_back(std::pair<double,double>(tempPitch, .9));
         }
         smoothedPitch.push_back(temp);
@@ -569,14 +596,15 @@
 
     float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
     
+    // the body of the loop below should be in a function/method
     std::vector<float> notePitchTrack; // collects pitches for one note at a time
     for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
     {
         isVoiced = mnOut[iFrame].noteState < 3
                    && smoothedPitch[iFrame].size() > 0
                    && (iFrame >= nFrame-2
-                       || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
-        // std::cerr << m_level[iFrame]/m_level[iFrame-1] << " " << isVoiced << std::endl;
+                       || ((m_level[iFrame]/m_level[iFrame+2]) > 
+                        m_onsetSensitivity));
         if (isVoiced && iFrame != nFrame-1)
         {
             if (oldIsVoiced == 0) // beginning of a note
@@ -588,7 +616,6 @@
         } else { // not currently voiced
             if (oldIsVoiced == 1) // end of note
             {
-                // std::cerr << notePitchTrack.size() << " " << minNoteFrames << std::endl;
                 if (notePitchTrack.size() >= minNoteFrames)
                 {
                     std::sort(notePitchTrack.begin(), notePitchTrack.end());
--- a/PYinVamp.h	Wed Jul 01 19:13:05 2015 +0100
+++ b/PYinVamp.h	Fri Jul 03 12:22:44 2015 +0100
@@ -71,6 +71,7 @@
     mutable int m_oNotes;
 
     float m_threshDistr;
+    float m_fixedLag;
     float m_outputUnvoiced;
     float m_preciseTime;
     float m_lowAmp;
--- a/SparseHMM.cpp	Wed Jul 01 19:13:05 2015 +0100
+++ b/SparseHMM.cpp	Fri Jul 03 12:22:44 2015 +0100
@@ -19,6 +19,21 @@
 using std::vector;
 using std::pair;
 
+SparseHMM::SparseHMM() :
+    m_nState(0),
+    m_nTrans(0),
+    m_init(0),
+    m_from(0),
+    m_to(0),
+    m_transProb(0),
+    m_scale(0),
+    m_psi(0),
+    m_delta(0),
+    m_oldDelta(0)
+{
+
+}
+
 const vector<double>
 SparseHMM::calculateObsProb(const vector<pair<double, double> > data)
 {
@@ -26,103 +41,126 @@
     return(vector<double>());
 }
 
+void
+SparseHMM::build()
+{ }
+
 const std::vector<int> 
-SparseHMM::decodeViterbi(std::vector<vector<double> > obsProb,
-                         vector<double> *scale) 
+SparseHMM::decodeViterbi(std::vector<vector<double> > obsProb) 
 {
-    if (obsProb.size() < 1) {
+    size_t nFrame = obsProb.size();
+    if (nFrame < 1) {
         return vector<int>();
     }
-
-    size_t nState = init.size();
-    size_t nFrame = obsProb.size();
     
-    // check for consistency    
-    size_t nTrans = transProb.size();
     
-    // declaring variables
-    std::vector<double> delta = std::vector<double>(nState);
-    std::vector<double> oldDelta = std::vector<double>(nState);
-    vector<vector<int> > psi; //  "matrix" of remembered indices of the best transitions
-    vector<int> path = vector<int>(nFrame, nState-1); // the final output path (current assignment arbitrary, makes sense only for Chordino, where nChord-1 is the "no chord" label)
-
-    double deltasum = 0;
-
-    // initialise first frame
-    for (size_t iState = 0; iState < nState; ++iState)
-    {
-        oldDelta[iState] = init[iState] * obsProb[0][iState];
-        // std::cerr << iState << " ----- " << init[iState] << std::endl;
-        deltasum += oldDelta[iState];
-    }
-
-    for (size_t iState = 0; iState < nState; ++iState)
-    {
-        oldDelta[iState] /= deltasum; // normalise (scale)
-        // std::cerr << oldDelta[iState] << std::endl;
-    }
-
-    scale->push_back(1.0/deltasum);
-    psi.push_back(vector<int>(nState,0));
+    initialise(obsProb[0]);
 
     // rest of forward step
     for (size_t iFrame = 1; iFrame < nFrame; ++iFrame)
     {
-        deltasum = 0;
-        psi.push_back(vector<int>(nState,0));
+        process(obsProb[iFrame]);
+    }
 
-        // calculate best previous state for every current state
-        size_t fromState;
-        size_t toState;
-        double currentTransProb;
-        double currentValue;
+    vector<int> path = finalise();
+    return(path);
+}
+
+
+void
+SparseHMM::initialise(vector<double> firstObs)
+{
+    double deltasum = 0;
+
+    // initialise first frame
+    for (size_t iState = 0; iState < m_nState; ++iState)
+    {
+        m_oldDelta[iState] = m_init[iState] * firstObs[iState];
+        deltasum += m_oldDelta[iState];
+    }
+
+    for (size_t iState = 0; iState < m_nState; ++iState)
+    {
+        m_oldDelta[iState] /= deltasum; // normalise (scale)
+    }
+
+    m_scale.push_back(1.0/deltasum);
+    m_psi.push_back(vector<int>(m_nState,0));
+}
+
+int
+SparseHMM::process(vector<double> newObs)
+{
+    vector<int> tempPsi = vector<int>(m_nState,0);
+
+    // calculate best previous state for every current state
+    size_t fromState;
+    size_t toState;
+    double currentTransProb;
+    double currentValue;
+    
+    // this is the "sparse" loop
+    for (size_t iTrans = 0; iTrans < m_nTrans; ++iTrans)
+    {
+        fromState = m_from[iTrans];
+        toState = m_to[iTrans];
+        currentTransProb = m_transProb[iTrans];
         
-        // this is the "sparse" loop
-        for (size_t iTrans = 0; iTrans < nTrans; ++iTrans)
+        currentValue = m_oldDelta[fromState] * currentTransProb;
+        if (currentValue > m_delta[toState])
         {
-            fromState = from[iTrans];
-            toState = to[iTrans];
-            currentTransProb = transProb[iTrans];
-            
-            currentValue = oldDelta[fromState] * currentTransProb;
-            if (currentValue > delta[toState])
-            {
-                delta[toState] = currentValue; // will be multiplied by the right obs later!
-                psi[iFrame][toState] = fromState;
-            }            
-        }
-        
-        for (size_t jState = 0; jState < nState; ++jState)
-        {
-            delta[jState] *= obsProb[iFrame][jState];
-            deltasum += delta[jState];
-        }
-
-        if (deltasum > 0)
-        {
-            for (size_t iState = 0; iState < nState; ++iState)
-            {
-                oldDelta[iState] = delta[iState] / deltasum; // normalise (scale)
-                delta[iState] = 0;
-            }
-            scale->push_back(1.0/deltasum);
-        } else
-        {
-            std::cerr << "WARNING: Viterbi has been fed some zero probabilities, at least they become zero at frame " <<  iFrame << " in combination with the model." << std::endl;
-            for (size_t iState = 0; iState < nState; ++iState)
-            {
-                oldDelta[iState] = 1.0/nState;
-                delta[iState] = 0;
-            }
-            scale->push_back(1.0);
+            // will be multiplied by the right obs later!
+            m_delta[toState] = currentValue;
+            tempPsi[toState] = fromState;
         }
     }
+    m_psi.push_back(tempPsi);
 
+
+    double deltasum = 0;
+    for (size_t jState = 0; jState < m_nState; ++jState)
+    {
+        m_delta[jState] *= newObs[jState];
+        deltasum += m_delta[jState];
+    }
+
+    if (deltasum > 0)
+    {
+        for (size_t iState = 0; iState < m_nState; ++iState)
+        {
+            m_oldDelta[iState] = m_delta[iState] / deltasum;// normalise (scale)
+            m_delta[iState] = 0;
+        }
+        m_scale.push_back(1.0/deltasum);
+    } else
+    {
+        std::cerr << "WARNING: Viterbi has been fed some zero "
+            "probabilities, at least they become zero "
+            "in combination with the model." << std::endl;
+        for (size_t iState = 0; iState < m_nState; ++iState)
+        {
+            m_oldDelta[iState] = 1.0/m_nState;
+            m_delta[iState] = 0;
+        }
+        m_scale.push_back(1.0);
+    }
+    return 0;
+}
+
+vector<int> 
+SparseHMM::finalise()
+{
     // initialise backward step
+    size_t nFrame = m_psi.size();
+
+    // The final output path (current assignment arbitrary, makes sense only for 
+    // Chordino, where nChord-1 is the "no chord" label)
+    vector<int> path = vector<int>(nFrame, m_nState-1);
+
     double bestValue = 0;
-    for (size_t iState = 0; iState < nState; ++iState)
+    for (size_t iState = 0; iState < m_nState; ++iState)
     {
-        double currentValue = oldDelta[iState];
+        double currentValue = m_oldDelta[iState];
         if (currentValue > bestValue)
         {
             bestValue = currentValue;            
@@ -130,16 +168,11 @@
         }
     }
 
-    // rest of backward step
+    // Rest of backward step
     for (int iFrame = nFrame-2; iFrame != -1; --iFrame)
     {
-        path[iFrame] = psi[iFrame+1][path[iFrame+1]];
+        path[iFrame] = m_psi[iFrame+1][path[iFrame+1]];
     }
-    
-    // for (size_t iState = 0; iState < nState; ++iState)
-    // {
-    //     // std::cerr << psi[2][iState] << std::endl;
-    // }
-    
+        
     return path;
-}
+}
\ No newline at end of file
--- a/SparseHMM.h	Wed Jul 01 19:13:05 2015 +0100
+++ b/SparseHMM.h	Fri Jul 03 12:22:44 2015 +0100
@@ -15,21 +15,37 @@
 #define _SPARSEHMM_H_
 
 #include <vector>
+#include <queue>
 #include <cstdio>
 
 using std::vector;
+using std::deque;
 using std::pair;
 
 class SparseHMM
 {
 public:
-    virtual const std::vector<double> calculateObsProb(const vector<pair<double, double> >);
-    const std::vector<int> decodeViterbi(std::vector<vector<double> > obs, 
-                                   vector<double> *scale);
-    vector<double> init;
-    vector<size_t> from;
-    vector<size_t> to;
-    vector<double> transProb;
+    SparseHMM();
+    virtual const std::vector<double>
+                           calculateObsProb(const vector<pair<double, double> >);
+    virtual void           build();
+    const std::vector<int> decodeViterbi(std::vector<vector<double> > obs);
+    void                   initialise(vector<double> firstObs);
+    int                    process(vector<double> newObs);
+    vector<int>            finalise();
+    // "sparse" HMM definition
+    int m_nState;
+    int m_nTrans;
+    vector<double> m_init;
+    vector<size_t> m_from;
+    vector<size_t> m_to;
+    vector<double> m_transProb;
+
+    // variables for decoding
+    deque<double> m_scale;
+    deque<vector<int> > m_psi;
+    vector<double> m_delta;
+    vector<double> m_oldDelta;
 };
 
 #endif