# HG changeset patch # User Chris Cannam # Date 1495029010 -3600 # Node ID 8404827a4b028b30d57ac12fbee476eb901ec7d1 # Parent 0432723faf031101a3d05942b9c6d0328bc5ba86 Avoid calculating a temporary obsprob matrix for note tracking; + some tidying diff -r 0432723faf03 -r 8404827a4b02 MonoNote.cpp --- a/MonoNote.cpp Fri Mar 24 15:35:32 2017 +0000 +++ b/MonoNote.cpp Wed May 17 14:50:10 2017 +0100 @@ -33,16 +33,37 @@ const vector MonoNote::process(const vector > > pitchProb) { - vector > obsProb; - for (size_t iFrame = 0; iFrame < pitchProb.size(); ++iFrame) - { - obsProb.push_back(hmm.calculateObsProb(pitchProb[iFrame])); + // Previously, this built up a single matrix of probabilities, by + // calling calculateObsProb to get a column for each frame in + // pitchProb. + // + // The number of distinct states depends on MonoNoteParameters, + // but the defaults have 3 states per pitch, 3 pitches per MIDI + // note, and 69 MIDI notes, giving 681 states per frame. With a + // frame step size of 256 at 44100Hz sample rate, a 3-minute song + // has about 30K frames leading to a 20 million element + // probability matrix. + // + // Since the matrix is very sparse, we can avoid some of this by + // feeding the (sparse implementation of) HMM one column at a + // time. + + vector path; + + if (!pitchProb.empty()) { + + hmm.initialise(hmm.calculateObsProb(pitchProb[0])); + + for (size_t iFrame = 1; iFrame < pitchProb.size(); ++iFrame) + { + hmm.process(hmm.calculateObsProb(pitchProb[iFrame])); + } + + path = hmm.track(); } vector out; - - vector path = hmm.decodeViterbi(obsProb); - + for (size_t iFrame = 0; iFrame < path.size(); ++iFrame) { double currPitch = -1; @@ -53,5 +74,6 @@ out.push_back(FrameOutput(iFrame, currPitch, stateKind)); } + return(out); } diff -r 0432723faf03 -r 8404827a4b02 MonoNoteHMM.cpp --- a/MonoNoteHMM.cpp Fri Mar 24 15:35:32 2017 +0000 +++ b/MonoNoteHMM.cpp Wed May 17 14:50:10 2017 +0100 @@ -92,7 +92,7 @@ out[i] = (1-pIsPitched) / (par.nPPS * par.nS); } } - + return(out); } diff -r 0432723faf03 -r 8404827a4b02 PYinVamp.cpp --- a/PYinVamp.cpp Fri Mar 24 15:35:32 2017 +0000 +++ b/PYinVamp.cpp Wed May 17 14:50:10 2017 +0100 @@ -322,7 +322,6 @@ d.description = "Estimated fundamental frequency candidates."; d.unit = "Hz"; d.hasFixedBinCount = false; - // d.binCount = 1; d.hasKnownExtents = true; d.minValue = m_fmin; d.maxValue = 500; @@ -335,10 +334,9 @@ d.identifier = "f0probs"; d.name = "Candidate Probabilities"; - d.description = "Probabilities of estimated fundamental frequency candidates."; + d.description = "Probabilities of estimated fundamental frequency candidates."; d.unit = ""; d.hasFixedBinCount = false; - // d.binCount = 1; d.hasKnownExtents = true; d.minValue = 0; d.maxValue = 1; @@ -382,13 +380,11 @@ d.identifier = "smoothedpitchtrack"; d.name = "Smoothed Pitch Track"; - d.description = "."; + d.description = "Frame-by-frame pitch estimate after smoothing"; d.unit = "Hz"; d.hasFixedBinCount = true; d.binCount = 1; d.hasKnownExtents = false; - // d.minValue = 0; - // d.maxValue = 1; d.isQuantized = false; d.sampleType = OutputDescriptor::FixedSampleRate; d.sampleRate = (m_inputSampleRate / m_stepSize); @@ -399,7 +395,6 @@ d.identifier = "notes"; d.name = "Notes"; d.description = "Derived fixed-pitch note frequencies"; - // d.unit = "MIDI unit"; d.unit = "Hz"; d.hasFixedBinCount = true; d.binCount = 1; @@ -420,11 +415,6 @@ if (channels < getMinChannelCount() || channels > getMaxChannelCount()) return false; -/* - std::cerr << "PYinVamp::initialise: channels = " << channels - << ", stepSize = " << stepSize << ", blockSize = " << blockSize - << std::endl; -*/ m_channels = channels; m_stepSize = stepSize; m_blockSize = blockSize; @@ -448,17 +438,11 @@ m_timestamp.clear(); m_level.clear(); m_pitchTrack.clear(); -/* - std::cerr << "PYinVamp::reset" - << ", blockSize = " << m_blockSize - << std::endl; -*/ } PYinVamp::FeatureSet PYinVamp::process(const float *const *inputBuffers, RealTime timestamp) { -// std::cerr << timestamp << std::endl; int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4; timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate)); @@ -542,7 +526,6 @@ } } - // F0 CANDIDATES Feature f; f.hasTimestamp = true; @@ -584,16 +567,20 @@ PYinVamp::getRemainingFeatures() { FeatureSet fs; - Feature f; - f.hasTimestamp = true; - f.hasDuration = false; - + if (m_pitchProb.empty()) { return fs; } + Feature f; + f.hasTimestamp = true; + f.hasDuration = false; + // ================== P I T C H T R A C K ================================= + // NB we do this even in fixed-lag mode, as we still have the last + // lag's-worth of pitch probs to consume + vector rawPitchPath = m_pitchHmm.track(); for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) @@ -601,10 +588,10 @@ float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame], m_pitchProb[iFrame]); m_pitchTrack.push_back(freq); // for note processing below - + f.timestamp = m_timestamp[iFrame]; f.values.clear(); - + // different output modes if (freq < 0 && (m_outputUnvoiced==0)) continue; if (m_outputUnvoiced == 1) @@ -615,9 +602,15 @@ } fs[m_oSmoothedPitchTrack].push_back(f); } - - // ======================== N O T E S ====================================== - MonoNote mn; + + addNoteFeatures(fs); + + return fs; +} + +void +PYinVamp::addNoteFeatures(FeatureSet &fs) +{ std::vector > > smoothedPitch; for (size_t iFrame = 0; iFrame < m_pitchTrack.size(); ++iFrame) { std::vector > temp; @@ -626,17 +619,18 @@ double tempPitch = 12 * std::log(m_pitchTrack[iFrame]/440)/std::log(2.) + 69; temp.push_back(std::pair(tempPitch, .9)); - // std::cerr << "tempPitch: " << tempPitch << std::endl; } - // std::cerr << "temp size: " << temp.size() << std::endl; smoothedPitch.push_back(temp); } + MonoNote mn; vector mnOut = mn.process(smoothedPitch); + std::cerr << "mnOut size: " << mnOut.size() << std::endl; std::cerr << "m_pitchTrack size: " << m_pitchTrack.size() << std::endl; // turning feature into a note feature + Feature f; f.hasTimestamp = true; f.hasDuration = true; f.values.clear(); @@ -702,5 +696,4 @@ } oldIsVoiced = isVoiced; } - return fs; } diff -r 0432723faf03 -r 8404827a4b02 PYinVamp.h --- a/PYinVamp.h Fri Mar 24 15:35:32 2017 +0000 +++ b/PYinVamp.h Wed May 17 14:50:10 2017 +0100 @@ -86,9 +86,7 @@ vector m_level; vector m_pitchTrack; - // for note writing - // vector m_notePitchTrack; // contains pitches of one current note - // bool m_oldIsVoiced; + void addNoteFeatures(FeatureSet &fs); }; #endif diff -r 0432723faf03 -r 8404827a4b02 SparseHMM.cpp --- a/SparseHMM.cpp Fri Mar 24 15:35:32 2017 +0000 +++ b/SparseHMM.cpp Wed May 17 14:50:10 2017 +0100 @@ -35,13 +35,6 @@ } -vector -SparseHMM::calculateObsProb(const vector > ) -{ - // dummy (virtual?) implementation to be overloaded - return(vector()); -} - void SparseHMM::build() { } @@ -126,7 +119,6 @@ } m_psi.push_back(tempPsi); - double deltasum = 0; for (int jState = 0; jState < m_nState; ++jState) { diff -r 0432723faf03 -r 8404827a4b02 SparseHMM.h --- a/SparseHMM.h Fri Mar 24 15:35:32 2017 +0000 +++ b/SparseHMM.h Wed May 17 14:50:10 2017 +0100 @@ -25,15 +25,18 @@ class SparseHMM { public: - SparseHMM(int fixedLag); - virtual std::vector - calculateObsProb(const vector >); + SparseHMM(int fixedLag); // set fixedLag == 0 when doing full Viterbi + + virtual std::vector calculateObsProb + (const vector >) = 0; + virtual void build(); std::vector decodeViterbi(std::vector > obs); void reset(); void initialise(vector firstObs); int process(vector newObs); vector track(); + // "sparse" HMM definition int m_fixedLag; int m_nState;