matthiasm@32: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ matthiasm@32: matthiasm@32: /* matthiasm@32: pYIN - A fundamental frequency estimator for monophonic audio matthiasm@32: Centre for Digital Music, Queen Mary, University of London. matthiasm@32: matthiasm@32: This program is free software; you can redistribute it and/or matthiasm@32: modify it under the terms of the GNU General Public License as matthiasm@32: published by the Free Software Foundation; either version 2 of the matthiasm@32: License, or (at your option) any later version. See the file matthiasm@32: COLocalCandidatePYING included with this distribution for more information. matthiasm@32: */ matthiasm@32: matthiasm@32: #include "LocalCandidatePYIN.h" mail@132: #include "MonoPitchHMM.h" matthiasm@32: #include "YinUtil.h" matthiasm@32: matthiasm@32: #include "vamp-sdk/FFT.h" matthiasm@32: matthiasm@32: #include matthiasm@32: #include matthiasm@32: matthiasm@32: #include matthiasm@32: #include matthiasm@32: // #include matthiasm@32: #include matthiasm@32: #include Chris@39: #include matthiasm@32: matthiasm@46: #include matthiasm@46: matthiasm@32: using std::string; matthiasm@32: using std::vector; Chris@39: using std::map; matthiasm@32: using Vamp::RealTime; matthiasm@32: matthiasm@32: matthiasm@32: LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) : matthiasm@32: Plugin(inputSampleRate), matthiasm@32: m_channels(0), matthiasm@32: m_stepSize(256), matthiasm@32: m_blockSize(2048), matthiasm@32: m_fmin(40), matthiasm@32: m_fmax(700), matthiasm@32: m_oPitchTrackCandidates(0), matthiasm@32: m_threshDistr(2.0f), matthiasm@32: m_outputUnvoiced(0.0f), matthiasm@70: m_preciseTime(0.0f), matthiasm@32: m_pitchProb(0), matthiasm@32: m_timestamp(0), Chris@136: m_nCandidate(13), Chris@136: m_yinUtil(0) matthiasm@32: { matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::~LocalCandidatePYIN() matthiasm@32: { Chris@136: delete m_yinUtil; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getIdentifier() const matthiasm@32: { matthiasm@32: return "localcandidatepyin"; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getName() const matthiasm@32: { matthiasm@32: return "Local Candidate PYIN"; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getDescription() const matthiasm@32: { matthiasm@32: return "Monophonic pitch and note tracking based on a probabilistic Yin extension."; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getMaker() const matthiasm@32: { matthiasm@32: return "Matthias Mauch"; matthiasm@32: } matthiasm@32: matthiasm@32: int matthiasm@32: LocalCandidatePYIN::getPluginVersion() const matthiasm@32: { matthiasm@32: // Increment this each time you release a version that behaves matthiasm@32: // differently from the previous one Chris@143: return 3; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getCopyright() const matthiasm@32: { matthiasm@32: return "GPL"; matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::InputDomain matthiasm@32: LocalCandidatePYIN::getInputDomain() const matthiasm@32: { matthiasm@32: return TimeDomain; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getPreferredBlockSize() const matthiasm@32: { matthiasm@32: return 2048; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getPreferredStepSize() const matthiasm@32: { matthiasm@32: return 256; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getMinChannelCount() const matthiasm@32: { matthiasm@32: return 1; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getMaxChannelCount() const matthiasm@32: { matthiasm@32: return 1; matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::ParameterList matthiasm@32: LocalCandidatePYIN::getParameterDescriptors() const matthiasm@32: { matthiasm@32: ParameterList list; matthiasm@32: matthiasm@32: ParameterDescriptor d; matthiasm@32: matthiasm@32: d.identifier = "threshdistr"; matthiasm@32: d.name = "Yin threshold distribution"; matthiasm@32: d.description = "."; matthiasm@32: d.unit = ""; matthiasm@32: d.minValue = 0.0f; matthiasm@32: d.maxValue = 7.0f; matthiasm@32: d.defaultValue = 2.0f; matthiasm@32: d.isQuantized = true; matthiasm@32: d.quantizeStep = 1.0f; matthiasm@32: d.valueNames.push_back("Uniform"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.10)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.15)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.20)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.30)"); matthiasm@32: d.valueNames.push_back("Single Value 0.10"); matthiasm@32: d.valueNames.push_back("Single Value 0.15"); matthiasm@32: d.valueNames.push_back("Single Value 0.20"); matthiasm@32: list.push_back(d); matthiasm@32: matthiasm@32: d.identifier = "outputunvoiced"; matthiasm@32: d.valueNames.clear(); matthiasm@32: d.name = "Output estimates classified as unvoiced?"; matthiasm@32: d.description = "."; matthiasm@32: d.unit = ""; matthiasm@32: d.minValue = 0.0f; matthiasm@32: d.maxValue = 2.0f; matthiasm@32: d.defaultValue = 0.0f; matthiasm@32: d.isQuantized = true; matthiasm@32: d.quantizeStep = 1.0f; matthiasm@32: d.valueNames.push_back("No"); matthiasm@32: d.valueNames.push_back("Yes"); matthiasm@32: d.valueNames.push_back("Yes, as negative frequencies"); matthiasm@32: list.push_back(d); matthiasm@32: matthiasm@70: d.identifier = "precisetime"; matthiasm@70: d.valueNames.clear(); matthiasm@70: d.name = "Use non-standard precise YIN timing (slow)."; matthiasm@70: d.description = "."; matthiasm@70: d.unit = ""; matthiasm@70: d.minValue = 0.0f; matthiasm@70: d.maxValue = 1.0f; matthiasm@70: d.defaultValue = 0.0f; matthiasm@70: d.isQuantized = true; matthiasm@70: d.quantizeStep = 1.0f; matthiasm@70: list.push_back(d); matthiasm@70: matthiasm@32: return list; matthiasm@32: } matthiasm@32: matthiasm@32: float matthiasm@32: LocalCandidatePYIN::getParameter(string identifier) const matthiasm@32: { matthiasm@32: if (identifier == "threshdistr") { matthiasm@32: return m_threshDistr; matthiasm@32: } matthiasm@32: if (identifier == "outputunvoiced") { matthiasm@32: return m_outputUnvoiced; matthiasm@32: } matthiasm@70: if (identifier == "precisetime") { matthiasm@70: return m_preciseTime; matthiasm@70: } matthiasm@32: return 0.f; matthiasm@32: } matthiasm@32: matthiasm@32: void matthiasm@32: LocalCandidatePYIN::setParameter(string identifier, float value) matthiasm@32: { matthiasm@32: if (identifier == "threshdistr") matthiasm@32: { matthiasm@32: m_threshDistr = value; matthiasm@32: } matthiasm@32: if (identifier == "outputunvoiced") matthiasm@32: { matthiasm@32: m_outputUnvoiced = value; matthiasm@32: } matthiasm@70: if (identifier == "precisetime") matthiasm@70: { matthiasm@70: m_preciseTime = value; matthiasm@70: } matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::ProgramList matthiasm@32: LocalCandidatePYIN::getPrograms() const matthiasm@32: { matthiasm@32: ProgramList list; matthiasm@32: return list; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getCurrentProgram() const matthiasm@32: { matthiasm@32: return ""; // no programs matthiasm@32: } matthiasm@32: matthiasm@32: void Chris@138: LocalCandidatePYIN::selectProgram(string) matthiasm@32: { matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::OutputList matthiasm@32: LocalCandidatePYIN::getOutputDescriptors() const matthiasm@32: { matthiasm@32: OutputList outputs; matthiasm@32: matthiasm@32: OutputDescriptor d; matthiasm@32: matthiasm@32: d.identifier = "pitchtrackcandidates"; matthiasm@32: d.name = "Pitch track candidates"; matthiasm@32: d.description = "Multiple candidate pitch tracks."; matthiasm@32: d.unit = "Hz"; matthiasm@32: d.hasFixedBinCount = false; matthiasm@32: d.hasKnownExtents = true; matthiasm@32: d.minValue = m_fmin; Chris@39: d.maxValue = 500; //!!!??? matthiasm@32: d.isQuantized = false; matthiasm@32: d.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@32: d.sampleRate = (m_inputSampleRate / m_stepSize); matthiasm@32: d.hasDuration = false; matthiasm@32: outputs.push_back(d); matthiasm@32: matthiasm@32: return outputs; matthiasm@32: } matthiasm@32: matthiasm@32: bool matthiasm@32: LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize) matthiasm@32: { matthiasm@32: if (channels < getMinChannelCount() || matthiasm@32: channels > getMaxChannelCount()) return false; matthiasm@32: matthiasm@32: /* matthiasm@32: std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels matthiasm@32: << ", stepSize = " << stepSize << ", blockSize = " << blockSize matthiasm@32: << std::endl; matthiasm@32: */ matthiasm@32: m_channels = channels; matthiasm@32: m_stepSize = stepSize; matthiasm@32: m_blockSize = blockSize; Chris@136: Chris@136: m_yinUtil = new YinUtil(m_blockSize/2); matthiasm@32: matthiasm@32: reset(); matthiasm@32: matthiasm@32: return true; matthiasm@32: } matthiasm@32: matthiasm@32: void matthiasm@32: LocalCandidatePYIN::reset() matthiasm@32: { matthiasm@32: m_pitchProb.clear(); matthiasm@32: m_timestamp.clear(); matthiasm@32: /* matthiasm@32: std::cerr << "LocalCandidatePYIN::reset" matthiasm@32: << ", blockSize = " << m_blockSize matthiasm@32: << std::endl; matthiasm@32: */ matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::FeatureSet matthiasm@32: LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp) matthiasm@32: { matthiasm@77: int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4; matthiasm@77: timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate)); matthiasm@32: matthiasm@32: double *dInputBuffers = new double[m_blockSize]; matthiasm@32: for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i]; matthiasm@32: matthiasm@32: size_t yinBufferSize = m_blockSize/2; matthiasm@32: double* yinBuffer = new double[yinBufferSize]; Chris@136: if (!m_preciseTime) m_yinUtil->fastDifference(dInputBuffers, yinBuffer); Chris@136: else m_yinUtil->slowDifference(dInputBuffers, yinBuffer); matthiasm@32: matthiasm@32: delete [] dInputBuffers; matthiasm@32: Chris@136: m_yinUtil->cumulativeDifference(yinBuffer); matthiasm@32: matthiasm@46: float minFrequency = 60; matthiasm@46: float maxFrequency = 900; Chris@136: vector peakProbability = m_yinUtil->yinProb(yinBuffer, Chris@136: m_threshDistr, Chris@136: m_inputSampleRate/maxFrequency, Chris@136: m_inputSampleRate/minFrequency); matthiasm@46: matthiasm@46: vector > tempPitchProb; matthiasm@46: for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf) matthiasm@32: { matthiasm@46: if (peakProbability[iBuf] > 0) matthiasm@32: { matthiasm@46: double currentF0 = matthiasm@46: m_inputSampleRate * (1.0 / Chris@136: m_yinUtil->parabolicInterpolation(yinBuffer, iBuf)); matthiasm@46: double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69; matthiasm@46: tempPitchProb.push_back(pair(tempPitch, peakProbability[iBuf])); matthiasm@32: } matthiasm@32: } matthiasm@46: m_pitchProb.push_back(tempPitchProb); matthiasm@32: m_timestamp.push_back(timestamp); matthiasm@32: matthiasm@76: delete[] yinBuffer; matthiasm@76: Chris@39: return FeatureSet(); matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::FeatureSet matthiasm@32: LocalCandidatePYIN::getRemainingFeatures() matthiasm@32: { Chris@39: // timestamp -> candidate number -> value Chris@39: map > featureValues; matthiasm@32: matthiasm@37: // std::cerr << "in remaining features" << std::endl; matthiasm@32: matthiasm@32: if (m_pitchProb.empty()) { Chris@39: return FeatureSet(); matthiasm@32: } matthiasm@32: matthiasm@32: // MONO-PITCH STUFF mail@132: MonoPitchHMM hmm(0); matthiasm@32: size_t nFrame = m_timestamp.size(); matthiasm@32: vector > pitchTracks; matthiasm@32: vector freqSum = vector(m_nCandidate); matthiasm@32: vector freqNumber = vector(m_nCandidate); matthiasm@32: vector freqMean = vector(m_nCandidate); matthiasm@44: matthiasm@46: boost::math::normal normalDist(0, 8); // semitones sd matthiasm@46: float maxNormalDist = boost::math::pdf(normalDist, 0); matthiasm@46: matthiasm@110: // Viterbi-decode multiple times with different frequencies emphasised matthiasm@32: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) matthiasm@32: { matthiasm@32: pitchTracks.push_back(vector(nFrame)); mail@132: vector > tempPitchProb; mail@132: vector > tempObsProb; matthiasm@46: float centrePitch = 45 + 3 * iCandidate; matthiasm@109: matthiasm@46: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) { matthiasm@46: float sumProb = 0; matthiasm@46: float pitch = 0; matthiasm@46: float prob = 0; matthiasm@109: for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb) matthiasm@109: { matthiasm@109: pitch = m_pitchProb[iFrame][iProb].first; matthiasm@109: prob = m_pitchProb[iFrame][iProb].second * matthiasm@109: boost::math::pdf(normalDist, pitch-centrePitch) / matthiasm@109: maxNormalDist * 2; matthiasm@46: sumProb += prob; mail@132: tempPitchProb.push_back( matthiasm@109: pair(pitch,prob)); matthiasm@46: } matthiasm@109: for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb) matthiasm@109: { mail@132: tempPitchProb[iProb].second /= sumProb; matthiasm@46: } mail@132: tempObsProb.push_back(hmm.calculateObsProb(tempPitchProb)); matthiasm@46: } matthiasm@109: mail@132: vector rawPitchPath = hmm.decodeViterbi(tempObsProb); mail@132: vector mpOut; mail@132: mail@132: for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) mail@132: { mail@132: float freq = hmm.nearestFreq(rawPitchPath[iFrame], Chris@141: m_pitchProb[iFrame]); mail@132: mpOut.push_back(freq); // for note processing below mail@132: } mail@132: Chris@141: for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame) matthiasm@32: { matthiasm@32: if (mpOut[iFrame] > 0) { matthiasm@109: matthiasm@32: pitchTracks[iCandidate][iFrame] = mpOut[iFrame]; matthiasm@32: freqSum[iCandidate] += mpOut[iFrame]; matthiasm@32: freqNumber[iCandidate]++; matthiasm@32: } matthiasm@32: } matthiasm@32: freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate]; matthiasm@32: } matthiasm@32: matthiasm@37: // find near duplicate pitch tracks matthiasm@34: vector duplicates; matthiasm@34: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) { matthiasm@34: for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) { matthiasm@34: size_t countEqual = 0; matthiasm@34: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) matthiasm@34: { matthiasm@46: if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) || matthiasm@46: fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01) matthiasm@34: countEqual++; matthiasm@34: } matthiasm@46: // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl; matthiasm@34: if (countEqual * 1.0 / nFrame > 0.8) { matthiasm@34: if (freqNumber[iCandidate] > freqNumber[jCandidate]) { matthiasm@34: duplicates.push_back(jCandidate); matthiasm@46: } else if (iCandidate < jCandidate) { matthiasm@34: duplicates.push_back(iCandidate); matthiasm@34: } matthiasm@34: } matthiasm@34: } matthiasm@34: } matthiasm@34: matthiasm@37: // now find non-duplicate pitch tracks Chris@39: map candidateActuals; Chris@39: map candidateLabels; Chris@39: matthiasm@46: vector > outputFrequencies; matthiasm@60: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector()); matthiasm@46: matthiasm@32: int actualCandidateNumber = 0; matthiasm@110: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) matthiasm@110: { matthiasm@34: bool isDuplicate = false; matthiasm@34: for (size_t i = 0; i < duplicates.size(); ++i) { matthiasm@110: matthiasm@34: if (duplicates[i] == iCandidate) { matthiasm@34: isDuplicate = true; matthiasm@34: break; matthiasm@34: } matthiasm@34: } matthiasm@46: if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame) matthiasm@32: { matthiasm@32: std::ostringstream convert; matthiasm@32: convert << actualCandidateNumber++; Chris@39: candidateLabels[iCandidate] = convert.str(); Chris@39: candidateActuals[iCandidate] = actualCandidateNumber; matthiasm@46: // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl; matthiasm@32: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) matthiasm@32: { matthiasm@32: if (pitchTracks[iCandidate][iFrame] > 0) matthiasm@32: { matthiasm@46: // featureValues[m_timestamp[iFrame]][iCandidate] = matthiasm@46: // pitchTracks[iCandidate][iFrame]; matthiasm@46: outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]); matthiasm@60: } else { matthiasm@60: outputFrequencies[iFrame].push_back(0); matthiasm@32: } matthiasm@32: } matthiasm@32: } matthiasm@43: // fs[m_oPitchTrackCandidates].push_back(f); matthiasm@32: } matthiasm@32: Chris@39: // adapt our features so as to return a stack of candidate values Chris@39: // per frame Chris@39: Chris@39: FeatureSet fs; Chris@39: matthiasm@46: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){ Chris@39: Feature f; Chris@39: f.hasTimestamp = true; matthiasm@46: f.timestamp = m_timestamp[iFrame]; matthiasm@46: f.values = outputFrequencies[iFrame]; Chris@39: fs[0].push_back(f); Chris@39: } matthiasm@46: matthiasm@46: // I stopped using Chris's map stuff below because I couldn't get my head around it matthiasm@46: // matthiasm@46: // for (map >::const_iterator i = matthiasm@46: // featureValues.begin(); i != featureValues.end(); ++i) { matthiasm@46: // Feature f; matthiasm@46: // f.hasTimestamp = true; matthiasm@46: // f.timestamp = i->first; matthiasm@46: // int nextCandidate = candidateActuals.begin()->second; matthiasm@46: // for (map::const_iterator j = matthiasm@46: // i->second.begin(); j != i->second.end(); ++j) { matthiasm@46: // while (candidateActuals[j->first] > nextCandidate) { matthiasm@46: // f.values.push_back(0); matthiasm@46: // ++nextCandidate; matthiasm@46: // } matthiasm@46: // f.values.push_back(j->second); matthiasm@46: // nextCandidate = j->first + 1; matthiasm@46: // } matthiasm@46: // //!!! can't use labels? matthiasm@46: // fs[0].push_back(f); matthiasm@46: // } matthiasm@32: matthiasm@32: return fs; matthiasm@32: }