matthiasm@32: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ matthiasm@32: matthiasm@32: /* matthiasm@32: pYIN - A fundamental frequency estimator for monophonic audio matthiasm@32: Centre for Digital Music, Queen Mary, University of London. matthiasm@32: matthiasm@32: This program is free software; you can redistribute it and/or matthiasm@32: modify it under the terms of the GNU General Public License as matthiasm@32: published by the Free Software Foundation; either version 2 of the matthiasm@32: License, or (at your option) any later version. See the file matthiasm@32: COLocalCandidatePYING included with this distribution for more information. matthiasm@32: */ matthiasm@32: matthiasm@32: #include "LocalCandidatePYIN.h" matthiasm@32: #include "MonoPitch.h" matthiasm@32: #include "YinUtil.h" matthiasm@32: matthiasm@32: #include "vamp-sdk/FFT.h" matthiasm@32: matthiasm@32: #include matthiasm@32: #include matthiasm@32: matthiasm@32: #include matthiasm@32: #include matthiasm@32: // #include matthiasm@32: #include matthiasm@32: #include Chris@39: #include matthiasm@32: matthiasm@46: #include matthiasm@46: matthiasm@32: using std::string; matthiasm@32: using std::vector; Chris@39: using std::map; matthiasm@32: using Vamp::RealTime; matthiasm@32: matthiasm@32: matthiasm@32: LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) : matthiasm@32: Plugin(inputSampleRate), matthiasm@32: m_channels(0), matthiasm@32: m_stepSize(256), matthiasm@32: m_blockSize(2048), matthiasm@32: m_fmin(40), matthiasm@32: m_fmax(700), matthiasm@32: m_yin(2048, inputSampleRate, 0.0), matthiasm@32: m_oPitchTrackCandidates(0), matthiasm@32: m_threshDistr(2.0f), matthiasm@32: m_outputUnvoiced(0.0f), matthiasm@32: m_pitchProb(0), matthiasm@32: m_timestamp(0), matthiasm@48: m_nCandidate(13) matthiasm@32: { matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::~LocalCandidatePYIN() matthiasm@32: { matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getIdentifier() const matthiasm@32: { matthiasm@32: return "localcandidatepyin"; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getName() const matthiasm@32: { matthiasm@32: return "Local Candidate PYIN"; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getDescription() const matthiasm@32: { matthiasm@32: return "Monophonic pitch and note tracking based on a probabilistic Yin extension."; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getMaker() const matthiasm@32: { matthiasm@32: return "Matthias Mauch"; matthiasm@32: } matthiasm@32: matthiasm@32: int matthiasm@32: LocalCandidatePYIN::getPluginVersion() const matthiasm@32: { matthiasm@32: // Increment this each time you release a version that behaves matthiasm@32: // differently from the previous one matthiasm@32: return 1; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getCopyright() const matthiasm@32: { matthiasm@32: return "GPL"; matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::InputDomain matthiasm@32: LocalCandidatePYIN::getInputDomain() const matthiasm@32: { matthiasm@32: return TimeDomain; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getPreferredBlockSize() const matthiasm@32: { matthiasm@32: return 2048; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getPreferredStepSize() const matthiasm@32: { matthiasm@32: return 256; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getMinChannelCount() const matthiasm@32: { matthiasm@32: return 1; matthiasm@32: } matthiasm@32: matthiasm@32: size_t matthiasm@32: LocalCandidatePYIN::getMaxChannelCount() const matthiasm@32: { matthiasm@32: return 1; matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::ParameterList matthiasm@32: LocalCandidatePYIN::getParameterDescriptors() const matthiasm@32: { matthiasm@32: ParameterList list; matthiasm@32: matthiasm@32: ParameterDescriptor d; matthiasm@32: matthiasm@32: d.identifier = "threshdistr"; matthiasm@32: d.name = "Yin threshold distribution"; matthiasm@32: d.description = "."; matthiasm@32: d.unit = ""; matthiasm@32: d.minValue = 0.0f; matthiasm@32: d.maxValue = 7.0f; matthiasm@32: d.defaultValue = 2.0f; matthiasm@32: d.isQuantized = true; matthiasm@32: d.quantizeStep = 1.0f; matthiasm@32: d.valueNames.push_back("Uniform"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.10)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.15)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.20)"); matthiasm@32: d.valueNames.push_back("Beta (mean 0.30)"); matthiasm@32: d.valueNames.push_back("Single Value 0.10"); matthiasm@32: d.valueNames.push_back("Single Value 0.15"); matthiasm@32: d.valueNames.push_back("Single Value 0.20"); matthiasm@32: list.push_back(d); matthiasm@32: matthiasm@32: d.identifier = "outputunvoiced"; matthiasm@32: d.valueNames.clear(); matthiasm@32: d.name = "Output estimates classified as unvoiced?"; matthiasm@32: d.description = "."; matthiasm@32: d.unit = ""; matthiasm@32: d.minValue = 0.0f; matthiasm@32: d.maxValue = 2.0f; matthiasm@32: d.defaultValue = 0.0f; matthiasm@32: d.isQuantized = true; matthiasm@32: d.quantizeStep = 1.0f; matthiasm@32: d.valueNames.push_back("No"); matthiasm@32: d.valueNames.push_back("Yes"); matthiasm@32: d.valueNames.push_back("Yes, as negative frequencies"); matthiasm@32: list.push_back(d); matthiasm@32: matthiasm@32: return list; matthiasm@32: } matthiasm@32: matthiasm@32: float matthiasm@32: LocalCandidatePYIN::getParameter(string identifier) const matthiasm@32: { matthiasm@32: if (identifier == "threshdistr") { matthiasm@32: return m_threshDistr; matthiasm@32: } matthiasm@32: if (identifier == "outputunvoiced") { matthiasm@32: return m_outputUnvoiced; matthiasm@32: } matthiasm@32: return 0.f; matthiasm@32: } matthiasm@32: matthiasm@32: void matthiasm@32: LocalCandidatePYIN::setParameter(string identifier, float value) matthiasm@32: { matthiasm@32: if (identifier == "threshdistr") matthiasm@32: { matthiasm@32: m_threshDistr = value; matthiasm@32: } matthiasm@32: if (identifier == "outputunvoiced") matthiasm@32: { matthiasm@32: m_outputUnvoiced = value; matthiasm@32: } matthiasm@32: matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::ProgramList matthiasm@32: LocalCandidatePYIN::getPrograms() const matthiasm@32: { matthiasm@32: ProgramList list; matthiasm@32: return list; matthiasm@32: } matthiasm@32: matthiasm@32: string matthiasm@32: LocalCandidatePYIN::getCurrentProgram() const matthiasm@32: { matthiasm@32: return ""; // no programs matthiasm@32: } matthiasm@32: matthiasm@32: void matthiasm@32: LocalCandidatePYIN::selectProgram(string name) matthiasm@32: { matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::OutputList matthiasm@32: LocalCandidatePYIN::getOutputDescriptors() const matthiasm@32: { matthiasm@32: OutputList outputs; matthiasm@32: matthiasm@32: OutputDescriptor d; matthiasm@32: matthiasm@32: int outputNumber = 0; matthiasm@32: matthiasm@32: d.identifier = "pitchtrackcandidates"; matthiasm@32: d.name = "Pitch track candidates"; matthiasm@32: d.description = "Multiple candidate pitch tracks."; matthiasm@32: d.unit = "Hz"; matthiasm@32: d.hasFixedBinCount = false; matthiasm@32: d.hasKnownExtents = true; matthiasm@32: d.minValue = m_fmin; Chris@39: d.maxValue = 500; //!!!??? matthiasm@32: d.isQuantized = false; matthiasm@32: d.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@32: d.sampleRate = (m_inputSampleRate / m_stepSize); matthiasm@32: d.hasDuration = false; matthiasm@32: outputs.push_back(d); matthiasm@32: matthiasm@32: return outputs; matthiasm@32: } matthiasm@32: matthiasm@32: bool matthiasm@32: LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize) matthiasm@32: { matthiasm@32: if (channels < getMinChannelCount() || matthiasm@32: channels > getMaxChannelCount()) return false; matthiasm@32: matthiasm@32: /* matthiasm@32: std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels matthiasm@32: << ", stepSize = " << stepSize << ", blockSize = " << blockSize matthiasm@32: << std::endl; matthiasm@32: */ matthiasm@32: m_channels = channels; matthiasm@32: m_stepSize = stepSize; matthiasm@32: m_blockSize = blockSize; matthiasm@32: matthiasm@32: reset(); matthiasm@32: matthiasm@32: return true; matthiasm@32: } matthiasm@32: matthiasm@32: void matthiasm@32: LocalCandidatePYIN::reset() matthiasm@32: { matthiasm@32: m_yin.setThresholdDistr(m_threshDistr); matthiasm@32: m_yin.setFrameSize(m_blockSize); matthiasm@32: matthiasm@32: m_pitchProb.clear(); matthiasm@32: m_timestamp.clear(); matthiasm@32: /* matthiasm@32: std::cerr << "LocalCandidatePYIN::reset" matthiasm@32: << ", blockSize = " << m_blockSize matthiasm@32: << std::endl; matthiasm@32: */ matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::FeatureSet matthiasm@32: LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp) matthiasm@32: { matthiasm@60: timestamp = timestamp + Vamp::RealTime::frame2RealTime(m_blockSize/2, lrintf(m_inputSampleRate)); matthiasm@32: matthiasm@32: double *dInputBuffers = new double[m_blockSize]; matthiasm@32: for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i]; matthiasm@32: matthiasm@32: size_t yinBufferSize = m_blockSize/2; matthiasm@32: double* yinBuffer = new double[yinBufferSize]; matthiasm@60: YinUtil::slowDifference(dInputBuffers, yinBuffer, yinBufferSize); matthiasm@32: matthiasm@32: delete [] dInputBuffers; matthiasm@32: matthiasm@32: YinUtil::cumulativeDifference(yinBuffer, yinBufferSize); matthiasm@32: matthiasm@46: float minFrequency = 60; matthiasm@46: float maxFrequency = 900; matthiasm@46: vector peakProbability = YinUtil::yinProb(yinBuffer, matthiasm@46: m_threshDistr, matthiasm@46: yinBufferSize, matthiasm@46: m_inputSampleRate/maxFrequency, matthiasm@46: m_inputSampleRate/minFrequency); matthiasm@46: matthiasm@46: vector > tempPitchProb; matthiasm@46: for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf) matthiasm@32: { matthiasm@46: if (peakProbability[iBuf] > 0) matthiasm@32: { matthiasm@46: double currentF0 = matthiasm@46: m_inputSampleRate * (1.0 / matthiasm@46: YinUtil::parabolicInterpolation(yinBuffer, iBuf, yinBufferSize)); matthiasm@46: double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69; matthiasm@46: if (tempPitch != tempPitch) std::cerr << "AAAAAAAAA! " << currentF0 << " " << (m_inputSampleRate * 1.0 / iBuf) << std::endl; matthiasm@46: tempPitchProb.push_back(pair(tempPitch, peakProbability[iBuf])); matthiasm@32: } matthiasm@32: } matthiasm@46: m_pitchProb.push_back(tempPitchProb); matthiasm@32: m_timestamp.push_back(timestamp); matthiasm@32: Chris@39: return FeatureSet(); matthiasm@32: } matthiasm@32: matthiasm@32: LocalCandidatePYIN::FeatureSet matthiasm@32: LocalCandidatePYIN::getRemainingFeatures() matthiasm@32: { Chris@39: // timestamp -> candidate number -> value Chris@39: map > featureValues; matthiasm@32: matthiasm@37: // std::cerr << "in remaining features" << std::endl; matthiasm@32: matthiasm@32: if (m_pitchProb.empty()) { Chris@39: return FeatureSet(); matthiasm@32: } matthiasm@32: matthiasm@32: // MONO-PITCH STUFF matthiasm@32: MonoPitch mp; matthiasm@32: size_t nFrame = m_timestamp.size(); matthiasm@32: vector > pitchTracks; matthiasm@32: vector freqSum = vector(m_nCandidate); matthiasm@32: vector freqNumber = vector(m_nCandidate); matthiasm@32: vector freqMean = vector(m_nCandidate); matthiasm@44: matthiasm@46: boost::math::normal normalDist(0, 8); // semitones sd matthiasm@46: float maxNormalDist = boost::math::pdf(normalDist, 0); matthiasm@46: matthiasm@32: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) matthiasm@32: { matthiasm@32: pitchTracks.push_back(vector(nFrame)); matthiasm@46: vector > > tempPitchProb; matthiasm@46: float centrePitch = 45 + 3 * iCandidate; matthiasm@46: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) { matthiasm@60: tempPitchProb.push_back(vector >()); matthiasm@46: float sumProb = 0; matthiasm@46: float pitch = 0; matthiasm@46: float prob = 0; matthiasm@46: for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb) { matthiasm@46: pitch = m_pitchProb[iFrame][iProb].first; matthiasm@46: // std::cerr << pitch << " " << m_pitchProb[iFrame][iProb].second << std::endl; matthiasm@48: prob = m_pitchProb[iFrame][iProb].second * boost::math::pdf(normalDist, pitch-centrePitch) / maxNormalDist * 2; matthiasm@46: sumProb += prob; matthiasm@46: tempPitchProb[iFrame].push_back(pair(pitch,prob)); matthiasm@46: // std::cerr << m_timestamp[iFrame] << " " << iCandidate << " " << centrePitch << " " << pitch << " " << prob << std::endl; matthiasm@46: } matthiasm@46: for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb) { matthiasm@46: tempPitchProb[iFrame][iProb].second /= sumProb; matthiasm@46: } matthiasm@46: } matthiasm@46: vector mpOut = mp.process(tempPitchProb); matthiasm@44: float prevFreq = 0; matthiasm@32: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) matthiasm@32: { matthiasm@32: if (mpOut[iFrame] > 0) { matthiasm@46: // if (prevFreq>0 && fabs(log2(mpOut[iFrame]/prevFreq)) > 0.1) { matthiasm@46: // for (int jFrame = iFrame; jFrame != -1; --jFrame) { matthiasm@46: // // hack: setting all freqs to 0 -- will be eliminated later matthiasm@46: // pitchTracks[iCandidate][jFrame] = 0; matthiasm@46: // } matthiasm@46: // break; matthiasm@46: // } matthiasm@32: pitchTracks[iCandidate][iFrame] = mpOut[iFrame]; matthiasm@32: freqSum[iCandidate] += mpOut[iFrame]; matthiasm@32: freqNumber[iCandidate]++; matthiasm@44: prevFreq = mpOut[iFrame]; matthiasm@32: } matthiasm@32: } matthiasm@32: freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate]; matthiasm@32: } matthiasm@32: matthiasm@37: // find near duplicate pitch tracks matthiasm@34: vector duplicates; matthiasm@34: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) { matthiasm@34: for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) { matthiasm@34: size_t countEqual = 0; matthiasm@34: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) matthiasm@34: { matthiasm@46: if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) || matthiasm@46: fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01) matthiasm@34: countEqual++; matthiasm@34: } matthiasm@46: // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl; matthiasm@34: if (countEqual * 1.0 / nFrame > 0.8) { matthiasm@34: if (freqNumber[iCandidate] > freqNumber[jCandidate]) { matthiasm@34: duplicates.push_back(jCandidate); matthiasm@46: } else if (iCandidate < jCandidate) { matthiasm@34: duplicates.push_back(iCandidate); matthiasm@34: } matthiasm@34: } matthiasm@34: } matthiasm@34: } matthiasm@34: matthiasm@37: // now find non-duplicate pitch tracks Chris@39: map candidateActuals; Chris@39: map candidateLabels; Chris@39: matthiasm@46: vector > outputFrequencies; matthiasm@60: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector()); matthiasm@46: matthiasm@32: int actualCandidateNumber = 0; matthiasm@32: for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) { matthiasm@34: bool isDuplicate = false; matthiasm@34: for (size_t i = 0; i < duplicates.size(); ++i) { matthiasm@37: // std::cerr << duplicates[i] << std::endl; matthiasm@34: if (duplicates[i] == iCandidate) { matthiasm@34: isDuplicate = true; matthiasm@34: break; matthiasm@34: } matthiasm@34: } matthiasm@46: if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame) matthiasm@32: { matthiasm@32: std::ostringstream convert; matthiasm@32: convert << actualCandidateNumber++; Chris@39: candidateLabels[iCandidate] = convert.str(); Chris@39: candidateActuals[iCandidate] = actualCandidateNumber; matthiasm@46: // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl; matthiasm@32: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) matthiasm@32: { matthiasm@32: if (pitchTracks[iCandidate][iFrame] > 0) matthiasm@32: { matthiasm@46: // featureValues[m_timestamp[iFrame]][iCandidate] = matthiasm@46: // pitchTracks[iCandidate][iFrame]; matthiasm@46: outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]); matthiasm@60: } else { matthiasm@60: outputFrequencies[iFrame].push_back(0); matthiasm@32: } matthiasm@32: } matthiasm@32: } matthiasm@43: // fs[m_oPitchTrackCandidates].push_back(f); matthiasm@32: } matthiasm@32: Chris@39: // adapt our features so as to return a stack of candidate values Chris@39: // per frame Chris@39: Chris@39: FeatureSet fs; Chris@39: matthiasm@46: for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){ Chris@39: Feature f; Chris@39: f.hasTimestamp = true; matthiasm@46: f.timestamp = m_timestamp[iFrame]; matthiasm@46: f.values = outputFrequencies[iFrame]; Chris@39: fs[0].push_back(f); Chris@39: } matthiasm@46: matthiasm@46: // I stopped using Chris's map stuff below because I couldn't get my head around it matthiasm@46: // matthiasm@46: // for (map >::const_iterator i = matthiasm@46: // featureValues.begin(); i != featureValues.end(); ++i) { matthiasm@46: // Feature f; matthiasm@46: // f.hasTimestamp = true; matthiasm@46: // f.timestamp = i->first; matthiasm@46: // int nextCandidate = candidateActuals.begin()->second; matthiasm@46: // for (map::const_iterator j = matthiasm@46: // i->second.begin(); j != i->second.end(); ++j) { matthiasm@46: // while (candidateActuals[j->first] > nextCandidate) { matthiasm@46: // f.values.push_back(0); matthiasm@46: // ++nextCandidate; matthiasm@46: // } matthiasm@46: // f.values.push_back(j->second); matthiasm@46: // nextCandidate = j->first + 1; matthiasm@46: // } matthiasm@46: // //!!! can't use labels? matthiasm@46: // fs[0].push_back(f); matthiasm@46: // } matthiasm@32: matthiasm@32: return fs; matthiasm@32: }