view LocalCandidatePYIN.cpp @ 137:109c3a2ad930 vamp-fft-revision

Make use of new Vamp FFT interface. This reduces the runtime of the regression test from 5.7 to 2.2 seconds on this machine, but it does need the right version of the SDK, which is currently only available in the vampipe branch.
author Chris Cannam
date Fri, 19 Aug 2016 13:26:40 +0100
parents 7cbf40306c10
children d71170f5ba76
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    pYIN - A fundamental frequency estimator for monophonic audio
    Centre for Digital Music, Queen Mary, University of London.
    
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COLocalCandidatePYING included with this distribution for more information.
*/

#include "LocalCandidatePYIN.h"
#include "MonoPitch.h"
#include "YinUtil.h"

#include "vamp-sdk/FFT.h"

#include <vector>
#include <algorithm>

#include <cstdio>
#include <sstream>
// #include <iostream>
#include <cmath>
#include <complex>
#include <map>

#include <boost/math/distributions.hpp>

using std::string;
using std::vector;
using std::map;
using Vamp::RealTime;


LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) :
    Plugin(inputSampleRate),
    m_channels(0),
    m_stepSize(256),
    m_blockSize(2048),
    m_fmin(40),
    m_fmax(700),
    m_oPitchTrackCandidates(0),
    m_threshDistr(2.0f),
    m_outputUnvoiced(0.0f),
    m_preciseTime(0.0f),
    m_pitchProb(0),
    m_timestamp(0),
    m_nCandidate(13),
    m_yinUtil(0)
{
}

LocalCandidatePYIN::~LocalCandidatePYIN()
{
    delete m_yinUtil;
}

string
LocalCandidatePYIN::getIdentifier() const
{
    return "localcandidatepyin";
}

string
LocalCandidatePYIN::getName() const
{
    return "Local Candidate PYIN";
}

string
LocalCandidatePYIN::getDescription() const
{
    return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
}

string
LocalCandidatePYIN::getMaker() const
{
    return "Matthias Mauch";
}

int
LocalCandidatePYIN::getPluginVersion() const
{
    // Increment this each time you release a version that behaves
    // differently from the previous one
    return 2;
}

string
LocalCandidatePYIN::getCopyright() const
{
    return "GPL";
}

LocalCandidatePYIN::InputDomain
LocalCandidatePYIN::getInputDomain() const
{
    return TimeDomain;
}

size_t
LocalCandidatePYIN::getPreferredBlockSize() const
{
    return 2048;
}

size_t 
LocalCandidatePYIN::getPreferredStepSize() const
{
    return 256;
}

size_t
LocalCandidatePYIN::getMinChannelCount() const
{
    return 1;
}

size_t
LocalCandidatePYIN::getMaxChannelCount() const
{
    return 1;
}

LocalCandidatePYIN::ParameterList
LocalCandidatePYIN::getParameterDescriptors() const
{
    ParameterList list;
    
    ParameterDescriptor d;

    d.identifier = "threshdistr";
    d.name = "Yin threshold distribution";
    d.description = ".";
    d.unit = "";
    d.minValue = 0.0f;
    d.maxValue = 7.0f;
    d.defaultValue = 2.0f;
    d.isQuantized = true;
    d.quantizeStep = 1.0f;
    d.valueNames.push_back("Uniform");
    d.valueNames.push_back("Beta (mean 0.10)");
    d.valueNames.push_back("Beta (mean 0.15)");
    d.valueNames.push_back("Beta (mean 0.20)");
    d.valueNames.push_back("Beta (mean 0.30)");
    d.valueNames.push_back("Single Value 0.10");
    d.valueNames.push_back("Single Value 0.15");
    d.valueNames.push_back("Single Value 0.20");
    list.push_back(d);

    d.identifier = "outputunvoiced";
    d.valueNames.clear();
    d.name = "Output estimates classified as unvoiced?";
    d.description = ".";
    d.unit = "";
    d.minValue = 0.0f;
    d.maxValue = 2.0f;
    d.defaultValue = 0.0f;
    d.isQuantized = true;
    d.quantizeStep = 1.0f;
    d.valueNames.push_back("No");
    d.valueNames.push_back("Yes");
    d.valueNames.push_back("Yes, as negative frequencies");
    list.push_back(d);

    d.identifier = "precisetime";
    d.valueNames.clear();
    d.name = "Use non-standard precise YIN timing (slow).";
    d.description = ".";
    d.unit = "";
    d.minValue = 0.0f;
    d.maxValue = 1.0f;
    d.defaultValue = 0.0f;
    d.isQuantized = true;
    d.quantizeStep = 1.0f;
    list.push_back(d);

    return list;
}

float
LocalCandidatePYIN::getParameter(string identifier) const
{
    if (identifier == "threshdistr") {
            return m_threshDistr;
    }
    if (identifier == "outputunvoiced") {
            return m_outputUnvoiced;
    }
    if (identifier == "precisetime") {
            return m_preciseTime;
    }
    return 0.f;
}

void
LocalCandidatePYIN::setParameter(string identifier, float value) 
{
    if (identifier == "threshdistr")
    {
        m_threshDistr = value;
    }
    if (identifier == "outputunvoiced")
    {
        m_outputUnvoiced = value;
    }
    if (identifier == "precisetime")
    {
        m_preciseTime = value;
    }
}

LocalCandidatePYIN::ProgramList
LocalCandidatePYIN::getPrograms() const
{
    ProgramList list;
    return list;
}

string
LocalCandidatePYIN::getCurrentProgram() const
{
    return ""; // no programs
}

void
LocalCandidatePYIN::selectProgram(string name)
{
}

LocalCandidatePYIN::OutputList
LocalCandidatePYIN::getOutputDescriptors() const
{
    OutputList outputs;

    OutputDescriptor d;

    d.identifier = "pitchtrackcandidates";
    d.name = "Pitch track candidates";
    d.description = "Multiple candidate pitch tracks.";
    d.unit = "Hz";
    d.hasFixedBinCount = false;
    d.hasKnownExtents = true;
    d.minValue = m_fmin;
    d.maxValue = 500; //!!!???
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = (m_inputSampleRate / m_stepSize);
    d.hasDuration = false;
    outputs.push_back(d);

    return outputs;
}

bool
LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize)
{
    if (channels < getMinChannelCount() ||
	channels > getMaxChannelCount()) return false;

/*
    std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels
          << ", stepSize = " << stepSize << ", blockSize = " << blockSize
          << std::endl;
*/
    m_channels = channels;
    m_stepSize = stepSize;
    m_blockSize = blockSize;

    m_yinUtil = new YinUtil(m_blockSize/2);
    
    reset();

    return true;
}

void
LocalCandidatePYIN::reset()
{    
    m_pitchProb.clear();
    m_timestamp.clear();
/*    
    std::cerr << "LocalCandidatePYIN::reset"
          << ", blockSize = " << m_blockSize
          << std::endl;
*/
}

LocalCandidatePYIN::FeatureSet
LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp)
{
    int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
    timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
    
    double *dInputBuffers = new double[m_blockSize];
    for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i];
    
    size_t yinBufferSize = m_blockSize/2;
    double* yinBuffer = new double[yinBufferSize];
    if (!m_preciseTime) m_yinUtil->fastDifference(dInputBuffers, yinBuffer);
    else m_yinUtil->slowDifference(dInputBuffers, yinBuffer);    
    
    delete [] dInputBuffers;

    m_yinUtil->cumulativeDifference(yinBuffer);
    
    float minFrequency = 60;
    float maxFrequency = 900;
    vector<double> peakProbability = m_yinUtil->yinProb(yinBuffer, 
                                                        m_threshDistr, 
                                                        m_inputSampleRate/maxFrequency, 
                                                        m_inputSampleRate/minFrequency);

    vector<pair<double, double> > tempPitchProb;
    for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf)
    {
        if (peakProbability[iBuf] > 0)
        {
            double currentF0 = 
                m_inputSampleRate * (1.0 /
                m_yinUtil->parabolicInterpolation(yinBuffer, iBuf));
            double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69;
            tempPitchProb.push_back(pair<double, double>(tempPitch, peakProbability[iBuf]));
        }
    }
    m_pitchProb.push_back(tempPitchProb);
    m_timestamp.push_back(timestamp);

    delete[] yinBuffer;

    return FeatureSet();
}

LocalCandidatePYIN::FeatureSet
LocalCandidatePYIN::getRemainingFeatures()
{
    // timestamp -> candidate number -> value
    map<RealTime, map<int, float> > featureValues;

    // std::cerr << "in remaining features" << std::endl;

    if (m_pitchProb.empty()) {
        return FeatureSet();
    }

    // MONO-PITCH STUFF
    MonoPitch mp;
    size_t nFrame = m_timestamp.size();
    vector<vector<float> > pitchTracks;
    vector<float> freqSum = vector<float>(m_nCandidate);
    vector<float> freqNumber = vector<float>(m_nCandidate);
    vector<float> freqMean = vector<float>(m_nCandidate);
    
    boost::math::normal normalDist(0, 8); // semitones sd
    float maxNormalDist = boost::math::pdf(normalDist, 0);
    
    // Viterbi-decode multiple times with different frequencies emphasised
    for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
    {
        pitchTracks.push_back(vector<float>(nFrame));
        vector<vector<pair<double,double> > > tempPitchProb;
        float centrePitch = 45 + 3 * iCandidate;

        for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) {
            tempPitchProb.push_back(vector<pair<double,double> >());
            float sumProb = 0;
            float pitch = 0;
            float prob = 0;
            for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
            {
                pitch = m_pitchProb[iFrame][iProb].first;                
                prob  = m_pitchProb[iFrame][iProb].second * 
                    boost::math::pdf(normalDist, pitch-centrePitch) /
                    maxNormalDist * 2;
                sumProb += prob;
                tempPitchProb[iFrame].push_back(
                    pair<double,double>(pitch,prob));
            }
            for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
            {
                tempPitchProb[iFrame][iProb].second /= sumProb;
            }
        }

        vector<float> mpOut = mp.process(tempPitchProb);
        float prevFreq = 0;
        for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
        {
            if (mpOut[iFrame] > 0) {

                pitchTracks[iCandidate][iFrame] = mpOut[iFrame];
                freqSum[iCandidate] += mpOut[iFrame];
                freqNumber[iCandidate]++;
                prevFreq = mpOut[iFrame];

            }
        }
        freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate];
    }

    // find near duplicate pitch tracks
    vector<size_t> duplicates;
    for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) {
        for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) {
            size_t countEqual = 0;
            for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) 
            {
                if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) ||
                fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01)
                countEqual++;
            }
            // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl;    
            if (countEqual * 1.0 / nFrame > 0.8) {
                if (freqNumber[iCandidate] > freqNumber[jCandidate]) {
                    duplicates.push_back(jCandidate);
                } else if (iCandidate < jCandidate) {
                    duplicates.push_back(iCandidate);
                }
            }
        }
    }

    // now find non-duplicate pitch tracks
    map<int, int> candidateActuals;
    map<int, std::string> candidateLabels;

    vector<vector<float> > outputFrequencies;
    for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector<float>());

    int actualCandidateNumber = 0;
    for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
    {
        bool isDuplicate = false;
        for (size_t i = 0; i < duplicates.size(); ++i) {
    
            if (duplicates[i] == iCandidate) {
                isDuplicate = true;
                break;
            }
        }
        if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame)
        {
            std::ostringstream convert;
            convert << actualCandidateNumber++;
            candidateLabels[iCandidate] = convert.str();
            candidateActuals[iCandidate] = actualCandidateNumber;
            // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl;
            for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) 
            {
                if (pitchTracks[iCandidate][iFrame] > 0)
                {
                    // featureValues[m_timestamp[iFrame]][iCandidate] = 
                    //     pitchTracks[iCandidate][iFrame];
                    outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]);
                } else {
                    outputFrequencies[iFrame].push_back(0);
                }
            }
        }
        // fs[m_oPitchTrackCandidates].push_back(f);
    }

    // adapt our features so as to return a stack of candidate values
    // per frame

    FeatureSet fs;

    for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){
        Feature f;
        f.hasTimestamp = true;
        f.timestamp = m_timestamp[iFrame];
        f.values = outputFrequencies[iFrame];
        fs[0].push_back(f);
    }
    
    // I stopped using Chris's map stuff below because I couldn't get my head around it
    //
    // for (map<RealTime, map<int, float> >::const_iterator i =
    //          featureValues.begin(); i != featureValues.end(); ++i) {
    //     Feature f;
    //     f.hasTimestamp = true;
    //     f.timestamp = i->first;
    //     int nextCandidate = candidateActuals.begin()->second;
    //     for (map<int, float>::const_iterator j = 
    //              i->second.begin(); j != i->second.end(); ++j) {
    //         while (candidateActuals[j->first] > nextCandidate) {
    //             f.values.push_back(0);
    //             ++nextCandidate;
    //         }
    //         f.values.push_back(j->second);
    //         nextCandidate = j->first + 1;
    //     }
    //     //!!! can't use labels?
    //     fs[0].push_back(f);
    // }

    return fs;
}