view src/TuningDifference.cpp @ 69:c0b78dcc08e6 feature-alternatives tip

Some experimental feature adjustments - try using diffs between chroma cells (no better at first glance)
author Chris Cannam
date Fri, 16 Aug 2019 12:28:58 +0100
parents 7bdcc40deebe
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
  Centre for Digital Music, Queen Mary University of London.
    
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of the
  License, or (at your option) any later version.  See the file
  COPYING included with this distribution for more information.
*/

#include "TuningDifference.h"

#include <iostream>

#include <cmath>
#include <cstdio>
#include <climits>

#include <algorithm>
#include <numeric>

using namespace std;

static double pitchToFrequency(int pitch,
			       double centsOffset = 0.,
			       double concertA = 440.)
{
    double p = double(pitch) + (centsOffset / 100.);
    return concertA * pow(2.0, (p - 69.0) / 12.0); 
}

static double frequencyForCentsAbove440(double cents)
{
    return pitchToFrequency(69, cents, 440.);
}

static float defaultMaxDuration = 0.f;
static int defaultMaxSemis = 5;
static bool defaultFineTuning = true;

TuningDifference::TuningDifference(float inputSampleRate) :
    Plugin(inputSampleRate),
    m_channelCount(0),
    m_bpo(120),
    m_blockSize(0),
    m_frameCount(0),
    m_maxDuration(defaultMaxDuration),
    m_maxSemis(defaultMaxSemis),
    m_fineTuning(defaultFineTuning)
{
}

TuningDifference::~TuningDifference()
{
}

string
TuningDifference::getIdentifier() const
{
    return "tuning-difference";
}

string
TuningDifference::getName() const
{
    return "Tuning Difference";
}

string
TuningDifference::getDescription() const
{
    return "Estimate the tuning frequencies of a set of recordings at once, by comparing them to a reference recording of the same music whose tuning frequency is known";
}

string
TuningDifference::getMaker() const
{
    return "Chris Cannam";
}

int
TuningDifference::getPluginVersion() const
{
    // Increment this each time you release a version that behaves
    // differently from the previous one
    return 3;
}

string
TuningDifference::getCopyright() const
{
    // This function is not ideally named.  It does not necessarily
    // need to say who made the plugin -- getMaker does that -- but it
    // should indicate the terms under which it is distributed.  For
    // example, "Copyright (year). All Rights Reserved", or "GPL"
    return "GPL";
}

TuningDifference::InputDomain
TuningDifference::getInputDomain() const
{
    return TimeDomain;
}

size_t
TuningDifference::getPreferredBlockSize() const
{
    return 0;
}

size_t 
TuningDifference::getPreferredStepSize() const
{
    return 0;
}

size_t
TuningDifference::getMinChannelCount() const
{
    return 2;
}

size_t
TuningDifference::getMaxChannelCount() const
{
    return 1000;
}

TuningDifference::ParameterList
TuningDifference::getParameterDescriptors() const
{
    ParameterList list;

    ParameterDescriptor desc;

    desc.identifier = "maxduration";
    desc.name = "Maximum duration to analyse";
    desc.description = "The maximum duration (in seconds) to consider from either input file, always taken from the start of the input. Zero means there is no limit.";
    desc.minValue = 0;
    desc.maxValue = 3600;
    desc.defaultValue = defaultMaxDuration;
    desc.isQuantized = false;
    desc.unit = "s";
    list.push_back(desc);
    
    desc.identifier = "maxrange";
    desc.name = "Maximum range in semitones";
    desc.description = "The maximum difference in semitones that will be searched.";
    desc.minValue = 1;
    desc.maxValue = 11;
    desc.defaultValue = defaultMaxSemis;
    desc.isQuantized = true;
    desc.quantizeStep = 1;
    desc.unit = "semitones";
    list.push_back(desc);
    
    desc.identifier = "finetuning";
    desc.name = "Fine tuning";
    desc.description = "Use a fine tuning stage to increase nominal resolution from 10 cents to 1 cent.";
    desc.minValue = 0;
    desc.maxValue = 1;
    desc.defaultValue = (defaultFineTuning ? 1.f : 0.f);
    desc.isQuantized = true;
    desc.quantizeStep = 1;
    desc.unit = "";
    list.push_back(desc);

    return list;
}

float
TuningDifference::getParameter(string id) const
{
    if (id == "maxduration") {
        return m_maxDuration;
    } else if (id == "maxrange") {
        return float(m_maxSemis);
    } else if (id == "finetuning") {
        return m_fineTuning ? 1.f : 0.f;
    }
    return 0;
}

void
TuningDifference::setParameter(string id, float value) 
{
    if (id == "maxduration") {
        m_maxDuration = value;
    } else if (id == "maxrange") {
        m_maxSemis = int(roundf(value));
    } else if (id == "finetuning") {
        m_fineTuning = (value > 0.5f);
    }
}

TuningDifference::ProgramList
TuningDifference::getPrograms() const
{
    ProgramList list;
    return list;
}

string
TuningDifference::getCurrentProgram() const
{
    return ""; // no programs
}

void
TuningDifference::selectProgram(string)
{
}

TuningDifference::OutputList
TuningDifference::getOutputDescriptors() const
{
    OutputList list;

    OutputDescriptor d;
    d.identifier = "cents";
    d.name = "Tuning Differences";
    d.description = "A single feature vector containing a value for each input channel after the first (reference) channel, containing the difference in averaged frequency profile between that channel and the reference channel, in cents. A positive value means the corresponding channel is higher than the reference.";
    d.unit = "cents";
    d.hasFixedBinCount = true;
    if (m_channelCount > 1) {
        d.binCount = m_channelCount - 1;
    } else {
        d.binCount = 1;
    }
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::VariableSampleRate;
    d.hasDuration = false;
    m_outputs[d.identifier] = int(list.size());
    list.push_back(d);

    d.identifier = "tuningfreq";
    d.name = "Relative Tuning Frequencies";
    d.description = "A single feature vector containing a value for each input channel after the first (reference) channel, containing the tuning frequency of that channel, if the reference channel is assumed to contain the same music as it at a tuning frequency of A=440Hz.";
    d.unit = "hz";
    d.hasFixedBinCount = true;
    if (m_channelCount > 1) {
        d.binCount = m_channelCount - 1;
    } else {
        d.binCount = 1;
    }
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::VariableSampleRate;
    d.hasDuration = false;
    m_outputs[d.identifier] = int(list.size());
    list.push_back(d);

    d.identifier = "reffeature";
    d.name = "Reference Feature";
    d.description = "Chroma feature from reference channel.";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = m_bpo;
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = 1;
    d.hasDuration = false;
    m_outputs[d.identifier] = int(list.size());
    list.push_back(d);

    d.identifier = "otherfeature";
    d.name = "Other Features";
    d.description = "Series of chroma feature vectors from the non-reference audio channels, before rotation.";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = m_bpo;
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = 1;
    d.hasDuration = false;
    m_outputs[d.identifier] = int(list.size());
    list.push_back(d);

    d.identifier = "rotfeature";
    d.name = "Other Features at Rotated Frequency";
    d.description = "Series of chroma feature vectors from the non-reference audio channels, calculated with the tuning frequency obtained from rotation matching. Note that this does not take into account any fine tuning, only the basic rotation match.";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = m_bpo;
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = 1;
    d.hasDuration = false;
    m_outputs[d.identifier] = int(list.size());
    list.push_back(d);

    return list;
}

bool
TuningDifference::initialise(size_t channels, size_t stepSize, size_t blockSize)
{
    if (channels < getMinChannelCount()) return false;
    if (stepSize != blockSize) return false;
    if (m_blockSize > INT_MAX) return false;

    m_channelCount = int(channels);
    m_blockSize = int(blockSize);

    reset();
    
    return true;
}

void
TuningDifference::reset()
{
    Chromagram::Parameters params(paramsForTuningFrequency(440.));
    m_reference.clear();
    m_refChroma.reset(new Chromagram(params));
    m_refTotals = TFeature(m_bpo, 0.0);
    m_refFeatures.clear();
    m_otherChroma.clear();
    for (int i = 1; i < m_channelCount; ++i) {
        m_otherChroma.push_back(std::make_shared<Chromagram>(params));
    }
    m_otherTotals = vector<TFeature>(m_channelCount-1, TFeature(m_bpo, 0.0));
    m_frameCount = 0;
}

template<typename T>
void addTo(vector<T> &a, const vector<T> &b)
{
    int n = int(b.size());

    for (int i = 0; i < n; ++i) {
        int j = (i == 0 ? n-1 : i-1);
        T diff = b[i] - b[j];
        a[i] += diff;
    }
}

template<typename T>
T distance(const vector<T> &a, const vector<T> &b)
{
    return inner_product(a.begin(), a.end(), b.begin(), T(),
			 plus<T>(), [](T x, T y) { return fabs(x - y); });
}

TuningDifference::TFeature
TuningDifference::computeFeatureFromTotals(const TFeature &totals) const
{
    if (m_frameCount == 0) return totals;
    
    TFeature feature(m_bpo);
    double max = 0.0;

    for (int i = 0; i < m_bpo; ++i) {
	double value = totals[i] / m_frameCount;
	feature[i] = value;
        if (fabs(value) > max) {
            max = fabs(value);
        }
    }

    if (max > 0.0) {
        for (int i = 0; i < m_bpo; ++i) {
            feature[i] /= max;
        }
    }
    
    return feature;
}

Chromagram::Parameters
TuningDifference::paramsForTuningFrequency(double hz) const
{
    Chromagram::Parameters params(m_inputSampleRate);
    params.lowestOctave = 2;
    params.octaveCount = 4;
    params.binsPerOctave = m_bpo;
    params.tuningFrequency = hz;
    params.atomHopFactor = 0.5;
    params.window = CQParameters::Hann;
    return params;
}

TuningDifference::TFeature
TuningDifference::computeFeatureFromSignal(const Signal &signal,
                                           double hz) const
{
    Chromagram chromagram(paramsForTuningFrequency(hz));

    TFeature totals(m_bpo, 0.0);

    cerr << "computeFeatureFromSignal: hz = " << hz << ", frame count = " << m_frameCount << endl;
    
    for (int i = 0; i < m_frameCount; ++i) {
	Signal::const_iterator first = signal.begin() + i * m_blockSize;
	Signal::const_iterator last = first + m_blockSize;
	if (last > signal.end()) last = signal.end();
	CQBase::RealSequence input(first, last);
	input.resize(m_blockSize);
	CQBase::RealBlock block = chromagram.process(input);
	for (const auto &v: block) addTo(totals, v);
    }

    return computeFeatureFromTotals(totals);
}

TuningDifference::FeatureSet
TuningDifference::process(const float *const *inputBuffers, Vamp::RealTime)
{
    if (m_maxDuration > 0) {
        int maxFrames = int((m_maxDuration * m_inputSampleRate) /
                            float(m_blockSize));
        if (m_frameCount > maxFrames) return FeatureSet();
    }

    CQBase::RealBlock block;
    CQBase::RealSequence input;

    input = CQBase::RealSequence
        (inputBuffers[0], inputBuffers[0] + m_blockSize);
    block = m_refChroma->process(input);
    for (const auto &v: block) addTo(m_refTotals, v);

    if (m_fineTuning) {
        m_reference.insert(m_reference.end(),
                           inputBuffers[0],
                           inputBuffers[0] + m_blockSize);
    }

    for (int c = 1; c < m_channelCount; ++c) {
        input = CQBase::RealSequence
            (inputBuffers[c], inputBuffers[c] + m_blockSize);
        block = m_otherChroma[c-1]->process(input);
        for (const auto &v: block) addTo(m_otherTotals[c-1], v);
    }
    
    ++m_frameCount;
    return FeatureSet();
}

TuningDifference::FeatureSet
TuningDifference::getRemainingFeatures()
{
    FeatureSet fs;
    if (m_frameCount == 0) return fs;

    m_refFeatures[0] = computeFeatureFromTotals(m_refTotals);

    Feature f;
    f.hasTimestamp = true;
    f.timestamp = Vamp::RealTime::zeroTime;
    f.values.clear();
    fs[m_outputs["cents"]].push_back(f);
    fs[m_outputs["tuningfreq"]].push_back(f);

    for (int c = 1; c < m_channelCount; ++c) {
        getRemainingFeaturesForChannel(c, fs);
    }

    return fs;
}

void
TuningDifference::getRemainingFeaturesForChannel(int channel,
                                                 FeatureSet &fs)
{
    TFeature otherFeature =
        computeFeatureFromTotals(m_otherTotals[channel-1]);

    Feature f;
    f.hasTimestamp = true;
    f.timestamp = Vamp::RealTime::zeroTime;

    f.values.clear();
    for (auto v: m_refFeatures[0]) f.values.push_back(float(v));
    fs[m_outputs["reffeature"]].push_back(f);

    f.values.clear();
    for (auto v: otherFeature) f.values.push_back(float(v));
    fs[m_outputs["otherfeature"]].push_back(f); 
   
    int rotation = findBestRotation(m_refFeatures[0], otherFeature);

    int coarseCents = -(rotation * 1200) / m_bpo;

    cerr << "channel " << channel << ": rotation " << rotation << " -> cents " << coarseCents << endl;

    TFeature rotatedFeature = otherFeature;
    if (rotation != 0) {
        rotateFeature(rotatedFeature, rotation);
    }

    f.values.clear();
    for (auto v: rotatedFeature) f.values.push_back(float(v));
    fs[m_outputs["rotfeature"]].push_back(f);

    if (m_fineTuning) {
    
        pair<int, double> fine =
            findFineFrequency(rotatedFeature, coarseCents);

        int fineCents = fine.first;
        double fineHz = fine.second;

        fs[m_outputs["cents"]][0].values.push_back(float(fineCents));
        fs[m_outputs["tuningfreq"]][0].values.push_back(float(fineHz));
    
        cerr << "channel " << channel << ": overall best Hz = " << fineHz << endl;

    } else {

        fs[m_outputs["cents"]][0].values.push_back(float(coarseCents));
        fs[m_outputs["tuningfreq"]][0].values.push_back
            (float(frequencyForCentsAbove440(coarseCents)));
    }        
}

void
TuningDifference::rotateFeature(TFeature &r, int rotation) const
{
    if (rotation < 0) {
        rotate(r.begin(), r.begin() - rotation, r.end());
    } else {
        rotate(r.begin(), r.end() - rotation, r.end());
    }
}

double
TuningDifference::featureDistance(const TFeature &ref,
                                  const TFeature &other,
                                  int rotation) const
{
    if (rotation == 0) {
	return distance(ref, other);
    } else {
	// A positive rotation pushes the tuning frequency up for this
	// chroma, negative one pulls it down. If a positive rotation
	// makes this chroma match an un-rotated reference, then this
	// chroma must have initially been lower than the reference.
	TFeature r(other);
        rotateFeature(r, rotation);
	return distance(ref, r);
    }
}

int
TuningDifference::findBestRotation(const TFeature &ref,
                                   const TFeature &other) const
{
    map<double, int> dists;

    int maxRotation = (m_bpo * m_maxSemis) / 12;

    for (int r = -maxRotation; r <= maxRotation; ++r) {
	double dist = featureDistance(ref, other, r);
	dists[dist] = r;
    }

    int best = dists.begin()->second;

    return best;
}

pair<int, double>
TuningDifference::findFineFrequency(const TFeature &rotatedOtherFeature,
                                    int coarseCents)
{
    int coarseResolution = 1200 / m_bpo;
    int searchDistance = coarseResolution/2 - 1;

    int bestCents = coarseCents;
    double bestHz = frequencyForCentsAbove440(coarseCents);

    cerr << "findFineFrequency: coarse frequency is " << bestHz << endl;
    cerr << "searchDistance = " << searchDistance << endl;

    double bestScore = 0;
    bool firstScore = true;
    
    for (int sign = -1; sign <= 1; sign += 2) {
	for (int offset = (sign < 0 ? 0 : 1);
             offset <= searchDistance;
             ++offset) {

	    int fineCents = coarseCents + sign * offset;
	    double fineHz = frequencyForCentsAbove440(fineCents);

	    cerr << "trying with fineCents = " << fineCents << "..." << endl;

            // compare the rotated "other" chroma with a reference
            // chroma shifted by the offset in the opposite direction

            int compensatingCents = -sign * offset;
            TFeature compensatedReference;

            if (m_refFeatures.find(compensatingCents) == m_refFeatures.end()) {
                double compensatingHz = frequencyForCentsAbove440
                    (compensatingCents);
            
                compensatedReference = computeFeatureFromSignal
                    (m_reference, compensatingHz);

                m_refFeatures[compensatingCents] = compensatedReference;

            } else {

                compensatedReference = m_refFeatures[compensatingCents];
            }

	    double fineScore = featureDistance(compensatedReference,
                                               rotatedOtherFeature,
                                               0); // we are rotated already

	    cerr << "fine offset = " << offset << ", cents = " << fineCents
		 << ", Hz = " << fineHz << ", score " << fineScore
		 << " (best score so far " << bestScore << ")" << endl;
	    
	    if ((fineScore < bestScore) || firstScore) {
		cerr << "is good!" << endl;
		bestScore = fineScore;
		bestCents = fineCents;
		bestHz = fineHz;
                firstScore = false;
	    } else {
		break;
	    }
	}
    }
    
    return pair<int, double>(bestCents, bestHz);
}