Mercurial > hg > silvet

/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
  Silvet

  A Vamp plugin for note transcription.
  Centre for Digital Music, Queen Mary University of London.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of the
  License, or (at your option) any later version.  See the file
  COPYING included with this distribution for more information.
*/

#include "Silvet.h"
#include "EM.h"

#include "maths/MedianFilter.h"
#include "maths/MathUtilities.h"
#include "dsp/rateconversion/Resampler.h"

#include "constant-q-cpp/cpp-qm-dsp/CQInterpolated.h"

#include <vector>

#include <cstdio>

using std::vector;
using std::cout;
using std::cerr;
using std::endl;
using Vamp::RealTime;

static int processingSampleRate = 44100;
static int processingBPO = 60;
static int processingHeight = 545;
static int processingNotes = 88;

Silvet::Silvet(float inputSampleRate) :
    Plugin(inputSampleRate),
    m_resampler(0),
    m_cq(0)
{
}

Silvet::~Silvet()
{
    delete m_resampler;
    delete m_cq;
    for (int i = 0; i < (int)m_postFilter.size(); ++i) {
        delete m_postFilter[i];
    }
}

string
Silvet::getIdentifier() const
{
    return "silvet";
}

string
Silvet::getName() const
{
    return "Silvet Note Transcription";
}

string
Silvet::getDescription() const
{
    // Return something helpful here!
    return "";
}

string
Silvet::getMaker() const
{
    // Your name here
    return "";
}

int
Silvet::getPluginVersion() const
{
    return 1;
}

string
Silvet::getCopyright() const
{
    // This function is not ideally named.  It does not necessarily
    // need to say who made the plugin -- getMaker does that -- but it
    // should indicate the terms under which it is distributed.  For
    // example, "Copyright (year). All Rights Reserved", or "GPL"
    return "";
}

Silvet::InputDomain
Silvet::getInputDomain() const
{
    return TimeDomain;
}

size_t
Silvet::getPreferredBlockSize() const
{
    return 0;
}

size_t
Silvet::getPreferredStepSize() const
{
    return 0;
}

size_t
Silvet::getMinChannelCount() const
{
    return 1;
}

size_t
Silvet::getMaxChannelCount() const
{
    return 1;
}

Silvet::ParameterList
Silvet::getParameterDescriptors() const
{
    ParameterList list;
    return list;
}

float
Silvet::getParameter(string identifier) const
{
    return 0;
}

void
Silvet::setParameter(string identifier, float value)
{
}

Silvet::ProgramList
Silvet::getPrograms() const
{
    ProgramList list;
    return list;
}

string
Silvet::getCurrentProgram() const
{
    return "";
}

void
Silvet::selectProgram(string name)
{
}

Silvet::OutputList
Silvet::getOutputDescriptors() const
{
    OutputList list;

    OutputDescriptor d;
    d.identifier = "notes";
    d.name = "Note transcription";
    d.description = "Overall note transcription across all instruments";
    d.unit = "Hz";
    d.hasFixedBinCount = true;
    d.binCount = 2;
    d.binNames.push_back("Frequency");
    d.binNames.push_back("Velocity");
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::VariableSampleRate;
    d.sampleRate = m_inputSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
    d.hasDuration = true;
    m_notesOutputNo = list.size();
    list.push_back(d);

    d.identifier = "cq";
    d.name = "Raw constant-Q";
    d.description = "Unfiltered constant-Q time-frequency distribution";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = processingHeight + 55;
    d.binNames.clear();
    if (m_cq) {
        char name[20];
        for (int i = 0; i < processingHeight + 55; ++i) {
            float freq = m_cq->getBinFrequency(i);
            sprintf(name, "%.1f Hz", freq);
            d.binNames.push_back(name);
        }
    }
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = m_inputSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
    d.hasDuration = false;
    m_cqOutputNo = list.size();
    list.push_back(d);

    d.identifier = "inputgrid";
    d.name = "Filtered constant-Q";
    d.description = "Filtered constant-Q time-frequency distribution used as input to the PLCA step";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = processingHeight;
    d.binNames.clear();
    if (m_cq) {
        char name[20];
        for (int i = 0; i < processingHeight; ++i) {
            float freq = m_cq->getBinFrequency(i + 55);
            sprintf(name, "%.1f Hz", freq);
            d.binNames.push_back(name);
        }
    }
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = 25;
    d.hasDuration = false;
    m_fcqOutputNo = list.size();
    list.push_back(d);

    d.identifier = "pitches";
    d.name = "Pitch activation";
    d.description = "Estimated pitch activation matrix";
    d.unit = "";
    d.hasFixedBinCount = true;
    d.binCount = processingNotes;
    d.binNames.clear();
    for (int i = 0; i < processingNotes; ++i) {
        d.binNames.push_back(noteName(i));
    }
    d.hasKnownExtents = false;
    d.isQuantized = false;
    d.sampleType = OutputDescriptor::FixedSampleRate;
    d.sampleRate = 25;
    d.hasDuration = false;
    m_pitchOutputNo = list.size();
    list.push_back(d);

    return list;
}

std::string
Silvet::noteName(int i) const
{
    static const char *names[] = {
        "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
    };

    const char *n = names[i % 12];

    int oct = (i + 9) / 12;

    char buf[20];
    sprintf(buf, "%s%d", n, oct);

    return buf;
}

float
Silvet::noteFrequency(int note) const
{
    return float(27.5 * pow(2.0, note / 12.0));
}

bool
Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
{
    if (channels < getMinChannelCount() ||
	channels > getMaxChannelCount()) return false;

    if (stepSize != blockSize) {
	cerr << "Silvet::initialise: Step size must be the same as block size ("
	     << stepSize << " != " << blockSize << ")" << endl;
	return false;
    }

    m_blockSize = blockSize;

    reset();

    return true;
}

void
Silvet::reset()
{
    delete m_resampler;
    delete m_cq;

    if (m_inputSampleRate != processingSampleRate) {
	m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
    } else {
	m_resampler = 0;
    }

    m_cq = new CQInterpolated
	(processingSampleRate, 27.5, processingSampleRate / 3, processingBPO,
         CQInterpolated::Linear);

    for (int i = 0; i < (int)m_postFilter.size(); ++i) {
        delete m_postFilter[i];
    }
    m_postFilter.clear();
    for (int i = 0; i < processingNotes; ++i) {
        m_postFilter.push_back(new MedianFilter<double>(3));
    }
    m_pianoRoll.clear();
    m_columnCount = 0;
    m_reducedColumnCount = 0;
    m_startTime = RealTime::zeroTime;
}

Silvet::FeatureSet
Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
{
    if (m_columnCount == 0) {
        m_startTime = timestamp;
    }

    vector<double> data;
    for (int i = 0; i < m_blockSize; ++i) {
        data.push_back(inputBuffers[0][i]);
    }

    if (m_resampler) {
	data = m_resampler->process(data.data(), data.size());
    }

    Grid cqout = m_cq->process(data);
    FeatureSet fs = transcribe(cqout);

    for (int i = 0; i < (int)cqout.size(); ++i) {
        Feature f;
        for (int j = 0; j < (int)cqout[i].size(); ++j) {
            f.values.push_back(float(cqout[i][j]));
        }
        fs[m_cqOutputNo].push_back(f);
    }

    return fs;
}

Silvet::FeatureSet
Silvet::getRemainingFeatures()
{
    Grid cqout = m_cq->getRemainingBlocks();
    FeatureSet fs = transcribe(cqout);

    for (int i = 0; i < (int)cqout.size(); ++i) {
        Feature f;
        for (int j = 0; j < (int)cqout[i].size(); ++j) {
            f.values.push_back(float(cqout[i][j]));
        }
        fs[m_cqOutputNo].push_back(f);
    }

    return fs;
}

Silvet::FeatureSet
Silvet::transcribe(const Grid &cqout)
{
    Grid filtered = preProcess(cqout);

    FeatureSet fs;

    for (int i = 0; i < (int)filtered.size(); ++i) {
        Feature f;
        for (int j = 0; j < processingHeight; ++j) {
            f.values.push_back(float(filtered[i][j]));
        }
        fs[m_fcqOutputNo].push_back(f);
    }

    int width = filtered.size();

    int iterations = 12;

    for (int i = 0; i < width; ++i) {

        double sum = 0.0;
        for (int j = 0; j < processingHeight; ++j) {
            sum += filtered[i][j];
        }

        if (sum < 1e-5) continue;

        EM em;
        for (int j = 0; j < iterations; ++j) {
            em.iterate(filtered[i]);
        }

        vector<double> pitches = em.getPitchDistribution();

        for (int j = 0; j < processingNotes; ++j) {
            pitches[j] *= sum;
        }

        Feature f;
        for (int j = 0; j < processingNotes; ++j) {
            f.values.push_back(float(pitches[j]));
        }
        fs[m_pitchOutputNo].push_back(f);

        FeatureList noteFeatures = postProcess(pitches);

        for (FeatureList::const_iterator fi = noteFeatures.begin();
             fi != noteFeatures.end(); ++fi) {
            fs[m_notesOutputNo].push_back(*fi);
        }
    }

    return fs;
}

Silvet::Grid
Silvet::preProcess(const Grid &in)
{
    int width = in.size();

    // reduce to 100 columns per second, or one column every 441 samples

    int spacing = processingSampleRate / 100;

    Grid out;

    // We count the CQ latency in terms of processing hops, but
    // actually it probably isn't an exact number of hops so this
    // isn't quite accurate. But the small constant offset is
    // practically irrelevant compared to the jitter from the 40ms
    // frame size we reduce to in a moment
    int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();

    for (int i = 0; i < width; ++i) {

        if (m_columnCount < latentColumns) {
            ++m_columnCount;
            continue;
        }

        int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
        int sampleNo = m_columnCount * m_cq->getColumnHop();

        bool select = (sampleNo / spacing != prevSampleNo / spacing);

        if (select) {
            vector<double> inCol = in[i];
            vector<double> outCol(processingHeight);

            // we reverse the column as we go (the CQ output is
            // "upside-down", with high frequencies at the start of
            // each column, and we want it the other way around) and
            // then ignore the first 55 (lowest-frequency) bins,
            // giving us 545 bins instead of 600

            for (int j = 0; j < processingHeight; ++j) {
                int ix = inCol.size() - j - 55;
                outCol[j] = inCol[ix];
            }

            vector<double> noiseLevel1 =
                MedianFilter<double>::filter(40, outCol);
            for (int j = 0; j < processingHeight; ++j) {
                noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
            }

            vector<double> noiseLevel2 =
                MedianFilter<double>::filter(40, noiseLevel1);
            for (int j = 0; j < processingHeight; ++j) {
                outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
            }

            // then we only use every fourth filtered column, for 25
            // columns per second in the eventual grid

            if (m_reducedColumnCount % 4 == 0) {
                out.push_back(outCol);
            }

            ++m_reducedColumnCount;
        }

        ++m_columnCount;
    }

    return out;
}

Vamp::Plugin::FeatureList
Silvet::postProcess(const vector<double> &pitches)
{
    vector<double> filtered;

    for (int j = 0; j < processingNotes; ++j) {
        m_postFilter[j]->push(pitches[j]);
        filtered.push_back(m_postFilter[j]->get());
    }

    int postFilterLatency = int(m_postFilter[0]->getSize() / 2);

    // Threshold for level and reduce number of candidate pitches

    int polyphony = 5;
    double threshold = 4.8;

    typedef std::multimap<double, int> ValueIndexMap;

    ValueIndexMap strengths;
    for (int j = 0; j < processingNotes; ++j) {
        strengths.insert(ValueIndexMap::value_type(filtered[j], j));
    }

    map<int, double> active;
    ValueIndexMap::const_iterator si = strengths.end();
    while (int(active.size()) < polyphony) {
        --si;
        if (si->first < threshold) break;
        cerr << si->second << " : " << si->first << endl;
        active[si->second] = si->first;
        if (si == strengths.begin()) break;
    }

    // Minimum duration pruning, and conversion to notes. We can only
    // report notes that have just ended (i.e. that are absent in the
    // latest active set but present in the last set in the piano
    // roll) -- any notes that ended earlier will have been reported
    // already, and if they haven't ended, we don't know their
    // duration.

    int width = m_pianoRoll.size();

    int durationThreshold = 2; // columns

    FeatureList noteFeatures;

    if (width < durationThreshold + 1) {
        m_pianoRoll.push_back(active);
        return noteFeatures;
    }

    // we have 25 columns per second
    double columnDuration = 1.0 / 25.0;

    for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
         ni != m_pianoRoll[width-1].end(); ++ni) {

        int note = ni->first;

        if (active.find(note) != active.end()) {
            // the note is still playing
            continue;
        }

        // the note was playing but just ended
        int end = width;
        int start = end-1;

        double maxStrength = 0.0;

        while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
            double strength = m_pianoRoll[start][note];
            if (strength > maxStrength) {
                maxStrength = strength;
            }
            --start;
        }
        ++start;

        int duration = width - start;
//        cerr << "duration " << duration << " for just-ended note " << note << endl;
        if (duration < durationThreshold) {
            // spurious
            continue;
        }

        int velocity = maxStrength * 2;
        if (velocity > 127) velocity = 127;

        cerr << "Found a genuine note, starting at " << columnDuration * start << " with duration " << columnDuration * duration << endl;

        Feature nf;
        nf.hasTimestamp = true;
        nf.timestamp = RealTime::fromSeconds
            (columnDuration * (start - postFilterLatency));
        nf.hasDuration = true;
        nf.duration = RealTime::fromSeconds
            (columnDuration * duration);
        nf.values.push_back(noteFrequency(note));
        nf.values.push_back(velocity);
        nf.label = noteName(note);
        noteFeatures.push_back(nf);
    }

    m_pianoRoll.push_back(active);

//    cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;

    return noteFeatures;
}
author	Chris Cannam
date	Tue, 06 May 2014 11:26:33 +0100
parents	5314d3361dfb
children	81eaba98985b 840c0d703bbb