Mercurial > hg > silvet
view src/Silvet.cpp @ 189:3de7c871d9c8 noteagent
Fixes to mono feeder; use it for monophonic instruments
author | Chris Cannam |
---|---|
date | Thu, 29 May 2014 10:30:08 +0100 |
parents | 462b165c8c0f |
children | 28cbc7eaf415 |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* Silvet A Vamp plugin for note transcription. Centre for Digital Music, Queen Mary University of London. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "Silvet.h" #include "EM.h" #include <cq/CQSpectrogram.h> #include "MedianFilter.h" #include "AgentFeederPoly.h" #include "AgentFeederMono.h" #include "NoteHypothesis.h" #include "constant-q-cpp/src/dsp/Resampler.h" #include <vector> #include <cstdio> using std::vector; using std::cout; using std::cerr; using std::endl; using Vamp::RealTime; static int processingSampleRate = 44100; static int processingBPO = 60; Silvet::Silvet(float inputSampleRate) : Plugin(inputSampleRate), m_instruments(InstrumentPack::listInstrumentPacks()), m_resampler(0), m_cq(0), m_hqMode(true), m_fineTuning(false), m_instrument(0), m_colsPerSec(50), m_agentFeeder(0) { } Silvet::~Silvet() { delete m_resampler; delete m_cq; for (int i = 0; i < (int)m_postFilter.size(); ++i) { delete m_postFilter[i]; } delete m_agentFeeder; } string Silvet::getIdentifier() const { return "silvet"; } string Silvet::getName() const { return "Silvet Note Transcription"; } string Silvet::getDescription() const { // Return something helpful here! return ""; } string Silvet::getMaker() const { // Your name here return ""; } int Silvet::getPluginVersion() const { return 1; } string Silvet::getCopyright() const { // This function is not ideally named. It does not necessarily // need to say who made the plugin -- getMaker does that -- but it // should indicate the terms under which it is distributed. For // example, "Copyright (year). All Rights Reserved", or "GPL" return ""; } Silvet::InputDomain Silvet::getInputDomain() const { return TimeDomain; } size_t Silvet::getPreferredBlockSize() const { return 0; } size_t Silvet::getPreferredStepSize() const { return 0; } size_t Silvet::getMinChannelCount() const { return 1; } size_t Silvet::getMaxChannelCount() const { return 1; } Silvet::ParameterList Silvet::getParameterDescriptors() const { ParameterList list; ParameterDescriptor desc; desc.identifier = "mode"; desc.name = "Processing mode"; desc.unit = ""; desc.description = "Determines the tradeoff of processing speed against transcription quality"; desc.minValue = 0; desc.maxValue = 1; desc.defaultValue = 1; desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.push_back("Draft (faster)"); desc.valueNames.push_back("Intensive (higher quality)"); list.push_back(desc); desc.identifier = "instrument"; desc.name = "Instrument"; desc.unit = ""; desc.description = "The instrument known to be present in the recording, if there is only one"; desc.minValue = 0; desc.maxValue = m_instruments.size()-1; desc.defaultValue = 0; desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.clear(); for (int i = 0; i < int(m_instruments.size()); ++i) { desc.valueNames.push_back(m_instruments[i].name); } list.push_back(desc); desc.identifier = "finetune"; desc.name = "Return fine pitch estimates"; desc.unit = ""; desc.description = "Return pitch estimates at finer than semitone resolution (works only in Intensive mode)"; desc.minValue = 0; desc.maxValue = 1; desc.defaultValue = 0; desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.clear(); list.push_back(desc); return list; } float Silvet::getParameter(string identifier) const { if (identifier == "mode") { return m_hqMode ? 1.f : 0.f; } else if (identifier == "finetune") { return m_fineTuning ? 1.f : 0.f; } else if (identifier == "instrument") { return m_instrument; } return 0; } void Silvet::setParameter(string identifier, float value) { if (identifier == "mode") { m_hqMode = (value > 0.5); } else if (identifier == "finetune") { m_fineTuning = (value > 0.5); } else if (identifier == "instrument") { m_instrument = lrintf(value); } } Silvet::ProgramList Silvet::getPrograms() const { ProgramList list; return list; } string Silvet::getCurrentProgram() const { return ""; } void Silvet::selectProgram(string name) { } Silvet::OutputList Silvet::getOutputDescriptors() const { OutputList list; OutputDescriptor d; d.identifier = "notes"; d.name = "Note transcription"; d.description = "Overall note transcription across selected instruments"; d.unit = "Hz"; d.hasFixedBinCount = true; d.binCount = 2; d.binNames.push_back("Frequency"); d.binNames.push_back("Velocity"); d.hasKnownExtents = false; d.isQuantized = false; d.sampleType = OutputDescriptor::VariableSampleRate; d.sampleRate = m_inputSampleRate / (m_cq ? m_cq->getColumnHop() : 62); d.hasDuration = true; m_notesOutputNo = list.size(); list.push_back(d); d.identifier = "timefreq"; d.name = "Time-frequency distribution"; d.description = "Filtered constant-Q time-frequency distribution used as input to the expectation-maximisation algorithm"; d.unit = ""; d.hasFixedBinCount = true; d.binCount = m_instruments[0].templateHeight; d.binNames.clear(); if (m_cq) { char name[20]; for (int i = 0; i < m_instruments[0].templateHeight; ++i) { // We have a 600-bin (10 oct 60-bin CQ) of which the // lowest-frequency 55 bins have been dropped, for a // 545-bin template. The native CQ bins go high->low // frequency though, so these are still the first 545 bins // as reported by getBinFrequency, though in reverse order float freq = m_cq->getBinFrequency (m_instruments[0].templateHeight - i - 1); sprintf(name, "%.1f Hz", freq); d.binNames.push_back(name); } } d.hasKnownExtents = false; d.isQuantized = false; d.sampleType = OutputDescriptor::FixedSampleRate; d.sampleRate = m_colsPerSec; d.hasDuration = false; m_fcqOutputNo = list.size(); list.push_back(d); return list; } std::string Silvet::noteName(int note, int shift, int shiftCount) const { static const char *names[] = { "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#" }; const char *n = names[note % 12]; int oct = (note + 9) / 12; char buf[30]; float pshift = 0.f; if (shiftCount > 1) { // see noteFrequency below pshift = float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount; } if (pshift > 0.f) { sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100))); } else if (pshift < 0.f) { sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100))); } else { sprintf(buf, "%s%d", n, oct); } return buf; } float Silvet::noteFrequency(int note, int shift, int shiftCount) const { // Convert shift number to a pitch shift. The given shift number // is an offset into the template array, which starts with some // zeros, followed by the template, then some trailing zeros. // // Example: if we have templateMaxShift == 2 and thus shiftCount // == 5, then the number will be in the range 0-4 and the template // will have 2 zeros at either end. Thus number 2 represents the // template "as recorded", for a pitch shift of 0; smaller indices // represent moving the template *up* in pitch (by introducing // zeros at the start, which is the low-frequency end), for a // positive pitch shift; and higher values represent moving it // down in pitch, for a negative pitch shift. float pshift = 0.f; if (shiftCount > 1) { pshift = float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount; } return float(27.5 * pow(2.0, (note + pshift) / 12.0)); } float Silvet::roundToMidiFrequency(float freq) const { // n is our note number, not actually MIDI note number as we have // a different origin float n = 12.0 * (log(freq / 27.5) / log(2.0)); return 27.5 * pow(2.0, round(n) / 12.0); } bool Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize) { if (channels < getMinChannelCount() || channels > getMaxChannelCount()) return false; if (stepSize != blockSize) { cerr << "Silvet::initialise: Step size must be the same as block size (" << stepSize << " != " << blockSize << ")" << endl; return false; } m_blockSize = blockSize; reset(); return true; } void Silvet::reset() { delete m_resampler; delete m_cq; delete m_agentFeeder; if (m_inputSampleRate != processingSampleRate) { m_resampler = new Resampler(m_inputSampleRate, processingSampleRate); } else { m_resampler = 0; } double minFreq = 27.5; if (!m_hqMode) { // We don't actually return any notes from the bottom octave, // so we can just pad with zeros minFreq *= 2; } CQParameters params(processingSampleRate, minFreq, processingSampleRate / 3, processingBPO); params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower // drops the FFT size to 512 from 1024 and alters // some other processing parameters, making // everything much, much slower. Could be a flaw // in the CQ parameter calculations, must check params.atomHopFactor = 0.3; params.threshold = 0.0005; params.window = CQParameters::Hann; m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear); m_colsPerSec = m_hqMode ? 50 : 25; for (int i = 0; i < (int)m_postFilter.size(); ++i) { delete m_postFilter[i]; } m_postFilter.clear(); for (int i = 0; i < m_instruments[m_instrument].templateNoteCount; ++i) { //!!! m_postFilter.push_back(new MedianFilter<double>(3)); m_postFilter.push_back(new MedianFilter<double>(1));//!!! } m_columnCountIn = 0; m_columnCountOut = 0; m_startTime = RealTime::zeroTime; if (m_instruments[m_instrument].maxPolyphony == 1) { m_agentFeeder = new AgentFeederMono<NoteHypothesis>(); } else { m_agentFeeder = new AgentFeederPoly<NoteHypothesis>(); } } Silvet::FeatureSet Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp) { if (m_columnCountIn == 0) { m_startTime = timestamp; } vector<double> data; for (int i = 0; i < m_blockSize; ++i) { data.push_back(inputBuffers[0][i]); } if (m_resampler) { data = m_resampler->process(data.data(), data.size()); } Grid cqout = m_cq->process(data); FeatureSet fs = transcribe(cqout); return fs; } Silvet::FeatureSet Silvet::getRemainingFeatures() { Grid cqout = m_cq->getRemainingOutput(); FeatureSet fs = transcribe(cqout); m_agentFeeder->finish(); FeatureList noteFeatures = obtainNotes(); for (FeatureList::const_iterator fi = noteFeatures.begin(); fi != noteFeatures.end(); ++fi) { fs[m_notesOutputNo].push_back(*fi); } return fs; } Silvet::FeatureSet Silvet::transcribe(const Grid &cqout) { Grid filtered = preProcess(cqout); FeatureSet fs; if (filtered.empty()) return fs; const InstrumentPack &pack = m_instruments[m_instrument]; for (int i = 0; i < (int)filtered.size(); ++i) { Feature f; for (int j = 0; j < pack.templateHeight; ++j) { f.values.push_back(float(filtered[i][j])); } fs[m_fcqOutputNo].push_back(f); } int width = filtered.size(); int iterations = m_hqMode ? 20 : 10; //!!! pitches or notes? [terminology] Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0)); bool wantShifts = m_hqMode; int shiftCount = 1; if (wantShifts) { shiftCount = pack.templateMaxShift * 2 + 1; } vector<vector<int> > localBestShifts; if (wantShifts) { localBestShifts = vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0)); } vector<bool> present(width, false); #pragma omp parallel for for (int i = 0; i < width; ++i) { double sum = 0.0; for (int j = 0; j < pack.templateHeight; ++j) { sum += filtered.at(i).at(j); } if (sum < 1e-5) continue; present[i] = true; EM em(&pack, m_hqMode); em.setPitchSparsity(pack.pitchSparsity); for (int j = 0; j < iterations; ++j) { em.iterate(filtered.at(i).data()); } const float *pitchDist = em.getPitchDistribution(); const float *const *shiftDist = em.getShifts(); for (int j = 0; j < pack.templateNoteCount; ++j) { localPitches[i][j] = pitchDist[j] * sum; int bestShift = 0; float bestShiftValue = 0.0; if (wantShifts) { for (int k = 0; k < shiftCount; ++k) { float value = shiftDist[k][j]; if (k == 0 || value > bestShiftValue) { bestShiftValue = value; bestShift = k; } } localBestShifts[i][j] = bestShift; } } } for (int i = 0; i < width; ++i) { if (!present[i]) { // silent column for (int j = 0; j < pack.templateNoteCount; ++j) { m_postFilter[j]->push(0.0); } } else { postProcess(localPitches[i], localBestShifts[i], wantShifts, shiftCount); FeatureList noteFeatures = obtainNotes(); for (FeatureList::const_iterator fi = noteFeatures.begin(); fi != noteFeatures.end(); ++fi) { fs[m_notesOutputNo].push_back(*fi); } } ++m_columnCountOut; } return fs; } Silvet::Grid Silvet::preProcess(const Grid &in) { int width = in.size(); int spacing = processingSampleRate / m_colsPerSec; // need to be careful that col spacing is an integer number of samples! assert(spacing * m_colsPerSec == processingSampleRate); Grid out; // We count the CQ latency in terms of processing hops, but // actually it probably isn't an exact number of hops so this // isn't quite accurate. But the small constant offset is // practically irrelevant compared to the jitter from the frame // size we reduce to in a moment int latentColumns = m_cq->getLatency() / m_cq->getColumnHop(); const InstrumentPack &pack = m_instruments[m_instrument]; for (int i = 0; i < width; ++i) { if (m_columnCountIn < latentColumns) { ++m_columnCountIn; continue; } int prevSampleNo = (m_columnCountIn - 1) * m_cq->getColumnHop(); int sampleNo = m_columnCountIn * m_cq->getColumnHop(); bool select = (sampleNo / spacing != prevSampleNo / spacing); if (select) { vector<double> inCol = in[i]; vector<double> outCol(pack.templateHeight); // In HQ mode, the CQ returns 600 bins and we ignore the // lowest 55 of them. // // In draft mode the CQ is an octave shorter, returning // 540 bins, so we instead pad them with an additional 5 // zeros. // // We also need to reverse the column as we go, since the // raw CQ has the high frequencies first and we need it // the other way around. if (m_hqMode) { for (int j = 0; j < pack.templateHeight; ++j) { int ix = inCol.size() - j - 55; outCol[j] = inCol[ix]; } } else { for (int j = 0; j < 5; ++j) { outCol[j] = 0.0; } for (int j = 5; j < pack.templateHeight; ++j) { int ix = inCol.size() - j + 4; outCol[j] = inCol[ix]; } } vector<double> noiseLevel1 = MedianFilter<double>::filter(40, outCol); for (int j = 0; j < pack.templateHeight; ++j) { noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]); } vector<double> noiseLevel2 = MedianFilter<double>::filter(40, noiseLevel1); for (int j = 0; j < pack.templateHeight; ++j) { outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0); } out.push_back(outCol); } ++m_columnCountIn; } return out; } void Silvet::postProcess(const vector<double> &pitches, const vector<int> &bestShifts, bool wantShifts, int shiftCount) { const InstrumentPack &pack = m_instruments[m_instrument]; vector<double> filtered; for (int j = 0; j < pack.templateNoteCount; ++j) { m_postFilter[j]->push(pitches[j]); filtered.push_back(m_postFilter[j]->get()); } double threshold = 1; //!!! pack.levelThreshold double columnDuration = 1.0 / m_colsPerSec; int postFilterLatency = int(m_postFilter[0]->getSize() / 2); RealTime t = RealTime::fromSeconds (columnDuration * (m_columnCountOut - postFilterLatency) + 0.02); for (int j = 0; j < pack.templateNoteCount; ++j) { double strength = filtered[j]; if (strength < threshold) { continue; } double freq; if (wantShifts) { freq = noteFrequency(j, bestShifts[j], shiftCount); } else { freq = noteFrequency(j, 0, shiftCount); } double confidence = strength / 50.0; //!!!??? if (confidence > 1.0) confidence = 1.0; AgentHypothesis::Observation obs(freq, t, confidence); m_agentFeeder->feed(obs); } } Vamp::Plugin::FeatureList Silvet::obtainNotes() { FeatureList noteFeatures; std::set<NoteHypothesis> hh; AgentFeederPoly<NoteHypothesis> *polyFeeder = dynamic_cast<AgentFeederPoly<NoteHypothesis> *>(m_agentFeeder); AgentFeederMono<NoteHypothesis> *monoFeeder = dynamic_cast<AgentFeederMono<NoteHypothesis> *>(m_agentFeeder); if (polyFeeder) { hh = polyFeeder->retrieveAcceptedHypotheses(); } else if (monoFeeder) { hh = monoFeeder->retrieveAcceptedHypotheses(); } else { cerr << "INTERNAL ERROR: Feeder is neither poly- nor " << "mono-note-hypothesis-feeder!" << endl; return noteFeatures; } for (std::set<NoteHypothesis>::const_iterator hi = hh.begin(); hi != hh.end(); ++hi) { NoteHypothesis h(*hi); NoteHypothesis::Note n = h.getAveragedNote(); int velocity = n.confidence * 127; if (velocity > 127) velocity = 127; float freq = n.freq; if (!m_fineTuning) { freq = roundToMidiFrequency(freq); } Feature f; f.hasTimestamp = true; f.hasDuration = true; f.timestamp = n.time; f.duration = n.duration; f.values.clear(); f.values.push_back(freq); f.values.push_back(velocity); // f.label = noteName(note, partShift, shiftCount); noteFeatures.push_back(f); } return noteFeatures; }