Mercurial > hg > nnls-chroma
view NNLSBase.cpp @ 172:d95c4cdef8af
Merge
author | Chris Cannam |
---|---|
date | Mon, 02 Nov 2015 11:32:30 +0000 |
parents | ba9310bcc3fc a85e745c0721 |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* NNLS-Chroma / Chordino Audio feature extraction plugins for chromagram and chord estimation. Centre for Digital Music, Queen Mary University of London. This file copyright 2008-2010 Matthias Mauch and QMUL. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "NNLSBase.h" #include "chromamethods.h" #include <cstdlib> #include <fstream> #include <cmath> #include <algorithm> static bool debug_on = false; NNLSBase::NNLSBase(float inputSampleRate) : Plugin(inputSampleRate), m_frameCount(0), m_logSpectrum(0), m_blockSize(0), m_stepSize(0), m_lengthOfNoteIndex(0), m_meanTunings(0), m_localTunings(0), m_whitening(1.0), m_preset(0.0), m_useNNLS(1.0), m_localTuning(0.0), m_kernelValue(0), m_kernelFftIndex(0), m_kernelNoteIndex(0), m_dict(0), m_tuneLocal(0.0), m_doNormalizeChroma(0), m_rollon(0.0), m_boostN(0.1), m_s(0.7), m_harte_syntax(0), sinvalues(0), cosvalues(0) { if (debug_on) cerr << "--> NNLSBase" << endl; // make the *note* dictionary matrix m_dict = new float[nNote * 84]; for (int i = 0; i < nNote * 84; ++i) m_dict[i] = 0.0; } NNLSBase::~NNLSBase() { if (debug_on) cerr << "--> ~NNLSBase" << endl; delete [] m_dict; } string NNLSBase::getMaker() const { if (debug_on) cerr << "--> getMaker" << endl; // Your name here return "Matthias Mauch"; } int NNLSBase::getPluginVersion() const { if (debug_on) cerr << "--> getPluginVersion" << endl; // Increment this each time you release a version that behaves // differently from the previous one return 5; } string NNLSBase::getCopyright() const { if (debug_on) cerr << "--> getCopyright" << endl; // This function is not ideally named. It does not necessarily // need to say who made the plugin -- getMaker does that -- but it // should indicate the terms under which it is distributed. For // example, "Copyright (year). All Rights Reserved", or "GPL" return "GPL"; } NNLSBase::InputDomain NNLSBase::getInputDomain() const { if (debug_on) cerr << "--> getInputDomain" << endl; return FrequencyDomain; } size_t NNLSBase::getPreferredBlockSize() const { if (debug_on) cerr << "--> getPreferredBlockSize" << endl; return 16384; // 0 means "I can handle any block size" } size_t NNLSBase::getPreferredStepSize() const { if (debug_on) cerr << "--> getPreferredStepSize" << endl; return 2048; // 0 means "anything sensible"; in practice this // means the same as the block size for TimeDomain // plugins, or half of it for FrequencyDomain plugins } size_t NNLSBase::getMinChannelCount() const { if (debug_on) cerr << "--> getMinChannelCount" << endl; return 1; } size_t NNLSBase::getMaxChannelCount() const { if (debug_on) cerr << "--> getMaxChannelCount" << endl; return 1; } NNLSBase::ParameterList NNLSBase::getParameterDescriptors() const { if (debug_on) cerr << "--> getParameterDescriptors" << endl; ParameterList list; ParameterDescriptor d; d.identifier = "useNNLS"; d.name = "use approximate transcription (NNLS)"; d.description = "Toggles approximate transcription (NNLS)."; d.unit = ""; d.minValue = 0.0; d.maxValue = 1.0; d.defaultValue = 1.0; d.isQuantized = true; d.quantizeStep = 1.0; list.push_back(d); ParameterDescriptor d0; d0.identifier = "rollon"; d0.name = "bass noise threshold"; d0.description = "Consider the cumulative energy spectrum (from low to high frequencies). All bins below the first bin whose cumulative energy exceeds the quantile [bass noise threshold] x [total energy] will be set to 0. A threshold value of 0 means that no bins will be changed."; d0.unit = "%"; d0.minValue = 0; d0.maxValue = 5; d0.defaultValue = 0; d0.isQuantized = true; d0.quantizeStep = 0.5; list.push_back(d0); ParameterDescriptor d1; d1.identifier = "tuningmode"; d1.name = "tuning mode"; d1.description = "Tuning can be performed locally or on the whole extraction segment. Local tuning is only advisable when the tuning is likely to change over the audio, for example in podcasts, or in a cappella singing."; d1.unit = ""; d1.minValue = 0; d1.maxValue = 1; d1.defaultValue = 0; d1.isQuantized = true; d1.valueNames.push_back("global tuning"); d1.valueNames.push_back("local tuning"); d1.quantizeStep = 1.0; list.push_back(d1); ParameterDescriptor d2; d2.identifier = "whitening"; d2.name = "spectral whitening"; d2.description = "Spectral whitening: no whitening - 0; whitening - 1."; d2.unit = ""; d2.isQuantized = true; d2.minValue = 0.0; d2.maxValue = 1.0; d2.defaultValue = 1.0; d2.isQuantized = false; list.push_back(d2); ParameterDescriptor d3; d3.identifier = "s"; d3.name = "spectral shape"; d3.description = "Determines how individual notes in the note dictionary look: higher values mean more dominant higher harmonics."; d3.unit = ""; d3.minValue = 0.5; d3.maxValue = 0.9; d3.defaultValue = 0.7; d3.isQuantized = false; list.push_back(d3); ParameterDescriptor d4; d4.identifier = "chromanormalize"; d4.name = "chroma normalization"; d4.description = "How shall the chroma vector be normalized?"; d4.unit = ""; d4.minValue = 0; d4.maxValue = 3; d4.defaultValue = 0; d4.isQuantized = true; d4.valueNames.push_back("none"); d4.valueNames.push_back("maximum norm"); d4.valueNames.push_back("L1 norm"); d4.valueNames.push_back("L2 norm"); d4.quantizeStep = 1.0; list.push_back(d4); return list; } float NNLSBase::getParameter(string identifier) const { if (debug_on) cerr << "--> getParameter" << endl; if (identifier == "useNNLS") { return m_useNNLS; } if (identifier == "whitening") { return m_whitening; } if (identifier == "s") { return m_s; } if (identifier == "rollon") { return m_rollon; } if (identifier == "boostn") { return m_boostN; } if (identifier == "tuningmode") { if (m_tuneLocal) { return 1.0; } else { return 0.0; } } if (identifier == "preset") { return m_preset; } if (identifier == "chromanormalize") { return m_doNormalizeChroma; } if (identifier == "usehartesyntax") { return m_harte_syntax; } return 0; } void NNLSBase::setParameter(string identifier, float value) { // cerr << "setParameter (" << identifier << ") -> " << value << endl; if (debug_on) cerr << "--> setParameter" << endl; if (identifier == "useNNLS") { m_useNNLS = (int) value; } if (identifier == "whitening") { m_whitening = value; } if (identifier == "s") { m_s = value; } if (identifier == "boostn") { m_boostN = value; } if (identifier == "tuningmode") { // m_tuneLocal = (value > 0) ? true : false; m_tuneLocal = value; // cerr << "m_tuneLocal :" << m_tuneLocal << endl; } // if (identifier == "preset") { // m_preset = value; // if (m_preset == 0.0) { // m_tuneLocal = false; // m_whitening = 1.0; // m_dictID = 0.0; // } // if (m_preset == 1.0) { // m_tuneLocal = false; // m_whitening = 1.0; // m_dictID = 1.0; // } // if (m_preset == 2.0) { // m_tuneLocal = false; // m_whitening = 0.7; // m_dictID = 0.0; // } // } if (identifier == "chromanormalize") { m_doNormalizeChroma = value; } if (identifier == "rollon") { m_rollon = value; } if (identifier == "usehartesyntax") { m_harte_syntax = value; } } NNLSBase::ProgramList NNLSBase::getPrograms() const { if (debug_on) cerr << "--> getPrograms" << endl; ProgramList list; // If you have no programs, return an empty list (or simply don't // implement this function or getCurrentProgram/selectProgram) return list; } string NNLSBase::getCurrentProgram() const { if (debug_on) cerr << "--> getCurrentProgram" << endl; return ""; // no programs } void NNLSBase::selectProgram(string name) { if (debug_on) cerr << "--> selectProgram" << endl; } bool NNLSBase::initialise(size_t channels, size_t stepSize, size_t blockSize) { if (debug_on) { cerr << "--> initialise"; } dictionaryMatrix(m_dict, m_s); // make things for tuning estimation for (int iBPS = 0; iBPS < nBPS; ++iBPS) { sinvalues.push_back(sin(2*M_PI*(iBPS*1.0/nBPS))); cosvalues.push_back(cos(2*M_PI*(iBPS*1.0/nBPS))); } // make hamming window of length 1/2 octave int hamwinlength = nBPS * 6 + 1; float hamwinsum = 0; for (int i = 0; i < hamwinlength; ++i) { hw.push_back(0.54 - 0.46 * cos((2*M_PI*i)/(hamwinlength-1))); hamwinsum += 0.54 - 0.46 * cos((2*M_PI*i)/(hamwinlength-1)); } for (int i = 0; i < hamwinlength; ++i) hw[i] = hw[i] / hamwinsum; // initialise the tuning for (int iBPS = 0; iBPS < nBPS; ++iBPS) { m_meanTunings.push_back(0); m_localTunings.push_back(0); } if (channels < getMinChannelCount() || channels > getMaxChannelCount()) return false; m_blockSize = blockSize; m_stepSize = stepSize; m_frameCount = 0; int tempn = nNote * m_blockSize/2; // cerr << "length of tempkernel : " << tempn << endl; float *tempkernel; tempkernel = new float[tempn]; logFreqMatrix(m_inputSampleRate, m_blockSize, tempkernel); m_kernelValue.clear(); m_kernelFftIndex.clear(); m_kernelNoteIndex.clear(); int countNonzero = 0; for (int iNote = 0; iNote < nNote; ++iNote) { // I don't know if this is wise: manually making a sparse matrix for (int iFFT = 0; iFFT < static_cast<int>(blockSize/2); ++iFFT) { if (tempkernel[iFFT + blockSize/2 * iNote] > 0) { m_kernelValue.push_back(tempkernel[iFFT + blockSize/2 * iNote]); if (tempkernel[iFFT + blockSize/2 * iNote] > 0) { countNonzero++; } m_kernelFftIndex.push_back(iFFT); m_kernelNoteIndex.push_back(iNote); } } } // cerr << "nonzero count : " << countNonzero << endl; delete [] tempkernel; /* ofstream myfile; myfile.open ("matrix.txt"); // myfile << "Writing this to a file.\n"; for (int i = 0; i < nNote * 84; ++i) { myfile << m_dict[i] << endl; } myfile.close(); */ return true; } void NNLSBase::reset() { if (debug_on) cerr << "--> reset"; // Clear buffers, reset stored values, etc m_frameCount = 0; // m_dictID = 0; m_logSpectrum.clear(); for (int iBPS = 0; iBPS < nBPS; ++iBPS) { m_meanTunings[iBPS] = 0; m_localTunings[iBPS] = 0; } m_localTuning.clear(); } void NNLSBase::baseProcess(const float *const *inputBuffers, Vamp::RealTime timestamp) { m_frameCount++; float *magnitude = new float[m_blockSize/2]; const float *fbuf = inputBuffers[0]; float energysum = 0; // make magnitude float maxmag = -10000; for (int iBin = 0; iBin < static_cast<int>(m_blockSize/2); iBin++) { magnitude[iBin] = sqrt(fbuf[2 * iBin] * fbuf[2 * iBin] + fbuf[2 * iBin + 1] * fbuf[2 * iBin + 1]); if (magnitude[iBin]>m_blockSize*1.0) magnitude[iBin] = m_blockSize; // a valid audio signal (between -1 and 1) should not be limited here. if (maxmag < magnitude[iBin]) maxmag = magnitude[iBin]; if (m_rollon > 0) { energysum += pow(magnitude[iBin],2); } } float cumenergy = 0; if (m_rollon > 0) { for (int iBin = 2; iBin < static_cast<int>(m_blockSize/2); iBin++) { cumenergy += pow(magnitude[iBin],2); if (cumenergy < energysum * m_rollon / 100) magnitude[iBin-2] = 0; else break; } } if (maxmag < m_blockSize * 2.0 / 16384.0) { // this is not quite right, I think // cerr << "timestamp " << timestamp << ": very low magnitude, setting magnitude to all zeros" << endl; for (int iBin = 0; iBin < static_cast<int>(m_blockSize/2); iBin++) { magnitude[iBin] = 0; } } // note magnitude mapping using pre-calculated matrix float *nm = new float[nNote]; // note magnitude for (int iNote = 0; iNote < nNote; iNote++) { nm[iNote] = 0; // initialise as 0 } int binCount = 0; for (vector<float>::iterator it = m_kernelValue.begin(); it != m_kernelValue.end(); ++it) { // cerr << "."; nm[m_kernelNoteIndex[binCount]] += magnitude[m_kernelFftIndex[binCount]] * m_kernelValue[binCount]; // cerr << m_kernelFftIndex[binCount] << " -- " << magnitude[m_kernelFftIndex[binCount]] << " -- "<< m_kernelValue[binCount] << endl; binCount++; } // cerr << nm[20]; // cerr << endl; float one_over_N = 1.0/m_frameCount; // update means of complex tuning variables for (int iBPS = 0; iBPS < nBPS; ++iBPS) m_meanTunings[iBPS] *= float(m_frameCount-1)*one_over_N; for (int iTone = 0; iTone < round(nNote*0.62/nBPS)*nBPS+1; iTone = iTone + nBPS) { for (int iBPS = 0; iBPS < nBPS; ++iBPS) m_meanTunings[iBPS] += nm[iTone + iBPS]*one_over_N; float ratioOld = 0.997; for (int iBPS = 0; iBPS < nBPS; ++iBPS) { m_localTunings[iBPS] *= ratioOld; m_localTunings[iBPS] += nm[iTone + iBPS] * (1 - ratioOld); } } // if (m_tuneLocal) { // local tuning // float localTuningImag = sinvalue * m_localTunings[1] - sinvalue * m_localTunings[2]; // float localTuningReal = m_localTunings[0] + cosvalue * m_localTunings[1] + cosvalue * m_localTunings[2]; float localTuningImag = 0; float localTuningReal = 0; for (int iBPS = 0; iBPS < nBPS; ++iBPS) { localTuningReal += m_localTunings[iBPS] * cosvalues[iBPS]; localTuningImag += m_localTunings[iBPS] * sinvalues[iBPS]; } float normalisedtuning = atan2(localTuningImag, localTuningReal)/(2*M_PI); m_localTuning.push_back(normalisedtuning); Feature f1; // logfreqspec f1.hasTimestamp = true; f1.timestamp = timestamp; for (int iNote = 0; iNote < nNote; iNote++) { f1.values.push_back(nm[iNote]); } // deletes delete[] magnitude; delete[] nm; m_logSpectrum.push_back(f1); // remember note magnitude }