Mercurial > hg > nnls-chroma
view NNLSChroma.cpp @ 184:82d5d11b68d7 tip
Update library URI so it's not document-local
author | Chris Cannam |
---|---|
date | Wed, 22 Apr 2020 14:21:25 +0100 |
parents | 3c731acad404 |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* NNLS-Chroma / Chordino Audio feature extraction plugins for chromagram and chord estimation. Centre for Digital Music, Queen Mary University of London. This file copyright 2008-2010 Matthias Mauch and QMUL. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "NNLSChroma.h" #include "chromamethods.h" #include <cstdlib> #include <fstream> #include <cmath> #include <algorithm> const bool debug_on = false; NNLSChroma::NNLSChroma(float inputSampleRate) : NNLSBase(inputSampleRate) { if (debug_on) cerr << "--> NNLSChroma" << endl; } NNLSChroma::~NNLSChroma() { if (debug_on) cerr << "--> ~NNLSChroma" << endl; } string NNLSChroma::getIdentifier() const { if (debug_on) cerr << "--> getIdentifier" << endl; return "nnls-chroma"; } string NNLSChroma::getName() const { if (debug_on) cerr << "--> getName" << endl; return "NNLS Chroma"; } string NNLSChroma::getDescription() const { if (debug_on) cerr << "--> getDescription" << endl; return "This plugin provides a number of features derived from a DFT-based log-frequency amplitude spectrum: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; and based on this semitone spectrum, different chroma features."; } NNLSChroma::OutputList NNLSChroma::getOutputDescriptors() const { if (debug_on) cerr << "--> getOutputDescriptors" << endl; OutputList list; // Make chroma names for the binNames property const char* notenames[24] = { "A (bass)","Bb (bass)","B (bass)","C (bass)","C# (bass)","D (bass)","Eb (bass)","E (bass)","F (bass)","F# (bass)","G (bass)","Ab (bass)", "A","Bb","B","C","C#","D","Eb","E","F","F#","G","Ab"}; vector<string> chromanames; vector<string> bothchromanames; for (int iNote = 0; iNote < 24; iNote++) { bothchromanames.push_back(notenames[iNote]); if (iNote < 12) { chromanames.push_back(notenames[iNote+12]); } } int index = 0; float featureRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; OutputDescriptor logfreqspecOutput; logfreqspecOutput.identifier = "logfreqspec"; logfreqspecOutput.name = "Log-Frequency Spectrum"; logfreqspecOutput.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping."; logfreqspecOutput.unit = ""; logfreqspecOutput.hasFixedBinCount = true; logfreqspecOutput.binCount = nNote; logfreqspecOutput.hasKnownExtents = false; logfreqspecOutput.isQuantized = false; logfreqspecOutput.sampleType = OutputDescriptor::FixedSampleRate; logfreqspecOutput.hasDuration = false; logfreqspecOutput.sampleRate = featureRate; list.push_back(logfreqspecOutput); m_outputLogfreqspec = index++; OutputDescriptor tunedlogfreqspecOutput; tunedlogfreqspecOutput.identifier = "tunedlogfreqspec"; tunedlogfreqspecOutput.name = "Tuned Log-Frequency Spectrum"; tunedlogfreqspecOutput.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping, then its tuned using the estimated tuning frequency."; tunedlogfreqspecOutput.unit = ""; tunedlogfreqspecOutput.hasFixedBinCount = true; tunedlogfreqspecOutput.binCount = nNote; tunedlogfreqspecOutput.hasKnownExtents = false; tunedlogfreqspecOutput.isQuantized = false; tunedlogfreqspecOutput.sampleType = OutputDescriptor::FixedSampleRate; tunedlogfreqspecOutput.hasDuration = false; tunedlogfreqspecOutput.sampleRate = featureRate; list.push_back(tunedlogfreqspecOutput); m_outputTunedlogfreqspec = index++; OutputDescriptor semitonespectrumOutput; semitonespectrumOutput.identifier = "semitonespectrum"; semitonespectrumOutput.name = "Semitone Spectrum"; semitonespectrumOutput.description = "A semitone-spaced log-frequency spectrum derived from the third-of-a-semitone-spaced tuned log-frequency spectrum."; semitonespectrumOutput.unit = ""; semitonespectrumOutput.hasFixedBinCount = true; semitonespectrumOutput.binCount = 84; semitonespectrumOutput.hasKnownExtents = false; semitonespectrumOutput.isQuantized = false; semitonespectrumOutput.sampleType = OutputDescriptor::FixedSampleRate; semitonespectrumOutput.hasDuration = false; semitonespectrumOutput.sampleRate = featureRate; list.push_back(semitonespectrumOutput); m_outputSemitonespectrum = index++; OutputDescriptor chromaOutput; chromaOutput.identifier = "chroma"; chromaOutput.name = "Chromagram"; chromaOutput.description = "Tuning-adjusted chromagram from NNLS approximate transcription, with an emphasis on the medium note range."; chromaOutput.unit = ""; chromaOutput.hasFixedBinCount = true; chromaOutput.binCount = 12; chromaOutput.binNames = chromanames; chromaOutput.hasKnownExtents = false; chromaOutput.isQuantized = false; chromaOutput.sampleType = OutputDescriptor::FixedSampleRate; chromaOutput.hasDuration = false; chromaOutput.sampleRate = featureRate; list.push_back(chromaOutput); m_outputChroma = index++; OutputDescriptor basschromaOutput; basschromaOutput.identifier = "basschroma"; basschromaOutput.name = "Bass Chromagram"; basschromaOutput.description = "Tuning-adjusted bass chromagram from NNLS approximate transcription, with an emphasis on the bass note range."; basschromaOutput.unit = ""; basschromaOutput.hasFixedBinCount = true; basschromaOutput.binCount = 12; basschromaOutput.binNames = chromanames; basschromaOutput.hasKnownExtents = false; basschromaOutput.isQuantized = false; basschromaOutput.sampleType = OutputDescriptor::FixedSampleRate; basschromaOutput.hasDuration = false; basschromaOutput.sampleRate = featureRate; list.push_back(basschromaOutput); m_outputBasschroma = index++; OutputDescriptor bothchromaOutput; bothchromaOutput.identifier = "bothchroma"; bothchromaOutput.name = "Chromagram and Bass Chromagram"; bothchromaOutput.description = "Tuning-adjusted chromagram and bass chromagram (stacked on top of each other) from NNLS approximate transcription."; bothchromaOutput.unit = ""; bothchromaOutput.hasFixedBinCount = true; bothchromaOutput.binCount = 24; bothchromaOutput.binNames = bothchromanames; bothchromaOutput.hasKnownExtents = false; bothchromaOutput.isQuantized = false; bothchromaOutput.sampleType = OutputDescriptor::FixedSampleRate; bothchromaOutput.hasDuration = false; bothchromaOutput.sampleRate = featureRate; list.push_back(bothchromaOutput); m_outputBothchroma = index++; return list; } bool NNLSChroma::initialise(size_t channels, size_t stepSize, size_t blockSize) { if (debug_on) { cerr << "--> initialise"; } if (!NNLSBase::initialise(channels, stepSize, blockSize)) { return false; } return true; } void NNLSChroma::reset() { if (debug_on) cerr << "--> reset"; NNLSBase::reset(); } NNLSChroma::FeatureSet NNLSChroma::process(const float *const *inputBuffers, Vamp::RealTime timestamp) { if (debug_on) cerr << "--> process" << endl; NNLSBase::baseProcess(inputBuffers, timestamp); FeatureSet fs; fs[m_outputLogfreqspec].push_back(m_logSpectrum[m_logSpectrum.size()-1]); return fs; } NNLSChroma::FeatureSet NNLSChroma::getRemainingFeatures() { if (debug_on) cerr << "--> getRemainingFeatures" << endl; FeatureSet fsOut; if (m_logSpectrum.size() == 0) return fsOut; /** Calculate Tuning calculate tuning from (using the angle of the complex number defined by the cumulative mean real and imag values) **/ float meanTuningImag = 0; float meanTuningReal = 0; for (int iBPS = 0; iBPS < nBPS; ++iBPS) { meanTuningReal += m_meanTunings[iBPS] * cosvalues[iBPS]; meanTuningImag += m_meanTunings[iBPS] * sinvalues[iBPS]; } float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI)); float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI); int intShift = floor(normalisedtuning * 3); float floatShift = normalisedtuning * 3 - intShift; // floatShift is a really bad name for this char buffer0 [50]; sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning); /** Tune Log-Frequency Spectrogram calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to perform linear interpolation on the existing log-frequency spectrogram (kinda f1). **/ if (debug_on) cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... "; float tempValue = 0; int count = 0; for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) { Feature f1 = *i; Feature f2; // tuned log-frequency spectrum f2.hasTimestamp = true; f2.timestamp = f1.timestamp; f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero if (m_tuneLocal) { intShift = floor(m_localTuning[count] * 3); floatShift = m_localTuning[count] * 3 - intShift; // floatShift is a really bad name for this } // cerr << intShift << " " << floatShift << endl; for (int k = 2; k < (int)f1.values.size() - 3; ++k) { // interpolate all inner bins tempValue = f1.values[k + intShift] * (1-floatShift) + f1.values[k+intShift+1] * floatShift; f2.values.push_back(tempValue); } f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge vector<float> runningmean = SpecialConvolution(f2.values,hw); vector<float> runningstd; for (int i = 0; i < nNote; i++) { // first step: squared values into vector (variance) runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i])); } runningstd = SpecialConvolution(runningstd,hw); // second step convolve for (int i = 0; i < nNote; i++) { runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std if (runningstd[i] > 0) { // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ? // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ? (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; } if (f2.values[i] < 0) { cerr << "ERROR: negative value in logfreq spectrum" << endl; } } fsOut[m_outputTunedlogfreqspec].push_back(f2); count++; } if (debug_on) cerr << "done." << endl; /** Semitone spectrum and chromagrams Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum is inferred using a non-negative least squares algorithm. Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means bass and treble stacked onto each other). **/ if (m_useNNLS == 0) { if (debug_on) cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... "; } else { if (debug_on) cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... "; } vector<float> oldchroma = vector<float>(12,0); vector<float> oldbasschroma = vector<float>(12,0); count = 0; for (FeatureList::iterator it = fsOut[m_outputTunedlogfreqspec].begin(); it != fsOut[m_outputTunedlogfreqspec].end(); ++it) { Feature f2 = *it; // logfreq spectrum Feature f3; // semitone spectrum Feature f4; // treble chromagram Feature f5; // bass chromagram Feature f6; // treble and bass chromagram f3.hasTimestamp = true; f3.timestamp = f2.timestamp; f4.hasTimestamp = true; f4.timestamp = f2.timestamp; f5.hasTimestamp = true; f5.timestamp = f2.timestamp; f6.hasTimestamp = true; f6.timestamp = f2.timestamp; float b[nNote]; bool some_b_greater_zero = false; float sumb = 0; for (int i = 0; i < nNote; i++) { // b[i] = m_dict[(nNote * count + i) % (nNote * 84)]; b[i] = f2.values[i]; sumb += b[i]; if (b[i] > 0) { some_b_greater_zero = true; } } // here's where the non-negative least squares algorithm calculates the note activation x vector<float> chroma = vector<float>(12, 0); vector<float> basschroma = vector<float>(12, 0); float currval; int iSemitone = 0; if (some_b_greater_zero) { if (m_useNNLS == 0) { for (int iNote = nBPS/2 + 2; iNote < nNote - nBPS/2; iNote += nBPS) { currval = 0; for (int iBPS = -nBPS/2; iBPS < nBPS/2+1; ++iBPS) { currval += b[iNote + iBPS] * (1-abs(iBPS*1.0/(nBPS/2+1))); } f3.values.push_back(currval); chroma[iSemitone % 12] += currval * treblewindow[iSemitone]; basschroma[iSemitone % 12] += currval * basswindow[iSemitone]; iSemitone++; } } else { float x[84+1000]; for (int i = 1; i < 1084; ++i) x[i] = 1.0; vector<int> signifIndex; int index=0; sumb /= 84.0; for (int iNote = nBPS/2 + 2; iNote < nNote - nBPS/2; iNote += nBPS) { float currval = 0; for (int iBPS = -nBPS/2; iBPS < nBPS/2+1; ++iBPS) { currval += b[iNote + iBPS]; } if (currval > 0) signifIndex.push_back(index); f3.values.push_back(0); // fill the values, change later index++; } float rnorm; float w[84+1000]; float zz[84+1000]; int indx[84+1000]; int mode; int dictsize = nNote*signifIndex.size(); // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl; float *curr_dict = new float[dictsize]; for (int iNote = 0; iNote < (int)signifIndex.size(); ++iNote) { for (int iBin = 0; iBin < nNote; iBin++) { curr_dict[iNote * nNote + iBin] = 1.0 * m_dict[signifIndex[iNote] * nNote + iBin]; } } nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode); delete [] curr_dict; for (int iNote = 0; iNote < (int)signifIndex.size(); ++iNote) { f3.values[signifIndex[iNote]] = x[iNote]; // cerr << mode << endl; chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]]; basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]]; } } } else { for (int i = 0; i < 84; ++i) f3.values.push_back(0); } f4.values = chroma; f5.values = basschroma; chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas f6.values = chroma; if (m_doNormalizeChroma > 0) { vector<float> chromanorm = vector<float>(3,0); switch (int(m_doNormalizeChroma)) { case 0: // should never end up here break; case 1: chromanorm[0] = *max_element(f4.values.begin(), f4.values.end()); chromanorm[1] = *max_element(f5.values.begin(), f5.values.end()); chromanorm[2] = max(chromanorm[0], chromanorm[1]); break; case 2: for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { chromanorm[0] += *it; } for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { chromanorm[1] += *it; } for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { chromanorm[2] += *it; } break; case 3: for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { chromanorm[0] += pow(*it,2); } chromanorm[0] = sqrt(chromanorm[0]); for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { chromanorm[1] += pow(*it,2); } chromanorm[1] = sqrt(chromanorm[1]); for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { chromanorm[2] += pow(*it,2); } chromanorm[2] = sqrt(chromanorm[2]); break; } if (chromanorm[0] > 0) { for (int i = 0; i < (int)f4.values.size(); i++) { f4.values[i] /= chromanorm[0]; } } if (chromanorm[1] > 0) { for (int i = 0; i < (int)f5.values.size(); i++) { f5.values[i] /= chromanorm[1]; } } if (chromanorm[2] > 0) { for (int i = 0; i < (int)f6.values.size(); i++) { f6.values[i] /= chromanorm[2]; } } } fsOut[m_outputSemitonespectrum].push_back(f3); fsOut[m_outputChroma].push_back(f4); fsOut[m_outputBasschroma].push_back(f5); fsOut[m_outputBothchroma].push_back(f6); count++; } if (debug_on) cerr << "done." << endl; return fsOut; }