Chris@23: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ matthiasm@0: Chris@35: /* Chris@35: NNLS-Chroma / Chordino Chris@35: Chris@35: Audio feature extraction plugins for chromagram and chord Chris@35: estimation. Chris@35: Chris@35: Centre for Digital Music, Queen Mary University of London. Chris@35: This file copyright 2008-2010 Matthias Mauch and QMUL. Chris@35: Chris@35: This program is free software; you can redistribute it and/or Chris@35: modify it under the terms of the GNU General Public License as Chris@35: published by the Free Software Foundation; either version 2 of the Chris@35: License, or (at your option) any later version. See the file Chris@35: COPYING included with this distribution for more information. Chris@35: */ Chris@35: matthiasm@0: #include "NNLSChroma.h" Chris@27: Chris@27: #include "chromamethods.h" Chris@27: Chris@27: #include Chris@27: #include matthiasm@0: #include matthiasm@9: Chris@27: #include matthiasm@0: matthiasm@0: const bool debug_on = false; matthiasm@0: Chris@27: const vector hw(hammingwind, hammingwind+19); matthiasm@0: matthiasm@0: NNLSChroma::NNLSChroma(float inputSampleRate) : Chris@35: NNLSBase(inputSampleRate) matthiasm@0: { Chris@23: if (debug_on) cerr << "--> NNLSChroma" << endl; matthiasm@0: } matthiasm@0: matthiasm@0: NNLSChroma::~NNLSChroma() matthiasm@0: { Chris@23: if (debug_on) cerr << "--> ~NNLSChroma" << endl; matthiasm@0: } matthiasm@0: matthiasm@0: string matthiasm@0: NNLSChroma::getIdentifier() const matthiasm@0: { Chris@23: if (debug_on) cerr << "--> getIdentifier" << endl; matthiasm@0: return "nnls_chroma"; matthiasm@0: } matthiasm@0: matthiasm@0: string matthiasm@0: NNLSChroma::getName() const matthiasm@0: { Chris@23: if (debug_on) cerr << "--> getName" << endl; matthiasm@0: return "NNLS Chroma"; matthiasm@0: } matthiasm@0: matthiasm@0: string matthiasm@0: NNLSChroma::getDescription() const matthiasm@0: { Chris@23: if (debug_on) cerr << "--> getDescription" << endl; matthiasm@13: return "This plugin provides a number of features derived from a log-frequency amplitude spectrum of the DFT: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; based on this semitone spectrum, chroma features and a simple chord estimate."; matthiasm@0: } matthiasm@0: matthiasm@0: NNLSChroma::OutputList matthiasm@0: NNLSChroma::getOutputDescriptors() const matthiasm@0: { Chris@23: if (debug_on) cerr << "--> getOutputDescriptors" << endl; matthiasm@0: OutputList list; matthiasm@0: matthiasm@0: // Make chroma names for the binNames property matthiasm@0: vector chromanames; matthiasm@0: vector bothchromanames; matthiasm@0: for (int iNote = 0; iNote < 24; iNote++) { matthiasm@0: bothchromanames.push_back(notenames[iNote]); matthiasm@0: if (iNote < 12) { matthiasm@0: chromanames.push_back(notenames[iNote]); matthiasm@0: } matthiasm@0: } matthiasm@0: Chris@35: int index = 0; matthiasm@0: Chris@23: OutputDescriptor d1; matthiasm@0: d1.identifier = "logfreqspec"; matthiasm@0: d1.name = "Log-Frequency Spectrum"; matthiasm@0: d1.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping."; matthiasm@0: d1.unit = ""; matthiasm@0: d1.hasFixedBinCount = true; matthiasm@0: d1.binCount = nNote; matthiasm@0: d1.hasKnownExtents = false; matthiasm@0: d1.isQuantized = false; matthiasm@0: d1.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d1.hasDuration = false; matthiasm@0: d1.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d1); Chris@35: m_outputLogSpec = index++; matthiasm@0: Chris@23: OutputDescriptor d2; matthiasm@0: d2.identifier = "tunedlogfreqspec"; matthiasm@0: d2.name = "Tuned Log-Frequency Spectrum"; matthiasm@0: d2.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping, then its tuned using the estimated tuning frequency."; matthiasm@0: d2.unit = ""; matthiasm@0: d2.hasFixedBinCount = true; matthiasm@0: d2.binCount = 256; matthiasm@0: d2.hasKnownExtents = false; matthiasm@0: d2.isQuantized = false; matthiasm@0: d2.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d2.hasDuration = false; matthiasm@0: d2.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d2); Chris@35: m_outputTunedSpec = index++; matthiasm@0: matthiasm@0: OutputDescriptor d3; matthiasm@0: d3.identifier = "semitonespectrum"; matthiasm@0: d3.name = "Semitone Spectrum"; matthiasm@0: d3.description = "A semitone-spaced log-frequency spectrum derived from the third-of-a-semitone-spaced tuned log-frequency spectrum."; matthiasm@0: d3.unit = ""; matthiasm@0: d3.hasFixedBinCount = true; matthiasm@0: d3.binCount = 84; matthiasm@0: d3.hasKnownExtents = false; matthiasm@0: d3.isQuantized = false; matthiasm@0: d3.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d3.hasDuration = false; matthiasm@0: d3.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d3); Chris@35: m_outputSemiSpec = index++; matthiasm@0: matthiasm@0: OutputDescriptor d4; matthiasm@0: d4.identifier = "chroma"; matthiasm@0: d4.name = "Chromagram"; matthiasm@0: d4.description = "Tuning-adjusted chromagram from NNLS soft transcription, with an emphasis on the medium note range."; matthiasm@0: d4.unit = ""; matthiasm@0: d4.hasFixedBinCount = true; matthiasm@0: d4.binCount = 12; matthiasm@0: d4.binNames = chromanames; matthiasm@0: d4.hasKnownExtents = false; matthiasm@0: d4.isQuantized = false; matthiasm@0: d4.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d4.hasDuration = false; matthiasm@0: d4.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d4); Chris@35: m_outputChroma = index++; matthiasm@0: matthiasm@0: OutputDescriptor d5; matthiasm@0: d5.identifier = "basschroma"; matthiasm@0: d5.name = "Bass Chromagram"; matthiasm@0: d5.description = "Tuning-adjusted bass chromagram from NNLS soft transcription, with an emphasis on the bass note range."; matthiasm@0: d5.unit = ""; matthiasm@0: d5.hasFixedBinCount = true; matthiasm@0: d5.binCount = 12; matthiasm@0: d5.binNames = chromanames; matthiasm@0: d5.hasKnownExtents = false; matthiasm@0: d5.isQuantized = false; matthiasm@0: d5.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d5.hasDuration = false; matthiasm@0: d5.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d5); Chris@35: m_outputBassChroma = index++; matthiasm@0: matthiasm@0: OutputDescriptor d6; matthiasm@0: d6.identifier = "bothchroma"; matthiasm@0: d6.name = "Chromagram and Bass Chromagram"; matthiasm@0: d6.description = "Tuning-adjusted chromagram and bass chromagram (stacked on top of each other) from NNLS soft transcription."; matthiasm@0: d6.unit = ""; matthiasm@0: d6.hasFixedBinCount = true; matthiasm@0: d6.binCount = 24; matthiasm@0: d6.binNames = bothchromanames; matthiasm@0: d6.hasKnownExtents = false; matthiasm@0: d6.isQuantized = false; matthiasm@0: d6.sampleType = OutputDescriptor::FixedSampleRate; matthiasm@0: d6.hasDuration = false; matthiasm@0: d6.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; matthiasm@0: list.push_back(d6); Chris@35: m_outputBothChroma = index++; matthiasm@1: matthiasm@0: return list; matthiasm@0: } matthiasm@0: matthiasm@0: matthiasm@0: bool matthiasm@0: NNLSChroma::initialise(size_t channels, size_t stepSize, size_t blockSize) matthiasm@0: { Chris@23: if (debug_on) { Chris@23: cerr << "--> initialise"; Chris@23: } matthiasm@1: Chris@35: if (!NNLSBase::initialise(channels, stepSize, blockSize)) { Chris@35: return false; Chris@35: } matthiasm@1: matthiasm@0: return true; matthiasm@0: } matthiasm@0: matthiasm@0: void matthiasm@0: NNLSChroma::reset() matthiasm@0: { Chris@23: if (debug_on) cerr << "--> reset"; Chris@35: NNLSBase::reset(); matthiasm@0: } matthiasm@0: matthiasm@0: NNLSChroma::FeatureSet matthiasm@0: NNLSChroma::process(const float *const *inputBuffers, Vamp::RealTime timestamp) matthiasm@0: { Chris@23: if (debug_on) cerr << "--> process" << endl; Chris@35: Chris@35: NNLSBase::baseProcess(inputBuffers, timestamp); matthiasm@0: Chris@23: FeatureSet fs; Chris@35: fs[m_outputLogSpec].push_back(m_logSpectrum[m_logSpectrum.size()-1]); Chris@23: return fs; matthiasm@0: } matthiasm@0: matthiasm@0: NNLSChroma::FeatureSet matthiasm@0: NNLSChroma::getRemainingFeatures() matthiasm@0: { Chris@23: if (debug_on) cerr << "--> getRemainingFeatures" << endl; Chris@23: FeatureSet fsOut; Chris@35: if (m_logSpectrum.size() == 0) return fsOut; Chris@23: // Chris@23: /** Calculate Tuning Chris@23: calculate tuning from (using the angle of the complex number defined by the Chris@23: cumulative mean real and imag values) Chris@23: **/ Chris@23: float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2; Chris@23: float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2; Chris@23: float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI)); Chris@23: float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI); Chris@23: int intShift = floor(normalisedtuning * 3); Chris@23: float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this matthiasm@1: Chris@23: char buffer0 [50]; matthiasm@1: Chris@23: sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning); matthiasm@1: Chris@23: // cerr << "normalisedtuning: " << normalisedtuning << '\n'; matthiasm@1: Chris@23: /** Tune Log-Frequency Spectrogram Chris@23: calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to Chris@23: perform linear interpolation on the existing log-frequency spectrogram (kinda f1). Chris@23: **/ Chris@23: cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... "; matthiasm@13: Chris@23: float tempValue = 0; Chris@23: float dbThreshold = 0; // relative to the background spectrum Chris@23: float thresh = pow(10,dbThreshold/20); Chris@23: // cerr << "tune local ? " << m_tuneLocal << endl; Chris@23: int count = 0; matthiasm@1: Chris@35: for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) { Chris@23: Feature f1 = *i; Chris@23: Feature f2; // tuned log-frequency spectrum Chris@23: f2.hasTimestamp = true; Chris@23: f2.timestamp = f1.timestamp; Chris@23: f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero matthiasm@1: Chris@23: if (m_tuneLocal) { Chris@23: intShift = floor(m_localTuning[count] * 3); Chris@23: intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this Chris@23: } matthiasm@1: Chris@23: // cerr << intShift << " " << intFactor << endl; matthiasm@1: Chris@23: for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins Chris@23: tempValue = f1.values[k + intShift] * (1-intFactor) + f1.values[k+intShift+1] * intFactor; Chris@23: f2.values.push_back(tempValue); Chris@23: } matthiasm@1: Chris@23: f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge Chris@23: vector runningmean = SpecialConvolution(f2.values,hw); Chris@23: vector runningstd; Chris@23: for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance) Chris@23: runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i])); Chris@23: } Chris@23: runningstd = SpecialConvolution(runningstd,hw); // second step convolve Chris@23: for (int i = 0; i < 256; i++) { Chris@23: runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std Chris@23: if (runningstd[i] > 0) { Chris@23: // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ? mail@41: // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; Chris@23: f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ? mail@41: (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; Chris@23: } Chris@23: if (f2.values[i] < 0) { Chris@23: cerr << "ERROR: negative value in logfreq spectrum" << endl; Chris@23: } Chris@23: } Chris@35: fsOut[m_outputTunedSpec].push_back(f2); Chris@23: count++; Chris@23: } Chris@23: cerr << "done." << endl; matthiasm@1: Chris@23: /** Semitone spectrum and chromagrams Chris@23: Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum Chris@23: is inferred using a non-negative least squares algorithm. Chris@23: Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means Chris@23: bass and treble stacked onto each other). Chris@23: **/ Chris@23: if (m_dictID == 1) { Chris@23: cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... "; Chris@23: } else { Chris@23: cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... "; Chris@23: } matthiasm@13: matthiasm@1: Chris@23: vector oldchroma = vector(12,0); Chris@23: vector oldbasschroma = vector(12,0); Chris@23: count = 0; matthiasm@9: Chris@38: for (FeatureList::iterator it = fsOut[m_outputTunedSpec].begin(); it != fsOut[m_outputTunedSpec].end(); ++it) { Chris@23: Feature f2 = *it; // logfreq spectrum Chris@23: Feature f3; // semitone spectrum Chris@23: Feature f4; // treble chromagram Chris@23: Feature f5; // bass chromagram Chris@23: Feature f6; // treble and bass chromagram matthiasm@1: Chris@23: f3.hasTimestamp = true; Chris@23: f3.timestamp = f2.timestamp; matthiasm@1: Chris@23: f4.hasTimestamp = true; Chris@23: f4.timestamp = f2.timestamp; matthiasm@1: Chris@23: f5.hasTimestamp = true; Chris@23: f5.timestamp = f2.timestamp; matthiasm@1: Chris@23: f6.hasTimestamp = true; Chris@23: f6.timestamp = f2.timestamp; matthiasm@1: Chris@35: float b[256]; matthiasm@1: Chris@23: bool some_b_greater_zero = false; Chris@23: float sumb = 0; Chris@23: for (int i = 0; i < 256; i++) { Chris@23: // b[i] = m_dict[(256 * count + i) % (256 * 84)]; Chris@23: b[i] = f2.values[i]; Chris@23: sumb += b[i]; Chris@23: if (b[i] > 0) { Chris@23: some_b_greater_zero = true; Chris@23: } Chris@23: } matthiasm@1: Chris@23: // here's where the non-negative least squares algorithm calculates the note activation x matthiasm@1: Chris@23: vector chroma = vector(12, 0); Chris@23: vector basschroma = vector(12, 0); Chris@23: float currval; Chris@23: unsigned iSemitone = 0; matthiasm@1: Chris@23: if (some_b_greater_zero) { Chris@23: if (m_dictID == 1) { Chris@23: for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) { Chris@23: currval = 0; Chris@23: currval += b[iNote + 1 + -1] * 0.5; Chris@23: currval += b[iNote + 1 + 0] * 1.0; Chris@23: currval += b[iNote + 1 + 1] * 0.5; Chris@23: f3.values.push_back(currval); Chris@23: chroma[iSemitone % 12] += currval * treblewindow[iSemitone]; Chris@23: basschroma[iSemitone % 12] += currval * basswindow[iSemitone]; Chris@23: iSemitone++; Chris@23: } matthiasm@1: Chris@23: } else { Chris@35: float x[84+1000]; Chris@23: for (int i = 1; i < 1084; ++i) x[i] = 1.0; Chris@23: vector signifIndex; Chris@23: int index=0; Chris@23: sumb /= 84.0; Chris@23: for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) { Chris@23: float currval = 0; Chris@35: currval += b[iNote + 1 + -1]; Chris@35: currval += b[iNote + 1 + 0]; Chris@23: currval += b[iNote + 1 + 1]; Chris@23: if (currval > 0) signifIndex.push_back(index); Chris@23: f3.values.push_back(0); // fill the values, change later Chris@23: index++; Chris@23: } Chris@35: float rnorm; Chris@35: float w[84+1000]; Chris@35: float zz[84+1000]; Chris@23: int indx[84+1000]; Chris@23: int mode; Chris@23: int dictsize = 256*signifIndex.size(); Chris@23: // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl; Chris@35: float *curr_dict = new float[dictsize]; Chris@23: for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) { Chris@23: for (unsigned iBin = 0; iBin < 256; iBin++) { Chris@23: curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin]; Chris@23: } Chris@23: } Chris@35: nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode); Chris@23: delete [] curr_dict; Chris@23: for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) { Chris@23: f3.values[signifIndex[iNote]] = x[iNote]; Chris@23: // cerr << mode << endl; Chris@23: chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]]; Chris@23: basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]]; Chris@23: } Chris@23: } Chris@23: } matthiasm@13: Chris@23: f4.values = chroma; Chris@23: f5.values = basschroma; Chris@23: chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas Chris@23: f6.values = chroma; matthiasm@1: Chris@23: if (m_doNormalizeChroma > 0) { Chris@23: vector chromanorm = vector(3,0); Chris@23: switch (int(m_doNormalizeChroma)) { Chris@23: case 0: // should never end up here Chris@23: break; Chris@23: case 1: Chris@23: chromanorm[0] = *max_element(f4.values.begin(), f4.values.end()); Chris@23: chromanorm[1] = *max_element(f5.values.begin(), f5.values.end()); Chris@23: chromanorm[2] = max(chromanorm[0], chromanorm[1]); Chris@23: break; Chris@23: case 2: Chris@23: for (vector::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { Chris@23: chromanorm[0] += *it; Chris@23: } Chris@23: for (vector::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { Chris@23: chromanorm[1] += *it; Chris@23: } Chris@23: for (vector::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { Chris@23: chromanorm[2] += *it; Chris@23: } Chris@23: break; Chris@23: case 3: Chris@23: for (vector::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { Chris@23: chromanorm[0] += pow(*it,2); Chris@23: } Chris@23: chromanorm[0] = sqrt(chromanorm[0]); Chris@23: for (vector::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { Chris@23: chromanorm[1] += pow(*it,2); Chris@23: } Chris@23: chromanorm[1] = sqrt(chromanorm[1]); Chris@23: for (vector::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { Chris@23: chromanorm[2] += pow(*it,2); Chris@23: } Chris@23: chromanorm[2] = sqrt(chromanorm[2]); Chris@23: break; Chris@23: } Chris@23: if (chromanorm[0] > 0) { Chris@23: for (int i = 0; i < f4.values.size(); i++) { Chris@23: f4.values[i] /= chromanorm[0]; Chris@23: } Chris@23: } Chris@23: if (chromanorm[1] > 0) { Chris@23: for (int i = 0; i < f5.values.size(); i++) { Chris@23: f5.values[i] /= chromanorm[1]; Chris@23: } Chris@23: } Chris@23: if (chromanorm[2] > 0) { Chris@23: for (int i = 0; i < f6.values.size(); i++) { Chris@23: f6.values[i] /= chromanorm[2]; Chris@23: } Chris@23: } Chris@23: } matthiasm@13: Chris@35: fsOut[m_outputSemiSpec].push_back(f3); Chris@35: fsOut[m_outputChroma].push_back(f4); Chris@35: fsOut[m_outputBassChroma].push_back(f5); Chris@35: fsOut[m_outputBothChroma].push_back(f6); Chris@23: count++; Chris@23: } Chris@23: cerr << "done." << endl; matthiasm@10: Chris@23: return fsOut; matthiasm@0: matthiasm@0: } matthiasm@0: