diff Chordino.cpp @ 35:cf8898a0174c matthiasm-plugin

* Split out NNLSChroma plugin into three plugins (chroma, chordino, tuning) with a common base class. There's still quite a lot of duplication between the getRemainingFeatures functions. Also add copyright / copying headers, etc.
author Chris Cannam
date Fri, 22 Oct 2010 11:30:21 +0100
parents NNLSChroma.cpp@da3195577172
children 7409ab74c63b
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Chordino.cpp	Fri Oct 22 11:30:21 2010 +0100
@@ -0,0 +1,541 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+  NNLS-Chroma / Chordino
+
+  Audio feature extraction plugins for chromagram and chord
+  estimation.
+
+  Centre for Digital Music, Queen Mary University of London.
+  This file copyright 2008-2010 Matthias Mauch and QMUL.
+    
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of the
+  License, or (at your option) any later version.  See the file
+  COPYING included with this distribution for more information.
+*/
+
+#include "Chordino.h"
+
+#include "chromamethods.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <cmath>
+
+#include <algorithm>
+
+const bool debug_on = false;
+
+const vector<float> hw(hammingwind, hammingwind+19);
+
+Chordino::Chordino(float inputSampleRate) :
+    NNLSBase(inputSampleRate)
+{
+    if (debug_on) cerr << "--> Chordino" << endl;
+}
+
+Chordino::~Chordino()
+{
+    if (debug_on) cerr << "--> ~Chordino" << endl;
+}
+
+string
+Chordino::getIdentifier() const
+{
+    if (debug_on) cerr << "--> getIdentifier" << endl;
+    return "chordino";
+}
+
+string
+Chordino::getName() const
+{
+    if (debug_on) cerr << "--> getName" << endl;
+    return "Chordino";
+}
+
+string
+Chordino::getDescription() const
+{
+    if (debug_on) cerr << "--> getDescription" << endl;
+    return "This plugin provides a number of features derived from a log-frequency amplitude spectrum of the DFT: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; based on this semitone spectrum, chroma features and a simple chord estimate.";
+}
+
+Chordino::OutputList
+Chordino::getOutputDescriptors() const
+{
+    if (debug_on) cerr << "--> getOutputDescriptors" << endl;
+    OutputList list;
+    
+    int index = 0;
+
+    OutputDescriptor d7;
+    d7.identifier = "simplechord";
+    d7.name = "Simple Chord Estimate";
+    d7.description = "A simple chord estimate based on the inner product of chord templates with the smoothed chroma.";
+    d7.unit = "";
+    d7.hasFixedBinCount = true;
+    d7.binCount = 0;
+    d7.hasKnownExtents = false;
+    d7.isQuantized = false;
+    d7.sampleType = OutputDescriptor::VariableSampleRate;
+    d7.hasDuration = false;
+    d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
+    list.push_back(d7);
+    m_outputChords = index++;
+    
+    OutputDescriptor d8;
+    d8.identifier = "harmonicchange";
+    d8.name = "Harmonic change value";
+    d8.description = "Harmonic change.";
+    d8.unit = "";
+    d8.hasFixedBinCount = true;
+    d8.binCount = 1;
+    d8.hasKnownExtents = true;
+    d8.minValue = 0.0;
+    d8.maxValue = 0.999;
+    d8.isQuantized = false;
+    d8.sampleType = OutputDescriptor::FixedSampleRate;
+    d8.hasDuration = false;
+    // d8.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
+    list.push_back(d8);
+    m_outputHarmonicChange = index++;
+  
+    return list;
+}
+
+bool
+Chordino::initialise(size_t channels, size_t stepSize, size_t blockSize)
+{	
+    if (debug_on) {
+        cerr << "--> initialise";
+    }
+
+    if (!NNLSBase::initialise(channels, stepSize, blockSize)) {
+        return false;
+    }
+
+    return true;
+}
+
+void
+Chordino::reset()
+{
+    if (debug_on) cerr << "--> reset";
+    NNLSBase::reset();
+}
+
+Chordino::FeatureSet
+Chordino::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
+{   
+    if (debug_on) cerr << "--> process" << endl;
+
+    NNLSBase::baseProcess(inputBuffers, timestamp);
+
+    return FeatureSet();
+}
+
+Chordino::FeatureSet
+Chordino::getRemainingFeatures()
+{
+    if (debug_on) cerr << "--> getRemainingFeatures" << endl;
+    FeatureSet fsOut;
+    if (m_logSpectrum.size() == 0) return fsOut;
+    int nChord = m_chordnames.size();
+    // 
+    /**  Calculate Tuning
+         calculate tuning from (using the angle of the complex number defined by the 
+         cumulative mean real and imag values)
+    **/
+    float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2;
+    float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2;
+    float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
+    float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
+    int intShift = floor(normalisedtuning * 3);
+    float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this
+		    
+    char buffer0 [50];
+		
+    sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
+		    
+		    
+    /** Tune Log-Frequency Spectrogram
+        calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to 
+        perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
+    **/
+    cerr << endl << "[Chordino Plugin] Tuning Log-Frequency Spectrogram ... ";
+					
+    float tempValue = 0;
+    float dbThreshold = 0; // relative to the background spectrum
+    float thresh = pow(10,dbThreshold/20);
+    // cerr << "tune local ? " << m_tuneLocal << endl;
+    int count = 0;
+		
+    FeatureList tunedSpec;
+
+    for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
+        Feature f1 = *i;
+        Feature f2; // tuned log-frequency spectrum
+        f2.hasTimestamp = true;
+        f2.timestamp = f1.timestamp;
+        f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
+		
+        if (m_tuneLocal) {
+            intShift = floor(m_localTuning[count] * 3);
+            intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this
+        }
+		        
+        // cerr << intShift << " " << intFactor << endl;
+		        
+        for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
+            tempValue = f1.values[k + intShift] * (1-intFactor) + f1.values[k+intShift+1] * intFactor;
+            f2.values.push_back(tempValue);
+        }
+		        
+        f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
+        vector<float> runningmean = SpecialConvolution(f2.values,hw);
+        vector<float> runningstd;
+        for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance)
+            runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
+        }
+        runningstd = SpecialConvolution(runningstd,hw); // second step convolve
+        for (int i = 0; i < 256; i++) { 
+            runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
+            if (runningstd[i] > 0) {
+                // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ? 
+                // 		                    (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
+                f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
+                    (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_paling) : 0;
+            }
+            if (f2.values[i] < 0) {
+                cerr << "ERROR: negative value in logfreq spectrum" << endl;
+            }
+        }
+        tunedSpec.push_back(f2);
+        count++;
+    }
+    cerr << "done." << endl;
+	    
+    /** Semitone spectrum and chromagrams
+        Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
+        is inferred using a non-negative least squares algorithm.
+        Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means 
+        bass and treble stacked onto each other).
+    **/
+    if (m_dictID == 1) {
+        cerr << "[Chordino Plugin] Mapping to semitone spectrum and chroma ... ";
+    } else {
+        cerr << "[Chordino Plugin] Performing NNLS and mapping to chroma ... ";
+    }
+
+	    
+    vector<vector<float> > chordogram;
+    vector<vector<int> > scoreChordogram;
+    vector<float> chordchange = vector<float>(tunedSpec.size(),0);
+    count = 0;
+
+    FeatureList chromaList;
+
+    for (FeatureList::iterator it = tunedSpec.begin(); it != tunedSpec.end(); ++it) {
+        Feature f2 = *it; // logfreq spectrum
+        Feature f6; // treble and bass chromagram
+
+        f6.hasTimestamp = true;
+        f6.timestamp = f2.timestamp;
+
+        float b[256];
+	
+        bool some_b_greater_zero = false;
+        float sumb = 0;
+        for (int i = 0; i < 256; i++) {
+            // b[i] = m_dict[(256 * count + i) % (256 * 84)];
+            b[i] = f2.values[i];
+            sumb += b[i];
+            if (b[i] > 0) {
+                some_b_greater_zero = true;
+            }            
+        }
+	    
+        // here's where the non-negative least squares algorithm calculates the note activation x
+	
+        vector<float> chroma = vector<float>(12, 0);
+        vector<float> basschroma = vector<float>(12, 0);
+        float currval;
+        unsigned iSemitone = 0;
+			
+        if (some_b_greater_zero) {
+            if (m_dictID == 1) {
+                for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
+                    currval = 0;
+                    currval += b[iNote + 1 + -1] * 0.5;
+                    currval += b[iNote + 1 +  0] * 1.0; 
+                    currval += b[iNote + 1 +  1] * 0.5; 
+                    chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
+                    basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
+                    iSemitone++;
+                }
+		        
+            } else {
+                float x[84+1000];
+                for (int i = 1; i < 1084; ++i) x[i] = 1.0;
+                vector<int> signifIndex;
+                int index=0;
+                sumb /= 84.0;
+                for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
+                    float currval = 0;
+                    currval += b[iNote + 1 + -1];						
+                    currval += b[iNote + 1 +  0];						
+                    currval += b[iNote + 1 +  1];
+                    if (currval > 0) signifIndex.push_back(index);
+                    index++;
+                }
+                float rnorm;
+                float w[84+1000];
+                float zz[84+1000];
+                int indx[84+1000];
+                int mode;
+                int dictsize = 256*signifIndex.size();
+                float *curr_dict = new float[dictsize];
+                for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
+                    for (unsigned iBin = 0; iBin < 256; iBin++) {
+                        curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin];
+                    }
+                }
+                nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
+                delete [] curr_dict;
+                for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
+                    // cerr << mode << endl;
+                    chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
+                    basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
+                }
+            }	
+        }
+
+        vector<float> origchroma = chroma;
+        chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas 
+        f6.values = chroma;
+ 
+        if (m_doNormalizeChroma > 0) {
+            vector<float> chromanorm = vector<float>(3,0);			
+            switch (int(m_doNormalizeChroma)) {
+            case 0: // should never end up here
+                break;
+            case 1:
+                chromanorm[0] = *max_element(origchroma.begin(), origchroma.end());
+                chromanorm[1] = *max_element(basschroma.begin(), basschroma.end());
+                chromanorm[2] = max(chromanorm[0], chromanorm[1]);
+                break;
+            case 2:
+                for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
+                    chromanorm[2] += *it; 						
+                }
+                break;
+            case 3:
+                for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
+                    chromanorm[2] += pow(*it,2); 						
+                }
+                chromanorm[2] = sqrt(chromanorm[2]);
+                break;
+            }
+            if (chromanorm[2] > 0) {
+                for (int i = 0; i < chroma.size(); i++) {
+                    f6.values[i] /= chromanorm[2];
+                }
+            }
+        }
+
+        chromaList.push_back(f6);
+
+        // local chord estimation
+        vector<float> currentChordSalience;
+        float tempchordvalue = 0;
+        float sumchordvalue = 0;
+	        
+        for (int iChord = 0; iChord < nChord; iChord++) {
+            tempchordvalue = 0;
+            for (int iBin = 0; iBin < 12; iBin++) {
+                tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
+            }
+            for (int iBin = 12; iBin < 24; iBin++) {
+                tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
+            }
+            sumchordvalue+=tempchordvalue;
+            currentChordSalience.push_back(tempchordvalue);
+        }
+        if (sumchordvalue > 0) {
+            for (int iChord = 0; iChord < nChord; iChord++) {
+                currentChordSalience[iChord] /= sumchordvalue;
+            }
+        } else {
+            currentChordSalience[nChord-1] = 1.0;
+        }
+        chordogram.push_back(currentChordSalience);
+	        
+        count++;
+    }
+    cerr << "done." << endl;
+		
+
+    /* Simple chord estimation
+       I just take the local chord estimates ("currentChordSalience") and average them over time, then
+       take the maximum. Very simple, don't do this at home...
+    */
+    cerr << "[Chordino Plugin] Chord Estimation ... ";
+    count = 0; 
+    int halfwindowlength = m_inputSampleRate / m_stepSize;
+    vector<int> chordSequence;
+
+    for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) { // initialise the score chordogram
+        vector<int> temp = vector<int>(nChord,0);
+        scoreChordogram.push_back(temp);
+    }
+
+    for (FeatureList::iterator it = chromaList.begin(); it < chromaList.end()-2*halfwindowlength-1; ++it) {		
+        int startIndex = count + 1;
+        int endIndex = count + 2 * halfwindowlength;
+			
+        float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
+            
+        vector<int> chordCandidates;
+        for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
+            // float currsum = 0;
+            // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
+            //  currsum += chordogram[iFrame][iChord];
+            // }
+            //                 if (currsum > chordThreshold) chordCandidates.push_back(iChord);
+            for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
+                if (chordogram[iFrame][iChord] > chordThreshold) {
+                    chordCandidates.push_back(iChord);
+                    break;
+                }                    
+            }
+        }
+        chordCandidates.push_back(nChord-1);
+//        cerr << chordCandidates.size() << endl;          
+	        
+        float maxval = 0; // will be the value of the most salient *chord change* in this frame
+        float maxindex = 0; //... and the index thereof
+        unsigned bestchordL = nChord-1; // index of the best "left" chord
+        unsigned bestchordR = nChord-1; // index of the best "right" chord
+ 	 		
+        for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
+            // now find the max values on both sides of iWF
+            // left side:
+            float maxL = 0;
+            unsigned maxindL = nChord-1;
+            for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
+                unsigned iChord = chordCandidates[kChord];
+                float currsum = 0;
+                for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
+                    currsum += chordogram[count+iFrame][iChord];
+                }
+                if (iChord == nChord-1) currsum *= 0.8;
+                if (currsum > maxL) {
+                    maxL = currsum;
+                    maxindL = iChord;
+                }
+            }				
+            // right side:
+            float maxR = 0;
+            unsigned maxindR = nChord-1;
+            for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
+                unsigned iChord = chordCandidates[kChord];
+                float currsum = 0;
+                for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
+                    currsum += chordogram[count+iFrame][iChord];
+                }
+                if (iChord == nChord-1) currsum *= 0.8;
+                if (currsum > maxR) {
+                    maxR = currsum;
+                    maxindR = iChord;
+                }
+            }
+            if (maxL+maxR > maxval) {					
+                maxval = maxL+maxR;
+                maxindex = iWF;
+                bestchordL = maxindL;
+                bestchordR = maxindR;
+            }
+				
+        }
+//        cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
+        // add a score to every chord-frame-point that was part of a maximum 
+        for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
+            scoreChordogram[iFrame+count][bestchordL]++;
+        }
+        for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
+            scoreChordogram[iFrame+count][bestchordR]++;
+        }
+        if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
+        count++;	
+    }
+//    cerr << "*******  agent finished   *******" << endl;
+    count = 0;
+    for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) { 
+        float maxval = 0; // will be the value of the most salient chord in this frame
+        float maxindex = 0; //... and the index thereof
+        for (unsigned iChord = 0; iChord < nChord; iChord++) {
+            if (scoreChordogram[count][iChord] > maxval) {
+                maxval = scoreChordogram[count][iChord];
+                maxindex = iChord;
+                // cerr << iChord << endl;
+            }
+        }
+        chordSequence.push_back(maxindex);
+        // cerr << "before modefilter, maxindex: " << maxindex << endl;
+        count++;
+    }
+//    cerr << "*******  mode filter done *******" << endl;
+
+	
+    // mode filter on chordSequence
+    count = 0;
+    string oldChord = "";
+    for (FeatureList::iterator it = chromaList.begin(); it != chromaList.end(); ++it) {
+        Feature f6 = *it;
+        Feature f7; // chord estimate
+        f7.hasTimestamp = true;
+        f7.timestamp = f6.timestamp;
+        Feature f8; // chord estimate
+        f8.hasTimestamp = true;
+        f8.timestamp = f6.timestamp;
+			
+        vector<int> chordCount = vector<int>(nChord,0);
+        int maxChordCount = 0;
+        int maxChordIndex = nChord-1;
+        string maxChord;
+        int startIndex = max(count - halfwindowlength/2,0);
+        int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
+        for (int i = startIndex; i < endIndex; i++) {				
+            chordCount[chordSequence[i]]++;
+            if (chordCount[chordSequence[i]] > maxChordCount) {
+                // cerr << "start index " << startIndex << endl;
+                maxChordCount++;
+                maxChordIndex = chordSequence[i];
+                maxChord = m_chordnames[maxChordIndex];
+            }
+        }
+        // chordSequence[count] = maxChordIndex;
+        // cerr << maxChordIndex << endl;
+        f8.values.push_back(chordchange[count]/(halfwindowlength*2));
+        // cerr << chordchange[count] << endl;
+        fsOut[m_outputHarmonicChange].push_back(f8);
+        if (oldChord != maxChord) {
+            oldChord = maxChord;
+            f7.label = m_chordnames[maxChordIndex];
+            fsOut[m_outputChords].push_back(f7);
+        }
+        count++;
+    }
+    Feature f7; // last chord estimate
+    f7.hasTimestamp = true;
+    f7.timestamp = chromaList[chromaList.size()-1].timestamp;
+    f7.label = "N";
+    fsOut[m_outputChords].push_back(f7);
+    cerr << "done." << endl;
+
+    return fsOut;     
+
+}
+