Mercurial > hg > nnls-chroma
changeset 50:b6cddb109482 matthiasm-plugin
added chord change value, some other tweaks
author | matthiasm |
---|---|
date | Mon, 25 Oct 2010 18:34:44 +0900 |
parents | 6e76c7710fa1 |
children | 9ea1b92082fc |
files | Chordino.cpp Chordino.h NNLSBase.cpp NNLSBase.h README viterbi.cpp viterbi.h viterbi.o |
diffstat | 8 files changed, 164 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/Chordino.cpp Mon Oct 25 16:58:32 2010 +0900 +++ b/Chordino.cpp Mon Oct 25 18:34:44 2010 +0900 @@ -63,6 +63,104 @@ return "This plugin provides a number of features derived from a log-frequency amplitude spectrum of the DFT: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; based on this semitone spectrum, chroma features and a simple chord estimate."; } +Chordino::ParameterList +Chordino::getParameterDescriptors() const +{ + if (debug_on) cerr << "--> getParameterDescriptors" << endl; + ParameterList list; + + ParameterDescriptor d; + d.identifier = "useNNLS"; + d.name = "use approximate transcription (NNLS)"; + d.description = "Toggles approximate transcription (NNLS)."; + d.unit = ""; + d.minValue = 0.0; + d.maxValue = 1.0; + d.defaultValue = 1.0; + d.isQuantized = true; + d.quantizeStep = 1.0; + list.push_back(d); + + ParameterDescriptor d4; + d4.identifier = "useHMM"; + d4.name = "Viterbi decoding"; + d4.description = "Turns on Viterbi decoding (when off, the simple chord estimator is used)."; + d4.unit = ""; + d4.minValue = 0.0; + d4.maxValue = 1.0; + d4.defaultValue = 1.0; + d4.isQuantized = true; + d4.quantizeStep = 1.0; + list.push_back(d4); + + ParameterDescriptor d0; + d0.identifier = "rollon"; + d0.name = "spectral roll-on"; + d0.description = "The bins below the spectral roll-on quantile will be set to 0."; + d0.unit = ""; + d0.minValue = 0; + d0.maxValue = 0.05; + d0.defaultValue = 0; + d0.isQuantized = true; + d0.quantizeStep = 0.005; + list.push_back(d0); + + ParameterDescriptor d1; + d1.identifier = "tuningmode"; + d1.name = "tuning mode"; + d1.description = "Tuning can be performed locally or on the whole extraction segment. Local tuning is only advisable when the tuning is likely to change over the audio, for example in podcasts, or in a cappella singing."; + d1.unit = ""; + d1.minValue = 0; + d1.maxValue = 1; + d1.defaultValue = 0; + d1.isQuantized = true; + d1.valueNames.push_back("global tuning"); + d1.valueNames.push_back("local tuning"); + d1.quantizeStep = 1.0; + list.push_back(d1); + + ParameterDescriptor d2; + d2.identifier = "whitening"; + d2.name = "spectral whitening"; + d2.description = "Spectral whitening: no whitening - 0; whitening - 1."; + d2.unit = ""; + d2.isQuantized = true; + d2.minValue = 0.0; + d2.maxValue = 1.0; + d2.defaultValue = 1.0; + d2.isQuantized = false; + list.push_back(d2); + + ParameterDescriptor d3; + d3.identifier = "s"; + d3.name = "spectral shape"; + d3.description = "Determines how individual notes in the note dictionary look: higher values mean more dominant higher harmonics."; + d3.unit = ""; + d3.minValue = 0.5; + d3.maxValue = 0.9; + d3.defaultValue = 0.7; + d3.isQuantized = false; + list.push_back(d3); + + // ParameterDescriptor d4; + // d4.identifier = "chromanormalize"; + // d4.name = "chroma normalization"; + // d4.description = "How shall the chroma vector be normalized?"; + // d4.unit = ""; + // d4.minValue = 0; + // d4.maxValue = 3; + // d4.defaultValue = 0; + // d4.isQuantized = true; + // d4.valueNames.push_back("none"); + // d4.valueNames.push_back("maximum norm"); + // d4.valueNames.push_back("L1 norm"); + // d4.valueNames.push_back("L2 norm"); + // d4.quantizeStep = 1.0; + // list.push_back(d4); + + return list; +} + Chordino::OutputList Chordino::getOutputDescriptors() const { @@ -369,7 +467,7 @@ } if (iChord == nChord-1) tempchordvalue *= .7; if (tempchordvalue < 0) tempchordvalue = 0.0; - tempchordvalue = pow(1.5,tempchordvalue); + tempchordvalue = pow(1.3,tempchordvalue); sumchordvalue+=tempchordvalue; currentChordSalience.push_back(tempchordvalue); } @@ -387,8 +485,8 @@ cerr << "done." << endl; - bool m_useHMM = true; // this will go into the chordino header file. - if (m_useHMM) { + // bool m_useHMM = true; // this will go into the chordino header file. + if (m_useHMM == 1.0) { cerr << "[Chordino Plugin] HMM Chord Estimation ... "; int oldchord = nChord-1; double selftransprob = 0.99; @@ -396,13 +494,16 @@ // vector<double> init = vector<double>(nChord,1.0/nChord); vector<double> init = vector<double>(nChord,0); init[nChord-1] = 1; + double *delta; + delta = (double *)malloc(sizeof(double)*nFrame*nChord); + vector<vector<double> > trans; for (int iChord = 0; iChord < nChord; iChord++) { vector<double> temp = vector<double>(nChord,(1-selftransprob)/(nChord-1)); temp[iChord] = selftransprob; trans.push_back(temp); } - vector<int> chordpath = ViterbiPath(init,trans,chordogram); + vector<int> chordpath = ViterbiPath(init, trans, chordogram, delta); Feature chord_feature; // chord estimate @@ -411,7 +512,7 @@ chord_feature.label = m_chordnames[chordpath[0]]; fsOut[0].push_back(chord_feature); - for (int iFrame = 0; iFrame < chordpath.size(); ++iFrame) { + for (int iFrame = 1; iFrame < chordpath.size(); ++iFrame) { // cerr << chordpath[iFrame] << endl; if (chordpath[iFrame] != oldchord ) { Feature chord_feature; // chord estimate @@ -421,6 +522,10 @@ fsOut[0].push_back(chord_feature); oldchord = chordpath[iFrame]; } + /* calculating simple chord change prob */ + for (int iChord = 0; iChord < nChord; iChord++) { + chordchange[iFrame-1] += delta[(iFrame-1)*nChord + iChord] * log(delta[(iFrame-1)*nChord + iChord]/delta[iFrame*nChord + iChord]); + } } // cerr << chordpath[0] << endl; @@ -513,7 +618,9 @@ for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) { scoreChordogram[iFrame+count][bestchordR]++; } - if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength; + if (bestchordL != bestchordR) { + chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength; + } count++; } // cerr << "******* agent finished *******" << endl; @@ -561,8 +668,7 @@ } // chordSequence[count] = maxChordIndex; // cerr << maxChordIndex << endl; - // cerr << chordchange[count] << endl; - // fsOut[9].push_back(currentChord); + // cerr << chordchange[count] << endl; if (oldChord != maxChord) { oldChord = maxChord; chord_feature.label = m_chordnames[maxChordIndex]; @@ -577,5 +683,17 @@ chord_feature.label = "N"; fsOut[0].push_back(chord_feature); cerr << "done." << endl; + + for (int iFrame = 0; iFrame < nFrame; iFrame++) { + Feature chordchange_feature; + chordchange_feature.hasTimestamp = true; + chordchange_feature.timestamp = timestamps[iFrame]; + chordchange_feature.values.push_back(chordchange[iFrame]); + fsOut[1].push_back(chordchange_feature); + } + + + + return fsOut; }
--- a/Chordino.h Mon Oct 25 16:58:32 2010 +0900 +++ b/Chordino.h Mon Oct 25 18:34:44 2010 +0900 @@ -34,6 +34,7 @@ string getName() const; string getDescription() const; + ParameterList getParameterDescriptors() const; OutputList getOutputDescriptors() const; FeatureSet process(const float *const *inputBuffers,
--- a/NNLSBase.cpp Mon Oct 25 16:58:32 2010 +0900 +++ b/NNLSBase.cpp Mon Oct 25 18:34:44 2010 +0900 @@ -55,7 +55,8 @@ m_doNormalizeChroma(0), m_rollon(0.0), m_s(0.7), - m_useNNLS(1) + m_useNNLS(1), + m_useHMM(1) { if (debug_on) cerr << "--> NNLSBase" << endl; @@ -259,6 +260,11 @@ if (identifier == "chromanormalize") { return m_doNormalizeChroma; } + + if (identifier == "useHMM") { + return m_useHMM; + } + return 0; } @@ -279,6 +285,10 @@ m_s = value; } + if (identifier == "useHMM") { + m_useHMM = value; + } + if (identifier == "tuningmode") { m_tuneLocal = (value > 0) ? true : false; // cerr << "m_tuneLocal :" << m_tuneLocal << endl;
--- a/NNLSBase.h Mon Oct 25 16:58:32 2010 +0900 +++ b/NNLSBase.h Mon Oct 25 18:34:44 2010 +0900 @@ -70,6 +70,7 @@ float m_preset; float m_s; float m_useNNLS; + float m_useHMM; vector<float> m_localTuning; vector<float> m_kernelValue; vector<int> m_kernelFftIndex;
--- a/README Mon Oct 25 16:58:32 2010 +0900 +++ b/README Mon Oct 25 18:34:44 2010 +0900 @@ -31,8 +31,6 @@ * Chromagram: a 12-dimensional chromagram, restricted with mid-range emphasis. At each frame the Semitone Spectrum is multiplied by a mid-range pattern and then mapped to the 12 chroma bins. * Chromagram and Bass Chromagram: a 24-dimensional chromagram, consisting of the both Bass Chromgram and Chromagram, see above. When normalisation is used, this representation will however be scaled differently, and hence be different from the individual chromagrams. -### References and Credits ### - ## Chordino ## System identifier – vamp:nnls-chroma:chordino @@ -40,13 +38,14 @@ ### General Description ### -Chordino provides a simple chord transcription based on NNLS Chroma (described above). +Chordino provides a simple chord transcription based on NNLS Chroma (described above). Chord profiles given by the user in the file "chord.dict" are used to calculate frame-wise chord similarities. Two simple (non-state-of-the-art!) algorithms are available that smooth these to provide a chord transcription: a simple chord change method, and a standard HMM/Viterbi approach. ### Parameters ### The default settings (in brackets, below) are those used for Matthias Mauch's 2010 MIREX submissions. * use approximate transcription (NNLS) (on or off; default: on): toggle between NNLS approximate transcription and linear spectral mapping. +* Viterbi decoding (on or off; default: on): uses HMM/Viterbi smoothing. Otherwise: heuristic chord change smoothing. * spectral roll on (0.00 -- 0.05; default: 0.0): consider the cumulative energy spectrum (from low to high frequencies). All bins below the first bin whose cumulative energy exceeds [spectral roll on] x [total energy] will be set to 0. A value of 0 means that no bins will be changed. * tuning mode (global or local; default: global): local uses a local average for tuning, global uses ... exactly. * spectral whitening (0.0 -- 1.0; default: 1.0): determines how much the log-frequency spectrum is whitened. A value of 0.0 means no whitening. For values other than 0.0 the log-freq spectral bins are divided by [standard deviation of their neighbours]^[spectral whitening], where "^" means "to the power of". @@ -55,5 +54,8 @@ ### Outputs ### -### References and Credits ### +* Chord Estimate: estimated chord times and labels. +* Harmonic Change Value: an indication of the likelihood harmonic change. Depends on the chord dictionary. Calculation is different depending on whether the Viterbi algorithm is used for chord estimation, or the simple chord estimate. +## References and Credits ## +
--- a/viterbi.cpp Mon Oct 25 16:58:32 2010 +0900 +++ b/viterbi.cpp Mon Oct 25 18:34:44 2010 +0900 @@ -2,7 +2,7 @@ #include "viterbi.h" #include <iostream> -std::vector<int> ViterbiPath(std::vector<double> init, std::vector<vector<double> > trans, std::vector<vector<double> > obs) { +std::vector<int> ViterbiPath(std::vector<double> init, std::vector<vector<double> > trans, std::vector<vector<double> > obs, double *delta) { int nState = init.size(); int nFrame = obs.size(); @@ -12,8 +12,13 @@ if (trans[0].size() != nState || trans.size() != nState || obs[0].size() != nState) { cerr << "ERROR: matrix sizes inconsistent." << endl; } + + for (int iState = 0; iState < nState; ++iState) delta[iState] = init[iState]; + for (int iFrame = 1; iFrame < nFrame; ++iFrame) { + for (int iState = 0; iState < nState; ++iState) delta[iFrame*nState + iState]; + } - vector<vector<double> > delta; // "matrix" of conditional probabilities + // vector<vector<double> > delta; // "matrix" of conditional probabilities vector<vector<int> > psi; // "matrix" of remembered indices of the best transitions vector<int> path = vector<int>(nFrame, nState-1); // the final output path (current assignment arbitrary, makes sense only for Chordino, where nChord-1 is the "no chord" label) vector<double> scale = vector<double>(nFrame, 0); // remembers by how much the vectors in delta are scaled. @@ -21,18 +26,18 @@ double deltasum = 0; /* initialise first frame */ - delta.push_back(init); + // delta.push_back(init); for (int iState = 0; iState < nState; ++iState) { - delta[0][iState] *= obs[0][iState]; - deltasum += delta[0][iState]; + delta[iState] *= obs[0][iState]; + deltasum += delta[iState]; } - for (int iState = 0; iState < nState; ++iState) delta[0][iState] /= deltasum; // normalise (scale) + for (int iState = 0; iState < nState; ++iState) delta[iState] /= deltasum; // normalise (scale) scale.push_back(1.0/deltasum); psi.push_back(vector<int>(nState,0)); /* rest of the forward step */ for (int iFrame = 1; iFrame < nFrame; ++iFrame) { - delta.push_back(vector<double>(nState,0)); + // delta.push_back(vector<double>(nState,0)); deltasum = 0; psi.push_back(vector<int>(nState,0)); /* every state wants to know which previous state suits him best */ @@ -41,7 +46,7 @@ double bestValue = 0; if (obs[iFrame][jState] > 0) { for (int iState = 0; iState < nState; ++iState) { - double currentValue = delta[iFrame-1][iState] * trans[iState][jState]; + double currentValue = delta[(iFrame-1) * nState + iState] * trans[iState][jState]; if (currentValue > bestValue) { bestValue = currentValue; bestState = iState; @@ -49,18 +54,18 @@ } } // cerr << bestState <<" ::: " << bestValue << endl ; - delta[iFrame][jState] = bestValue * obs[iFrame][jState]; - deltasum += delta[iFrame][jState]; + delta[iFrame * nState + jState] = bestValue * obs[iFrame][jState]; + deltasum += delta[iFrame * nState + jState]; psi[iFrame][jState] = bestState; } if (deltasum > 0) { for (int iState = 0; iState < nState; ++iState) { - delta[iFrame][iState] /= deltasum; // normalise (scale) + delta[iFrame * nState + iState] /= deltasum; // normalise (scale) } scale.push_back(1.0/deltasum); } else { for (int iState = 0; iState < nState; ++iState) { - delta[iFrame][iState] = 1.0/nState; + delta[iFrame * nState + iState] = 1.0/nState; } scale.push_back(1.0); } @@ -70,7 +75,7 @@ /* initialise backward step */ int bestValue = 0; for (int iState = 0; iState < nState; ++iState) { - double currentValue = delta[nFrame-1][iState]; + double currentValue = delta[(nFrame-1) * nState + iState]; if (currentValue > path[nFrame-1]) { bestValue = currentValue; path[nFrame-1] = iState;
--- a/viterbi.h Mon Oct 25 16:58:32 2010 +0900 +++ b/viterbi.h Mon Oct 25 18:34:44 2010 +0900 @@ -23,6 +23,6 @@ #include <string> using namespace std; -extern std::vector<int> ViterbiPath(std::vector<double> init, std::vector<vector<double> > trans, std::vector<vector<double> > obs); +extern std::vector<int> ViterbiPath(std::vector<double> init, std::vector<vector<double> > trans, std::vector<vector<double> > obs, double *delta); #endif \ No newline at end of file