Chris@23
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
matthiasm@0
|
2
|
Chris@35
|
3 /*
|
Chris@35
|
4 NNLS-Chroma / Chordino
|
Chris@35
|
5
|
Chris@35
|
6 Audio feature extraction plugins for chromagram and chord
|
Chris@35
|
7 estimation.
|
Chris@35
|
8
|
Chris@35
|
9 Centre for Digital Music, Queen Mary University of London.
|
Chris@35
|
10 This file copyright 2008-2010 Matthias Mauch and QMUL.
|
Chris@35
|
11
|
Chris@35
|
12 This program is free software; you can redistribute it and/or
|
Chris@35
|
13 modify it under the terms of the GNU General Public License as
|
Chris@35
|
14 published by the Free Software Foundation; either version 2 of the
|
Chris@35
|
15 License, or (at your option) any later version. See the file
|
Chris@35
|
16 COPYING included with this distribution for more information.
|
Chris@35
|
17 */
|
Chris@35
|
18
|
Chris@35
|
19 #include "Chordino.h"
|
Chris@27
|
20
|
Chris@27
|
21 #include "chromamethods.h"
|
matthiasm@43
|
22 #include "viterbi.h"
|
Chris@27
|
23
|
Chris@27
|
24 #include <cstdlib>
|
Chris@27
|
25 #include <fstream>
|
matthiasm@0
|
26 #include <cmath>
|
matthiasm@9
|
27
|
Chris@27
|
28 #include <algorithm>
|
matthiasm@0
|
29
|
matthiasm@0
|
30 const bool debug_on = false;
|
matthiasm@0
|
31
|
Chris@27
|
32 const vector<float> hw(hammingwind, hammingwind+19);
|
matthiasm@0
|
33
|
Chris@35
|
34 Chordino::Chordino(float inputSampleRate) :
|
Chris@35
|
35 NNLSBase(inputSampleRate)
|
matthiasm@0
|
36 {
|
Chris@35
|
37 if (debug_on) cerr << "--> Chordino" << endl;
|
matthiasm@0
|
38 }
|
matthiasm@0
|
39
|
Chris@35
|
40 Chordino::~Chordino()
|
matthiasm@0
|
41 {
|
Chris@35
|
42 if (debug_on) cerr << "--> ~Chordino" << endl;
|
matthiasm@0
|
43 }
|
matthiasm@0
|
44
|
matthiasm@0
|
45 string
|
Chris@35
|
46 Chordino::getIdentifier() const
|
matthiasm@0
|
47 {
|
Chris@23
|
48 if (debug_on) cerr << "--> getIdentifier" << endl;
|
Chris@35
|
49 return "chordino";
|
matthiasm@0
|
50 }
|
matthiasm@0
|
51
|
matthiasm@0
|
52 string
|
Chris@35
|
53 Chordino::getName() const
|
matthiasm@0
|
54 {
|
Chris@23
|
55 if (debug_on) cerr << "--> getName" << endl;
|
Chris@35
|
56 return "Chordino";
|
matthiasm@0
|
57 }
|
matthiasm@0
|
58
|
matthiasm@0
|
59 string
|
Chris@35
|
60 Chordino::getDescription() const
|
matthiasm@0
|
61 {
|
Chris@23
|
62 if (debug_on) cerr << "--> getDescription" << endl;
|
matthiasm@58
|
63 return "Chordino provides a simple chord transcription based on NNLS Chroma (as in the NNLS Chroma plugin). Chord profiles given by the user in the file chord.dict are used to calculate frame-wise chord similarities. Two simple (non-state-of-the-art!) algorithms are available that smooth these to provide a chord transcription: a simple chord change method, and a standard HMM/Viterbi approach.";
|
matthiasm@0
|
64 }
|
matthiasm@0
|
65
|
matthiasm@50
|
66 Chordino::ParameterList
|
matthiasm@50
|
67 Chordino::getParameterDescriptors() const
|
matthiasm@50
|
68 {
|
matthiasm@50
|
69 if (debug_on) cerr << "--> getParameterDescriptors" << endl;
|
matthiasm@50
|
70 ParameterList list;
|
matthiasm@50
|
71
|
matthiasm@50
|
72 ParameterDescriptor d;
|
matthiasm@50
|
73 d.identifier = "useNNLS";
|
matthiasm@50
|
74 d.name = "use approximate transcription (NNLS)";
|
matthiasm@50
|
75 d.description = "Toggles approximate transcription (NNLS).";
|
matthiasm@50
|
76 d.unit = "";
|
matthiasm@50
|
77 d.minValue = 0.0;
|
matthiasm@50
|
78 d.maxValue = 1.0;
|
matthiasm@50
|
79 d.defaultValue = 1.0;
|
matthiasm@50
|
80 d.isQuantized = true;
|
matthiasm@50
|
81 d.quantizeStep = 1.0;
|
matthiasm@50
|
82 list.push_back(d);
|
matthiasm@50
|
83
|
matthiasm@50
|
84 ParameterDescriptor d4;
|
matthiasm@50
|
85 d4.identifier = "useHMM";
|
matthiasm@53
|
86 d4.name = "HMM (Viterbi decoding)";
|
matthiasm@50
|
87 d4.description = "Turns on Viterbi decoding (when off, the simple chord estimator is used).";
|
matthiasm@50
|
88 d4.unit = "";
|
matthiasm@50
|
89 d4.minValue = 0.0;
|
matthiasm@50
|
90 d4.maxValue = 1.0;
|
matthiasm@50
|
91 d4.defaultValue = 1.0;
|
matthiasm@50
|
92 d4.isQuantized = true;
|
matthiasm@50
|
93 d4.quantizeStep = 1.0;
|
matthiasm@50
|
94 list.push_back(d4);
|
matthiasm@50
|
95
|
matthiasm@50
|
96 ParameterDescriptor d0;
|
matthiasm@50
|
97 d0.identifier = "rollon";
|
matthiasm@50
|
98 d0.name = "spectral roll-on";
|
matthiasm@58
|
99 d0.description = "Consider the cumulative energy spectrum (from low to high frequencies). All bins below the first bin whose cumulative energy exceeds the quantile [spectral roll on] x [total energy] will be set to 0. A value of 0 means that no bins will be changed.";
|
matthiasm@59
|
100 d0.unit = "%";
|
matthiasm@50
|
101 d0.minValue = 0;
|
matthiasm@50
|
102 d0.maxValue = 0.05;
|
matthiasm@50
|
103 d0.defaultValue = 0;
|
matthiasm@50
|
104 d0.isQuantized = true;
|
matthiasm@50
|
105 d0.quantizeStep = 0.005;
|
matthiasm@50
|
106 list.push_back(d0);
|
matthiasm@50
|
107
|
matthiasm@50
|
108 ParameterDescriptor d1;
|
matthiasm@50
|
109 d1.identifier = "tuningmode";
|
matthiasm@50
|
110 d1.name = "tuning mode";
|
matthiasm@50
|
111 d1.description = "Tuning can be performed locally or on the whole extraction segment. Local tuning is only advisable when the tuning is likely to change over the audio, for example in podcasts, or in a cappella singing.";
|
matthiasm@50
|
112 d1.unit = "";
|
matthiasm@50
|
113 d1.minValue = 0;
|
matthiasm@50
|
114 d1.maxValue = 1;
|
matthiasm@50
|
115 d1.defaultValue = 0;
|
matthiasm@50
|
116 d1.isQuantized = true;
|
matthiasm@50
|
117 d1.valueNames.push_back("global tuning");
|
matthiasm@50
|
118 d1.valueNames.push_back("local tuning");
|
matthiasm@50
|
119 d1.quantizeStep = 1.0;
|
matthiasm@50
|
120 list.push_back(d1);
|
matthiasm@50
|
121
|
matthiasm@50
|
122 ParameterDescriptor d2;
|
matthiasm@50
|
123 d2.identifier = "whitening";
|
matthiasm@50
|
124 d2.name = "spectral whitening";
|
matthiasm@50
|
125 d2.description = "Spectral whitening: no whitening - 0; whitening - 1.";
|
matthiasm@50
|
126 d2.unit = "";
|
matthiasm@50
|
127 d2.isQuantized = true;
|
matthiasm@50
|
128 d2.minValue = 0.0;
|
matthiasm@50
|
129 d2.maxValue = 1.0;
|
matthiasm@50
|
130 d2.defaultValue = 1.0;
|
matthiasm@50
|
131 d2.isQuantized = false;
|
matthiasm@50
|
132 list.push_back(d2);
|
matthiasm@50
|
133
|
matthiasm@50
|
134 ParameterDescriptor d3;
|
matthiasm@50
|
135 d3.identifier = "s";
|
matthiasm@50
|
136 d3.name = "spectral shape";
|
matthiasm@50
|
137 d3.description = "Determines how individual notes in the note dictionary look: higher values mean more dominant higher harmonics.";
|
matthiasm@50
|
138 d3.unit = "";
|
matthiasm@50
|
139 d3.minValue = 0.5;
|
matthiasm@50
|
140 d3.maxValue = 0.9;
|
matthiasm@50
|
141 d3.defaultValue = 0.7;
|
matthiasm@50
|
142 d3.isQuantized = false;
|
matthiasm@50
|
143 list.push_back(d3);
|
matthiasm@50
|
144
|
matthiasm@50
|
145 // ParameterDescriptor d4;
|
matthiasm@50
|
146 // d4.identifier = "chromanormalize";
|
matthiasm@50
|
147 // d4.name = "chroma normalization";
|
matthiasm@50
|
148 // d4.description = "How shall the chroma vector be normalized?";
|
matthiasm@50
|
149 // d4.unit = "";
|
matthiasm@50
|
150 // d4.minValue = 0;
|
matthiasm@50
|
151 // d4.maxValue = 3;
|
matthiasm@50
|
152 // d4.defaultValue = 0;
|
matthiasm@50
|
153 // d4.isQuantized = true;
|
matthiasm@50
|
154 // d4.valueNames.push_back("none");
|
matthiasm@50
|
155 // d4.valueNames.push_back("maximum norm");
|
matthiasm@50
|
156 // d4.valueNames.push_back("L1 norm");
|
matthiasm@50
|
157 // d4.valueNames.push_back("L2 norm");
|
matthiasm@50
|
158 // d4.quantizeStep = 1.0;
|
matthiasm@50
|
159 // list.push_back(d4);
|
matthiasm@50
|
160
|
matthiasm@50
|
161 return list;
|
matthiasm@50
|
162 }
|
matthiasm@50
|
163
|
Chris@35
|
164 Chordino::OutputList
|
Chris@35
|
165 Chordino::getOutputDescriptors() const
|
matthiasm@0
|
166 {
|
Chris@23
|
167 if (debug_on) cerr << "--> getOutputDescriptors" << endl;
|
matthiasm@0
|
168 OutputList list;
|
matthiasm@0
|
169
|
Chris@35
|
170 int index = 0;
|
matthiasm@0
|
171
|
matthiasm@0
|
172 OutputDescriptor d7;
|
matthiasm@0
|
173 d7.identifier = "simplechord";
|
Chris@36
|
174 d7.name = "Chord Estimate";
|
matthiasm@58
|
175 d7.description = "Estimated chord times and labels. Two simple (non-state-of-the-art!) algorithms are available that smooth these to provide a chord transcription: a simple chord change method, and a standard HMM/Viterbi approach.";
|
matthiasm@0
|
176 d7.unit = "";
|
matthiasm@0
|
177 d7.hasFixedBinCount = true;
|
matthiasm@0
|
178 d7.binCount = 0;
|
matthiasm@0
|
179 d7.hasKnownExtents = false;
|
matthiasm@0
|
180 d7.isQuantized = false;
|
matthiasm@0
|
181 d7.sampleType = OutputDescriptor::VariableSampleRate;
|
matthiasm@0
|
182 d7.hasDuration = false;
|
matthiasm@0
|
183 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
184 list.push_back(d7);
|
Chris@35
|
185 m_outputChords = index++;
|
matthiasm@0
|
186
|
Chris@23
|
187 OutputDescriptor d8;
|
mail@60
|
188 d8.identifier = "harmonicchange";
|
Chris@36
|
189 d8.name = "Harmonic Change Value";
|
matthiasm@58
|
190 d8.description = "An indication of the likelihood of harmonic change. Depends on the chord dictionary. Calculation is different depending on whether the Viterbi algorithm is used for chord estimation, or the simple chord estimate.";
|
matthiasm@17
|
191 d8.unit = "";
|
matthiasm@17
|
192 d8.hasFixedBinCount = true;
|
matthiasm@17
|
193 d8.binCount = 1;
|
mail@60
|
194 d8.hasKnownExtents = false;
|
mail@60
|
195 // d8.minValue = 0.0;
|
mail@60
|
196 // d8.maxValue = 0.999;
|
matthiasm@17
|
197 d8.isQuantized = false;
|
matthiasm@17
|
198 d8.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@17
|
199 d8.hasDuration = false;
|
matthiasm@17
|
200 // d8.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@17
|
201 list.push_back(d8);
|
Chris@35
|
202 m_outputHarmonicChange = index++;
|
matthiasm@1
|
203
|
matthiasm@0
|
204 return list;
|
matthiasm@0
|
205 }
|
matthiasm@0
|
206
|
matthiasm@0
|
207 bool
|
Chris@35
|
208 Chordino::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
matthiasm@0
|
209 {
|
Chris@23
|
210 if (debug_on) {
|
Chris@23
|
211 cerr << "--> initialise";
|
Chris@23
|
212 }
|
matthiasm@1
|
213
|
Chris@35
|
214 if (!NNLSBase::initialise(channels, stepSize, blockSize)) {
|
Chris@35
|
215 return false;
|
Chris@35
|
216 }
|
matthiasm@1
|
217
|
matthiasm@0
|
218 return true;
|
matthiasm@0
|
219 }
|
matthiasm@0
|
220
|
matthiasm@0
|
221 void
|
Chris@35
|
222 Chordino::reset()
|
matthiasm@0
|
223 {
|
Chris@23
|
224 if (debug_on) cerr << "--> reset";
|
Chris@35
|
225 NNLSBase::reset();
|
matthiasm@0
|
226 }
|
matthiasm@0
|
227
|
Chris@35
|
228 Chordino::FeatureSet
|
Chris@35
|
229 Chordino::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
matthiasm@0
|
230 {
|
Chris@23
|
231 if (debug_on) cerr << "--> process" << endl;
|
matthiasm@0
|
232
|
Chris@35
|
233 NNLSBase::baseProcess(inputBuffers, timestamp);
|
matthiasm@0
|
234
|
Chris@35
|
235 return FeatureSet();
|
matthiasm@0
|
236 }
|
matthiasm@0
|
237
|
Chris@35
|
238 Chordino::FeatureSet
|
Chris@35
|
239 Chordino::getRemainingFeatures()
|
matthiasm@0
|
240 {
|
Chris@23
|
241 if (debug_on) cerr << "--> getRemainingFeatures" << endl;
|
Chris@23
|
242 FeatureSet fsOut;
|
Chris@35
|
243 if (m_logSpectrum.size() == 0) return fsOut;
|
Chris@23
|
244 int nChord = m_chordnames.size();
|
Chris@23
|
245 //
|
Chris@23
|
246 /** Calculate Tuning
|
Chris@23
|
247 calculate tuning from (using the angle of the complex number defined by the
|
Chris@23
|
248 cumulative mean real and imag values)
|
Chris@23
|
249 **/
|
Chris@23
|
250 float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2;
|
Chris@23
|
251 float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2;
|
Chris@23
|
252 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
|
Chris@23
|
253 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
|
Chris@23
|
254 int intShift = floor(normalisedtuning * 3);
|
Chris@23
|
255 float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this
|
matthiasm@1
|
256
|
Chris@23
|
257 char buffer0 [50];
|
matthiasm@1
|
258
|
Chris@23
|
259 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
|
matthiasm@1
|
260
|
matthiasm@1
|
261
|
Chris@23
|
262 /** Tune Log-Frequency Spectrogram
|
matthiasm@43
|
263 calculate a tuned log-frequency spectrogram (currentTunedSpec): use the tuning estimated above (kinda f0) to
|
matthiasm@43
|
264 perform linear interpolation on the existing log-frequency spectrogram (kinda currentLogSpectum).
|
Chris@23
|
265 **/
|
Chris@35
|
266 cerr << endl << "[Chordino Plugin] Tuning Log-Frequency Spectrogram ... ";
|
matthiasm@13
|
267
|
Chris@23
|
268 float tempValue = 0;
|
Chris@23
|
269 float dbThreshold = 0; // relative to the background spectrum
|
Chris@23
|
270 float thresh = pow(10,dbThreshold/20);
|
Chris@23
|
271 // cerr << "tune local ? " << m_tuneLocal << endl;
|
Chris@23
|
272 int count = 0;
|
matthiasm@1
|
273
|
Chris@35
|
274 FeatureList tunedSpec;
|
matthiasm@43
|
275 int nFrame = m_logSpectrum.size();
|
matthiasm@43
|
276
|
matthiasm@43
|
277 vector<Vamp::RealTime> timestamps;
|
Chris@35
|
278
|
Chris@35
|
279 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
|
matthiasm@43
|
280 Feature currentLogSpectum = *i;
|
matthiasm@43
|
281 Feature currentTunedSpec; // tuned log-frequency spectrum
|
matthiasm@43
|
282 currentTunedSpec.hasTimestamp = true;
|
matthiasm@43
|
283 currentTunedSpec.timestamp = currentLogSpectum.timestamp;
|
matthiasm@43
|
284 timestamps.push_back(currentLogSpectum.timestamp);
|
matthiasm@43
|
285 currentTunedSpec.values.push_back(0.0); currentTunedSpec.values.push_back(0.0); // set lower edge to zero
|
matthiasm@1
|
286
|
Chris@23
|
287 if (m_tuneLocal) {
|
Chris@23
|
288 intShift = floor(m_localTuning[count] * 3);
|
Chris@23
|
289 intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this
|
Chris@23
|
290 }
|
matthiasm@1
|
291
|
Chris@23
|
292 // cerr << intShift << " " << intFactor << endl;
|
matthiasm@1
|
293
|
matthiasm@43
|
294 for (unsigned k = 2; k < currentLogSpectum.values.size() - 3; ++k) { // interpolate all inner bins
|
matthiasm@43
|
295 tempValue = currentLogSpectum.values[k + intShift] * (1-intFactor) + currentLogSpectum.values[k+intShift+1] * intFactor;
|
matthiasm@43
|
296 currentTunedSpec.values.push_back(tempValue);
|
Chris@23
|
297 }
|
matthiasm@1
|
298
|
matthiasm@43
|
299 currentTunedSpec.values.push_back(0.0); currentTunedSpec.values.push_back(0.0); currentTunedSpec.values.push_back(0.0); // upper edge
|
matthiasm@43
|
300 vector<float> runningmean = SpecialConvolution(currentTunedSpec.values,hw);
|
Chris@23
|
301 vector<float> runningstd;
|
Chris@23
|
302 for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance)
|
matthiasm@43
|
303 runningstd.push_back((currentTunedSpec.values[i] - runningmean[i]) * (currentTunedSpec.values[i] - runningmean[i]));
|
Chris@23
|
304 }
|
Chris@23
|
305 runningstd = SpecialConvolution(runningstd,hw); // second step convolve
|
Chris@23
|
306 for (int i = 0; i < 256; i++) {
|
Chris@23
|
307 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
|
Chris@23
|
308 if (runningstd[i] > 0) {
|
matthiasm@43
|
309 // currentTunedSpec.values[i] = (currentTunedSpec.values[i] / runningmean[i]) > thresh ?
|
matthiasm@43
|
310 // (currentTunedSpec.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
matthiasm@43
|
311 currentTunedSpec.values[i] = (currentTunedSpec.values[i] - runningmean[i]) > 0 ?
|
matthiasm@43
|
312 (currentTunedSpec.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
Chris@23
|
313 }
|
matthiasm@43
|
314 if (currentTunedSpec.values[i] < 0) {
|
Chris@23
|
315 cerr << "ERROR: negative value in logfreq spectrum" << endl;
|
Chris@23
|
316 }
|
Chris@23
|
317 }
|
matthiasm@43
|
318 tunedSpec.push_back(currentTunedSpec);
|
Chris@23
|
319 count++;
|
Chris@23
|
320 }
|
Chris@23
|
321 cerr << "done." << endl;
|
matthiasm@1
|
322
|
Chris@23
|
323 /** Semitone spectrum and chromagrams
|
Chris@23
|
324 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
|
Chris@23
|
325 is inferred using a non-negative least squares algorithm.
|
Chris@23
|
326 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
|
Chris@23
|
327 bass and treble stacked onto each other).
|
Chris@23
|
328 **/
|
matthiasm@42
|
329 if (m_useNNLS == 0) {
|
Chris@35
|
330 cerr << "[Chordino Plugin] Mapping to semitone spectrum and chroma ... ";
|
Chris@23
|
331 } else {
|
Chris@35
|
332 cerr << "[Chordino Plugin] Performing NNLS and mapping to chroma ... ";
|
Chris@23
|
333 }
|
matthiasm@13
|
334
|
matthiasm@1
|
335
|
matthiasm@43
|
336 vector<vector<double> > chordogram;
|
Chris@23
|
337 vector<vector<int> > scoreChordogram;
|
Chris@35
|
338 vector<float> chordchange = vector<float>(tunedSpec.size(),0);
|
Chris@23
|
339 count = 0;
|
matthiasm@9
|
340
|
Chris@35
|
341 FeatureList chromaList;
|
matthiasm@43
|
342
|
matthiasm@43
|
343
|
Chris@35
|
344
|
Chris@35
|
345 for (FeatureList::iterator it = tunedSpec.begin(); it != tunedSpec.end(); ++it) {
|
matthiasm@43
|
346 Feature currentTunedSpec = *it; // logfreq spectrum
|
matthiasm@43
|
347 Feature currentChromas; // treble and bass chromagram
|
Chris@35
|
348
|
matthiasm@43
|
349 currentChromas.hasTimestamp = true;
|
matthiasm@43
|
350 currentChromas.timestamp = currentTunedSpec.timestamp;
|
Chris@35
|
351
|
Chris@35
|
352 float b[256];
|
matthiasm@1
|
353
|
Chris@23
|
354 bool some_b_greater_zero = false;
|
Chris@23
|
355 float sumb = 0;
|
Chris@23
|
356 for (int i = 0; i < 256; i++) {
|
Chris@23
|
357 // b[i] = m_dict[(256 * count + i) % (256 * 84)];
|
matthiasm@43
|
358 b[i] = currentTunedSpec.values[i];
|
Chris@23
|
359 sumb += b[i];
|
Chris@23
|
360 if (b[i] > 0) {
|
Chris@23
|
361 some_b_greater_zero = true;
|
Chris@23
|
362 }
|
Chris@23
|
363 }
|
matthiasm@1
|
364
|
Chris@23
|
365 // here's where the non-negative least squares algorithm calculates the note activation x
|
matthiasm@1
|
366
|
Chris@23
|
367 vector<float> chroma = vector<float>(12, 0);
|
Chris@23
|
368 vector<float> basschroma = vector<float>(12, 0);
|
Chris@23
|
369 float currval;
|
Chris@23
|
370 unsigned iSemitone = 0;
|
matthiasm@1
|
371
|
Chris@23
|
372 if (some_b_greater_zero) {
|
matthiasm@42
|
373 if (m_useNNLS == 0) {
|
Chris@23
|
374 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
|
Chris@23
|
375 currval = 0;
|
Chris@35
|
376 currval += b[iNote + 1 + -1] * 0.5;
|
Chris@35
|
377 currval += b[iNote + 1 + 0] * 1.0;
|
Chris@35
|
378 currval += b[iNote + 1 + 1] * 0.5;
|
Chris@23
|
379 chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
|
Chris@23
|
380 basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
|
Chris@23
|
381 iSemitone++;
|
Chris@23
|
382 }
|
matthiasm@1
|
383
|
Chris@23
|
384 } else {
|
Chris@35
|
385 float x[84+1000];
|
Chris@23
|
386 for (int i = 1; i < 1084; ++i) x[i] = 1.0;
|
Chris@23
|
387 vector<int> signifIndex;
|
Chris@23
|
388 int index=0;
|
Chris@23
|
389 sumb /= 84.0;
|
Chris@23
|
390 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
|
Chris@23
|
391 float currval = 0;
|
Chris@23
|
392 currval += b[iNote + 1 + -1];
|
Chris@23
|
393 currval += b[iNote + 1 + 0];
|
Chris@23
|
394 currval += b[iNote + 1 + 1];
|
Chris@23
|
395 if (currval > 0) signifIndex.push_back(index);
|
Chris@23
|
396 index++;
|
Chris@23
|
397 }
|
Chris@35
|
398 float rnorm;
|
Chris@35
|
399 float w[84+1000];
|
Chris@35
|
400 float zz[84+1000];
|
Chris@23
|
401 int indx[84+1000];
|
Chris@23
|
402 int mode;
|
Chris@23
|
403 int dictsize = 256*signifIndex.size();
|
Chris@35
|
404 float *curr_dict = new float[dictsize];
|
Chris@23
|
405 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
|
Chris@23
|
406 for (unsigned iBin = 0; iBin < 256; iBin++) {
|
Chris@23
|
407 curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin];
|
Chris@23
|
408 }
|
Chris@23
|
409 }
|
Chris@35
|
410 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
|
Chris@23
|
411 delete [] curr_dict;
|
Chris@23
|
412 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
|
Chris@23
|
413 // cerr << mode << endl;
|
Chris@23
|
414 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
|
Chris@23
|
415 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
|
Chris@23
|
416 }
|
Chris@23
|
417 }
|
Chris@23
|
418 }
|
Chris@35
|
419
|
Chris@35
|
420 vector<float> origchroma = chroma;
|
Chris@23
|
421 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
|
matthiasm@43
|
422 currentChromas.values = chroma;
|
Chris@35
|
423
|
Chris@23
|
424 if (m_doNormalizeChroma > 0) {
|
Chris@23
|
425 vector<float> chromanorm = vector<float>(3,0);
|
Chris@23
|
426 switch (int(m_doNormalizeChroma)) {
|
Chris@23
|
427 case 0: // should never end up here
|
Chris@23
|
428 break;
|
Chris@23
|
429 case 1:
|
Chris@35
|
430 chromanorm[0] = *max_element(origchroma.begin(), origchroma.end());
|
Chris@35
|
431 chromanorm[1] = *max_element(basschroma.begin(), basschroma.end());
|
Chris@23
|
432 chromanorm[2] = max(chromanorm[0], chromanorm[1]);
|
Chris@23
|
433 break;
|
Chris@23
|
434 case 2:
|
Chris@35
|
435 for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
|
Chris@23
|
436 chromanorm[2] += *it;
|
Chris@23
|
437 }
|
Chris@23
|
438 break;
|
Chris@23
|
439 case 3:
|
Chris@35
|
440 for (vector<float>::iterator it = chroma.begin(); it != chroma.end(); ++it) {
|
Chris@23
|
441 chromanorm[2] += pow(*it,2);
|
Chris@23
|
442 }
|
Chris@23
|
443 chromanorm[2] = sqrt(chromanorm[2]);
|
Chris@23
|
444 break;
|
Chris@23
|
445 }
|
Chris@23
|
446 if (chromanorm[2] > 0) {
|
Chris@35
|
447 for (int i = 0; i < chroma.size(); i++) {
|
matthiasm@43
|
448 currentChromas.values[i] /= chromanorm[2];
|
Chris@23
|
449 }
|
Chris@23
|
450 }
|
Chris@23
|
451 }
|
Chris@35
|
452
|
matthiasm@43
|
453 chromaList.push_back(currentChromas);
|
Chris@35
|
454
|
Chris@23
|
455 // local chord estimation
|
matthiasm@43
|
456 vector<double> currentChordSalience;
|
matthiasm@43
|
457 double tempchordvalue = 0;
|
matthiasm@43
|
458 double sumchordvalue = 0;
|
matthiasm@9
|
459
|
Chris@23
|
460 for (int iChord = 0; iChord < nChord; iChord++) {
|
Chris@23
|
461 tempchordvalue = 0;
|
Chris@23
|
462 for (int iBin = 0; iBin < 12; iBin++) {
|
matthiasm@44
|
463 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
|
Chris@23
|
464 }
|
Chris@23
|
465 for (int iBin = 12; iBin < 24; iBin++) {
|
Chris@23
|
466 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
|
Chris@23
|
467 }
|
matthiasm@48
|
468 if (iChord == nChord-1) tempchordvalue *= .7;
|
matthiasm@48
|
469 if (tempchordvalue < 0) tempchordvalue = 0.0;
|
matthiasm@50
|
470 tempchordvalue = pow(1.3,tempchordvalue);
|
Chris@23
|
471 sumchordvalue+=tempchordvalue;
|
Chris@23
|
472 currentChordSalience.push_back(tempchordvalue);
|
Chris@23
|
473 }
|
Chris@23
|
474 if (sumchordvalue > 0) {
|
Chris@23
|
475 for (int iChord = 0; iChord < nChord; iChord++) {
|
Chris@23
|
476 currentChordSalience[iChord] /= sumchordvalue;
|
Chris@23
|
477 }
|
Chris@23
|
478 } else {
|
Chris@23
|
479 currentChordSalience[nChord-1] = 1.0;
|
Chris@23
|
480 }
|
Chris@23
|
481 chordogram.push_back(currentChordSalience);
|
matthiasm@1
|
482
|
Chris@23
|
483 count++;
|
Chris@23
|
484 }
|
Chris@23
|
485 cerr << "done." << endl;
|
matthiasm@13
|
486
|
matthiasm@10
|
487
|
matthiasm@50
|
488 // bool m_useHMM = true; // this will go into the chordino header file.
|
matthiasm@50
|
489 if (m_useHMM == 1.0) {
|
matthiasm@44
|
490 cerr << "[Chordino Plugin] HMM Chord Estimation ... ";
|
matthiasm@43
|
491 int oldchord = nChord-1;
|
matthiasm@48
|
492 double selftransprob = 0.99;
|
matthiasm@43
|
493
|
matthiasm@48
|
494 // vector<double> init = vector<double>(nChord,1.0/nChord);
|
matthiasm@48
|
495 vector<double> init = vector<double>(nChord,0); init[nChord-1] = 1;
|
matthiasm@48
|
496
|
matthiasm@50
|
497 double *delta;
|
matthiasm@50
|
498 delta = (double *)malloc(sizeof(double)*nFrame*nChord);
|
matthiasm@50
|
499
|
matthiasm@43
|
500 vector<vector<double> > trans;
|
matthiasm@43
|
501 for (int iChord = 0; iChord < nChord; iChord++) {
|
matthiasm@43
|
502 vector<double> temp = vector<double>(nChord,(1-selftransprob)/(nChord-1));
|
matthiasm@43
|
503 temp[iChord] = selftransprob;
|
matthiasm@43
|
504 trans.push_back(temp);
|
matthiasm@43
|
505 }
|
matthiasm@50
|
506 vector<int> chordpath = ViterbiPath(init, trans, chordogram, delta);
|
matthiasm@48
|
507
|
matthiasm@48
|
508
|
matthiasm@48
|
509 Feature chord_feature; // chord estimate
|
matthiasm@48
|
510 chord_feature.hasTimestamp = true;
|
matthiasm@48
|
511 chord_feature.timestamp = timestamps[0];
|
matthiasm@48
|
512 chord_feature.label = m_chordnames[chordpath[0]];
|
mail@60
|
513 fsOut[m_outputChords].push_back(chord_feature);
|
matthiasm@43
|
514
|
mail@60
|
515 chordchange[0] = 0;
|
matthiasm@50
|
516 for (int iFrame = 1; iFrame < chordpath.size(); ++iFrame) {
|
matthiasm@43
|
517 // cerr << chordpath[iFrame] << endl;
|
matthiasm@48
|
518 if (chordpath[iFrame] != oldchord ) {
|
matthiasm@43
|
519 Feature chord_feature; // chord estimate
|
matthiasm@43
|
520 chord_feature.hasTimestamp = true;
|
matthiasm@43
|
521 chord_feature.timestamp = timestamps[iFrame];
|
matthiasm@43
|
522 chord_feature.label = m_chordnames[chordpath[iFrame]];
|
mail@60
|
523 fsOut[m_outputChords].push_back(chord_feature);
|
matthiasm@43
|
524 oldchord = chordpath[iFrame];
|
Chris@23
|
525 }
|
matthiasm@50
|
526 /* calculating simple chord change prob */
|
matthiasm@50
|
527 for (int iChord = 0; iChord < nChord; iChord++) {
|
matthiasm@50
|
528 chordchange[iFrame-1] += delta[(iFrame-1)*nChord + iChord] * log(delta[(iFrame-1)*nChord + iChord]/delta[iFrame*nChord + iChord]);
|
matthiasm@50
|
529 }
|
Chris@23
|
530 }
|
matthiasm@43
|
531
|
matthiasm@43
|
532 // cerr << chordpath[0] << endl;
|
matthiasm@43
|
533 } else {
|
matthiasm@43
|
534 /* Simple chord estimation
|
matthiasm@43
|
535 I just take the local chord estimates ("currentChordSalience") and average them over time, then
|
matthiasm@43
|
536 take the maximum. Very simple, don't do this at home...
|
matthiasm@43
|
537 */
|
matthiasm@44
|
538 cerr << "[Chordino Plugin] Simple Chord Estimation ... ";
|
matthiasm@43
|
539 count = 0;
|
matthiasm@43
|
540 int halfwindowlength = m_inputSampleRate / m_stepSize;
|
matthiasm@43
|
541 vector<int> chordSequence;
|
matthiasm@43
|
542 for (vector<Vamp::RealTime>::iterator it = timestamps.begin(); it != timestamps.end(); ++it) { // initialise the score chordogram
|
matthiasm@43
|
543 vector<int> temp = vector<int>(nChord,0);
|
matthiasm@43
|
544 scoreChordogram.push_back(temp);
|
matthiasm@43
|
545 }
|
matthiasm@43
|
546 for (vector<Vamp::RealTime>::iterator it = timestamps.begin(); it < timestamps.end()-2*halfwindowlength-1; ++it) {
|
matthiasm@43
|
547 int startIndex = count + 1;
|
matthiasm@43
|
548 int endIndex = count + 2 * halfwindowlength;
|
matthiasm@43
|
549
|
matthiasm@43
|
550 float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
|
matthiasm@43
|
551
|
matthiasm@43
|
552 vector<int> chordCandidates;
|
matthiasm@43
|
553 for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
|
matthiasm@43
|
554 // float currsum = 0;
|
matthiasm@43
|
555 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
|
matthiasm@43
|
556 // currsum += chordogram[iFrame][iChord];
|
matthiasm@43
|
557 // }
|
matthiasm@43
|
558 // if (currsum > chordThreshold) chordCandidates.push_back(iChord);
|
matthiasm@43
|
559 for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
|
matthiasm@43
|
560 if (chordogram[iFrame][iChord] > chordThreshold) {
|
matthiasm@43
|
561 chordCandidates.push_back(iChord);
|
matthiasm@43
|
562 break;
|
matthiasm@43
|
563 }
|
Chris@23
|
564 }
|
Chris@23
|
565 }
|
matthiasm@43
|
566 chordCandidates.push_back(nChord-1);
|
matthiasm@43
|
567 // cerr << chordCandidates.size() << endl;
|
matthiasm@43
|
568
|
matthiasm@43
|
569 float maxval = 0; // will be the value of the most salient *chord change* in this frame
|
matthiasm@43
|
570 float maxindex = 0; //... and the index thereof
|
matthiasm@43
|
571 unsigned bestchordL = nChord-1; // index of the best "left" chord
|
matthiasm@43
|
572 unsigned bestchordR = nChord-1; // index of the best "right" chord
|
matthiasm@43
|
573
|
matthiasm@43
|
574 for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
|
matthiasm@43
|
575 // now find the max values on both sides of iWF
|
matthiasm@43
|
576 // left side:
|
matthiasm@43
|
577 float maxL = 0;
|
matthiasm@43
|
578 unsigned maxindL = nChord-1;
|
matthiasm@43
|
579 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
|
matthiasm@43
|
580 unsigned iChord = chordCandidates[kChord];
|
matthiasm@43
|
581 float currsum = 0;
|
matthiasm@43
|
582 for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
|
matthiasm@43
|
583 currsum += chordogram[count+iFrame][iChord];
|
matthiasm@43
|
584 }
|
matthiasm@43
|
585 if (iChord == nChord-1) currsum *= 0.8;
|
matthiasm@43
|
586 if (currsum > maxL) {
|
matthiasm@43
|
587 maxL = currsum;
|
matthiasm@43
|
588 maxindL = iChord;
|
matthiasm@43
|
589 }
|
matthiasm@43
|
590 }
|
matthiasm@43
|
591 // right side:
|
matthiasm@43
|
592 float maxR = 0;
|
matthiasm@43
|
593 unsigned maxindR = nChord-1;
|
matthiasm@43
|
594 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
|
matthiasm@43
|
595 unsigned iChord = chordCandidates[kChord];
|
matthiasm@43
|
596 float currsum = 0;
|
matthiasm@43
|
597 for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
|
matthiasm@43
|
598 currsum += chordogram[count+iFrame][iChord];
|
matthiasm@43
|
599 }
|
matthiasm@43
|
600 if (iChord == nChord-1) currsum *= 0.8;
|
matthiasm@43
|
601 if (currsum > maxR) {
|
matthiasm@43
|
602 maxR = currsum;
|
matthiasm@43
|
603 maxindR = iChord;
|
matthiasm@43
|
604 }
|
matthiasm@43
|
605 }
|
matthiasm@43
|
606 if (maxL+maxR > maxval) {
|
matthiasm@43
|
607 maxval = maxL+maxR;
|
matthiasm@43
|
608 maxindex = iWF;
|
matthiasm@43
|
609 bestchordL = maxindL;
|
matthiasm@43
|
610 bestchordR = maxindR;
|
matthiasm@43
|
611 }
|
matthiasm@43
|
612
|
Chris@23
|
613 }
|
matthiasm@43
|
614 // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
|
matthiasm@43
|
615 // add a score to every chord-frame-point that was part of a maximum
|
matthiasm@43
|
616 for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
|
matthiasm@43
|
617 scoreChordogram[iFrame+count][bestchordL]++;
|
matthiasm@43
|
618 }
|
matthiasm@43
|
619 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
|
matthiasm@43
|
620 scoreChordogram[iFrame+count][bestchordR]++;
|
matthiasm@43
|
621 }
|
matthiasm@50
|
622 if (bestchordL != bestchordR) {
|
matthiasm@50
|
623 chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
|
matthiasm@50
|
624 }
|
matthiasm@43
|
625 count++;
|
Chris@23
|
626 }
|
matthiasm@43
|
627 // cerr << "******* agent finished *******" << endl;
|
matthiasm@43
|
628 count = 0;
|
matthiasm@43
|
629 for (vector<Vamp::RealTime>::iterator it = timestamps.begin(); it != timestamps.end(); ++it) {
|
matthiasm@43
|
630 float maxval = 0; // will be the value of the most salient chord in this frame
|
matthiasm@43
|
631 float maxindex = 0; //... and the index thereof
|
matthiasm@43
|
632 for (unsigned iChord = 0; iChord < nChord; iChord++) {
|
matthiasm@43
|
633 if (scoreChordogram[count][iChord] > maxval) {
|
matthiasm@43
|
634 maxval = scoreChordogram[count][iChord];
|
matthiasm@43
|
635 maxindex = iChord;
|
matthiasm@43
|
636 // cerr << iChord << endl;
|
matthiasm@43
|
637 }
|
matthiasm@43
|
638 }
|
matthiasm@43
|
639 chordSequence.push_back(maxindex);
|
matthiasm@43
|
640 count++;
|
Chris@23
|
641 }
|
matthiasm@43
|
642
|
matthiasm@43
|
643
|
matthiasm@43
|
644 // mode filter on chordSequence
|
matthiasm@43
|
645 count = 0;
|
matthiasm@43
|
646 string oldChord = "";
|
matthiasm@43
|
647 for (vector<Vamp::RealTime>::iterator it = timestamps.begin(); it != timestamps.end(); ++it) {
|
matthiasm@43
|
648 Feature chord_feature; // chord estimate
|
matthiasm@43
|
649 chord_feature.hasTimestamp = true;
|
matthiasm@43
|
650 chord_feature.timestamp = *it;
|
matthiasm@43
|
651 // Feature currentChord; // chord estimate
|
matthiasm@43
|
652 // currentChord.hasTimestamp = true;
|
matthiasm@43
|
653 // currentChord.timestamp = currentChromas.timestamp;
|
matthiasm@43
|
654
|
matthiasm@43
|
655 vector<int> chordCount = vector<int>(nChord,0);
|
matthiasm@43
|
656 int maxChordCount = 0;
|
matthiasm@43
|
657 int maxChordIndex = nChord-1;
|
matthiasm@43
|
658 string maxChord;
|
matthiasm@43
|
659 int startIndex = max(count - halfwindowlength/2,0);
|
matthiasm@43
|
660 int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
|
matthiasm@43
|
661 for (int i = startIndex; i < endIndex; i++) {
|
matthiasm@43
|
662 chordCount[chordSequence[i]]++;
|
matthiasm@43
|
663 if (chordCount[chordSequence[i]] > maxChordCount) {
|
matthiasm@43
|
664 // cerr << "start index " << startIndex << endl;
|
matthiasm@43
|
665 maxChordCount++;
|
matthiasm@43
|
666 maxChordIndex = chordSequence[i];
|
matthiasm@43
|
667 maxChord = m_chordnames[maxChordIndex];
|
matthiasm@43
|
668 }
|
matthiasm@43
|
669 }
|
matthiasm@43
|
670 // chordSequence[count] = maxChordIndex;
|
matthiasm@43
|
671 // cerr << maxChordIndex << endl;
|
matthiasm@50
|
672 // cerr << chordchange[count] << endl;
|
matthiasm@43
|
673 if (oldChord != maxChord) {
|
matthiasm@43
|
674 oldChord = maxChord;
|
matthiasm@43
|
675 chord_feature.label = m_chordnames[maxChordIndex];
|
mail@60
|
676 fsOut[m_outputChords].push_back(chord_feature);
|
matthiasm@43
|
677 }
|
matthiasm@43
|
678 count++;
|
Chris@23
|
679 }
|
Chris@23
|
680 }
|
matthiasm@43
|
681 Feature chord_feature; // last chord estimate
|
matthiasm@43
|
682 chord_feature.hasTimestamp = true;
|
matthiasm@43
|
683 chord_feature.timestamp = timestamps[timestamps.size()-1];
|
matthiasm@43
|
684 chord_feature.label = "N";
|
mail@60
|
685 fsOut[m_outputChords].push_back(chord_feature);
|
Chris@23
|
686 cerr << "done." << endl;
|
matthiasm@50
|
687
|
matthiasm@50
|
688 for (int iFrame = 0; iFrame < nFrame; iFrame++) {
|
matthiasm@50
|
689 Feature chordchange_feature;
|
matthiasm@50
|
690 chordchange_feature.hasTimestamp = true;
|
matthiasm@50
|
691 chordchange_feature.timestamp = timestamps[iFrame];
|
matthiasm@50
|
692 chordchange_feature.values.push_back(chordchange[iFrame]);
|
mail@60
|
693 // cerr << chordchange[iFrame] << endl;
|
mail@60
|
694 fsOut[m_outputHarmonicChange].push_back(chordchange_feature);
|
matthiasm@50
|
695 }
|
matthiasm@50
|
696
|
mail@60
|
697 // for (int iFrame = 0; iFrame < nFrame; iFrame++) cerr << fsOut[m_outputHarmonicChange][iFrame].values[0] << endl;
|
matthiasm@50
|
698
|
matthiasm@50
|
699
|
Chris@23
|
700 return fsOut;
|
matthiasm@0
|
701 }
|