Chris@23
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
matthiasm@0
|
2
|
Chris@35
|
3 /*
|
Chris@35
|
4 NNLS-Chroma / Chordino
|
Chris@35
|
5
|
Chris@35
|
6 Audio feature extraction plugins for chromagram and chord
|
Chris@35
|
7 estimation.
|
Chris@35
|
8
|
Chris@35
|
9 Centre for Digital Music, Queen Mary University of London.
|
Chris@35
|
10 This file copyright 2008-2010 Matthias Mauch and QMUL.
|
Chris@35
|
11
|
Chris@35
|
12 This program is free software; you can redistribute it and/or
|
Chris@35
|
13 modify it under the terms of the GNU General Public License as
|
Chris@35
|
14 published by the Free Software Foundation; either version 2 of the
|
Chris@35
|
15 License, or (at your option) any later version. See the file
|
Chris@35
|
16 COPYING included with this distribution for more information.
|
Chris@35
|
17 */
|
Chris@35
|
18
|
Chris@35
|
19 #include "NNLSBase.h"
|
Chris@27
|
20
|
Chris@27
|
21 #include "chromamethods.h"
|
Chris@27
|
22
|
Chris@27
|
23 #include <cstdlib>
|
Chris@27
|
24 #include <fstream>
|
matthiasm@0
|
25 #include <cmath>
|
matthiasm@9
|
26
|
Chris@27
|
27 #include <algorithm>
|
matthiasm@0
|
28
|
matthiasm@0
|
29 const bool debug_on = false;
|
matthiasm@0
|
30
|
Chris@27
|
31 const vector<float> hw(hammingwind, hammingwind+19);
|
matthiasm@0
|
32
|
Chris@35
|
33 NNLSBase::NNLSBase(float inputSampleRate) :
|
Chris@23
|
34 Plugin(inputSampleRate),
|
Chris@35
|
35 m_logSpectrum(0),
|
Chris@23
|
36 m_blockSize(0),
|
Chris@23
|
37 m_stepSize(0),
|
Chris@23
|
38 m_lengthOfNoteIndex(0),
|
Chris@23
|
39 m_meanTuning0(0),
|
Chris@23
|
40 m_meanTuning1(0),
|
Chris@23
|
41 m_meanTuning2(0),
|
Chris@23
|
42 m_localTuning0(0),
|
Chris@23
|
43 m_localTuning1(0),
|
Chris@23
|
44 m_localTuning2(0),
|
mail@41
|
45 m_whitening(1.0),
|
Chris@23
|
46 m_preset(0.0),
|
Chris@23
|
47 m_localTuning(0),
|
Chris@23
|
48 m_kernelValue(0),
|
Chris@23
|
49 m_kernelFftIndex(0),
|
Chris@23
|
50 m_kernelNoteIndex(0),
|
Chris@23
|
51 m_dict(0),
|
Chris@23
|
52 m_tuneLocal(false),
|
Chris@23
|
53 m_chorddict(0),
|
Chris@23
|
54 m_chordnames(0),
|
Chris@23
|
55 m_doNormalizeChroma(0),
|
mail@41
|
56 m_rollon(0.0),
|
matthiasm@42
|
57 m_s(0.7),
|
matthiasm@50
|
58 m_useNNLS(1),
|
matthiasm@50
|
59 m_useHMM(1)
|
matthiasm@0
|
60 {
|
Chris@35
|
61 if (debug_on) cerr << "--> NNLSBase" << endl;
|
matthiasm@7
|
62
|
Chris@23
|
63 // make the *note* dictionary matrix
|
Chris@23
|
64 m_dict = new float[nNote * 84];
|
Chris@23
|
65 for (unsigned i = 0; i < nNote * 84; ++i) m_dict[i] = 0.0;
|
mail@41
|
66 dictionaryMatrix(m_dict, 0.7);
|
matthiasm@7
|
67
|
Chris@23
|
68 // get the *chord* dictionary from file (if the file exists)
|
Chris@23
|
69 m_chordnames = chordDictionary(&m_chorddict);
|
matthiasm@0
|
70 }
|
matthiasm@0
|
71
|
matthiasm@0
|
72
|
Chris@35
|
73 NNLSBase::~NNLSBase()
|
matthiasm@0
|
74 {
|
Chris@35
|
75 if (debug_on) cerr << "--> ~NNLSBase" << endl;
|
Chris@23
|
76 delete [] m_dict;
|
matthiasm@0
|
77 }
|
matthiasm@0
|
78
|
matthiasm@0
|
79 string
|
Chris@35
|
80 NNLSBase::getMaker() const
|
matthiasm@0
|
81 {
|
Chris@23
|
82 if (debug_on) cerr << "--> getMaker" << endl;
|
matthiasm@0
|
83 // Your name here
|
matthiasm@0
|
84 return "Matthias Mauch";
|
matthiasm@0
|
85 }
|
matthiasm@0
|
86
|
matthiasm@0
|
87 int
|
Chris@35
|
88 NNLSBase::getPluginVersion() const
|
matthiasm@0
|
89 {
|
Chris@23
|
90 if (debug_on) cerr << "--> getPluginVersion" << endl;
|
matthiasm@0
|
91 // Increment this each time you release a version that behaves
|
matthiasm@0
|
92 // differently from the previous one
|
matthiasm@0
|
93 return 1;
|
matthiasm@0
|
94 }
|
matthiasm@0
|
95
|
matthiasm@0
|
96 string
|
Chris@35
|
97 NNLSBase::getCopyright() const
|
matthiasm@0
|
98 {
|
Chris@23
|
99 if (debug_on) cerr << "--> getCopyright" << endl;
|
matthiasm@0
|
100 // This function is not ideally named. It does not necessarily
|
matthiasm@0
|
101 // need to say who made the plugin -- getMaker does that -- but it
|
matthiasm@0
|
102 // should indicate the terms under which it is distributed. For
|
matthiasm@0
|
103 // example, "Copyright (year). All Rights Reserved", or "GPL"
|
Chris@35
|
104 return "GPL";
|
matthiasm@0
|
105 }
|
matthiasm@0
|
106
|
Chris@35
|
107 NNLSBase::InputDomain
|
Chris@35
|
108 NNLSBase::getInputDomain() const
|
matthiasm@0
|
109 {
|
Chris@23
|
110 if (debug_on) cerr << "--> getInputDomain" << endl;
|
matthiasm@0
|
111 return FrequencyDomain;
|
matthiasm@0
|
112 }
|
matthiasm@0
|
113
|
matthiasm@0
|
114 size_t
|
Chris@35
|
115 NNLSBase::getPreferredBlockSize() const
|
matthiasm@0
|
116 {
|
Chris@23
|
117 if (debug_on) cerr << "--> getPreferredBlockSize" << endl;
|
matthiasm@0
|
118 return 16384; // 0 means "I can handle any block size"
|
matthiasm@0
|
119 }
|
matthiasm@0
|
120
|
matthiasm@0
|
121 size_t
|
Chris@35
|
122 NNLSBase::getPreferredStepSize() const
|
matthiasm@0
|
123 {
|
Chris@23
|
124 if (debug_on) cerr << "--> getPreferredStepSize" << endl;
|
matthiasm@0
|
125 return 2048; // 0 means "anything sensible"; in practice this
|
Chris@23
|
126 // means the same as the block size for TimeDomain
|
Chris@23
|
127 // plugins, or half of it for FrequencyDomain plugins
|
matthiasm@0
|
128 }
|
matthiasm@0
|
129
|
matthiasm@0
|
130 size_t
|
Chris@35
|
131 NNLSBase::getMinChannelCount() const
|
matthiasm@0
|
132 {
|
Chris@23
|
133 if (debug_on) cerr << "--> getMinChannelCount" << endl;
|
matthiasm@0
|
134 return 1;
|
matthiasm@0
|
135 }
|
matthiasm@0
|
136
|
matthiasm@0
|
137 size_t
|
Chris@35
|
138 NNLSBase::getMaxChannelCount() const
|
matthiasm@0
|
139 {
|
Chris@23
|
140 if (debug_on) cerr << "--> getMaxChannelCount" << endl;
|
matthiasm@0
|
141 return 1;
|
matthiasm@0
|
142 }
|
matthiasm@0
|
143
|
Chris@35
|
144 NNLSBase::ParameterList
|
Chris@35
|
145 NNLSBase::getParameterDescriptors() const
|
matthiasm@0
|
146 {
|
Chris@23
|
147 if (debug_on) cerr << "--> getParameterDescriptors" << endl;
|
matthiasm@0
|
148 ParameterList list;
|
matthiasm@0
|
149
|
matthiasm@42
|
150 ParameterDescriptor d;
|
matthiasm@42
|
151 d.identifier = "useNNLS";
|
matthiasm@42
|
152 d.name = "use approximate transcription (NNLS)";
|
matthiasm@42
|
153 d.description = "Toggles approximate transcription (NNLS).";
|
matthiasm@42
|
154 d.unit = "";
|
matthiasm@42
|
155 d.minValue = 0.0;
|
matthiasm@42
|
156 d.maxValue = 1.0;
|
matthiasm@42
|
157 d.defaultValue = 1.0;
|
matthiasm@42
|
158 d.isQuantized = true;
|
matthiasm@42
|
159 d.quantizeStep = 1.0;
|
matthiasm@42
|
160 list.push_back(d);
|
matthiasm@42
|
161
|
mail@41
|
162 ParameterDescriptor d0;
|
mail@41
|
163 d0.identifier = "rollon";
|
mail@41
|
164 d0.name = "spectral roll-on";
|
matthiasm@58
|
165 d0.description = "Consider the cumulative energy spectrum (from low to high frequencies). All bins below the first bin whose cumulative energy exceeds the quantile [spectral roll on] x [total energy] will be set to 0. A value of 0 means that no bins will be changed.";
|
matthiasm@59
|
166 d0.unit = "%";
|
mail@41
|
167 d0.minValue = 0;
|
matthiasm@59
|
168 d0.maxValue = 5;
|
mail@41
|
169 d0.defaultValue = 0;
|
matthiasm@48
|
170 d0.isQuantized = true;
|
matthiasm@59
|
171 d0.quantizeStep = 0.5;
|
mail@41
|
172 list.push_back(d0);
|
matthiasm@4
|
173
|
matthiasm@4
|
174 ParameterDescriptor d1;
|
matthiasm@4
|
175 d1.identifier = "tuningmode";
|
matthiasm@4
|
176 d1.name = "tuning mode";
|
matthiasm@4
|
177 d1.description = "Tuning can be performed locally or on the whole extraction segment. Local tuning is only advisable when the tuning is likely to change over the audio, for example in podcasts, or in a cappella singing.";
|
matthiasm@4
|
178 d1.unit = "";
|
matthiasm@4
|
179 d1.minValue = 0;
|
matthiasm@4
|
180 d1.maxValue = 1;
|
matthiasm@4
|
181 d1.defaultValue = 0;
|
matthiasm@4
|
182 d1.isQuantized = true;
|
matthiasm@4
|
183 d1.valueNames.push_back("global tuning");
|
matthiasm@4
|
184 d1.valueNames.push_back("local tuning");
|
matthiasm@4
|
185 d1.quantizeStep = 1.0;
|
matthiasm@4
|
186 list.push_back(d1);
|
matthiasm@4
|
187
|
mail@41
|
188 ParameterDescriptor d2;
|
mail@41
|
189 d2.identifier = "whitening";
|
mail@41
|
190 d2.name = "spectral whitening";
|
mail@41
|
191 d2.description = "Spectral whitening: no whitening - 0; whitening - 1.";
|
mail@41
|
192 d2.unit = "";
|
mail@41
|
193 d2.isQuantized = true;
|
mail@41
|
194 d2.minValue = 0.0;
|
mail@41
|
195 d2.maxValue = 1.0;
|
mail@41
|
196 d2.defaultValue = 1.0;
|
mail@41
|
197 d2.isQuantized = false;
|
mail@41
|
198 list.push_back(d2);
|
mail@41
|
199
|
mail@41
|
200 ParameterDescriptor d3;
|
mail@41
|
201 d3.identifier = "s";
|
mail@41
|
202 d3.name = "spectral shape";
|
mail@41
|
203 d3.description = "Determines how individual notes in the note dictionary look: higher values mean more dominant higher harmonics.";
|
mail@41
|
204 d3.unit = "";
|
mail@41
|
205 d3.minValue = 0.5;
|
mail@41
|
206 d3.maxValue = 0.9;
|
mail@41
|
207 d3.defaultValue = 0.7;
|
mail@41
|
208 d3.isQuantized = false;
|
mail@41
|
209 list.push_back(d3);
|
mail@41
|
210
|
Chris@23
|
211 ParameterDescriptor d4;
|
matthiasm@12
|
212 d4.identifier = "chromanormalize";
|
matthiasm@12
|
213 d4.name = "chroma normalization";
|
matthiasm@12
|
214 d4.description = "How shall the chroma vector be normalized?";
|
matthiasm@12
|
215 d4.unit = "";
|
matthiasm@12
|
216 d4.minValue = 0;
|
matthiasm@13
|
217 d4.maxValue = 3;
|
matthiasm@12
|
218 d4.defaultValue = 0;
|
matthiasm@12
|
219 d4.isQuantized = true;
|
matthiasm@13
|
220 d4.valueNames.push_back("none");
|
matthiasm@13
|
221 d4.valueNames.push_back("maximum norm");
|
Chris@23
|
222 d4.valueNames.push_back("L1 norm");
|
Chris@23
|
223 d4.valueNames.push_back("L2 norm");
|
matthiasm@12
|
224 d4.quantizeStep = 1.0;
|
matthiasm@12
|
225 list.push_back(d4);
|
matthiasm@4
|
226
|
matthiasm@0
|
227 return list;
|
matthiasm@0
|
228 }
|
matthiasm@0
|
229
|
matthiasm@0
|
230 float
|
Chris@35
|
231 NNLSBase::getParameter(string identifier) const
|
matthiasm@0
|
232 {
|
Chris@23
|
233 if (debug_on) cerr << "--> getParameter" << endl;
|
matthiasm@42
|
234 if (identifier == "useNNLS") {
|
matthiasm@42
|
235 return m_useNNLS;
|
matthiasm@0
|
236 }
|
matthiasm@0
|
237
|
mail@41
|
238 if (identifier == "whitening") {
|
mail@41
|
239 return m_whitening;
|
mail@41
|
240 }
|
mail@41
|
241
|
mail@41
|
242 if (identifier == "s") {
|
mail@41
|
243 return m_s;
|
matthiasm@0
|
244 }
|
matthiasm@17
|
245
|
Chris@23
|
246 if (identifier == "rollon") {
|
matthiasm@17
|
247 return m_rollon;
|
matthiasm@17
|
248 }
|
matthiasm@0
|
249
|
matthiasm@0
|
250 if (identifier == "tuningmode") {
|
matthiasm@0
|
251 if (m_tuneLocal) {
|
matthiasm@0
|
252 return 1.0;
|
matthiasm@0
|
253 } else {
|
matthiasm@0
|
254 return 0.0;
|
matthiasm@0
|
255 }
|
matthiasm@0
|
256 }
|
Chris@23
|
257 if (identifier == "preset") {
|
Chris@23
|
258 return m_preset;
|
matthiasm@3
|
259 }
|
Chris@23
|
260 if (identifier == "chromanormalize") {
|
Chris@23
|
261 return m_doNormalizeChroma;
|
matthiasm@12
|
262 }
|
matthiasm@50
|
263
|
matthiasm@50
|
264 if (identifier == "useHMM") {
|
matthiasm@50
|
265 return m_useHMM;
|
matthiasm@50
|
266 }
|
matthiasm@50
|
267
|
matthiasm@0
|
268 return 0;
|
matthiasm@0
|
269
|
matthiasm@0
|
270 }
|
matthiasm@0
|
271
|
matthiasm@0
|
272 void
|
Chris@35
|
273 NNLSBase::setParameter(string identifier, float value)
|
matthiasm@0
|
274 {
|
Chris@23
|
275 if (debug_on) cerr << "--> setParameter" << endl;
|
matthiasm@42
|
276 if (identifier == "useNNLS") {
|
matthiasm@42
|
277 m_useNNLS = (int) value;
|
matthiasm@0
|
278 }
|
matthiasm@0
|
279
|
mail@41
|
280 if (identifier == "whitening") {
|
mail@41
|
281 m_whitening = value;
|
matthiasm@0
|
282 }
|
matthiasm@0
|
283
|
mail@41
|
284 if (identifier == "s") {
|
mail@41
|
285 m_s = value;
|
mail@41
|
286 }
|
mail@41
|
287
|
matthiasm@50
|
288 if (identifier == "useHMM") {
|
matthiasm@50
|
289 m_useHMM = value;
|
matthiasm@50
|
290 }
|
matthiasm@50
|
291
|
matthiasm@0
|
292 if (identifier == "tuningmode") {
|
matthiasm@0
|
293 m_tuneLocal = (value > 0) ? true : false;
|
matthiasm@0
|
294 // cerr << "m_tuneLocal :" << m_tuneLocal << endl;
|
matthiasm@0
|
295 }
|
matthiasm@42
|
296 // if (identifier == "preset") {
|
matthiasm@42
|
297 // m_preset = value;
|
matthiasm@42
|
298 // if (m_preset == 0.0) {
|
matthiasm@42
|
299 // m_tuneLocal = false;
|
matthiasm@42
|
300 // m_whitening = 1.0;
|
matthiasm@42
|
301 // m_dictID = 0.0;
|
matthiasm@42
|
302 // }
|
matthiasm@42
|
303 // if (m_preset == 1.0) {
|
matthiasm@42
|
304 // m_tuneLocal = false;
|
matthiasm@42
|
305 // m_whitening = 1.0;
|
matthiasm@42
|
306 // m_dictID = 1.0;
|
matthiasm@42
|
307 // }
|
matthiasm@42
|
308 // if (m_preset == 2.0) {
|
matthiasm@42
|
309 // m_tuneLocal = false;
|
matthiasm@42
|
310 // m_whitening = 0.7;
|
matthiasm@42
|
311 // m_dictID = 0.0;
|
matthiasm@42
|
312 // }
|
matthiasm@42
|
313 // }
|
Chris@23
|
314 if (identifier == "chromanormalize") {
|
Chris@23
|
315 m_doNormalizeChroma = value;
|
Chris@23
|
316 }
|
matthiasm@17
|
317
|
Chris@23
|
318 if (identifier == "rollon") {
|
Chris@23
|
319 m_rollon = value;
|
Chris@23
|
320 }
|
matthiasm@0
|
321 }
|
matthiasm@0
|
322
|
Chris@35
|
323 NNLSBase::ProgramList
|
Chris@35
|
324 NNLSBase::getPrograms() const
|
matthiasm@0
|
325 {
|
Chris@23
|
326 if (debug_on) cerr << "--> getPrograms" << endl;
|
matthiasm@0
|
327 ProgramList list;
|
matthiasm@0
|
328
|
matthiasm@0
|
329 // If you have no programs, return an empty list (or simply don't
|
matthiasm@0
|
330 // implement this function or getCurrentProgram/selectProgram)
|
matthiasm@0
|
331
|
matthiasm@0
|
332 return list;
|
matthiasm@0
|
333 }
|
matthiasm@0
|
334
|
matthiasm@0
|
335 string
|
Chris@35
|
336 NNLSBase::getCurrentProgram() const
|
matthiasm@0
|
337 {
|
Chris@23
|
338 if (debug_on) cerr << "--> getCurrentProgram" << endl;
|
matthiasm@0
|
339 return ""; // no programs
|
matthiasm@0
|
340 }
|
matthiasm@0
|
341
|
matthiasm@0
|
342 void
|
Chris@35
|
343 NNLSBase::selectProgram(string name)
|
matthiasm@0
|
344 {
|
Chris@23
|
345 if (debug_on) cerr << "--> selectProgram" << endl;
|
matthiasm@0
|
346 }
|
matthiasm@0
|
347
|
matthiasm@0
|
348
|
matthiasm@0
|
349 bool
|
Chris@35
|
350 NNLSBase::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
matthiasm@0
|
351 {
|
Chris@23
|
352 if (debug_on) {
|
Chris@23
|
353 cerr << "--> initialise";
|
Chris@23
|
354 }
|
matthiasm@1
|
355
|
matthiasm@0
|
356 if (channels < getMinChannelCount() ||
|
matthiasm@0
|
357 channels > getMaxChannelCount()) return false;
|
matthiasm@0
|
358 m_blockSize = blockSize;
|
matthiasm@0
|
359 m_stepSize = stepSize;
|
Chris@35
|
360 m_frameCount = 0;
|
Chris@23
|
361 int tempn = 256 * m_blockSize/2;
|
Chris@23
|
362 // cerr << "length of tempkernel : " << tempn << endl;
|
Chris@23
|
363 float *tempkernel;
|
matthiasm@1
|
364
|
Chris@23
|
365 tempkernel = new float[tempn];
|
matthiasm@1
|
366
|
Chris@23
|
367 logFreqMatrix(m_inputSampleRate, m_blockSize, tempkernel);
|
Chris@23
|
368 m_kernelValue.clear();
|
Chris@23
|
369 m_kernelFftIndex.clear();
|
Chris@23
|
370 m_kernelNoteIndex.clear();
|
Chris@23
|
371 int countNonzero = 0;
|
Chris@23
|
372 for (unsigned iNote = 0; iNote < nNote; ++iNote) { // I don't know if this is wise: manually making a sparse matrix
|
Chris@23
|
373 for (unsigned iFFT = 0; iFFT < blockSize/2; ++iFFT) {
|
Chris@23
|
374 if (tempkernel[iFFT + blockSize/2 * iNote] > 0) {
|
Chris@23
|
375 m_kernelValue.push_back(tempkernel[iFFT + blockSize/2 * iNote]);
|
Chris@23
|
376 if (tempkernel[iFFT + blockSize/2 * iNote] > 0) {
|
Chris@23
|
377 countNonzero++;
|
Chris@23
|
378 }
|
Chris@23
|
379 m_kernelFftIndex.push_back(iFFT);
|
Chris@23
|
380 m_kernelNoteIndex.push_back(iNote);
|
Chris@23
|
381 }
|
Chris@23
|
382 }
|
Chris@23
|
383 }
|
Chris@23
|
384 // cerr << "nonzero count : " << countNonzero << endl;
|
Chris@23
|
385 delete [] tempkernel;
|
Chris@35
|
386 /*
|
Chris@23
|
387 ofstream myfile;
|
Chris@23
|
388 myfile.open ("matrix.txt");
|
matthiasm@3
|
389 // myfile << "Writing this to a file.\n";
|
Chris@23
|
390 for (int i = 0; i < nNote * 84; ++i) {
|
Chris@23
|
391 myfile << m_dict[i] << endl;
|
Chris@23
|
392 }
|
matthiasm@3
|
393 myfile.close();
|
Chris@35
|
394 */
|
matthiasm@0
|
395 return true;
|
matthiasm@0
|
396 }
|
matthiasm@0
|
397
|
matthiasm@0
|
398 void
|
Chris@35
|
399 NNLSBase::reset()
|
matthiasm@0
|
400 {
|
Chris@23
|
401 if (debug_on) cerr << "--> reset";
|
matthiasm@4
|
402
|
matthiasm@0
|
403 // Clear buffers, reset stored values, etc
|
Chris@35
|
404 m_frameCount = 0;
|
matthiasm@42
|
405 // m_dictID = 0;
|
Chris@35
|
406 m_logSpectrum.clear();
|
Chris@23
|
407 m_meanTuning0 = 0;
|
Chris@23
|
408 m_meanTuning1 = 0;
|
Chris@23
|
409 m_meanTuning2 = 0;
|
Chris@23
|
410 m_localTuning0 = 0;
|
Chris@23
|
411 m_localTuning1 = 0;
|
Chris@23
|
412 m_localTuning2 = 0;
|
Chris@23
|
413 m_localTuning.clear();
|
matthiasm@0
|
414 }
|
matthiasm@0
|
415
|
Chris@35
|
416 void
|
Chris@35
|
417 NNLSBase::baseProcess(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
matthiasm@0
|
418 {
|
Chris@35
|
419 m_frameCount++;
|
Chris@23
|
420 float *magnitude = new float[m_blockSize/2];
|
matthiasm@0
|
421
|
Chris@23
|
422 const float *fbuf = inputBuffers[0];
|
Chris@23
|
423 float energysum = 0;
|
Chris@23
|
424 // make magnitude
|
Chris@23
|
425 float maxmag = -10000;
|
Chris@23
|
426 for (size_t iBin = 0; iBin < m_blockSize/2; iBin++) {
|
Chris@23
|
427 magnitude[iBin] = sqrt(fbuf[2 * iBin] * fbuf[2 * iBin] +
|
Chris@23
|
428 fbuf[2 * iBin + 1] * fbuf[2 * iBin + 1]);
|
Chris@23
|
429 if (maxmag < magnitude[iBin]) maxmag = magnitude[iBin];
|
Chris@23
|
430 if (m_rollon > 0) {
|
Chris@23
|
431 energysum += pow(magnitude[iBin],2);
|
Chris@23
|
432 }
|
Chris@23
|
433 }
|
matthiasm@14
|
434
|
Chris@23
|
435 float cumenergy = 0;
|
Chris@23
|
436 if (m_rollon > 0) {
|
Chris@23
|
437 for (size_t iBin = 2; iBin < m_blockSize/2; iBin++) {
|
Chris@23
|
438 cumenergy += pow(magnitude[iBin],2);
|
matthiasm@59
|
439 if (cumenergy < energysum * m_rollon / 100) magnitude[iBin-2] = 0;
|
Chris@23
|
440 else break;
|
Chris@23
|
441 }
|
Chris@23
|
442 }
|
matthiasm@17
|
443
|
Chris@23
|
444 if (maxmag < 2) {
|
Chris@23
|
445 // cerr << "timestamp " << timestamp << ": very low magnitude, setting magnitude to all zeros" << endl;
|
Chris@23
|
446 for (size_t iBin = 0; iBin < m_blockSize/2; iBin++) {
|
Chris@23
|
447 magnitude[iBin] = 0;
|
Chris@23
|
448 }
|
Chris@23
|
449 }
|
matthiasm@4
|
450
|
Chris@23
|
451 // note magnitude mapping using pre-calculated matrix
|
Chris@23
|
452 float *nm = new float[nNote]; // note magnitude
|
Chris@23
|
453 for (size_t iNote = 0; iNote < nNote; iNote++) {
|
Chris@23
|
454 nm[iNote] = 0; // initialise as 0
|
Chris@23
|
455 }
|
Chris@23
|
456 int binCount = 0;
|
Chris@23
|
457 for (vector<float>::iterator it = m_kernelValue.begin(); it != m_kernelValue.end(); ++it) {
|
Chris@23
|
458 // cerr << ".";
|
Chris@23
|
459 nm[m_kernelNoteIndex[binCount]] += magnitude[m_kernelFftIndex[binCount]] * m_kernelValue[binCount];
|
Chris@23
|
460 // cerr << m_kernelFftIndex[binCount] << " -- " << magnitude[m_kernelFftIndex[binCount]] << " -- "<< m_kernelValue[binCount] << endl;
|
Chris@23
|
461 binCount++;
|
Chris@23
|
462 }
|
Chris@23
|
463 // cerr << nm[20];
|
Chris@23
|
464 // cerr << endl;
|
matthiasm@0
|
465
|
matthiasm@0
|
466
|
Chris@35
|
467 float one_over_N = 1.0/m_frameCount;
|
matthiasm@0
|
468 // update means of complex tuning variables
|
Chris@35
|
469 m_meanTuning0 *= float(m_frameCount-1)*one_over_N;
|
Chris@35
|
470 m_meanTuning1 *= float(m_frameCount-1)*one_over_N;
|
Chris@35
|
471 m_meanTuning2 *= float(m_frameCount-1)*one_over_N;
|
matthiasm@0
|
472
|
matthiasm@0
|
473 for (int iTone = 0; iTone < 160; iTone = iTone + 3) {
|
matthiasm@0
|
474 m_meanTuning0 += nm[iTone + 0]*one_over_N;
|
matthiasm@0
|
475 m_meanTuning1 += nm[iTone + 1]*one_over_N;
|
matthiasm@0
|
476 m_meanTuning2 += nm[iTone + 2]*one_over_N;
|
Chris@23
|
477 float ratioOld = 0.997;
|
matthiasm@3
|
478 m_localTuning0 *= ratioOld; m_localTuning0 += nm[iTone + 0] * (1 - ratioOld);
|
matthiasm@3
|
479 m_localTuning1 *= ratioOld; m_localTuning1 += nm[iTone + 1] * (1 - ratioOld);
|
matthiasm@3
|
480 m_localTuning2 *= ratioOld; m_localTuning2 += nm[iTone + 2] * (1 - ratioOld);
|
matthiasm@0
|
481 }
|
matthiasm@0
|
482
|
matthiasm@0
|
483 // if (m_tuneLocal) {
|
Chris@23
|
484 // local tuning
|
Chris@23
|
485 float localTuningImag = sinvalue * m_localTuning1 - sinvalue * m_localTuning2;
|
Chris@23
|
486 float localTuningReal = m_localTuning0 + cosvalue * m_localTuning1 + cosvalue * m_localTuning2;
|
Chris@23
|
487 float normalisedtuning = atan2(localTuningImag, localTuningReal)/(2*M_PI);
|
Chris@23
|
488 m_localTuning.push_back(normalisedtuning);
|
matthiasm@0
|
489
|
Chris@23
|
490 Feature f1; // logfreqspec
|
Chris@23
|
491 f1.hasTimestamp = true;
|
matthiasm@0
|
492 f1.timestamp = timestamp;
|
Chris@23
|
493 for (size_t iNote = 0; iNote < nNote; iNote++) {
|
Chris@23
|
494 f1.values.push_back(nm[iNote]);
|
Chris@23
|
495 }
|
matthiasm@0
|
496
|
matthiasm@0
|
497 // deletes
|
matthiasm@0
|
498 delete[] magnitude;
|
matthiasm@0
|
499 delete[] nm;
|
matthiasm@0
|
500
|
Chris@35
|
501 m_logSpectrum.push_back(f1); // remember note magnitude
|
matthiasm@0
|
502 }
|
matthiasm@0
|
503
|
Chris@35
|
504
|
Chris@35
|
505 #ifdef NOT_DEFINED
|
Chris@35
|
506
|
Chris@35
|
507 NNLSBase::FeatureSet
|
Chris@35
|
508 NNLSBase::getRemainingFeatures()
|
matthiasm@0
|
509 {
|
Chris@23
|
510 if (debug_on) cerr << "--> getRemainingFeatures" << endl;
|
Chris@23
|
511 FeatureSet fsOut;
|
Chris@35
|
512 if (m_logSpectrum.size() == 0) return fsOut;
|
Chris@23
|
513 int nChord = m_chordnames.size();
|
Chris@23
|
514 //
|
Chris@23
|
515 /** Calculate Tuning
|
Chris@23
|
516 calculate tuning from (using the angle of the complex number defined by the
|
Chris@23
|
517 cumulative mean real and imag values)
|
Chris@23
|
518 **/
|
Chris@23
|
519 float meanTuningImag = sinvalue * m_meanTuning1 - sinvalue * m_meanTuning2;
|
Chris@23
|
520 float meanTuningReal = m_meanTuning0 + cosvalue * m_meanTuning1 + cosvalue * m_meanTuning2;
|
Chris@23
|
521 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
|
Chris@23
|
522 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
|
Chris@23
|
523 int intShift = floor(normalisedtuning * 3);
|
Chris@23
|
524 float intFactor = normalisedtuning * 3 - intShift; // intFactor is a really bad name for this
|
matthiasm@1
|
525
|
Chris@23
|
526 char buffer0 [50];
|
matthiasm@1
|
527
|
Chris@23
|
528 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
|
matthiasm@1
|
529
|
Chris@23
|
530 // cerr << "normalisedtuning: " << normalisedtuning << '\n';
|
matthiasm@1
|
531
|
Chris@23
|
532 // push tuning to FeatureSet fsOut
|
Chris@23
|
533 Feature f0; // tuning
|
Chris@23
|
534 f0.hasTimestamp = true;
|
Chris@23
|
535 f0.timestamp = Vamp::RealTime::frame2RealTime(0, lrintf(m_inputSampleRate));;
|
Chris@23
|
536 f0.label = buffer0;
|
Chris@23
|
537 fsOut[0].push_back(f0);
|
matthiasm@1
|
538
|
Chris@23
|
539 /** Tune Log-Frequency Spectrogram
|
Chris@23
|
540 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to
|
Chris@23
|
541 perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
|
Chris@23
|
542 **/
|
Chris@23
|
543 cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... ";
|
matthiasm@13
|
544
|
Chris@23
|
545 float tempValue = 0;
|
Chris@23
|
546 float dbThreshold = 0; // relative to the background spectrum
|
Chris@23
|
547 float thresh = pow(10,dbThreshold/20);
|
Chris@23
|
548 // cerr << "tune local ? " << m_tuneLocal << endl;
|
Chris@23
|
549 int count = 0;
|
matthiasm@1
|
550
|
Chris@35
|
551 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
|
Chris@23
|
552 Feature f1 = *i;
|
Chris@23
|
553 Feature f2; // tuned log-frequency spectrum
|
Chris@23
|
554 f2.hasTimestamp = true;
|
Chris@23
|
555 f2.timestamp = f1.timestamp;
|
Chris@23
|
556 f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
|
matthiasm@1
|
557
|
Chris@23
|
558 if (m_tuneLocal) {
|
Chris@23
|
559 intShift = floor(m_localTuning[count] * 3);
|
Chris@23
|
560 intFactor = m_localTuning[count] * 3 - intShift; // intFactor is a really bad name for this
|
Chris@23
|
561 }
|
matthiasm@1
|
562
|
Chris@23
|
563 // cerr << intShift << " " << intFactor << endl;
|
matthiasm@1
|
564
|
Chris@23
|
565 for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
|
Chris@23
|
566 tempValue = f1.values[k + intShift] * (1-intFactor) + f1.values[k+intShift+1] * intFactor;
|
Chris@23
|
567 f2.values.push_back(tempValue);
|
Chris@23
|
568 }
|
matthiasm@1
|
569
|
Chris@23
|
570 f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
|
Chris@23
|
571 vector<float> runningmean = SpecialConvolution(f2.values,hw);
|
Chris@23
|
572 vector<float> runningstd;
|
Chris@23
|
573 for (int i = 0; i < 256; i++) { // first step: squared values into vector (variance)
|
Chris@23
|
574 runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
|
Chris@23
|
575 }
|
Chris@23
|
576 runningstd = SpecialConvolution(runningstd,hw); // second step convolve
|
Chris@23
|
577 for (int i = 0; i < 256; i++) {
|
Chris@23
|
578 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
|
Chris@23
|
579 if (runningstd[i] > 0) {
|
Chris@23
|
580 // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ?
|
mail@41
|
581 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
Chris@23
|
582 f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
|
mail@41
|
583 (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
Chris@23
|
584 }
|
Chris@23
|
585 if (f2.values[i] < 0) {
|
Chris@23
|
586 cerr << "ERROR: negative value in logfreq spectrum" << endl;
|
Chris@23
|
587 }
|
Chris@23
|
588 }
|
Chris@23
|
589 fsOut[2].push_back(f2);
|
Chris@23
|
590 count++;
|
Chris@23
|
591 }
|
Chris@23
|
592 cerr << "done." << endl;
|
matthiasm@1
|
593
|
Chris@23
|
594 /** Semitone spectrum and chromagrams
|
Chris@23
|
595 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
|
Chris@23
|
596 is inferred using a non-negative least squares algorithm.
|
Chris@23
|
597 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
|
Chris@23
|
598 bass and treble stacked onto each other).
|
Chris@23
|
599 **/
|
matthiasm@42
|
600 if (m_useNNLS == 0) {
|
Chris@23
|
601 cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... ";
|
Chris@23
|
602 } else {
|
Chris@23
|
603 cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... ";
|
Chris@23
|
604 }
|
matthiasm@13
|
605
|
matthiasm@1
|
606
|
Chris@23
|
607 vector<vector<float> > chordogram;
|
Chris@23
|
608 vector<vector<int> > scoreChordogram;
|
Chris@23
|
609 vector<float> chordchange = vector<float>(fsOut[2].size(),0);
|
Chris@23
|
610 vector<float> oldchroma = vector<float>(12,0);
|
Chris@23
|
611 vector<float> oldbasschroma = vector<float>(12,0);
|
Chris@23
|
612 count = 0;
|
matthiasm@9
|
613
|
Chris@23
|
614 for (FeatureList::iterator it = fsOut[2].begin(); it != fsOut[2].end(); ++it) {
|
Chris@23
|
615 Feature f2 = *it; // logfreq spectrum
|
Chris@23
|
616 Feature f3; // semitone spectrum
|
Chris@23
|
617 Feature f4; // treble chromagram
|
Chris@23
|
618 Feature f5; // bass chromagram
|
Chris@23
|
619 Feature f6; // treble and bass chromagram
|
matthiasm@1
|
620
|
Chris@23
|
621 f3.hasTimestamp = true;
|
Chris@23
|
622 f3.timestamp = f2.timestamp;
|
matthiasm@1
|
623
|
Chris@23
|
624 f4.hasTimestamp = true;
|
Chris@23
|
625 f4.timestamp = f2.timestamp;
|
matthiasm@1
|
626
|
Chris@23
|
627 f5.hasTimestamp = true;
|
Chris@23
|
628 f5.timestamp = f2.timestamp;
|
matthiasm@1
|
629
|
Chris@23
|
630 f6.hasTimestamp = true;
|
Chris@23
|
631 f6.timestamp = f2.timestamp;
|
matthiasm@1
|
632
|
Chris@29
|
633 float b[256];
|
matthiasm@1
|
634
|
Chris@23
|
635 bool some_b_greater_zero = false;
|
Chris@23
|
636 float sumb = 0;
|
Chris@23
|
637 for (int i = 0; i < 256; i++) {
|
Chris@23
|
638 // b[i] = m_dict[(256 * count + i) % (256 * 84)];
|
Chris@23
|
639 b[i] = f2.values[i];
|
Chris@23
|
640 sumb += b[i];
|
Chris@23
|
641 if (b[i] > 0) {
|
Chris@23
|
642 some_b_greater_zero = true;
|
Chris@23
|
643 }
|
Chris@23
|
644 }
|
matthiasm@1
|
645
|
Chris@23
|
646 // here's where the non-negative least squares algorithm calculates the note activation x
|
matthiasm@1
|
647
|
Chris@23
|
648 vector<float> chroma = vector<float>(12, 0);
|
Chris@23
|
649 vector<float> basschroma = vector<float>(12, 0);
|
Chris@23
|
650 float currval;
|
Chris@23
|
651 unsigned iSemitone = 0;
|
matthiasm@1
|
652
|
Chris@23
|
653 if (some_b_greater_zero) {
|
matthiasm@42
|
654 if (m_useNNLS == 0) {
|
Chris@23
|
655 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
|
Chris@23
|
656 currval = 0;
|
Chris@23
|
657 currval += b[iNote + 1 + -1] * 0.5;
|
Chris@23
|
658 currval += b[iNote + 1 + 0] * 1.0;
|
Chris@23
|
659 currval += b[iNote + 1 + 1] * 0.5;
|
Chris@23
|
660 f3.values.push_back(currval);
|
Chris@23
|
661 chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
|
Chris@23
|
662 basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
|
Chris@23
|
663 iSemitone++;
|
Chris@23
|
664 }
|
matthiasm@1
|
665
|
Chris@23
|
666 } else {
|
Chris@29
|
667 float x[84+1000];
|
Chris@23
|
668 for (int i = 1; i < 1084; ++i) x[i] = 1.0;
|
Chris@23
|
669 vector<int> signifIndex;
|
Chris@23
|
670 int index=0;
|
Chris@23
|
671 sumb /= 84.0;
|
Chris@23
|
672 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
|
Chris@23
|
673 float currval = 0;
|
Chris@23
|
674 currval += b[iNote + 1 + -1];
|
Chris@23
|
675 currval += b[iNote + 1 + 0];
|
Chris@23
|
676 currval += b[iNote + 1 + 1];
|
Chris@23
|
677 if (currval > 0) signifIndex.push_back(index);
|
Chris@23
|
678 f3.values.push_back(0); // fill the values, change later
|
Chris@23
|
679 index++;
|
Chris@23
|
680 }
|
Chris@29
|
681 float rnorm;
|
Chris@29
|
682 float w[84+1000];
|
Chris@29
|
683 float zz[84+1000];
|
Chris@23
|
684 int indx[84+1000];
|
Chris@23
|
685 int mode;
|
Chris@23
|
686 int dictsize = 256*signifIndex.size();
|
Chris@23
|
687 // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl;
|
Chris@29
|
688 float *curr_dict = new float[dictsize];
|
Chris@23
|
689 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
|
Chris@23
|
690 for (unsigned iBin = 0; iBin < 256; iBin++) {
|
Chris@23
|
691 curr_dict[iNote * 256 + iBin] = 1.0 * m_dict[signifIndex[iNote] * 256 + iBin];
|
Chris@23
|
692 }
|
Chris@23
|
693 }
|
Chris@29
|
694 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
|
Chris@23
|
695 delete [] curr_dict;
|
Chris@23
|
696 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
|
Chris@23
|
697 f3.values[signifIndex[iNote]] = x[iNote];
|
Chris@23
|
698 // cerr << mode << endl;
|
Chris@23
|
699 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
|
Chris@23
|
700 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
|
Chris@23
|
701 }
|
Chris@23
|
702 }
|
Chris@23
|
703 }
|
matthiasm@13
|
704
|
matthiasm@10
|
705
|
matthiasm@12
|
706
|
matthiasm@13
|
707
|
Chris@23
|
708 f4.values = chroma;
|
Chris@23
|
709 f5.values = basschroma;
|
Chris@23
|
710 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
|
Chris@23
|
711 f6.values = chroma;
|
matthiasm@1
|
712
|
Chris@23
|
713 if (m_doNormalizeChroma > 0) {
|
Chris@23
|
714 vector<float> chromanorm = vector<float>(3,0);
|
Chris@23
|
715 switch (int(m_doNormalizeChroma)) {
|
Chris@23
|
716 case 0: // should never end up here
|
Chris@23
|
717 break;
|
Chris@23
|
718 case 1:
|
Chris@23
|
719 chromanorm[0] = *max_element(f4.values.begin(), f4.values.end());
|
Chris@23
|
720 chromanorm[1] = *max_element(f5.values.begin(), f5.values.end());
|
Chris@23
|
721 chromanorm[2] = max(chromanorm[0], chromanorm[1]);
|
Chris@23
|
722 break;
|
Chris@23
|
723 case 2:
|
Chris@23
|
724 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
|
Chris@23
|
725 chromanorm[0] += *it;
|
Chris@23
|
726 }
|
Chris@23
|
727 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
|
Chris@23
|
728 chromanorm[1] += *it;
|
Chris@23
|
729 }
|
Chris@23
|
730 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
|
Chris@23
|
731 chromanorm[2] += *it;
|
Chris@23
|
732 }
|
Chris@23
|
733 break;
|
Chris@23
|
734 case 3:
|
Chris@23
|
735 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
|
Chris@23
|
736 chromanorm[0] += pow(*it,2);
|
Chris@23
|
737 }
|
Chris@23
|
738 chromanorm[0] = sqrt(chromanorm[0]);
|
Chris@23
|
739 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
|
Chris@23
|
740 chromanorm[1] += pow(*it,2);
|
Chris@23
|
741 }
|
Chris@23
|
742 chromanorm[1] = sqrt(chromanorm[1]);
|
Chris@23
|
743 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
|
Chris@23
|
744 chromanorm[2] += pow(*it,2);
|
Chris@23
|
745 }
|
Chris@23
|
746 chromanorm[2] = sqrt(chromanorm[2]);
|
Chris@23
|
747 break;
|
Chris@23
|
748 }
|
Chris@23
|
749 if (chromanorm[0] > 0) {
|
Chris@23
|
750 for (int i = 0; i < f4.values.size(); i++) {
|
Chris@23
|
751 f4.values[i] /= chromanorm[0];
|
Chris@23
|
752 }
|
Chris@23
|
753 }
|
Chris@23
|
754 if (chromanorm[1] > 0) {
|
Chris@23
|
755 for (int i = 0; i < f5.values.size(); i++) {
|
Chris@23
|
756 f5.values[i] /= chromanorm[1];
|
Chris@23
|
757 }
|
Chris@23
|
758 }
|
Chris@23
|
759 if (chromanorm[2] > 0) {
|
Chris@23
|
760 for (int i = 0; i < f6.values.size(); i++) {
|
Chris@23
|
761 f6.values[i] /= chromanorm[2];
|
Chris@23
|
762 }
|
Chris@23
|
763 }
|
matthiasm@13
|
764
|
Chris@23
|
765 }
|
matthiasm@13
|
766
|
Chris@23
|
767 // local chord estimation
|
Chris@23
|
768 vector<float> currentChordSalience;
|
Chris@23
|
769 float tempchordvalue = 0;
|
Chris@23
|
770 float sumchordvalue = 0;
|
matthiasm@9
|
771
|
Chris@23
|
772 for (int iChord = 0; iChord < nChord; iChord++) {
|
Chris@23
|
773 tempchordvalue = 0;
|
Chris@23
|
774 for (int iBin = 0; iBin < 12; iBin++) {
|
Chris@23
|
775 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
|
Chris@23
|
776 }
|
Chris@23
|
777 for (int iBin = 12; iBin < 24; iBin++) {
|
Chris@23
|
778 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
|
Chris@23
|
779 }
|
Chris@23
|
780 sumchordvalue+=tempchordvalue;
|
Chris@23
|
781 currentChordSalience.push_back(tempchordvalue);
|
Chris@23
|
782 }
|
Chris@23
|
783 if (sumchordvalue > 0) {
|
Chris@23
|
784 for (int iChord = 0; iChord < nChord; iChord++) {
|
Chris@23
|
785 currentChordSalience[iChord] /= sumchordvalue;
|
Chris@23
|
786 }
|
Chris@23
|
787 } else {
|
Chris@23
|
788 currentChordSalience[nChord-1] = 1.0;
|
Chris@23
|
789 }
|
Chris@23
|
790 chordogram.push_back(currentChordSalience);
|
matthiasm@1
|
791
|
Chris@23
|
792 fsOut[3].push_back(f3);
|
Chris@23
|
793 fsOut[4].push_back(f4);
|
Chris@23
|
794 fsOut[5].push_back(f5);
|
Chris@23
|
795 fsOut[6].push_back(f6);
|
Chris@23
|
796 count++;
|
Chris@23
|
797 }
|
Chris@23
|
798 cerr << "done." << endl;
|
matthiasm@13
|
799
|
matthiasm@10
|
800
|
Chris@23
|
801 /* Simple chord estimation
|
Chris@23
|
802 I just take the local chord estimates ("currentChordSalience") and average them over time, then
|
Chris@23
|
803 take the maximum. Very simple, don't do this at home...
|
Chris@23
|
804 */
|
Chris@23
|
805 cerr << "[NNLS Chroma Plugin] Chord Estimation ... ";
|
Chris@23
|
806 count = 0;
|
Chris@23
|
807 int halfwindowlength = m_inputSampleRate / m_stepSize;
|
Chris@23
|
808 vector<int> chordSequence;
|
Chris@23
|
809 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram
|
Chris@23
|
810 vector<int> temp = vector<int>(nChord,0);
|
Chris@23
|
811 scoreChordogram.push_back(temp);
|
Chris@23
|
812 }
|
Chris@23
|
813 for (FeatureList::iterator it = fsOut[6].begin(); it < fsOut[6].end()-2*halfwindowlength-1; ++it) {
|
Chris@23
|
814 int startIndex = count + 1;
|
Chris@23
|
815 int endIndex = count + 2 * halfwindowlength;
|
matthiasm@10
|
816
|
Chris@23
|
817 float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
|
matthiasm@10
|
818
|
Chris@23
|
819 vector<int> chordCandidates;
|
Chris@23
|
820 for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
|
Chris@23
|
821 // float currsum = 0;
|
Chris@23
|
822 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
|
Chris@23
|
823 // currsum += chordogram[iFrame][iChord];
|
Chris@23
|
824 // }
|
Chris@23
|
825 // if (currsum > chordThreshold) chordCandidates.push_back(iChord);
|
Chris@23
|
826 for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
|
Chris@23
|
827 if (chordogram[iFrame][iChord] > chordThreshold) {
|
Chris@23
|
828 chordCandidates.push_back(iChord);
|
Chris@23
|
829 break;
|
Chris@23
|
830 }
|
Chris@23
|
831 }
|
Chris@23
|
832 }
|
Chris@23
|
833 chordCandidates.push_back(nChord-1);
|
Chris@23
|
834 // cerr << chordCandidates.size() << endl;
|
Chris@23
|
835
|
Chris@23
|
836 float maxval = 0; // will be the value of the most salient *chord change* in this frame
|
Chris@23
|
837 float maxindex = 0; //... and the index thereof
|
Chris@23
|
838 unsigned bestchordL = nChord-1; // index of the best "left" chord
|
Chris@23
|
839 unsigned bestchordR = nChord-1; // index of the best "right" chord
|
Chris@23
|
840
|
Chris@23
|
841 for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
|
Chris@23
|
842 // now find the max values on both sides of iWF
|
Chris@23
|
843 // left side:
|
Chris@23
|
844 float maxL = 0;
|
Chris@23
|
845 unsigned maxindL = nChord-1;
|
Chris@23
|
846 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
|
Chris@23
|
847 unsigned iChord = chordCandidates[kChord];
|
Chris@23
|
848 float currsum = 0;
|
Chris@23
|
849 for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
|
Chris@23
|
850 currsum += chordogram[count+iFrame][iChord];
|
matthiasm@10
|
851 }
|
Chris@23
|
852 if (iChord == nChord-1) currsum *= 0.8;
|
Chris@23
|
853 if (currsum > maxL) {
|
Chris@23
|
854 maxL = currsum;
|
Chris@23
|
855 maxindL = iChord;
|
Chris@23
|
856 }
|
Chris@23
|
857 }
|
Chris@23
|
858 // right side:
|
Chris@23
|
859 float maxR = 0;
|
Chris@23
|
860 unsigned maxindR = nChord-1;
|
Chris@23
|
861 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
|
Chris@23
|
862 unsigned iChord = chordCandidates[kChord];
|
Chris@23
|
863 float currsum = 0;
|
Chris@23
|
864 for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
|
Chris@23
|
865 currsum += chordogram[count+iFrame][iChord];
|
Chris@23
|
866 }
|
Chris@23
|
867 if (iChord == nChord-1) currsum *= 0.8;
|
Chris@23
|
868 if (currsum > maxR) {
|
Chris@23
|
869 maxR = currsum;
|
Chris@23
|
870 maxindR = iChord;
|
Chris@23
|
871 }
|
Chris@23
|
872 }
|
Chris@23
|
873 if (maxL+maxR > maxval) {
|
Chris@23
|
874 maxval = maxL+maxR;
|
Chris@23
|
875 maxindex = iWF;
|
Chris@23
|
876 bestchordL = maxindL;
|
Chris@23
|
877 bestchordR = maxindR;
|
Chris@23
|
878 }
|
matthiasm@3
|
879
|
Chris@23
|
880 }
|
Chris@23
|
881 // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
|
Chris@23
|
882 // add a score to every chord-frame-point that was part of a maximum
|
Chris@23
|
883 for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
|
Chris@23
|
884 scoreChordogram[iFrame+count][bestchordL]++;
|
Chris@23
|
885 }
|
Chris@23
|
886 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
|
Chris@23
|
887 scoreChordogram[iFrame+count][bestchordR]++;
|
Chris@23
|
888 }
|
Chris@23
|
889 if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
|
Chris@23
|
890 count++;
|
Chris@23
|
891 }
|
Chris@23
|
892 // cerr << "******* agent finished *******" << endl;
|
Chris@23
|
893 count = 0;
|
Chris@23
|
894 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
|
Chris@23
|
895 float maxval = 0; // will be the value of the most salient chord in this frame
|
Chris@23
|
896 float maxindex = 0; //... and the index thereof
|
Chris@23
|
897 for (unsigned iChord = 0; iChord < nChord; iChord++) {
|
Chris@23
|
898 if (scoreChordogram[count][iChord] > maxval) {
|
Chris@23
|
899 maxval = scoreChordogram[count][iChord];
|
Chris@23
|
900 maxindex = iChord;
|
Chris@23
|
901 // cerr << iChord << endl;
|
Chris@23
|
902 }
|
Chris@23
|
903 }
|
Chris@23
|
904 chordSequence.push_back(maxindex);
|
Chris@23
|
905 // cerr << "before modefilter, maxindex: " << maxindex << endl;
|
Chris@23
|
906 count++;
|
Chris@23
|
907 }
|
Chris@23
|
908 // cerr << "******* mode filter done *******" << endl;
|
matthiasm@10
|
909
|
matthiasm@3
|
910
|
Chris@23
|
911 // mode filter on chordSequence
|
Chris@23
|
912 count = 0;
|
Chris@23
|
913 string oldChord = "";
|
Chris@23
|
914 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
|
Chris@23
|
915 Feature f6 = *it;
|
Chris@23
|
916 Feature f7; // chord estimate
|
Chris@23
|
917 f7.hasTimestamp = true;
|
Chris@23
|
918 f7.timestamp = f6.timestamp;
|
Chris@23
|
919 Feature f8; // chord estimate
|
Chris@23
|
920 f8.hasTimestamp = true;
|
Chris@23
|
921 f8.timestamp = f6.timestamp;
|
matthiasm@17
|
922
|
Chris@23
|
923 vector<int> chordCount = vector<int>(nChord,0);
|
Chris@23
|
924 int maxChordCount = 0;
|
Chris@23
|
925 int maxChordIndex = nChord-1;
|
Chris@23
|
926 string maxChord;
|
Chris@23
|
927 int startIndex = max(count - halfwindowlength/2,0);
|
Chris@23
|
928 int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
|
Chris@23
|
929 for (int i = startIndex; i < endIndex; i++) {
|
Chris@23
|
930 chordCount[chordSequence[i]]++;
|
Chris@23
|
931 if (chordCount[chordSequence[i]] > maxChordCount) {
|
Chris@23
|
932 // cerr << "start index " << startIndex << endl;
|
Chris@23
|
933 maxChordCount++;
|
Chris@23
|
934 maxChordIndex = chordSequence[i];
|
Chris@23
|
935 maxChord = m_chordnames[maxChordIndex];
|
Chris@23
|
936 }
|
Chris@23
|
937 }
|
Chris@23
|
938 // chordSequence[count] = maxChordIndex;
|
Chris@23
|
939 // cerr << maxChordIndex << endl;
|
Chris@23
|
940 f8.values.push_back(chordchange[count]/(halfwindowlength*2));
|
Chris@23
|
941 // cerr << chordchange[count] << endl;
|
Chris@23
|
942 fsOut[9].push_back(f8);
|
Chris@23
|
943 if (oldChord != maxChord) {
|
Chris@23
|
944 oldChord = maxChord;
|
matthiasm@3
|
945
|
Chris@23
|
946 // char buffer1 [50];
|
Chris@23
|
947 // if (maxChordIndex < nChord - 1) {
|
Chris@23
|
948 // sprintf(buffer1, "%s%s", notenames[maxChordIndex % 12 + 12], chordtypes[maxChordIndex]);
|
Chris@23
|
949 // } else {
|
Chris@23
|
950 // sprintf(buffer1, "N");
|
Chris@23
|
951 // }
|
Chris@23
|
952 // f7.label = buffer1;
|
Chris@23
|
953 f7.label = m_chordnames[maxChordIndex];
|
Chris@23
|
954 fsOut[7].push_back(f7);
|
Chris@23
|
955 }
|
Chris@23
|
956 count++;
|
Chris@23
|
957 }
|
Chris@23
|
958 Feature f7; // last chord estimate
|
Chris@23
|
959 f7.hasTimestamp = true;
|
Chris@23
|
960 f7.timestamp = fsOut[6][fsOut[6].size()-1].timestamp;
|
Chris@23
|
961 f7.label = "N";
|
Chris@23
|
962 fsOut[7].push_back(f7);
|
Chris@23
|
963 cerr << "done." << endl;
|
Chris@23
|
964 // // musicity
|
Chris@23
|
965 // count = 0;
|
Chris@23
|
966 // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
|
Chris@23
|
967 // vector<float> musicityValue;
|
Chris@23
|
968 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
|
Chris@23
|
969 // Feature f4 = *it;
|
Chris@23
|
970 //
|
Chris@23
|
971 // int startIndex = max(count - musicitykernelwidth/2,0);
|
Chris@23
|
972 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
|
Chris@23
|
973 // float chromasum = 0;
|
Chris@23
|
974 // float diffsum = 0;
|
Chris@23
|
975 // for (int k = 0; k < 12; k++) {
|
Chris@23
|
976 // for (int i = startIndex + 1; i < endIndex; i++) {
|
Chris@23
|
977 // chromasum += pow(fsOut[4][i].values[k],2);
|
Chris@23
|
978 // diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
|
Chris@23
|
979 // }
|
Chris@23
|
980 // }
|
Chris@23
|
981 // diffsum /= chromasum;
|
Chris@23
|
982 // musicityValue.push_back(diffsum);
|
Chris@23
|
983 // count++;
|
Chris@23
|
984 // }
|
Chris@23
|
985 //
|
Chris@23
|
986 // float musicityThreshold = 0.44;
|
Chris@23
|
987 // if (m_stepSize == 4096) {
|
Chris@23
|
988 // musicityThreshold = 0.74;
|
Chris@23
|
989 // }
|
Chris@23
|
990 // if (m_stepSize == 4410) {
|
Chris@23
|
991 // musicityThreshold = 0.77;
|
Chris@23
|
992 // }
|
Chris@23
|
993 //
|
Chris@23
|
994 // count = 0;
|
Chris@23
|
995 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
|
Chris@23
|
996 // Feature f4 = *it;
|
Chris@23
|
997 // Feature f8; // musicity
|
Chris@23
|
998 // Feature f9; // musicity segmenter
|
Chris@23
|
999 //
|
Chris@23
|
1000 // f8.hasTimestamp = true;
|
Chris@23
|
1001 // f8.timestamp = f4.timestamp;
|
Chris@23
|
1002 // f9.hasTimestamp = true;
|
Chris@23
|
1003 // f9.timestamp = f4.timestamp;
|
Chris@23
|
1004 //
|
Chris@23
|
1005 // int startIndex = max(count - musicitykernelwidth/2,0);
|
Chris@23
|
1006 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
|
Chris@23
|
1007 // int musicityCount = 0;
|
Chris@23
|
1008 // for (int i = startIndex; i <= endIndex; i++) {
|
Chris@23
|
1009 // if (musicityValue[i] > musicityThreshold) musicityCount++;
|
Chris@23
|
1010 // }
|
Chris@23
|
1011 // bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
|
Chris@23
|
1012 //
|
Chris@23
|
1013 // if (isSpeech) {
|
Chris@23
|
1014 // if (oldlabeltype != 2) {
|
Chris@23
|
1015 // f9.label = "Speech";
|
Chris@23
|
1016 // fsOut[9].push_back(f9);
|
Chris@23
|
1017 // oldlabeltype = 2;
|
Chris@23
|
1018 // }
|
Chris@23
|
1019 // } else {
|
Chris@23
|
1020 // if (oldlabeltype != 1) {
|
Chris@23
|
1021 // f9.label = "Music";
|
Chris@23
|
1022 // fsOut[9].push_back(f9);
|
Chris@23
|
1023 // oldlabeltype = 1;
|
Chris@23
|
1024 // }
|
Chris@23
|
1025 // }
|
Chris@23
|
1026 // f8.values.push_back(musicityValue[count]);
|
Chris@23
|
1027 // fsOut[8].push_back(f8);
|
Chris@23
|
1028 // count++;
|
Chris@23
|
1029 // }
|
Chris@23
|
1030 return fsOut;
|
matthiasm@0
|
1031
|
matthiasm@0
|
1032 }
|
matthiasm@0
|
1033
|
Chris@35
|
1034 #endif
|