Chris@23
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
matthiasm@0
|
2
|
Chris@35
|
3 /*
|
Chris@35
|
4 NNLS-Chroma / Chordino
|
Chris@35
|
5
|
Chris@35
|
6 Audio feature extraction plugins for chromagram and chord
|
Chris@35
|
7 estimation.
|
Chris@35
|
8
|
Chris@35
|
9 Centre for Digital Music, Queen Mary University of London.
|
Chris@35
|
10 This file copyright 2008-2010 Matthias Mauch and QMUL.
|
Chris@35
|
11
|
Chris@35
|
12 This program is free software; you can redistribute it and/or
|
Chris@35
|
13 modify it under the terms of the GNU General Public License as
|
Chris@35
|
14 published by the Free Software Foundation; either version 2 of the
|
Chris@35
|
15 License, or (at your option) any later version. See the file
|
Chris@35
|
16 COPYING included with this distribution for more information.
|
Chris@35
|
17 */
|
Chris@35
|
18
|
matthiasm@0
|
19 #include "NNLSChroma.h"
|
Chris@27
|
20
|
Chris@27
|
21 #include "chromamethods.h"
|
Chris@27
|
22
|
Chris@27
|
23 #include <cstdlib>
|
Chris@27
|
24 #include <fstream>
|
matthiasm@0
|
25 #include <cmath>
|
matthiasm@9
|
26
|
Chris@27
|
27 #include <algorithm>
|
matthiasm@0
|
28
|
matthiasm@0
|
29 const bool debug_on = false;
|
matthiasm@0
|
30
|
matthiasm@0
|
31 NNLSChroma::NNLSChroma(float inputSampleRate) :
|
Chris@35
|
32 NNLSBase(inputSampleRate)
|
matthiasm@0
|
33 {
|
Chris@23
|
34 if (debug_on) cerr << "--> NNLSChroma" << endl;
|
matthiasm@0
|
35 }
|
matthiasm@0
|
36
|
matthiasm@0
|
37 NNLSChroma::~NNLSChroma()
|
matthiasm@0
|
38 {
|
Chris@23
|
39 if (debug_on) cerr << "--> ~NNLSChroma" << endl;
|
matthiasm@0
|
40 }
|
matthiasm@0
|
41
|
matthiasm@0
|
42 string
|
matthiasm@0
|
43 NNLSChroma::getIdentifier() const
|
matthiasm@0
|
44 {
|
Chris@23
|
45 if (debug_on) cerr << "--> getIdentifier" << endl;
|
matthiasm@46
|
46 return "nnls-chroma";
|
matthiasm@0
|
47 }
|
matthiasm@0
|
48
|
matthiasm@0
|
49 string
|
matthiasm@0
|
50 NNLSChroma::getName() const
|
matthiasm@0
|
51 {
|
Chris@23
|
52 if (debug_on) cerr << "--> getName" << endl;
|
matthiasm@0
|
53 return "NNLS Chroma";
|
matthiasm@0
|
54 }
|
matthiasm@0
|
55
|
matthiasm@0
|
56 string
|
matthiasm@0
|
57 NNLSChroma::getDescription() const
|
matthiasm@0
|
58 {
|
Chris@23
|
59 if (debug_on) cerr << "--> getDescription" << endl;
|
matthiasm@58
|
60 return "This plugin provides a number of features derived from a DFT-based log-frequency amplitude spectrum: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; and based on this semitone spectrum, different chroma features.";
|
matthiasm@0
|
61 }
|
matthiasm@0
|
62
|
matthiasm@0
|
63 NNLSChroma::OutputList
|
matthiasm@0
|
64 NNLSChroma::getOutputDescriptors() const
|
matthiasm@0
|
65 {
|
Chris@23
|
66 if (debug_on) cerr << "--> getOutputDescriptors" << endl;
|
matthiasm@0
|
67 OutputList list;
|
matthiasm@0
|
68
|
matthiasm@0
|
69 // Make chroma names for the binNames property
|
matthiasm@0
|
70 vector<string> chromanames;
|
matthiasm@0
|
71 vector<string> bothchromanames;
|
matthiasm@0
|
72 for (int iNote = 0; iNote < 24; iNote++) {
|
matthiasm@0
|
73 bothchromanames.push_back(notenames[iNote]);
|
matthiasm@0
|
74 if (iNote < 12) {
|
matthiasm@43
|
75 chromanames.push_back(notenames[iNote+12]);
|
matthiasm@0
|
76 }
|
matthiasm@0
|
77 }
|
matthiasm@0
|
78
|
Chris@35
|
79 int index = 0;
|
matthiasm@0
|
80
|
Chris@23
|
81 OutputDescriptor d1;
|
matthiasm@0
|
82 d1.identifier = "logfreqspec";
|
matthiasm@0
|
83 d1.name = "Log-Frequency Spectrum";
|
matthiasm@0
|
84 d1.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping.";
|
matthiasm@0
|
85 d1.unit = "";
|
matthiasm@0
|
86 d1.hasFixedBinCount = true;
|
matthiasm@0
|
87 d1.binCount = nNote;
|
matthiasm@0
|
88 d1.hasKnownExtents = false;
|
matthiasm@0
|
89 d1.isQuantized = false;
|
matthiasm@0
|
90 d1.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
91 d1.hasDuration = false;
|
matthiasm@0
|
92 d1.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
93 list.push_back(d1);
|
Chris@35
|
94 m_outputLogSpec = index++;
|
matthiasm@0
|
95
|
Chris@23
|
96 OutputDescriptor d2;
|
matthiasm@0
|
97 d2.identifier = "tunedlogfreqspec";
|
matthiasm@0
|
98 d2.name = "Tuned Log-Frequency Spectrum";
|
matthiasm@0
|
99 d2.description = "A Log-Frequency Spectrum (constant Q) that is obtained by cosine filter mapping, then its tuned using the estimated tuning frequency.";
|
matthiasm@0
|
100 d2.unit = "";
|
matthiasm@0
|
101 d2.hasFixedBinCount = true;
|
mail@77
|
102 d2.binCount = nNote;
|
matthiasm@0
|
103 d2.hasKnownExtents = false;
|
matthiasm@0
|
104 d2.isQuantized = false;
|
matthiasm@0
|
105 d2.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
106 d2.hasDuration = false;
|
matthiasm@0
|
107 d2.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
108 list.push_back(d2);
|
Chris@35
|
109 m_outputTunedSpec = index++;
|
matthiasm@0
|
110
|
matthiasm@0
|
111 OutputDescriptor d3;
|
matthiasm@0
|
112 d3.identifier = "semitonespectrum";
|
matthiasm@0
|
113 d3.name = "Semitone Spectrum";
|
matthiasm@0
|
114 d3.description = "A semitone-spaced log-frequency spectrum derived from the third-of-a-semitone-spaced tuned log-frequency spectrum.";
|
matthiasm@0
|
115 d3.unit = "";
|
matthiasm@0
|
116 d3.hasFixedBinCount = true;
|
matthiasm@0
|
117 d3.binCount = 84;
|
matthiasm@0
|
118 d3.hasKnownExtents = false;
|
matthiasm@0
|
119 d3.isQuantized = false;
|
matthiasm@0
|
120 d3.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
121 d3.hasDuration = false;
|
matthiasm@0
|
122 d3.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
123 list.push_back(d3);
|
Chris@35
|
124 m_outputSemiSpec = index++;
|
matthiasm@0
|
125
|
matthiasm@0
|
126 OutputDescriptor d4;
|
matthiasm@0
|
127 d4.identifier = "chroma";
|
matthiasm@0
|
128 d4.name = "Chromagram";
|
matthiasm@58
|
129 d4.description = "Tuning-adjusted chromagram from NNLS approximate transcription, with an emphasis on the medium note range.";
|
matthiasm@0
|
130 d4.unit = "";
|
matthiasm@0
|
131 d4.hasFixedBinCount = true;
|
matthiasm@0
|
132 d4.binCount = 12;
|
matthiasm@0
|
133 d4.binNames = chromanames;
|
matthiasm@0
|
134 d4.hasKnownExtents = false;
|
matthiasm@0
|
135 d4.isQuantized = false;
|
matthiasm@0
|
136 d4.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
137 d4.hasDuration = false;
|
matthiasm@0
|
138 d4.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
139 list.push_back(d4);
|
Chris@35
|
140 m_outputChroma = index++;
|
matthiasm@0
|
141
|
matthiasm@0
|
142 OutputDescriptor d5;
|
matthiasm@0
|
143 d5.identifier = "basschroma";
|
matthiasm@0
|
144 d5.name = "Bass Chromagram";
|
matthiasm@58
|
145 d5.description = "Tuning-adjusted bass chromagram from NNLS approximate transcription, with an emphasis on the bass note range.";
|
matthiasm@0
|
146 d5.unit = "";
|
matthiasm@0
|
147 d5.hasFixedBinCount = true;
|
matthiasm@0
|
148 d5.binCount = 12;
|
matthiasm@0
|
149 d5.binNames = chromanames;
|
matthiasm@0
|
150 d5.hasKnownExtents = false;
|
matthiasm@0
|
151 d5.isQuantized = false;
|
matthiasm@0
|
152 d5.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
153 d5.hasDuration = false;
|
matthiasm@0
|
154 d5.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
155 list.push_back(d5);
|
Chris@35
|
156 m_outputBassChroma = index++;
|
matthiasm@0
|
157
|
matthiasm@0
|
158 OutputDescriptor d6;
|
matthiasm@0
|
159 d6.identifier = "bothchroma";
|
matthiasm@0
|
160 d6.name = "Chromagram and Bass Chromagram";
|
matthiasm@58
|
161 d6.description = "Tuning-adjusted chromagram and bass chromagram (stacked on top of each other) from NNLS approximate transcription.";
|
matthiasm@0
|
162 d6.unit = "";
|
matthiasm@0
|
163 d6.hasFixedBinCount = true;
|
matthiasm@0
|
164 d6.binCount = 24;
|
matthiasm@0
|
165 d6.binNames = bothchromanames;
|
matthiasm@0
|
166 d6.hasKnownExtents = false;
|
matthiasm@0
|
167 d6.isQuantized = false;
|
matthiasm@0
|
168 d6.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
169 d6.hasDuration = false;
|
matthiasm@0
|
170 d6.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
matthiasm@0
|
171 list.push_back(d6);
|
Chris@35
|
172 m_outputBothChroma = index++;
|
matthiasm@1
|
173
|
mail@83
|
174 OutputDescriptor d7;
|
mail@83
|
175 d7.identifier = "consonance";
|
mail@83
|
176 d7.name = "Consonance estimate.";
|
mail@83
|
177 d7.description = "A simple consonance value based on the convolution of a consonance profile with the semitone spectrum.";
|
mail@83
|
178 d7.unit = "";
|
mail@83
|
179 d7.hasFixedBinCount = true;
|
mail@83
|
180 d7.binCount = 1;
|
mail@83
|
181 d7.hasKnownExtents = false;
|
mail@83
|
182 d7.isQuantized = false;
|
mail@83
|
183 d7.sampleType = OutputDescriptor::FixedSampleRate;
|
mail@83
|
184 d7.hasDuration = false;
|
mail@83
|
185 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
mail@83
|
186 list.push_back(d7);
|
mail@83
|
187 m_outputConsonance = index++;
|
mail@110
|
188
|
mail@110
|
189 OutputDescriptor speechity;
|
mail@110
|
190 speechity.identifier = "speechity";
|
mail@110
|
191 speechity.name = "Speech vs music segmenter.";
|
mail@110
|
192 speechity.description = ".";
|
mail@110
|
193 speechity.unit = "";
|
mail@110
|
194 speechity.hasFixedBinCount = true;
|
mail@110
|
195 speechity.binCount = 1;
|
mail@110
|
196 speechity.hasKnownExtents = false;
|
mail@110
|
197 speechity.isQuantized = false;
|
mail@110
|
198 speechity.sampleType = OutputDescriptor::FixedSampleRate;
|
mail@110
|
199 speechity.hasDuration = false;
|
mail@110
|
200 speechity.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
mail@110
|
201 list.push_back(speechity);
|
mail@110
|
202 m_outputSpeechity = index++;
|
mail@110
|
203
|
mail@110
|
204 OutputDescriptor mssegment;
|
mail@110
|
205 mssegment.identifier = "mssegment";
|
mail@110
|
206 mssegment.name = "Speech vs music segmenter.";
|
mail@110
|
207 mssegment.description = ".";
|
mail@110
|
208 mssegment.unit = "";
|
mail@110
|
209 mssegment.hasFixedBinCount = true;
|
mail@110
|
210 mssegment.binCount = 1;
|
mail@110
|
211 mssegment.hasKnownExtents = false;
|
mail@110
|
212 mssegment.isQuantized = false;
|
mail@110
|
213 mssegment.sampleType = OutputDescriptor::FixedSampleRate;
|
mail@110
|
214 mssegment.hasDuration = false;
|
mail@110
|
215 mssegment.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
|
mail@110
|
216 list.push_back(mssegment);
|
mail@110
|
217 m_outputMssegment = index++;
|
mail@83
|
218
|
matthiasm@0
|
219 return list;
|
matthiasm@0
|
220 }
|
matthiasm@0
|
221
|
matthiasm@0
|
222
|
matthiasm@0
|
223 bool
|
matthiasm@0
|
224 NNLSChroma::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
matthiasm@0
|
225 {
|
Chris@23
|
226 if (debug_on) {
|
Chris@23
|
227 cerr << "--> initialise";
|
Chris@23
|
228 }
|
matthiasm@1
|
229
|
Chris@35
|
230 if (!NNLSBase::initialise(channels, stepSize, blockSize)) {
|
Chris@35
|
231 return false;
|
Chris@35
|
232 }
|
matthiasm@1
|
233
|
matthiasm@0
|
234 return true;
|
matthiasm@0
|
235 }
|
matthiasm@0
|
236
|
matthiasm@0
|
237 void
|
matthiasm@0
|
238 NNLSChroma::reset()
|
matthiasm@0
|
239 {
|
Chris@23
|
240 if (debug_on) cerr << "--> reset";
|
Chris@35
|
241 NNLSBase::reset();
|
matthiasm@0
|
242 }
|
matthiasm@0
|
243
|
matthiasm@0
|
244 NNLSChroma::FeatureSet
|
matthiasm@0
|
245 NNLSChroma::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
matthiasm@0
|
246 {
|
Chris@23
|
247 if (debug_on) cerr << "--> process" << endl;
|
Chris@35
|
248
|
Chris@35
|
249 NNLSBase::baseProcess(inputBuffers, timestamp);
|
matthiasm@0
|
250
|
Chris@23
|
251 FeatureSet fs;
|
Chris@35
|
252 fs[m_outputLogSpec].push_back(m_logSpectrum[m_logSpectrum.size()-1]);
|
Chris@23
|
253 return fs;
|
matthiasm@0
|
254 }
|
matthiasm@0
|
255
|
matthiasm@0
|
256 NNLSChroma::FeatureSet
|
matthiasm@0
|
257 NNLSChroma::getRemainingFeatures()
|
matthiasm@0
|
258 {
|
mail@100
|
259 static const int nConsonance = 24;
|
mail@100
|
260 float consonancepattern[nConsonance] = {0,-1,-1,1,1,1,-1,1,1,1,-1,-1,1,-1,-1,1,1,1,-1,1,1,1,-1,-1};
|
mail@100
|
261 float consonancemean = 0;
|
mail@100
|
262 for (int i = 0; i< nConsonance; ++i) {
|
mail@100
|
263 consonancemean += consonancepattern[i]/nConsonance;
|
mail@100
|
264 }
|
mail@84
|
265
|
mail@100
|
266 cerr << "consonancemean = " << consonancemean << endl;
|
mail@100
|
267
|
mail@100
|
268 for (int i = 0; i< nConsonance; ++i) {
|
mail@100
|
269 consonancepattern[i] -= consonancemean;
|
mail@100
|
270 }
|
Chris@23
|
271 if (debug_on) cerr << "--> getRemainingFeatures" << endl;
|
Chris@23
|
272 FeatureSet fsOut;
|
Chris@35
|
273 if (m_logSpectrum.size() == 0) return fsOut;
|
Chris@23
|
274 //
|
Chris@23
|
275 /** Calculate Tuning
|
Chris@23
|
276 calculate tuning from (using the angle of the complex number defined by the
|
Chris@23
|
277 cumulative mean real and imag values)
|
Chris@23
|
278 **/
|
mail@80
|
279 float meanTuningImag = 0;
|
mail@80
|
280 float meanTuningReal = 0;
|
mail@80
|
281 for (int iBPS = 0; iBPS < nBPS; ++iBPS) {
|
mail@80
|
282 meanTuningReal += m_meanTunings[iBPS] * cosvalues[iBPS];
|
mail@80
|
283 meanTuningImag += m_meanTunings[iBPS] * sinvalues[iBPS];
|
mail@80
|
284 }
|
Chris@23
|
285 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
|
Chris@23
|
286 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
|
Chris@23
|
287 int intShift = floor(normalisedtuning * 3);
|
mail@80
|
288 float floatShift = normalisedtuning * 3 - intShift; // floatShift is a really bad name for this
|
matthiasm@1
|
289
|
Chris@23
|
290 char buffer0 [50];
|
matthiasm@1
|
291
|
Chris@23
|
292 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
|
matthiasm@1
|
293
|
Chris@23
|
294 // cerr << "normalisedtuning: " << normalisedtuning << '\n';
|
matthiasm@1
|
295
|
Chris@23
|
296 /** Tune Log-Frequency Spectrogram
|
Chris@23
|
297 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to
|
Chris@23
|
298 perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
|
Chris@23
|
299 **/
|
Chris@23
|
300 cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... ";
|
matthiasm@13
|
301
|
Chris@23
|
302 float tempValue = 0;
|
Chris@23
|
303 float dbThreshold = 0; // relative to the background spectrum
|
Chris@23
|
304 float thresh = pow(10,dbThreshold/20);
|
Chris@23
|
305 // cerr << "tune local ? " << m_tuneLocal << endl;
|
Chris@23
|
306 int count = 0;
|
mail@77
|
307
|
matthiasm@1
|
308
|
Chris@35
|
309 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
|
Chris@23
|
310 Feature f1 = *i;
|
Chris@23
|
311 Feature f2; // tuned log-frequency spectrum
|
Chris@23
|
312 f2.hasTimestamp = true;
|
Chris@23
|
313 f2.timestamp = f1.timestamp;
|
Chris@23
|
314 f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
|
matthiasm@1
|
315
|
matthiasm@85
|
316
|
Chris@23
|
317 if (m_tuneLocal) {
|
Chris@23
|
318 intShift = floor(m_localTuning[count] * 3);
|
mail@80
|
319 floatShift = m_localTuning[count] * 3 - intShift; // floatShift is a really bad name for this
|
Chris@23
|
320 }
|
matthiasm@1
|
321
|
mail@80
|
322 // cerr << intShift << " " << floatShift << endl;
|
matthiasm@1
|
323
|
Chris@23
|
324 for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
|
mail@80
|
325 tempValue = f1.values[k + intShift] * (1-floatShift) + f1.values[k+intShift+1] * floatShift;
|
Chris@23
|
326 f2.values.push_back(tempValue);
|
Chris@23
|
327 }
|
matthiasm@1
|
328
|
Chris@23
|
329 f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
|
mail@77
|
330
|
Chris@23
|
331 vector<float> runningmean = SpecialConvolution(f2.values,hw);
|
Chris@23
|
332 vector<float> runningstd;
|
mail@77
|
333 for (int i = 0; i < nNote; i++) { // first step: squared values into vector (variance)
|
Chris@23
|
334 runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
|
Chris@23
|
335 }
|
Chris@23
|
336 runningstd = SpecialConvolution(runningstd,hw); // second step convolve
|
mail@77
|
337 for (int i = 0; i < nNote; i++) {
|
Chris@23
|
338 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
|
Chris@23
|
339 if (runningstd[i] > 0) {
|
Chris@23
|
340 // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ?
|
mail@41
|
341 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
Chris@23
|
342 f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
|
mail@41
|
343 (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
|
Chris@23
|
344 }
|
Chris@23
|
345 if (f2.values[i] < 0) {
|
Chris@23
|
346 cerr << "ERROR: negative value in logfreq spectrum" << endl;
|
Chris@23
|
347 }
|
Chris@23
|
348 }
|
Chris@35
|
349 fsOut[m_outputTunedSpec].push_back(f2);
|
Chris@23
|
350 count++;
|
Chris@23
|
351 }
|
Chris@23
|
352 cerr << "done." << endl;
|
matthiasm@1
|
353
|
Chris@23
|
354 /** Semitone spectrum and chromagrams
|
Chris@23
|
355 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
|
Chris@23
|
356 is inferred using a non-negative least squares algorithm.
|
Chris@23
|
357 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
|
Chris@23
|
358 bass and treble stacked onto each other).
|
Chris@23
|
359 **/
|
matthiasm@42
|
360 if (m_useNNLS == 0) {
|
Chris@23
|
361 cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... ";
|
Chris@23
|
362 } else {
|
Chris@23
|
363 cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... ";
|
Chris@23
|
364 }
|
matthiasm@13
|
365
|
matthiasm@1
|
366
|
Chris@23
|
367 vector<float> oldchroma = vector<float>(12,0);
|
Chris@23
|
368 vector<float> oldbasschroma = vector<float>(12,0);
|
Chris@23
|
369 count = 0;
|
matthiasm@9
|
370
|
Chris@38
|
371 for (FeatureList::iterator it = fsOut[m_outputTunedSpec].begin(); it != fsOut[m_outputTunedSpec].end(); ++it) {
|
Chris@23
|
372 Feature f2 = *it; // logfreq spectrum
|
Chris@23
|
373 Feature f3; // semitone spectrum
|
Chris@23
|
374 Feature f4; // treble chromagram
|
Chris@23
|
375 Feature f5; // bass chromagram
|
Chris@23
|
376 Feature f6; // treble and bass chromagram
|
matthiasm@85
|
377 Feature consonance;
|
mail@110
|
378 Feature speechity;
|
matthiasm@85
|
379
|
Chris@23
|
380 f3.hasTimestamp = true;
|
Chris@23
|
381 f3.timestamp = f2.timestamp;
|
matthiasm@1
|
382
|
Chris@23
|
383 f4.hasTimestamp = true;
|
Chris@23
|
384 f4.timestamp = f2.timestamp;
|
matthiasm@1
|
385
|
Chris@23
|
386 f5.hasTimestamp = true;
|
Chris@23
|
387 f5.timestamp = f2.timestamp;
|
matthiasm@1
|
388
|
Chris@23
|
389 f6.hasTimestamp = true;
|
Chris@23
|
390 f6.timestamp = f2.timestamp;
|
matthiasm@1
|
391
|
matthiasm@85
|
392 consonance.hasTimestamp = true;
|
matthiasm@85
|
393 consonance.timestamp = f2.timestamp;
|
mail@110
|
394 speechity.hasTimestamp = true;
|
mail@110
|
395 speechity.timestamp = f2.timestamp;
|
matthiasm@85
|
396
|
mail@77
|
397 float b[nNote];
|
matthiasm@1
|
398
|
Chris@23
|
399 bool some_b_greater_zero = false;
|
Chris@23
|
400 float sumb = 0;
|
mail@77
|
401 for (int i = 0; i < nNote; i++) {
|
mail@77
|
402 // b[i] = m_dict[(nNote * count + i) % (nNote * 84)];
|
Chris@23
|
403 b[i] = f2.values[i];
|
Chris@23
|
404 sumb += b[i];
|
Chris@23
|
405 if (b[i] > 0) {
|
Chris@23
|
406 some_b_greater_zero = true;
|
Chris@23
|
407 }
|
Chris@23
|
408 }
|
matthiasm@1
|
409
|
Chris@23
|
410 // here's where the non-negative least squares algorithm calculates the note activation x
|
matthiasm@1
|
411
|
Chris@23
|
412 vector<float> chroma = vector<float>(12, 0);
|
Chris@23
|
413 vector<float> basschroma = vector<float>(12, 0);
|
Chris@23
|
414 float currval;
|
Chris@23
|
415 unsigned iSemitone = 0;
|
matthiasm@1
|
416
|
Chris@23
|
417 if (some_b_greater_zero) {
|
matthiasm@42
|
418 if (m_useNNLS == 0) {
|
mail@80
|
419 for (unsigned iNote = nBPS/2 + 2; iNote < nNote - nBPS/2; iNote += nBPS) {
|
Chris@23
|
420 currval = 0;
|
mail@80
|
421 for (int iBPS = -nBPS/2; iBPS < nBPS/2+1; ++iBPS) {
|
mail@80
|
422 currval += b[iNote + iBPS] * (1-abs(iBPS*1.0/(nBPS/2+1)));
|
mail@80
|
423 }
|
Chris@23
|
424 f3.values.push_back(currval);
|
Chris@23
|
425 chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
|
Chris@23
|
426 basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
|
Chris@23
|
427 iSemitone++;
|
Chris@23
|
428 }
|
matthiasm@1
|
429
|
Chris@23
|
430 } else {
|
Chris@35
|
431 float x[84+1000];
|
Chris@23
|
432 for (int i = 1; i < 1084; ++i) x[i] = 1.0;
|
Chris@23
|
433 vector<int> signifIndex;
|
Chris@23
|
434 int index=0;
|
Chris@23
|
435 sumb /= 84.0;
|
mail@80
|
436 for (unsigned iNote = nBPS/2 + 2; iNote < nNote - nBPS/2; iNote += nBPS) {
|
Chris@23
|
437 float currval = 0;
|
mail@80
|
438 for (int iBPS = -nBPS/2; iBPS < nBPS/2+1; ++iBPS) {
|
mail@80
|
439 currval += b[iNote + iBPS];
|
mail@80
|
440 }
|
Chris@23
|
441 if (currval > 0) signifIndex.push_back(index);
|
Chris@23
|
442 f3.values.push_back(0); // fill the values, change later
|
Chris@23
|
443 index++;
|
Chris@23
|
444 }
|
Chris@35
|
445 float rnorm;
|
Chris@35
|
446 float w[84+1000];
|
Chris@35
|
447 float zz[84+1000];
|
Chris@23
|
448 int indx[84+1000];
|
Chris@23
|
449 int mode;
|
mail@77
|
450 int dictsize = nNote*signifIndex.size();
|
Chris@23
|
451 // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl;
|
Chris@35
|
452 float *curr_dict = new float[dictsize];
|
Chris@91
|
453 for (int iNote = 0; iNote < (int)signifIndex.size(); ++iNote) {
|
Chris@91
|
454 for (int iBin = 0; iBin < nNote; iBin++) {
|
mail@77
|
455 curr_dict[iNote * nNote + iBin] = 1.0 * m_dict[signifIndex[iNote] * nNote + iBin];
|
Chris@23
|
456 }
|
Chris@23
|
457 }
|
Chris@35
|
458 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
|
Chris@23
|
459 delete [] curr_dict;
|
Chris@91
|
460 for (int iNote = 0; iNote < (int)signifIndex.size(); ++iNote) {
|
Chris@23
|
461 f3.values[signifIndex[iNote]] = x[iNote];
|
Chris@23
|
462 // cerr << mode << endl;
|
Chris@23
|
463 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
|
Chris@23
|
464 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
|
Chris@23
|
465 }
|
Chris@23
|
466 }
|
matthiasm@79
|
467 } else {
|
matthiasm@79
|
468 for (int i = 0; i < 84; ++i) f3.values.push_back(0);
|
Chris@23
|
469 }
|
matthiasm@85
|
470
|
matthiasm@85
|
471 float notesum = 0;
|
matthiasm@85
|
472
|
matthiasm@85
|
473 consonance.values.push_back(0);
|
matthiasm@85
|
474 for (int iSemitone = 0; iSemitone < 84-24; ++iSemitone) {
|
mail@100
|
475 notesum += f3.values[iSemitone] * f3.values[iSemitone] * treblewindow[iSemitone] * treblewindow[iSemitone];
|
matthiasm@85
|
476 float tempconsonance = 0;
|
matthiasm@85
|
477 for (int jSemitone = 1; jSemitone < 24; ++jSemitone) {
|
mail@100
|
478 tempconsonance += f3.values[iSemitone+jSemitone] * (consonancepattern[jSemitone]) * treblewindow[iSemitone+jSemitone];
|
matthiasm@85
|
479 }
|
mail@100
|
480 consonance.values[0] += (f3.values[iSemitone] * tempconsonance * treblewindow[iSemitone]);
|
matthiasm@85
|
481 }
|
matthiasm@86
|
482 if (notesum > 0) consonance.values[0] /= notesum;
|
matthiasm@85
|
483
|
Chris@23
|
484 f4.values = chroma;
|
Chris@23
|
485 f5.values = basschroma;
|
Chris@23
|
486 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
|
Chris@23
|
487 f6.values = chroma;
|
matthiasm@1
|
488
|
Chris@23
|
489 if (m_doNormalizeChroma > 0) {
|
Chris@23
|
490 vector<float> chromanorm = vector<float>(3,0);
|
Chris@23
|
491 switch (int(m_doNormalizeChroma)) {
|
Chris@23
|
492 case 0: // should never end up here
|
Chris@23
|
493 break;
|
Chris@23
|
494 case 1:
|
Chris@23
|
495 chromanorm[0] = *max_element(f4.values.begin(), f4.values.end());
|
Chris@23
|
496 chromanorm[1] = *max_element(f5.values.begin(), f5.values.end());
|
Chris@23
|
497 chromanorm[2] = max(chromanorm[0], chromanorm[1]);
|
Chris@23
|
498 break;
|
Chris@23
|
499 case 2:
|
Chris@23
|
500 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
|
Chris@23
|
501 chromanorm[0] += *it;
|
Chris@23
|
502 }
|
Chris@23
|
503 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
|
Chris@23
|
504 chromanorm[1] += *it;
|
Chris@23
|
505 }
|
Chris@23
|
506 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
|
Chris@23
|
507 chromanorm[2] += *it;
|
Chris@23
|
508 }
|
Chris@23
|
509 break;
|
Chris@23
|
510 case 3:
|
Chris@23
|
511 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
|
Chris@23
|
512 chromanorm[0] += pow(*it,2);
|
Chris@23
|
513 }
|
Chris@23
|
514 chromanorm[0] = sqrt(chromanorm[0]);
|
Chris@23
|
515 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
|
Chris@23
|
516 chromanorm[1] += pow(*it,2);
|
Chris@23
|
517 }
|
Chris@23
|
518 chromanorm[1] = sqrt(chromanorm[1]);
|
Chris@23
|
519 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
|
Chris@23
|
520 chromanorm[2] += pow(*it,2);
|
Chris@23
|
521 }
|
Chris@23
|
522 chromanorm[2] = sqrt(chromanorm[2]);
|
Chris@23
|
523 break;
|
Chris@23
|
524 }
|
Chris@23
|
525 if (chromanorm[0] > 0) {
|
Chris@91
|
526 for (size_t i = 0; i < f4.values.size(); i++) {
|
Chris@23
|
527 f4.values[i] /= chromanorm[0];
|
Chris@23
|
528 }
|
Chris@23
|
529 }
|
Chris@23
|
530 if (chromanorm[1] > 0) {
|
Chris@91
|
531 for (size_t i = 0; i < f5.values.size(); i++) {
|
Chris@23
|
532 f5.values[i] /= chromanorm[1];
|
Chris@23
|
533 }
|
Chris@23
|
534 }
|
Chris@23
|
535 if (chromanorm[2] > 0) {
|
Chris@91
|
536 for (size_t i = 0; i < f6.values.size(); i++) {
|
Chris@23
|
537 f6.values[i] /= chromanorm[2];
|
Chris@23
|
538 }
|
Chris@23
|
539 }
|
Chris@23
|
540 }
|
mail@110
|
541 // float speechityvalue = 0;
|
mail@110
|
542 // for (int iPC = 0; iPC < 12; ++iPC) {
|
mail@110
|
543 // speechityvalue += abs(f3.values[iPC] - oldchroma[iPC]);
|
mail@110
|
544 // oldchroma[iPC] = f3.values[iPC];
|
mail@110
|
545 // }
|
mail@110
|
546 // speechity.values.push_back(speechityvalue);
|
mail@110
|
547
|
Chris@35
|
548 fsOut[m_outputSemiSpec].push_back(f3);
|
Chris@35
|
549 fsOut[m_outputChroma].push_back(f4);
|
Chris@35
|
550 fsOut[m_outputBassChroma].push_back(f5);
|
Chris@35
|
551 fsOut[m_outputBothChroma].push_back(f6);
|
matthiasm@85
|
552 fsOut[m_outputConsonance].push_back(consonance);
|
mail@110
|
553 // fsOut[m_outputSpeechity].push_back(speechity);
|
Chris@23
|
554 count++;
|
Chris@23
|
555 }
|
Chris@23
|
556 cerr << "done." << endl;
|
mail@110
|
557
|
mail@110
|
558
|
mail@110
|
559 // musicity
|
mail@110
|
560 count = 0;
|
mail@110
|
561 int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
|
mail@110
|
562 vector<float> musicityValue;
|
mail@110
|
563 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
|
mail@110
|
564 Feature f4 = *it;
|
mail@110
|
565
|
mail@110
|
566 int startIndex = max(count - musicitykernelwidth/2,0);
|
mail@110
|
567 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
|
mail@110
|
568 float chromasum = 0;
|
mail@110
|
569 float diffsum = 0;
|
mail@110
|
570 for (int k = 0; k < 12; k++) {
|
mail@110
|
571 for (int i = startIndex + 1; i < endIndex; i++) {
|
mail@110
|
572 chromasum += pow(fsOut[4][i].values[k],2);
|
mail@110
|
573 diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
|
mail@110
|
574 }
|
mail@110
|
575 }
|
mail@110
|
576 diffsum /= chromasum;
|
mail@110
|
577 musicityValue.push_back(diffsum);
|
mail@110
|
578 count++;
|
mail@110
|
579 }
|
mail@110
|
580
|
mail@110
|
581 float musicityThreshold = 0.44;
|
mail@110
|
582 if (m_stepSize == 4096) {
|
mail@110
|
583 musicityThreshold = 0.74;
|
mail@110
|
584 }
|
mail@110
|
585 if (m_stepSize == 4410) {
|
mail@110
|
586 musicityThreshold = 0.77;
|
mail@110
|
587 }
|
mail@110
|
588
|
mail@110
|
589 count = 0;
|
mail@110
|
590 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
|
mail@110
|
591 Feature f4 = *it;
|
mail@110
|
592 Feature speechity; // musicity
|
mail@110
|
593 Feature f9; // musicity segmenter
|
mail@110
|
594
|
mail@110
|
595 speechity.hasTimestamp = true;
|
mail@110
|
596 speechity.timestamp = f4.timestamp;
|
mail@110
|
597 mssegment.hasTimestamp = true;
|
mail@110
|
598 mssegment.timestamp = f4.timestamp;
|
mail@110
|
599
|
mail@110
|
600 int startIndex = max(count - musicitykernelwidth/2,0);
|
mail@110
|
601 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
|
mail@110
|
602 int musicityCount = 0;
|
mail@110
|
603 for (int i = startIndex; i <= endIndex; i++) {
|
mail@110
|
604 if (musicityValue[i] > musicityThreshold) musicityCount++;
|
mail@110
|
605 }
|
mail@110
|
606 bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
|
mail@110
|
607
|
mail@110
|
608 if (isSpeech) {
|
mail@110
|
609 if (oldlabeltype != 2) {
|
mail@110
|
610 mssegment.label = "Speech";
|
mail@110
|
611 fsOut[m_outputMssegment].push_back(mssegment);
|
mail@110
|
612 oldlabeltype = 2;
|
mail@110
|
613 }
|
mail@110
|
614 } else {
|
mail@110
|
615 if (oldlabeltype != 1) {
|
mail@110
|
616 mssegment.label = "Music";
|
mail@110
|
617 fsOut[m_outputMssegment].push_back(mssegment);
|
mail@110
|
618 oldlabeltype = 1;
|
mail@110
|
619 }
|
mail@110
|
620 }
|
mail@110
|
621 speechity.values.push_back(musicityValue[count]);
|
mail@110
|
622 fsOut[m_outputSpeechity].push_back(speechity);
|
mail@110
|
623 count++;
|
mail@110
|
624 }
|
mail@110
|
625
|
mail@110
|
626
|
Chris@23
|
627 return fsOut;
|
matthiasm@0
|
628
|
matthiasm@0
|
629 }
|
matthiasm@0
|
630
|