Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@31
|
24
|
Chris@31
|
25 #include <vector>
|
Chris@31
|
26
|
Chris@32
|
27 #include <cstdio>
|
Chris@32
|
28
|
Chris@31
|
29 using std::vector;
|
Chris@48
|
30 using std::cout;
|
Chris@31
|
31 using std::cerr;
|
Chris@31
|
32 using std::endl;
|
Chris@40
|
33 using Vamp::RealTime;
|
Chris@31
|
34
|
Chris@31
|
35 static int processingSampleRate = 44100;
|
Chris@31
|
36 static int processingBPO = 60;
|
Chris@170
|
37
|
Chris@31
|
38 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
39 Plugin(inputSampleRate),
|
Chris@161
|
40 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@31
|
41 m_resampler(0),
|
Chris@246
|
42 m_flattener(0),
|
Chris@110
|
43 m_cq(0),
|
Chris@162
|
44 m_hqMode(true),
|
Chris@166
|
45 m_fineTuning(false),
|
Chris@178
|
46 m_instrument(0),
|
Chris@178
|
47 m_colsPerSec(50)
|
Chris@31
|
48 {
|
Chris@31
|
49 }
|
Chris@31
|
50
|
Chris@31
|
51 Silvet::~Silvet()
|
Chris@31
|
52 {
|
Chris@31
|
53 delete m_resampler;
|
Chris@246
|
54 delete m_flattener;
|
Chris@31
|
55 delete m_cq;
|
Chris@41
|
56 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
57 delete m_postFilter[i];
|
Chris@41
|
58 }
|
Chris@31
|
59 }
|
Chris@31
|
60
|
Chris@31
|
61 string
|
Chris@31
|
62 Silvet::getIdentifier() const
|
Chris@31
|
63 {
|
Chris@31
|
64 return "silvet";
|
Chris@31
|
65 }
|
Chris@31
|
66
|
Chris@31
|
67 string
|
Chris@31
|
68 Silvet::getName() const
|
Chris@31
|
69 {
|
Chris@31
|
70 return "Silvet Note Transcription";
|
Chris@31
|
71 }
|
Chris@31
|
72
|
Chris@31
|
73 string
|
Chris@31
|
74 Silvet::getDescription() const
|
Chris@31
|
75 {
|
Chris@191
|
76 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
77 }
|
Chris@31
|
78
|
Chris@31
|
79 string
|
Chris@31
|
80 Silvet::getMaker() const
|
Chris@31
|
81 {
|
Chris@191
|
82 return "Queen Mary, University of London";
|
Chris@31
|
83 }
|
Chris@31
|
84
|
Chris@31
|
85 int
|
Chris@31
|
86 Silvet::getPluginVersion() const
|
Chris@31
|
87 {
|
Chris@31
|
88 return 1;
|
Chris@31
|
89 }
|
Chris@31
|
90
|
Chris@31
|
91 string
|
Chris@31
|
92 Silvet::getCopyright() const
|
Chris@31
|
93 {
|
Chris@191
|
94 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
95 }
|
Chris@31
|
96
|
Chris@31
|
97 Silvet::InputDomain
|
Chris@31
|
98 Silvet::getInputDomain() const
|
Chris@31
|
99 {
|
Chris@31
|
100 return TimeDomain;
|
Chris@31
|
101 }
|
Chris@31
|
102
|
Chris@31
|
103 size_t
|
Chris@31
|
104 Silvet::getPreferredBlockSize() const
|
Chris@31
|
105 {
|
Chris@31
|
106 return 0;
|
Chris@31
|
107 }
|
Chris@31
|
108
|
Chris@31
|
109 size_t
|
Chris@31
|
110 Silvet::getPreferredStepSize() const
|
Chris@31
|
111 {
|
Chris@31
|
112 return 0;
|
Chris@31
|
113 }
|
Chris@31
|
114
|
Chris@31
|
115 size_t
|
Chris@31
|
116 Silvet::getMinChannelCount() const
|
Chris@31
|
117 {
|
Chris@31
|
118 return 1;
|
Chris@31
|
119 }
|
Chris@31
|
120
|
Chris@31
|
121 size_t
|
Chris@31
|
122 Silvet::getMaxChannelCount() const
|
Chris@31
|
123 {
|
Chris@31
|
124 return 1;
|
Chris@31
|
125 }
|
Chris@31
|
126
|
Chris@31
|
127 Silvet::ParameterList
|
Chris@31
|
128 Silvet::getParameterDescriptors() const
|
Chris@31
|
129 {
|
Chris@31
|
130 ParameterList list;
|
Chris@110
|
131
|
Chris@110
|
132 ParameterDescriptor desc;
|
Chris@110
|
133 desc.identifier = "mode";
|
Chris@110
|
134 desc.name = "Processing mode";
|
Chris@110
|
135 desc.unit = "";
|
Chris@110
|
136 desc.description = "Determines the tradeoff of processing speed against transcription quality";
|
Chris@110
|
137 desc.minValue = 0;
|
Chris@110
|
138 desc.maxValue = 1;
|
Chris@113
|
139 desc.defaultValue = 1;
|
Chris@110
|
140 desc.isQuantized = true;
|
Chris@110
|
141 desc.quantizeStep = 1;
|
Chris@166
|
142 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
143 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@161
|
144 list.push_back(desc);
|
Chris@161
|
145
|
Chris@176
|
146 desc.identifier = "instrument";
|
Chris@176
|
147 desc.name = "Instrument";
|
Chris@161
|
148 desc.unit = "";
|
Chris@162
|
149 desc.description = "The instrument known to be present in the recording, if there is only one";
|
Chris@161
|
150 desc.minValue = 0;
|
Chris@162
|
151 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
152 desc.defaultValue = 0;
|
Chris@161
|
153 desc.isQuantized = true;
|
Chris@161
|
154 desc.quantizeStep = 1;
|
Chris@161
|
155 desc.valueNames.clear();
|
Chris@162
|
156 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
157 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
158 }
|
Chris@166
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@166
|
161 desc.identifier = "finetune";
|
Chris@166
|
162 desc.name = "Return fine pitch estimates";
|
Chris@166
|
163 desc.unit = "";
|
Chris@166
|
164 desc.description = "Return pitch estimates at finer than semitone resolution (works only in Intensive mode)";
|
Chris@166
|
165 desc.minValue = 0;
|
Chris@166
|
166 desc.maxValue = 1;
|
Chris@166
|
167 desc.defaultValue = 0;
|
Chris@166
|
168 desc.isQuantized = true;
|
Chris@166
|
169 desc.quantizeStep = 1;
|
Chris@166
|
170 desc.valueNames.clear();
|
Chris@110
|
171 list.push_back(desc);
|
Chris@110
|
172
|
Chris@31
|
173 return list;
|
Chris@31
|
174 }
|
Chris@31
|
175
|
Chris@31
|
176 float
|
Chris@31
|
177 Silvet::getParameter(string identifier) const
|
Chris@31
|
178 {
|
Chris@110
|
179 if (identifier == "mode") {
|
Chris@110
|
180 return m_hqMode ? 1.f : 0.f;
|
Chris@166
|
181 } else if (identifier == "finetune") {
|
Chris@166
|
182 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
183 } else if (identifier == "instrument") {
|
Chris@162
|
184 return m_instrument;
|
Chris@110
|
185 }
|
Chris@31
|
186 return 0;
|
Chris@31
|
187 }
|
Chris@31
|
188
|
Chris@31
|
189 void
|
Chris@31
|
190 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
191 {
|
Chris@110
|
192 if (identifier == "mode") {
|
Chris@110
|
193 m_hqMode = (value > 0.5);
|
Chris@166
|
194 } else if (identifier == "finetune") {
|
Chris@166
|
195 m_fineTuning = (value > 0.5);
|
Chris@176
|
196 } else if (identifier == "instrument") {
|
Chris@162
|
197 m_instrument = lrintf(value);
|
Chris@110
|
198 }
|
Chris@31
|
199 }
|
Chris@31
|
200
|
Chris@31
|
201 Silvet::ProgramList
|
Chris@31
|
202 Silvet::getPrograms() const
|
Chris@31
|
203 {
|
Chris@31
|
204 ProgramList list;
|
Chris@31
|
205 return list;
|
Chris@31
|
206 }
|
Chris@31
|
207
|
Chris@31
|
208 string
|
Chris@31
|
209 Silvet::getCurrentProgram() const
|
Chris@31
|
210 {
|
Chris@31
|
211 return "";
|
Chris@31
|
212 }
|
Chris@31
|
213
|
Chris@31
|
214 void
|
Chris@31
|
215 Silvet::selectProgram(string name)
|
Chris@31
|
216 {
|
Chris@31
|
217 }
|
Chris@31
|
218
|
Chris@31
|
219 Silvet::OutputList
|
Chris@31
|
220 Silvet::getOutputDescriptors() const
|
Chris@31
|
221 {
|
Chris@31
|
222 OutputList list;
|
Chris@31
|
223
|
Chris@31
|
224 OutputDescriptor d;
|
Chris@51
|
225 d.identifier = "notes";
|
Chris@51
|
226 d.name = "Note transcription";
|
Chris@162
|
227 d.description = "Overall note transcription across selected instruments";
|
Chris@41
|
228 d.unit = "Hz";
|
Chris@31
|
229 d.hasFixedBinCount = true;
|
Chris@31
|
230 d.binCount = 2;
|
Chris@41
|
231 d.binNames.push_back("Frequency");
|
Chris@31
|
232 d.binNames.push_back("Velocity");
|
Chris@31
|
233 d.hasKnownExtents = false;
|
Chris@31
|
234 d.isQuantized = false;
|
Chris@31
|
235 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
236 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
237 d.hasDuration = true;
|
Chris@32
|
238 m_notesOutputNo = list.size();
|
Chris@32
|
239 list.push_back(d);
|
Chris@32
|
240
|
Chris@178
|
241 d.identifier = "timefreq";
|
Chris@178
|
242 d.name = "Time-frequency distribution";
|
Chris@178
|
243 d.description = "Filtered constant-Q time-frequency distribution used as input to the expectation-maximisation algorithm";
|
Chris@178
|
244 d.unit = "";
|
Chris@178
|
245 d.hasFixedBinCount = true;
|
Chris@178
|
246 d.binCount = m_instruments[0].templateHeight;
|
Chris@178
|
247 d.binNames.clear();
|
Chris@178
|
248 if (m_cq) {
|
Chris@178
|
249 char name[20];
|
Chris@178
|
250 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
|
Chris@178
|
251 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
252 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
253 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
254 // frequency though, so these are still the first 545 bins
|
Chris@178
|
255 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
256 float freq = m_cq->getBinFrequency
|
Chris@178
|
257 (m_instruments[0].templateHeight - i - 1);
|
Chris@178
|
258 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
259 d.binNames.push_back(name);
|
Chris@178
|
260 }
|
Chris@178
|
261 }
|
Chris@178
|
262 d.hasKnownExtents = false;
|
Chris@178
|
263 d.isQuantized = false;
|
Chris@178
|
264 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
265 d.sampleRate = m_colsPerSec;
|
Chris@178
|
266 d.hasDuration = false;
|
Chris@178
|
267 m_fcqOutputNo = list.size();
|
Chris@178
|
268 list.push_back(d);
|
Chris@178
|
269
|
Chris@31
|
270 return list;
|
Chris@31
|
271 }
|
Chris@31
|
272
|
Chris@38
|
273 std::string
|
Chris@175
|
274 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@38
|
275 {
|
Chris@38
|
276 static const char *names[] = {
|
Chris@38
|
277 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
278 };
|
Chris@38
|
279
|
Chris@175
|
280 const char *n = names[note % 12];
|
Chris@38
|
281
|
Chris@175
|
282 int oct = (note + 9) / 12;
|
Chris@38
|
283
|
Chris@175
|
284 char buf[30];
|
Chris@175
|
285
|
Chris@175
|
286 float pshift = 0.f;
|
Chris@175
|
287 if (shiftCount > 1) {
|
Chris@175
|
288 // see noteFrequency below
|
Chris@175
|
289 pshift =
|
Chris@175
|
290 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
291 }
|
Chris@175
|
292
|
Chris@175
|
293 if (pshift > 0.f) {
|
Chris@175
|
294 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
|
Chris@175
|
295 } else if (pshift < 0.f) {
|
Chris@175
|
296 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
|
Chris@175
|
297 } else {
|
Chris@175
|
298 sprintf(buf, "%s%d", n, oct);
|
Chris@175
|
299 }
|
Chris@38
|
300
|
Chris@38
|
301 return buf;
|
Chris@38
|
302 }
|
Chris@38
|
303
|
Chris@41
|
304 float
|
Chris@168
|
305 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
306 {
|
Chris@169
|
307 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
308 // is an offset into the template array, which starts with some
|
Chris@169
|
309 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
310 //
|
Chris@169
|
311 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
312 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
313 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
314 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
315 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
316 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
317 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
318 // down in pitch, for a negative pitch shift.
|
Chris@169
|
319
|
Chris@175
|
320 float pshift = 0.f;
|
Chris@175
|
321 if (shiftCount > 1) {
|
Chris@175
|
322 pshift =
|
Chris@175
|
323 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
324 }
|
Chris@169
|
325
|
Chris@169
|
326 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@41
|
327 }
|
Chris@41
|
328
|
Chris@31
|
329 bool
|
Chris@31
|
330 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
331 {
|
Chris@31
|
332 if (channels < getMinChannelCount() ||
|
Chris@31
|
333 channels > getMaxChannelCount()) return false;
|
Chris@31
|
334
|
Chris@31
|
335 if (stepSize != blockSize) {
|
Chris@31
|
336 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
337 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
338 return false;
|
Chris@31
|
339 }
|
Chris@31
|
340
|
Chris@31
|
341 m_blockSize = blockSize;
|
Chris@31
|
342
|
Chris@31
|
343 reset();
|
Chris@31
|
344
|
Chris@31
|
345 return true;
|
Chris@31
|
346 }
|
Chris@31
|
347
|
Chris@31
|
348 void
|
Chris@31
|
349 Silvet::reset()
|
Chris@31
|
350 {
|
Chris@31
|
351 delete m_resampler;
|
Chris@246
|
352 delete m_flattener;
|
Chris@31
|
353 delete m_cq;
|
Chris@31
|
354
|
Chris@31
|
355 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
356 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
357 } else {
|
Chris@31
|
358 m_resampler = 0;
|
Chris@31
|
359 }
|
Chris@31
|
360
|
Chris@246
|
361 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
362 m_flattener->reset();
|
Chris@246
|
363
|
Chris@173
|
364 double minFreq = 27.5;
|
Chris@173
|
365
|
Chris@173
|
366 if (!m_hqMode) {
|
Chris@173
|
367 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
368 // so we can just pad with zeros
|
Chris@173
|
369 minFreq *= 2;
|
Chris@173
|
370 }
|
Chris@173
|
371
|
Chris@154
|
372 CQParameters params(processingSampleRate,
|
Chris@173
|
373 minFreq,
|
Chris@154
|
374 processingSampleRate / 3,
|
Chris@154
|
375 processingBPO);
|
Chris@154
|
376
|
Chris@155
|
377 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
|
Chris@155
|
378 // drops the FFT size to 512 from 1024 and alters
|
Chris@155
|
379 // some other processing parameters, making
|
Chris@155
|
380 // everything much, much slower. Could be a flaw
|
Chris@155
|
381 // in the CQ parameter calculations, must check
|
Chris@154
|
382 params.atomHopFactor = 0.3;
|
Chris@154
|
383 params.threshold = 0.0005;
|
Chris@172
|
384 params.window = CQParameters::Hann;
|
Chris@154
|
385
|
Chris@154
|
386 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
387
|
Chris@165
|
388 m_colsPerSec = m_hqMode ? 50 : 25;
|
Chris@165
|
389
|
Chris@41
|
390 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
391 delete m_postFilter[i];
|
Chris@41
|
392 }
|
Chris@41
|
393 m_postFilter.clear();
|
Chris@176
|
394 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
|
Chris@41
|
395 m_postFilter.push_back(new MedianFilter<double>(3));
|
Chris@41
|
396 }
|
Chris@41
|
397 m_pianoRoll.clear();
|
Chris@246
|
398 m_inputGains.clear();
|
Chris@32
|
399 m_columnCount = 0;
|
Chris@40
|
400 m_startTime = RealTime::zeroTime;
|
Chris@31
|
401 }
|
Chris@31
|
402
|
Chris@31
|
403 Silvet::FeatureSet
|
Chris@31
|
404 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
405 {
|
Chris@40
|
406 if (m_columnCount == 0) {
|
Chris@40
|
407 m_startTime = timestamp;
|
Chris@40
|
408 }
|
Chris@246
|
409
|
Chris@246
|
410 vector<float> flattened(m_blockSize);
|
Chris@246
|
411 float gain = 1.f;
|
Chris@246
|
412 m_flattener->connectInputPort
|
Chris@246
|
413 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
414 m_flattener->connectOutputPort
|
Chris@246
|
415 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
416 m_flattener->connectOutputPort
|
Chris@246
|
417 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
418 m_flattener->process(m_blockSize);
|
Chris@246
|
419
|
Chris@252
|
420 m_inputGains[timestamp] = gain;
|
Chris@40
|
421
|
Chris@31
|
422 vector<double> data;
|
Chris@40
|
423 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
424 double d = flattened[i];
|
Chris@235
|
425 data.push_back(d);
|
Chris@40
|
426 }
|
Chris@31
|
427
|
Chris@31
|
428 if (m_resampler) {
|
Chris@31
|
429 data = m_resampler->process(data.data(), data.size());
|
Chris@31
|
430 }
|
Chris@246
|
431
|
Chris@32
|
432 Grid cqout = m_cq->process(data);
|
Chris@51
|
433 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
434 return fs;
|
Chris@34
|
435 }
|
Chris@34
|
436
|
Chris@34
|
437 Silvet::FeatureSet
|
Chris@34
|
438 Silvet::getRemainingFeatures()
|
Chris@34
|
439 {
|
Chris@145
|
440 Grid cqout = m_cq->getRemainingOutput();
|
Chris@51
|
441 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
442 return fs;
|
Chris@34
|
443 }
|
Chris@34
|
444
|
Chris@34
|
445 Silvet::FeatureSet
|
Chris@34
|
446 Silvet::transcribe(const Grid &cqout)
|
Chris@34
|
447 {
|
Chris@32
|
448 Grid filtered = preProcess(cqout);
|
Chris@31
|
449
|
Chris@32
|
450 FeatureSet fs;
|
Chris@32
|
451
|
Chris@104
|
452 if (filtered.empty()) return fs;
|
Chris@170
|
453
|
Chris@170
|
454 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@104
|
455
|
Chris@178
|
456 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
457 Feature f;
|
Chris@178
|
458 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
459 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
460 }
|
Chris@178
|
461 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
462 }
|
Chris@178
|
463
|
Chris@34
|
464 int width = filtered.size();
|
Chris@34
|
465
|
Chris@164
|
466 int iterations = m_hqMode ? 20 : 10;
|
Chris@34
|
467
|
Chris@170
|
468 //!!! pitches or notes? [terminology]
|
Chris@176
|
469 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
|
Chris@170
|
470
|
Chris@170
|
471 bool wantShifts = m_hqMode && m_fineTuning;
|
Chris@170
|
472 int shiftCount = 1;
|
Chris@170
|
473 if (wantShifts) {
|
Chris@170
|
474 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
475 }
|
Chris@170
|
476
|
Chris@170
|
477 vector<vector<int> > localBestShifts;
|
Chris@170
|
478 if (wantShifts) {
|
Chris@170
|
479 localBestShifts =
|
Chris@176
|
480 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
|
Chris@170
|
481 }
|
Chris@170
|
482
|
Chris@170
|
483 vector<bool> present(width, false);
|
Chris@37
|
484
|
Chris@123
|
485 #pragma omp parallel for
|
Chris@123
|
486 for (int i = 0; i < width; ++i) {
|
Chris@104
|
487
|
Chris@170
|
488 double sum = 0.0;
|
Chris@176
|
489 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@170
|
490 sum += filtered.at(i).at(j);
|
Chris@170
|
491 }
|
Chris@170
|
492 if (sum < 1e-5) continue;
|
Chris@170
|
493
|
Chris@170
|
494 present[i] = true;
|
Chris@170
|
495
|
Chris@170
|
496 EM em(&pack, m_hqMode);
|
Chris@170
|
497
|
Chris@183
|
498 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@213
|
499 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@183
|
500
|
Chris@170
|
501 for (int j = 0; j < iterations; ++j) {
|
Chris@170
|
502 em.iterate(filtered.at(i).data());
|
Chris@37
|
503 }
|
Chris@37
|
504
|
Chris@170
|
505 const float *pitchDist = em.getPitchDistribution();
|
Chris@170
|
506 const float *const *shiftDist = em.getShifts();
|
Chris@37
|
507
|
Chris@176
|
508 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@104
|
509
|
Chris@170
|
510 localPitches[i][j] = pitchDist[j] * sum;
|
Chris@170
|
511
|
Chris@170
|
512 int bestShift = 0;
|
Chris@179
|
513 float bestShiftValue = 0.0;
|
Chris@170
|
514 if (wantShifts) {
|
Chris@170
|
515 for (int k = 0; k < shiftCount; ++k) {
|
Chris@179
|
516 float value = shiftDist[k][j];
|
Chris@179
|
517 if (k == 0 || value > bestShiftValue) {
|
Chris@179
|
518 bestShiftValue = value;
|
Chris@170
|
519 bestShift = k;
|
Chris@170
|
520 }
|
Chris@170
|
521 }
|
Chris@170
|
522 localBestShifts[i][j] = bestShift;
|
Chris@170
|
523 }
|
Chris@123
|
524 }
|
Chris@123
|
525 }
|
Chris@166
|
526
|
Chris@166
|
527 for (int i = 0; i < width; ++i) {
|
Chris@37
|
528
|
Chris@170
|
529 if (!present[i]) {
|
Chris@170
|
530 // silent column
|
Chris@176
|
531 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
532 m_postFilter[j]->push(0.0);
|
Chris@170
|
533 }
|
Chris@168
|
534 m_pianoRoll.push_back(map<int, double>());
|
Chris@170
|
535 if (wantShifts) {
|
Chris@168
|
536 m_pianoRollShifts.push_back(map<int, int>());
|
Chris@168
|
537 }
|
Chris@166
|
538 continue;
|
Chris@166
|
539 }
|
Chris@166
|
540
|
Chris@170
|
541 postProcess(localPitches[i], localBestShifts[i], wantShifts);
|
Chris@166
|
542
|
Chris@168
|
543 FeatureList noteFeatures = noteTrack(shiftCount);
|
Chris@38
|
544
|
Chris@123
|
545 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
546 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
547 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
548 }
|
Chris@34
|
549 }
|
Chris@34
|
550
|
Chris@32
|
551 return fs;
|
Chris@31
|
552 }
|
Chris@31
|
553
|
Chris@32
|
554 Silvet::Grid
|
Chris@32
|
555 Silvet::preProcess(const Grid &in)
|
Chris@32
|
556 {
|
Chris@32
|
557 int width = in.size();
|
Chris@32
|
558
|
Chris@165
|
559 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
560
|
Chris@165
|
561 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
562 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
563
|
Chris@32
|
564 Grid out;
|
Chris@32
|
565
|
Chris@58
|
566 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
567 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
568 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
569 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
570 // size we reduce to in a moment
|
Chris@33
|
571 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
572
|
Chris@176
|
573 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
574
|
Chris@32
|
575 for (int i = 0; i < width; ++i) {
|
Chris@32
|
576
|
Chris@33
|
577 if (m_columnCount < latentColumns) {
|
Chris@33
|
578 ++m_columnCount;
|
Chris@33
|
579 continue;
|
Chris@33
|
580 }
|
Chris@33
|
581
|
Chris@32
|
582 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
583 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
584
|
Chris@32
|
585 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
586
|
Chris@32
|
587 if (select) {
|
Chris@32
|
588 vector<double> inCol = in[i];
|
Chris@176
|
589 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
590
|
Chris@178
|
591 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@178
|
592 // lowest 55 of them.
|
Chris@178
|
593 //
|
Chris@178
|
594 // In draft mode the CQ is an octave shorter, returning
|
Chris@178
|
595 // 540 bins, so we instead pad them with an additional 5
|
Chris@178
|
596 // zeros.
|
Chris@178
|
597 //
|
Chris@178
|
598 // We also need to reverse the column as we go, since the
|
Chris@178
|
599 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
600 // the other way around.
|
Chris@32
|
601
|
Chris@178
|
602 if (m_hqMode) {
|
Chris@178
|
603 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
604 int ix = inCol.size() - j - 55;
|
Chris@178
|
605 outCol[j] = inCol[ix];
|
Chris@178
|
606 }
|
Chris@178
|
607 } else {
|
Chris@178
|
608 for (int j = 0; j < 5; ++j) {
|
Chris@178
|
609 outCol[j] = 0.0;
|
Chris@178
|
610 }
|
Chris@178
|
611 for (int j = 5; j < pack.templateHeight; ++j) {
|
Chris@178
|
612 int ix = inCol.size() - j + 4;
|
Chris@178
|
613 outCol[j] = inCol[ix];
|
Chris@178
|
614 }
|
Chris@46
|
615 }
|
Chris@32
|
616
|
Chris@46
|
617 vector<double> noiseLevel1 =
|
Chris@46
|
618 MedianFilter<double>::filter(40, outCol);
|
Chris@176
|
619 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
620 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
621 }
|
Chris@32
|
622
|
Chris@46
|
623 vector<double> noiseLevel2 =
|
Chris@46
|
624 MedianFilter<double>::filter(40, noiseLevel1);
|
Chris@176
|
625 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
626 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
627 }
|
Chris@32
|
628
|
Chris@165
|
629 out.push_back(outCol);
|
Chris@32
|
630 }
|
Chris@32
|
631
|
Chris@32
|
632 ++m_columnCount;
|
Chris@32
|
633 }
|
Chris@32
|
634
|
Chris@32
|
635 return out;
|
Chris@32
|
636 }
|
Chris@32
|
637
|
Chris@168
|
638 void
|
Chris@170
|
639 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
640 const vector<int> &bestShifts,
|
Chris@170
|
641 bool wantShifts)
|
Chris@166
|
642 {
|
Chris@176
|
643 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
644
|
Chris@41
|
645 vector<double> filtered;
|
Chris@41
|
646
|
Chris@176
|
647 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
648 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
649 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
650 }
|
Chris@41
|
651
|
Chris@41
|
652 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
653
|
Chris@41
|
654 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
655
|
Chris@41
|
656 ValueIndexMap strengths;
|
Chris@166
|
657
|
Chris@176
|
658 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
659 double strength = filtered[j];
|
Chris@183
|
660 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
661 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
662 }
|
Chris@166
|
663
|
Chris@168
|
664 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
665
|
Chris@168
|
666 map<int, double> active;
|
Chris@168
|
667 map<int, int> activeShifts;
|
Chris@168
|
668
|
Chris@183
|
669 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
670
|
Chris@168
|
671 --si;
|
Chris@168
|
672
|
Chris@168
|
673 double strength = si->first;
|
Chris@168
|
674 int j = si->second;
|
Chris@168
|
675
|
Chris@168
|
676 active[j] = strength;
|
Chris@168
|
677
|
Chris@170
|
678 if (wantShifts) {
|
Chris@170
|
679 activeShifts[j] = bestShifts[j];
|
Chris@167
|
680 }
|
Chris@41
|
681 }
|
Chris@41
|
682
|
Chris@168
|
683 m_pianoRoll.push_back(active);
|
Chris@170
|
684
|
Chris@170
|
685 if (wantShifts) {
|
Chris@168
|
686 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
687 }
|
Chris@166
|
688 }
|
Chris@166
|
689
|
Chris@166
|
690 Vamp::Plugin::FeatureList
|
Chris@168
|
691 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
692 {
|
Chris@41
|
693 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
694 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
695 // latest active set but present in the prior set in the piano
|
Chris@41
|
696 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
697 // already, and if they haven't ended, we don't know their
|
Chris@41
|
698 // duration.
|
Chris@41
|
699
|
Chris@168
|
700 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
701
|
Chris@168
|
702 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
703
|
Chris@165
|
704 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
705
|
Chris@165
|
706 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
707 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
708 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
709
|
Chris@41
|
710 FeatureList noteFeatures;
|
Chris@41
|
711
|
Chris@41
|
712 if (width < durationThreshold + 1) {
|
Chris@41
|
713 return noteFeatures;
|
Chris@41
|
714 }
|
Chris@41
|
715
|
Chris@150
|
716 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
717
|
Chris@55
|
718 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
719 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
720
|
Chris@55
|
721 int note = ni->first;
|
Chris@41
|
722
|
Chris@41
|
723 if (active.find(note) != active.end()) {
|
Chris@41
|
724 // the note is still playing
|
Chris@41
|
725 continue;
|
Chris@41
|
726 }
|
Chris@41
|
727
|
Chris@41
|
728 // the note was playing but just ended
|
Chris@41
|
729 int end = width;
|
Chris@41
|
730 int start = end-1;
|
Chris@41
|
731
|
Chris@41
|
732 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
733 --start;
|
Chris@41
|
734 }
|
Chris@41
|
735 ++start;
|
Chris@41
|
736
|
Chris@169
|
737 if ((end - start) < durationThreshold) {
|
Chris@41
|
738 continue;
|
Chris@41
|
739 }
|
Chris@41
|
740
|
Chris@169
|
741 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@41
|
742 }
|
Chris@41
|
743
|
Chris@62
|
744 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
745
|
Chris@41
|
746 return noteFeatures;
|
Chris@41
|
747 }
|
Chris@41
|
748
|
Chris@169
|
749 void
|
Chris@169
|
750 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
751 FeatureList ¬eFeatures)
|
Chris@169
|
752 {
|
Chris@169
|
753 int partStart = start;
|
Chris@169
|
754 int partShift = 0;
|
Chris@169
|
755 int partVelocity = 0;
|
Chris@169
|
756
|
Chris@252
|
757 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
758
|
Chris@169
|
759 for (int i = start; i != end; ++i) {
|
Chris@169
|
760
|
Chris@169
|
761 double strength = m_pianoRoll[i][note];
|
Chris@169
|
762
|
Chris@169
|
763 int shift = 0;
|
Chris@169
|
764
|
Chris@169
|
765 if (shiftCount > 1) {
|
Chris@169
|
766
|
Chris@169
|
767 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
768
|
Chris@169
|
769 if (i == partStart) {
|
Chris@169
|
770 partShift = shift;
|
Chris@169
|
771 }
|
Chris@169
|
772
|
Chris@169
|
773 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
774
|
Chris@169
|
775 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
776
|
Chris@169
|
777 // pitch has changed, emit an intermediate note
|
Chris@252
|
778 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
779 i,
|
Chris@252
|
780 note,
|
Chris@252
|
781 partShift,
|
Chris@252
|
782 shiftCount,
|
Chris@252
|
783 partVelocity));
|
Chris@169
|
784 partStart = i;
|
Chris@169
|
785 partShift = shift;
|
Chris@169
|
786 partVelocity = 0;
|
Chris@169
|
787 }
|
Chris@169
|
788 }
|
Chris@169
|
789
|
Chris@246
|
790 int v = round(strength * 2);
|
Chris@169
|
791 if (v > partVelocity) {
|
Chris@169
|
792 partVelocity = v;
|
Chris@169
|
793 }
|
Chris@169
|
794 }
|
Chris@169
|
795
|
Chris@169
|
796 if (end >= partStart + partThreshold) {
|
Chris@252
|
797 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
798 end,
|
Chris@252
|
799 note,
|
Chris@252
|
800 partShift,
|
Chris@252
|
801 shiftCount,
|
Chris@252
|
802 partVelocity));
|
Chris@169
|
803 }
|
Chris@169
|
804 }
|
Chris@252
|
805
|
Chris@252
|
806 Silvet::Feature
|
Chris@252
|
807 Silvet::makeNoteFeature(int start,
|
Chris@252
|
808 int end,
|
Chris@252
|
809 int note,
|
Chris@252
|
810 int shift,
|
Chris@252
|
811 int shiftCount,
|
Chris@252
|
812 int velocity)
|
Chris@252
|
813 {
|
Chris@252
|
814 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@252
|
815 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@252
|
816
|
Chris@252
|
817 Feature f;
|
Chris@252
|
818
|
Chris@252
|
819 f.hasTimestamp = true;
|
Chris@252
|
820 f.timestamp = RealTime::fromSeconds
|
Chris@252
|
821 (columnDuration * (start - postFilterLatency) + 0.02);
|
Chris@252
|
822
|
Chris@252
|
823 f.hasDuration = true;
|
Chris@252
|
824 f.duration = RealTime::fromSeconds
|
Chris@252
|
825 (columnDuration * (end - start));
|
Chris@252
|
826
|
Chris@252
|
827 f.values.clear();
|
Chris@252
|
828
|
Chris@252
|
829 f.values.push_back
|
Chris@252
|
830 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
831
|
Chris@252
|
832 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
833 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
834 velocity = round(velocity / inputGain);
|
Chris@252
|
835 if (velocity > 127) velocity = 127;
|
Chris@252
|
836 if (velocity < 1) velocity = 1;
|
Chris@252
|
837 f.values.push_back(velocity);
|
Chris@252
|
838
|
Chris@252
|
839 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
840
|
Chris@252
|
841 return f;
|
Chris@252
|
842 }
|
Chris@252
|
843
|
Chris@252
|
844 float
|
Chris@252
|
845 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
846 {
|
Chris@252
|
847 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
848
|
Chris@252
|
849 if (i == m_inputGains.end()) {
|
Chris@252
|
850 if (i != m_inputGains.begin()) {
|
Chris@252
|
851 --i;
|
Chris@252
|
852 } else {
|
Chris@252
|
853 return 1.f; // no data
|
Chris@252
|
854 }
|
Chris@252
|
855 }
|
Chris@252
|
856
|
Chris@252
|
857 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
858
|
Chris@252
|
859 return i->second;
|
Chris@252
|
860 }
|
Chris@252
|
861
|