Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@31
|
24
|
Chris@31
|
25 #include <vector>
|
Chris@31
|
26
|
Chris@32
|
27 #include <cstdio>
|
Chris@32
|
28
|
Chris@31
|
29 using std::vector;
|
Chris@48
|
30 using std::cout;
|
Chris@31
|
31 using std::cerr;
|
Chris@31
|
32 using std::endl;
|
Chris@40
|
33 using Vamp::RealTime;
|
Chris@31
|
34
|
Chris@31
|
35 static int processingSampleRate = 44100;
|
Chris@31
|
36 static int processingBPO = 60;
|
Chris@170
|
37
|
Chris@31
|
38 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
39 Plugin(inputSampleRate),
|
Chris@161
|
40 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@31
|
41 m_resampler(0),
|
Chris@246
|
42 m_flattener(0),
|
Chris@110
|
43 m_cq(0),
|
Chris@162
|
44 m_hqMode(true),
|
Chris@166
|
45 m_fineTuning(false),
|
Chris@178
|
46 m_instrument(0),
|
Chris@178
|
47 m_colsPerSec(50)
|
Chris@31
|
48 {
|
Chris@31
|
49 }
|
Chris@31
|
50
|
Chris@31
|
51 Silvet::~Silvet()
|
Chris@31
|
52 {
|
Chris@31
|
53 delete m_resampler;
|
Chris@246
|
54 delete m_flattener;
|
Chris@31
|
55 delete m_cq;
|
Chris@41
|
56 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
57 delete m_postFilter[i];
|
Chris@41
|
58 }
|
Chris@31
|
59 }
|
Chris@31
|
60
|
Chris@31
|
61 string
|
Chris@31
|
62 Silvet::getIdentifier() const
|
Chris@31
|
63 {
|
Chris@31
|
64 return "silvet";
|
Chris@31
|
65 }
|
Chris@31
|
66
|
Chris@31
|
67 string
|
Chris@31
|
68 Silvet::getName() const
|
Chris@31
|
69 {
|
Chris@31
|
70 return "Silvet Note Transcription";
|
Chris@31
|
71 }
|
Chris@31
|
72
|
Chris@31
|
73 string
|
Chris@31
|
74 Silvet::getDescription() const
|
Chris@31
|
75 {
|
Chris@191
|
76 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
77 }
|
Chris@31
|
78
|
Chris@31
|
79 string
|
Chris@31
|
80 Silvet::getMaker() const
|
Chris@31
|
81 {
|
Chris@191
|
82 return "Queen Mary, University of London";
|
Chris@31
|
83 }
|
Chris@31
|
84
|
Chris@31
|
85 int
|
Chris@31
|
86 Silvet::getPluginVersion() const
|
Chris@31
|
87 {
|
Chris@31
|
88 return 1;
|
Chris@31
|
89 }
|
Chris@31
|
90
|
Chris@31
|
91 string
|
Chris@31
|
92 Silvet::getCopyright() const
|
Chris@31
|
93 {
|
Chris@191
|
94 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
95 }
|
Chris@31
|
96
|
Chris@31
|
97 Silvet::InputDomain
|
Chris@31
|
98 Silvet::getInputDomain() const
|
Chris@31
|
99 {
|
Chris@31
|
100 return TimeDomain;
|
Chris@31
|
101 }
|
Chris@31
|
102
|
Chris@31
|
103 size_t
|
Chris@31
|
104 Silvet::getPreferredBlockSize() const
|
Chris@31
|
105 {
|
Chris@31
|
106 return 0;
|
Chris@31
|
107 }
|
Chris@31
|
108
|
Chris@31
|
109 size_t
|
Chris@31
|
110 Silvet::getPreferredStepSize() const
|
Chris@31
|
111 {
|
Chris@31
|
112 return 0;
|
Chris@31
|
113 }
|
Chris@31
|
114
|
Chris@31
|
115 size_t
|
Chris@31
|
116 Silvet::getMinChannelCount() const
|
Chris@31
|
117 {
|
Chris@31
|
118 return 1;
|
Chris@31
|
119 }
|
Chris@31
|
120
|
Chris@31
|
121 size_t
|
Chris@31
|
122 Silvet::getMaxChannelCount() const
|
Chris@31
|
123 {
|
Chris@31
|
124 return 1;
|
Chris@31
|
125 }
|
Chris@31
|
126
|
Chris@31
|
127 Silvet::ParameterList
|
Chris@31
|
128 Silvet::getParameterDescriptors() const
|
Chris@31
|
129 {
|
Chris@31
|
130 ParameterList list;
|
Chris@110
|
131
|
Chris@110
|
132 ParameterDescriptor desc;
|
Chris@110
|
133 desc.identifier = "mode";
|
Chris@110
|
134 desc.name = "Processing mode";
|
Chris@110
|
135 desc.unit = "";
|
Chris@271
|
136 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
|
Chris@110
|
137 desc.minValue = 0;
|
Chris@110
|
138 desc.maxValue = 1;
|
Chris@113
|
139 desc.defaultValue = 1;
|
Chris@110
|
140 desc.isQuantized = true;
|
Chris@110
|
141 desc.quantizeStep = 1;
|
Chris@166
|
142 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
143 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@161
|
144 list.push_back(desc);
|
Chris@161
|
145
|
Chris@176
|
146 desc.identifier = "instrument";
|
Chris@176
|
147 desc.name = "Instrument";
|
Chris@161
|
148 desc.unit = "";
|
Chris@271
|
149 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
150 desc.minValue = 0;
|
Chris@162
|
151 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
152 desc.defaultValue = 0;
|
Chris@161
|
153 desc.isQuantized = true;
|
Chris@161
|
154 desc.quantizeStep = 1;
|
Chris@161
|
155 desc.valueNames.clear();
|
Chris@162
|
156 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
157 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
158 }
|
Chris@166
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@166
|
161 desc.identifier = "finetune";
|
Chris@166
|
162 desc.name = "Return fine pitch estimates";
|
Chris@166
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
165 desc.minValue = 0;
|
Chris@166
|
166 desc.maxValue = 1;
|
Chris@166
|
167 desc.defaultValue = 0;
|
Chris@166
|
168 desc.isQuantized = true;
|
Chris@166
|
169 desc.quantizeStep = 1;
|
Chris@166
|
170 desc.valueNames.clear();
|
Chris@110
|
171 list.push_back(desc);
|
Chris@110
|
172
|
Chris@31
|
173 return list;
|
Chris@31
|
174 }
|
Chris@31
|
175
|
Chris@31
|
176 float
|
Chris@31
|
177 Silvet::getParameter(string identifier) const
|
Chris@31
|
178 {
|
Chris@110
|
179 if (identifier == "mode") {
|
Chris@110
|
180 return m_hqMode ? 1.f : 0.f;
|
Chris@166
|
181 } else if (identifier == "finetune") {
|
Chris@166
|
182 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
183 } else if (identifier == "instrument") {
|
Chris@162
|
184 return m_instrument;
|
Chris@110
|
185 }
|
Chris@31
|
186 return 0;
|
Chris@31
|
187 }
|
Chris@31
|
188
|
Chris@31
|
189 void
|
Chris@31
|
190 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
191 {
|
Chris@110
|
192 if (identifier == "mode") {
|
Chris@110
|
193 m_hqMode = (value > 0.5);
|
Chris@166
|
194 } else if (identifier == "finetune") {
|
Chris@166
|
195 m_fineTuning = (value > 0.5);
|
Chris@176
|
196 } else if (identifier == "instrument") {
|
Chris@162
|
197 m_instrument = lrintf(value);
|
Chris@110
|
198 }
|
Chris@31
|
199 }
|
Chris@31
|
200
|
Chris@31
|
201 Silvet::ProgramList
|
Chris@31
|
202 Silvet::getPrograms() const
|
Chris@31
|
203 {
|
Chris@31
|
204 ProgramList list;
|
Chris@31
|
205 return list;
|
Chris@31
|
206 }
|
Chris@31
|
207
|
Chris@31
|
208 string
|
Chris@31
|
209 Silvet::getCurrentProgram() const
|
Chris@31
|
210 {
|
Chris@31
|
211 return "";
|
Chris@31
|
212 }
|
Chris@31
|
213
|
Chris@31
|
214 void
|
Chris@31
|
215 Silvet::selectProgram(string name)
|
Chris@31
|
216 {
|
Chris@31
|
217 }
|
Chris@31
|
218
|
Chris@31
|
219 Silvet::OutputList
|
Chris@31
|
220 Silvet::getOutputDescriptors() const
|
Chris@31
|
221 {
|
Chris@31
|
222 OutputList list;
|
Chris@31
|
223
|
Chris@31
|
224 OutputDescriptor d;
|
Chris@51
|
225 d.identifier = "notes";
|
Chris@51
|
226 d.name = "Note transcription";
|
Chris@271
|
227 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
228 d.unit = "Hz";
|
Chris@31
|
229 d.hasFixedBinCount = true;
|
Chris@31
|
230 d.binCount = 2;
|
Chris@41
|
231 d.binNames.push_back("Frequency");
|
Chris@31
|
232 d.binNames.push_back("Velocity");
|
Chris@31
|
233 d.hasKnownExtents = false;
|
Chris@31
|
234 d.isQuantized = false;
|
Chris@31
|
235 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
236 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
237 d.hasDuration = true;
|
Chris@32
|
238 m_notesOutputNo = list.size();
|
Chris@32
|
239 list.push_back(d);
|
Chris@32
|
240
|
Chris@178
|
241 d.identifier = "timefreq";
|
Chris@178
|
242 d.name = "Time-frequency distribution";
|
Chris@271
|
243 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
244 d.unit = "";
|
Chris@178
|
245 d.hasFixedBinCount = true;
|
Chris@178
|
246 d.binCount = m_instruments[0].templateHeight;
|
Chris@178
|
247 d.binNames.clear();
|
Chris@178
|
248 if (m_cq) {
|
Chris@178
|
249 char name[20];
|
Chris@178
|
250 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
|
Chris@178
|
251 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
252 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
253 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
254 // frequency though, so these are still the first 545 bins
|
Chris@178
|
255 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
256 float freq = m_cq->getBinFrequency
|
Chris@178
|
257 (m_instruments[0].templateHeight - i - 1);
|
Chris@178
|
258 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
259 d.binNames.push_back(name);
|
Chris@178
|
260 }
|
Chris@178
|
261 }
|
Chris@178
|
262 d.hasKnownExtents = false;
|
Chris@178
|
263 d.isQuantized = false;
|
Chris@178
|
264 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
265 d.sampleRate = m_colsPerSec;
|
Chris@178
|
266 d.hasDuration = false;
|
Chris@178
|
267 m_fcqOutputNo = list.size();
|
Chris@178
|
268 list.push_back(d);
|
Chris@178
|
269
|
Chris@31
|
270 return list;
|
Chris@31
|
271 }
|
Chris@31
|
272
|
Chris@38
|
273 std::string
|
Chris@175
|
274 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@38
|
275 {
|
Chris@38
|
276 static const char *names[] = {
|
Chris@38
|
277 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
278 };
|
Chris@38
|
279
|
Chris@175
|
280 const char *n = names[note % 12];
|
Chris@38
|
281
|
Chris@175
|
282 int oct = (note + 9) / 12;
|
Chris@38
|
283
|
Chris@175
|
284 char buf[30];
|
Chris@175
|
285
|
Chris@175
|
286 float pshift = 0.f;
|
Chris@175
|
287 if (shiftCount > 1) {
|
Chris@175
|
288 // see noteFrequency below
|
Chris@175
|
289 pshift =
|
Chris@175
|
290 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
291 }
|
Chris@175
|
292
|
Chris@175
|
293 if (pshift > 0.f) {
|
Chris@175
|
294 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
|
Chris@175
|
295 } else if (pshift < 0.f) {
|
Chris@175
|
296 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
|
Chris@175
|
297 } else {
|
Chris@175
|
298 sprintf(buf, "%s%d", n, oct);
|
Chris@175
|
299 }
|
Chris@38
|
300
|
Chris@38
|
301 return buf;
|
Chris@38
|
302 }
|
Chris@38
|
303
|
Chris@41
|
304 float
|
Chris@168
|
305 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
306 {
|
Chris@169
|
307 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
308 // is an offset into the template array, which starts with some
|
Chris@169
|
309 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
310 //
|
Chris@169
|
311 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
312 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
313 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
314 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
315 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
316 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
317 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
318 // down in pitch, for a negative pitch shift.
|
Chris@169
|
319
|
Chris@175
|
320 float pshift = 0.f;
|
Chris@175
|
321 if (shiftCount > 1) {
|
Chris@175
|
322 pshift =
|
Chris@175
|
323 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
324 }
|
Chris@169
|
325
|
Chris@169
|
326 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@41
|
327 }
|
Chris@41
|
328
|
Chris@31
|
329 bool
|
Chris@31
|
330 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
331 {
|
Chris@31
|
332 if (channels < getMinChannelCount() ||
|
Chris@31
|
333 channels > getMaxChannelCount()) return false;
|
Chris@31
|
334
|
Chris@31
|
335 if (stepSize != blockSize) {
|
Chris@31
|
336 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
337 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
338 return false;
|
Chris@31
|
339 }
|
Chris@31
|
340
|
Chris@31
|
341 m_blockSize = blockSize;
|
Chris@31
|
342
|
Chris@31
|
343 reset();
|
Chris@31
|
344
|
Chris@31
|
345 return true;
|
Chris@31
|
346 }
|
Chris@31
|
347
|
Chris@31
|
348 void
|
Chris@31
|
349 Silvet::reset()
|
Chris@31
|
350 {
|
Chris@31
|
351 delete m_resampler;
|
Chris@246
|
352 delete m_flattener;
|
Chris@31
|
353 delete m_cq;
|
Chris@31
|
354
|
Chris@31
|
355 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
356 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
357 } else {
|
Chris@31
|
358 m_resampler = 0;
|
Chris@31
|
359 }
|
Chris@31
|
360
|
Chris@246
|
361 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
362 m_flattener->reset();
|
Chris@246
|
363
|
Chris@173
|
364 double minFreq = 27.5;
|
Chris@173
|
365
|
Chris@173
|
366 if (!m_hqMode) {
|
Chris@173
|
367 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
368 // so we can just pad with zeros
|
Chris@173
|
369 minFreq *= 2;
|
Chris@173
|
370 }
|
Chris@173
|
371
|
Chris@154
|
372 CQParameters params(processingSampleRate,
|
Chris@173
|
373 minFreq,
|
Chris@154
|
374 processingSampleRate / 3,
|
Chris@154
|
375 processingBPO);
|
Chris@154
|
376
|
Chris@155
|
377 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
|
Chris@155
|
378 // drops the FFT size to 512 from 1024 and alters
|
Chris@155
|
379 // some other processing parameters, making
|
Chris@155
|
380 // everything much, much slower. Could be a flaw
|
Chris@155
|
381 // in the CQ parameter calculations, must check
|
Chris@154
|
382 params.atomHopFactor = 0.3;
|
Chris@154
|
383 params.threshold = 0.0005;
|
Chris@172
|
384 params.window = CQParameters::Hann;
|
Chris@154
|
385
|
Chris@154
|
386 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
387
|
Chris@165
|
388 m_colsPerSec = m_hqMode ? 50 : 25;
|
Chris@165
|
389
|
Chris@41
|
390 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
391 delete m_postFilter[i];
|
Chris@41
|
392 }
|
Chris@41
|
393 m_postFilter.clear();
|
Chris@176
|
394 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
|
Chris@41
|
395 m_postFilter.push_back(new MedianFilter<double>(3));
|
Chris@41
|
396 }
|
Chris@41
|
397 m_pianoRoll.clear();
|
Chris@246
|
398 m_inputGains.clear();
|
Chris@32
|
399 m_columnCount = 0;
|
Chris@40
|
400 m_startTime = RealTime::zeroTime;
|
Chris@31
|
401 }
|
Chris@31
|
402
|
Chris@31
|
403 Silvet::FeatureSet
|
Chris@31
|
404 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
405 {
|
Chris@40
|
406 if (m_columnCount == 0) {
|
Chris@40
|
407 m_startTime = timestamp;
|
Chris@40
|
408 }
|
Chris@246
|
409
|
Chris@246
|
410 vector<float> flattened(m_blockSize);
|
Chris@246
|
411 float gain = 1.f;
|
Chris@246
|
412 m_flattener->connectInputPort
|
Chris@246
|
413 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
414 m_flattener->connectOutputPort
|
Chris@246
|
415 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
416 m_flattener->connectOutputPort
|
Chris@246
|
417 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
418 m_flattener->process(m_blockSize);
|
Chris@246
|
419
|
Chris@252
|
420 m_inputGains[timestamp] = gain;
|
Chris@40
|
421
|
Chris@31
|
422 vector<double> data;
|
Chris@40
|
423 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
424 double d = flattened[i];
|
Chris@235
|
425 data.push_back(d);
|
Chris@40
|
426 }
|
Chris@31
|
427
|
Chris@31
|
428 if (m_resampler) {
|
Chris@31
|
429 data = m_resampler->process(data.data(), data.size());
|
Chris@31
|
430 }
|
Chris@246
|
431
|
Chris@32
|
432 Grid cqout = m_cq->process(data);
|
Chris@51
|
433 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
434 return fs;
|
Chris@34
|
435 }
|
Chris@34
|
436
|
Chris@34
|
437 Silvet::FeatureSet
|
Chris@34
|
438 Silvet::getRemainingFeatures()
|
Chris@34
|
439 {
|
Chris@145
|
440 Grid cqout = m_cq->getRemainingOutput();
|
Chris@51
|
441 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
442 return fs;
|
Chris@34
|
443 }
|
Chris@34
|
444
|
Chris@34
|
445 Silvet::FeatureSet
|
Chris@34
|
446 Silvet::transcribe(const Grid &cqout)
|
Chris@34
|
447 {
|
Chris@32
|
448 Grid filtered = preProcess(cqout);
|
Chris@31
|
449
|
Chris@32
|
450 FeatureSet fs;
|
Chris@32
|
451
|
Chris@104
|
452 if (filtered.empty()) return fs;
|
Chris@170
|
453
|
Chris@170
|
454 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@104
|
455
|
Chris@178
|
456 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
457 Feature f;
|
Chris@178
|
458 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
459 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
460 }
|
Chris@178
|
461 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
462 }
|
Chris@178
|
463
|
Chris@34
|
464 int width = filtered.size();
|
Chris@34
|
465
|
Chris@164
|
466 int iterations = m_hqMode ? 20 : 10;
|
Chris@34
|
467
|
Chris@170
|
468 //!!! pitches or notes? [terminology]
|
Chris@176
|
469 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
|
Chris@170
|
470
|
Chris@170
|
471 bool wantShifts = m_hqMode && m_fineTuning;
|
Chris@170
|
472 int shiftCount = 1;
|
Chris@170
|
473 if (wantShifts) {
|
Chris@170
|
474 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
475 }
|
Chris@170
|
476
|
Chris@170
|
477 vector<vector<int> > localBestShifts;
|
Chris@170
|
478 if (wantShifts) {
|
Chris@170
|
479 localBestShifts =
|
Chris@176
|
480 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
|
Chris@170
|
481 }
|
Chris@170
|
482
|
Chris@170
|
483 vector<bool> present(width, false);
|
Chris@37
|
484
|
Chris@123
|
485 #pragma omp parallel for
|
Chris@123
|
486 for (int i = 0; i < width; ++i) {
|
Chris@104
|
487
|
Chris@170
|
488 double sum = 0.0;
|
Chris@176
|
489 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@170
|
490 sum += filtered.at(i).at(j);
|
Chris@170
|
491 }
|
Chris@170
|
492 if (sum < 1e-5) continue;
|
Chris@170
|
493
|
Chris@170
|
494 present[i] = true;
|
Chris@170
|
495
|
Chris@170
|
496 EM em(&pack, m_hqMode);
|
Chris@170
|
497
|
Chris@183
|
498 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@213
|
499 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@183
|
500
|
Chris@170
|
501 for (int j = 0; j < iterations; ++j) {
|
Chris@170
|
502 em.iterate(filtered.at(i).data());
|
Chris@37
|
503 }
|
Chris@37
|
504
|
Chris@170
|
505 const float *pitchDist = em.getPitchDistribution();
|
Chris@170
|
506 const float *const *shiftDist = em.getShifts();
|
Chris@37
|
507
|
Chris@176
|
508 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@104
|
509
|
Chris@170
|
510 localPitches[i][j] = pitchDist[j] * sum;
|
Chris@170
|
511
|
Chris@170
|
512 int bestShift = 0;
|
Chris@179
|
513 float bestShiftValue = 0.0;
|
Chris@170
|
514 if (wantShifts) {
|
Chris@170
|
515 for (int k = 0; k < shiftCount; ++k) {
|
Chris@179
|
516 float value = shiftDist[k][j];
|
Chris@179
|
517 if (k == 0 || value > bestShiftValue) {
|
Chris@179
|
518 bestShiftValue = value;
|
Chris@170
|
519 bestShift = k;
|
Chris@170
|
520 }
|
Chris@170
|
521 }
|
Chris@170
|
522 localBestShifts[i][j] = bestShift;
|
Chris@170
|
523 }
|
Chris@123
|
524 }
|
Chris@123
|
525 }
|
Chris@166
|
526
|
Chris@166
|
527 for (int i = 0; i < width; ++i) {
|
Chris@37
|
528
|
Chris@170
|
529 if (!present[i]) {
|
Chris@170
|
530 // silent column
|
Chris@176
|
531 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
532 m_postFilter[j]->push(0.0);
|
Chris@170
|
533 }
|
Chris@168
|
534 m_pianoRoll.push_back(map<int, double>());
|
Chris@170
|
535 if (wantShifts) {
|
Chris@168
|
536 m_pianoRollShifts.push_back(map<int, int>());
|
Chris@168
|
537 }
|
Chris@166
|
538 continue;
|
Chris@166
|
539 }
|
Chris@166
|
540
|
Chris@170
|
541 postProcess(localPitches[i], localBestShifts[i], wantShifts);
|
Chris@166
|
542
|
Chris@168
|
543 FeatureList noteFeatures = noteTrack(shiftCount);
|
Chris@38
|
544
|
Chris@123
|
545 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
546 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
547 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
548 }
|
Chris@34
|
549 }
|
Chris@34
|
550
|
Chris@32
|
551 return fs;
|
Chris@31
|
552 }
|
Chris@31
|
553
|
Chris@32
|
554 Silvet::Grid
|
Chris@32
|
555 Silvet::preProcess(const Grid &in)
|
Chris@32
|
556 {
|
Chris@32
|
557 int width = in.size();
|
Chris@32
|
558
|
Chris@165
|
559 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
560
|
Chris@165
|
561 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
562 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
563
|
Chris@32
|
564 Grid out;
|
Chris@32
|
565
|
Chris@58
|
566 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
567 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
568 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
569 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
570 // size we reduce to in a moment
|
Chris@33
|
571 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
572
|
Chris@176
|
573 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
574
|
Chris@32
|
575 for (int i = 0; i < width; ++i) {
|
Chris@32
|
576
|
Chris@33
|
577 if (m_columnCount < latentColumns) {
|
Chris@33
|
578 ++m_columnCount;
|
Chris@33
|
579 continue;
|
Chris@33
|
580 }
|
Chris@33
|
581
|
Chris@32
|
582 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
583 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
584
|
Chris@32
|
585 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
586
|
Chris@32
|
587 if (select) {
|
Chris@32
|
588 vector<double> inCol = in[i];
|
Chris@176
|
589 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
590
|
Chris@178
|
591 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@178
|
592 // lowest 55 of them.
|
Chris@178
|
593 //
|
Chris@178
|
594 // In draft mode the CQ is an octave shorter, returning
|
Chris@178
|
595 // 540 bins, so we instead pad them with an additional 5
|
Chris@178
|
596 // zeros.
|
Chris@178
|
597 //
|
Chris@178
|
598 // We also need to reverse the column as we go, since the
|
Chris@178
|
599 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
600 // the other way around.
|
Chris@32
|
601
|
Chris@178
|
602 if (m_hqMode) {
|
Chris@178
|
603 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
604 int ix = inCol.size() - j - 55;
|
Chris@178
|
605 outCol[j] = inCol[ix];
|
Chris@178
|
606 }
|
Chris@178
|
607 } else {
|
Chris@178
|
608 for (int j = 0; j < 5; ++j) {
|
Chris@178
|
609 outCol[j] = 0.0;
|
Chris@178
|
610 }
|
Chris@178
|
611 for (int j = 5; j < pack.templateHeight; ++j) {
|
Chris@178
|
612 int ix = inCol.size() - j + 4;
|
Chris@178
|
613 outCol[j] = inCol[ix];
|
Chris@178
|
614 }
|
Chris@46
|
615 }
|
Chris@32
|
616
|
Chris@46
|
617 vector<double> noiseLevel1 =
|
Chris@46
|
618 MedianFilter<double>::filter(40, outCol);
|
Chris@176
|
619 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
620 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
621 }
|
Chris@32
|
622
|
Chris@46
|
623 vector<double> noiseLevel2 =
|
Chris@46
|
624 MedianFilter<double>::filter(40, noiseLevel1);
|
Chris@176
|
625 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
626 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
627 }
|
Chris@32
|
628
|
Chris@165
|
629 out.push_back(outCol);
|
Chris@32
|
630 }
|
Chris@32
|
631
|
Chris@32
|
632 ++m_columnCount;
|
Chris@32
|
633 }
|
Chris@32
|
634
|
Chris@32
|
635 return out;
|
Chris@32
|
636 }
|
Chris@32
|
637
|
Chris@168
|
638 void
|
Chris@170
|
639 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
640 const vector<int> &bestShifts,
|
Chris@170
|
641 bool wantShifts)
|
Chris@166
|
642 {
|
Chris@176
|
643 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
644
|
Chris@41
|
645 vector<double> filtered;
|
Chris@41
|
646
|
Chris@176
|
647 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
648 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
649 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
650 }
|
Chris@41
|
651
|
Chris@41
|
652 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
653
|
Chris@41
|
654 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
655
|
Chris@41
|
656 ValueIndexMap strengths;
|
Chris@166
|
657
|
Chris@176
|
658 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
659 double strength = filtered[j];
|
Chris@183
|
660 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
661 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
662 }
|
Chris@166
|
663
|
Chris@168
|
664 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
665
|
Chris@168
|
666 map<int, double> active;
|
Chris@168
|
667 map<int, int> activeShifts;
|
Chris@168
|
668
|
Chris@183
|
669 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
670
|
Chris@168
|
671 --si;
|
Chris@168
|
672
|
Chris@168
|
673 double strength = si->first;
|
Chris@168
|
674 int j = si->second;
|
Chris@168
|
675
|
Chris@168
|
676 active[j] = strength;
|
Chris@168
|
677
|
Chris@170
|
678 if (wantShifts) {
|
Chris@170
|
679 activeShifts[j] = bestShifts[j];
|
Chris@167
|
680 }
|
Chris@41
|
681 }
|
Chris@41
|
682
|
Chris@168
|
683 m_pianoRoll.push_back(active);
|
Chris@170
|
684
|
Chris@170
|
685 if (wantShifts) {
|
Chris@168
|
686 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
687 }
|
Chris@166
|
688 }
|
Chris@166
|
689
|
Chris@166
|
690 Vamp::Plugin::FeatureList
|
Chris@168
|
691 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
692 {
|
Chris@41
|
693 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
694 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
695 // latest active set but present in the prior set in the piano
|
Chris@41
|
696 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
697 // already, and if they haven't ended, we don't know their
|
Chris@41
|
698 // duration.
|
Chris@41
|
699
|
Chris@168
|
700 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
701
|
Chris@168
|
702 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
703
|
Chris@165
|
704 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
705
|
Chris@165
|
706 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
707 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
708 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
709
|
Chris@41
|
710 FeatureList noteFeatures;
|
Chris@41
|
711
|
Chris@41
|
712 if (width < durationThreshold + 1) {
|
Chris@41
|
713 return noteFeatures;
|
Chris@41
|
714 }
|
Chris@41
|
715
|
Chris@150
|
716 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
717
|
Chris@55
|
718 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
719 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
720
|
Chris@55
|
721 int note = ni->first;
|
Chris@41
|
722
|
Chris@41
|
723 if (active.find(note) != active.end()) {
|
Chris@41
|
724 // the note is still playing
|
Chris@41
|
725 continue;
|
Chris@41
|
726 }
|
Chris@41
|
727
|
Chris@41
|
728 // the note was playing but just ended
|
Chris@41
|
729 int end = width;
|
Chris@41
|
730 int start = end-1;
|
Chris@41
|
731
|
Chris@41
|
732 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
733 --start;
|
Chris@41
|
734 }
|
Chris@41
|
735 ++start;
|
Chris@41
|
736
|
Chris@169
|
737 if ((end - start) < durationThreshold) {
|
Chris@41
|
738 continue;
|
Chris@41
|
739 }
|
Chris@41
|
740
|
Chris@169
|
741 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@41
|
742 }
|
Chris@41
|
743
|
Chris@62
|
744 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
745
|
Chris@41
|
746 return noteFeatures;
|
Chris@41
|
747 }
|
Chris@41
|
748
|
Chris@169
|
749 void
|
Chris@169
|
750 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
751 FeatureList ¬eFeatures)
|
Chris@169
|
752 {
|
Chris@169
|
753 int partStart = start;
|
Chris@169
|
754 int partShift = 0;
|
Chris@169
|
755 int partVelocity = 0;
|
Chris@169
|
756
|
Chris@252
|
757 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
758
|
Chris@169
|
759 for (int i = start; i != end; ++i) {
|
Chris@169
|
760
|
Chris@169
|
761 double strength = m_pianoRoll[i][note];
|
Chris@169
|
762
|
Chris@169
|
763 int shift = 0;
|
Chris@169
|
764
|
Chris@169
|
765 if (shiftCount > 1) {
|
Chris@169
|
766
|
Chris@169
|
767 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
768
|
Chris@169
|
769 if (i == partStart) {
|
Chris@169
|
770 partShift = shift;
|
Chris@169
|
771 }
|
Chris@169
|
772
|
Chris@169
|
773 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
774
|
Chris@169
|
775 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
776
|
Chris@169
|
777 // pitch has changed, emit an intermediate note
|
Chris@252
|
778 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
779 i,
|
Chris@252
|
780 note,
|
Chris@252
|
781 partShift,
|
Chris@252
|
782 shiftCount,
|
Chris@252
|
783 partVelocity));
|
Chris@169
|
784 partStart = i;
|
Chris@169
|
785 partShift = shift;
|
Chris@169
|
786 partVelocity = 0;
|
Chris@169
|
787 }
|
Chris@169
|
788 }
|
Chris@169
|
789
|
Chris@246
|
790 int v = round(strength * 2);
|
Chris@169
|
791 if (v > partVelocity) {
|
Chris@169
|
792 partVelocity = v;
|
Chris@169
|
793 }
|
Chris@169
|
794 }
|
Chris@169
|
795
|
Chris@169
|
796 if (end >= partStart + partThreshold) {
|
Chris@252
|
797 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
798 end,
|
Chris@252
|
799 note,
|
Chris@252
|
800 partShift,
|
Chris@252
|
801 shiftCount,
|
Chris@252
|
802 partVelocity));
|
Chris@169
|
803 }
|
Chris@169
|
804 }
|
Chris@252
|
805
|
Chris@252
|
806 Silvet::Feature
|
Chris@252
|
807 Silvet::makeNoteFeature(int start,
|
Chris@252
|
808 int end,
|
Chris@252
|
809 int note,
|
Chris@252
|
810 int shift,
|
Chris@252
|
811 int shiftCount,
|
Chris@252
|
812 int velocity)
|
Chris@252
|
813 {
|
Chris@252
|
814 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@252
|
815 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@252
|
816
|
Chris@252
|
817 Feature f;
|
Chris@252
|
818
|
Chris@252
|
819 f.hasTimestamp = true;
|
Chris@252
|
820 f.timestamp = RealTime::fromSeconds
|
Chris@252
|
821 (columnDuration * (start - postFilterLatency) + 0.02);
|
Chris@252
|
822
|
Chris@252
|
823 f.hasDuration = true;
|
Chris@252
|
824 f.duration = RealTime::fromSeconds
|
Chris@252
|
825 (columnDuration * (end - start));
|
Chris@252
|
826
|
Chris@252
|
827 f.values.clear();
|
Chris@252
|
828
|
Chris@252
|
829 f.values.push_back
|
Chris@252
|
830 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
831
|
Chris@252
|
832 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
833 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
834 velocity = round(velocity / inputGain);
|
Chris@252
|
835 if (velocity > 127) velocity = 127;
|
Chris@252
|
836 if (velocity < 1) velocity = 1;
|
Chris@252
|
837 f.values.push_back(velocity);
|
Chris@252
|
838
|
Chris@252
|
839 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
840
|
Chris@252
|
841 return f;
|
Chris@252
|
842 }
|
Chris@252
|
843
|
Chris@252
|
844 float
|
Chris@252
|
845 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
846 {
|
Chris@252
|
847 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
848
|
Chris@252
|
849 if (i == m_inputGains.end()) {
|
Chris@252
|
850 if (i != m_inputGains.begin()) {
|
Chris@252
|
851 --i;
|
Chris@252
|
852 } else {
|
Chris@252
|
853 return 1.f; // no data
|
Chris@252
|
854 }
|
Chris@252
|
855 }
|
Chris@252
|
856
|
Chris@252
|
857 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
858
|
Chris@252
|
859 return i->second;
|
Chris@252
|
860 }
|
Chris@252
|
861
|