Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@31
|
24
|
Chris@31
|
25 #include <vector>
|
Chris@31
|
26
|
Chris@32
|
27 #include <cstdio>
|
Chris@32
|
28
|
Chris@31
|
29 using std::vector;
|
Chris@48
|
30 using std::cout;
|
Chris@31
|
31 using std::cerr;
|
Chris@31
|
32 using std::endl;
|
Chris@40
|
33 using Vamp::RealTime;
|
Chris@31
|
34
|
Chris@31
|
35 static int processingSampleRate = 44100;
|
Chris@31
|
36 static int processingBPO = 60;
|
Chris@170
|
37
|
Chris@272
|
38 static int minInputSampleRate = 100;
|
Chris@272
|
39 static int maxInputSampleRate = 192000;
|
Chris@272
|
40
|
Chris@31
|
41 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
42 Plugin(inputSampleRate),
|
Chris@161
|
43 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@31
|
44 m_resampler(0),
|
Chris@246
|
45 m_flattener(0),
|
Chris@110
|
46 m_cq(0),
|
Chris@162
|
47 m_hqMode(true),
|
Chris@166
|
48 m_fineTuning(false),
|
Chris@178
|
49 m_instrument(0),
|
Chris@178
|
50 m_colsPerSec(50)
|
Chris@31
|
51 {
|
Chris@31
|
52 }
|
Chris@31
|
53
|
Chris@31
|
54 Silvet::~Silvet()
|
Chris@31
|
55 {
|
Chris@31
|
56 delete m_resampler;
|
Chris@246
|
57 delete m_flattener;
|
Chris@31
|
58 delete m_cq;
|
Chris@41
|
59 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
60 delete m_postFilter[i];
|
Chris@41
|
61 }
|
Chris@31
|
62 }
|
Chris@31
|
63
|
Chris@31
|
64 string
|
Chris@31
|
65 Silvet::getIdentifier() const
|
Chris@31
|
66 {
|
Chris@31
|
67 return "silvet";
|
Chris@31
|
68 }
|
Chris@31
|
69
|
Chris@31
|
70 string
|
Chris@31
|
71 Silvet::getName() const
|
Chris@31
|
72 {
|
Chris@31
|
73 return "Silvet Note Transcription";
|
Chris@31
|
74 }
|
Chris@31
|
75
|
Chris@31
|
76 string
|
Chris@31
|
77 Silvet::getDescription() const
|
Chris@31
|
78 {
|
Chris@191
|
79 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
80 }
|
Chris@31
|
81
|
Chris@31
|
82 string
|
Chris@31
|
83 Silvet::getMaker() const
|
Chris@31
|
84 {
|
Chris@191
|
85 return "Queen Mary, University of London";
|
Chris@31
|
86 }
|
Chris@31
|
87
|
Chris@31
|
88 int
|
Chris@31
|
89 Silvet::getPluginVersion() const
|
Chris@31
|
90 {
|
Chris@31
|
91 return 1;
|
Chris@31
|
92 }
|
Chris@31
|
93
|
Chris@31
|
94 string
|
Chris@31
|
95 Silvet::getCopyright() const
|
Chris@31
|
96 {
|
Chris@191
|
97 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
98 }
|
Chris@31
|
99
|
Chris@31
|
100 Silvet::InputDomain
|
Chris@31
|
101 Silvet::getInputDomain() const
|
Chris@31
|
102 {
|
Chris@31
|
103 return TimeDomain;
|
Chris@31
|
104 }
|
Chris@31
|
105
|
Chris@31
|
106 size_t
|
Chris@31
|
107 Silvet::getPreferredBlockSize() const
|
Chris@31
|
108 {
|
Chris@31
|
109 return 0;
|
Chris@31
|
110 }
|
Chris@31
|
111
|
Chris@31
|
112 size_t
|
Chris@31
|
113 Silvet::getPreferredStepSize() const
|
Chris@31
|
114 {
|
Chris@31
|
115 return 0;
|
Chris@31
|
116 }
|
Chris@31
|
117
|
Chris@31
|
118 size_t
|
Chris@31
|
119 Silvet::getMinChannelCount() const
|
Chris@31
|
120 {
|
Chris@31
|
121 return 1;
|
Chris@31
|
122 }
|
Chris@31
|
123
|
Chris@31
|
124 size_t
|
Chris@31
|
125 Silvet::getMaxChannelCount() const
|
Chris@31
|
126 {
|
Chris@31
|
127 return 1;
|
Chris@31
|
128 }
|
Chris@31
|
129
|
Chris@31
|
130 Silvet::ParameterList
|
Chris@31
|
131 Silvet::getParameterDescriptors() const
|
Chris@31
|
132 {
|
Chris@31
|
133 ParameterList list;
|
Chris@110
|
134
|
Chris@110
|
135 ParameterDescriptor desc;
|
Chris@110
|
136 desc.identifier = "mode";
|
Chris@110
|
137 desc.name = "Processing mode";
|
Chris@110
|
138 desc.unit = "";
|
Chris@271
|
139 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode modifies a number of internal parameters in favour of speed. Intensive mode (the default) will almost always produce better results.";
|
Chris@110
|
140 desc.minValue = 0;
|
Chris@110
|
141 desc.maxValue = 1;
|
Chris@113
|
142 desc.defaultValue = 1;
|
Chris@110
|
143 desc.isQuantized = true;
|
Chris@110
|
144 desc.quantizeStep = 1;
|
Chris@166
|
145 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
146 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@161
|
147 list.push_back(desc);
|
Chris@161
|
148
|
Chris@176
|
149 desc.identifier = "instrument";
|
Chris@176
|
150 desc.name = "Instrument";
|
Chris@161
|
151 desc.unit = "";
|
Chris@271
|
152 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
153 desc.minValue = 0;
|
Chris@162
|
154 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
155 desc.defaultValue = 0;
|
Chris@161
|
156 desc.isQuantized = true;
|
Chris@161
|
157 desc.quantizeStep = 1;
|
Chris@161
|
158 desc.valueNames.clear();
|
Chris@162
|
159 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
160 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
161 }
|
Chris@166
|
162 list.push_back(desc);
|
Chris@161
|
163
|
Chris@166
|
164 desc.identifier = "finetune";
|
Chris@166
|
165 desc.name = "Return fine pitch estimates";
|
Chris@166
|
166 desc.unit = "";
|
Chris@271
|
167 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
168 desc.minValue = 0;
|
Chris@166
|
169 desc.maxValue = 1;
|
Chris@166
|
170 desc.defaultValue = 0;
|
Chris@166
|
171 desc.isQuantized = true;
|
Chris@166
|
172 desc.quantizeStep = 1;
|
Chris@166
|
173 desc.valueNames.clear();
|
Chris@110
|
174 list.push_back(desc);
|
Chris@110
|
175
|
Chris@31
|
176 return list;
|
Chris@31
|
177 }
|
Chris@31
|
178
|
Chris@31
|
179 float
|
Chris@31
|
180 Silvet::getParameter(string identifier) const
|
Chris@31
|
181 {
|
Chris@110
|
182 if (identifier == "mode") {
|
Chris@110
|
183 return m_hqMode ? 1.f : 0.f;
|
Chris@166
|
184 } else if (identifier == "finetune") {
|
Chris@166
|
185 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
186 } else if (identifier == "instrument") {
|
Chris@162
|
187 return m_instrument;
|
Chris@110
|
188 }
|
Chris@31
|
189 return 0;
|
Chris@31
|
190 }
|
Chris@31
|
191
|
Chris@31
|
192 void
|
Chris@31
|
193 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
194 {
|
Chris@110
|
195 if (identifier == "mode") {
|
Chris@110
|
196 m_hqMode = (value > 0.5);
|
Chris@166
|
197 } else if (identifier == "finetune") {
|
Chris@166
|
198 m_fineTuning = (value > 0.5);
|
Chris@176
|
199 } else if (identifier == "instrument") {
|
Chris@162
|
200 m_instrument = lrintf(value);
|
Chris@110
|
201 }
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 Silvet::ProgramList
|
Chris@31
|
205 Silvet::getPrograms() const
|
Chris@31
|
206 {
|
Chris@31
|
207 ProgramList list;
|
Chris@31
|
208 return list;
|
Chris@31
|
209 }
|
Chris@31
|
210
|
Chris@31
|
211 string
|
Chris@31
|
212 Silvet::getCurrentProgram() const
|
Chris@31
|
213 {
|
Chris@31
|
214 return "";
|
Chris@31
|
215 }
|
Chris@31
|
216
|
Chris@31
|
217 void
|
Chris@31
|
218 Silvet::selectProgram(string name)
|
Chris@31
|
219 {
|
Chris@31
|
220 }
|
Chris@31
|
221
|
Chris@31
|
222 Silvet::OutputList
|
Chris@31
|
223 Silvet::getOutputDescriptors() const
|
Chris@31
|
224 {
|
Chris@31
|
225 OutputList list;
|
Chris@31
|
226
|
Chris@31
|
227 OutputDescriptor d;
|
Chris@51
|
228 d.identifier = "notes";
|
Chris@51
|
229 d.name = "Note transcription";
|
Chris@271
|
230 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
231 d.unit = "Hz";
|
Chris@31
|
232 d.hasFixedBinCount = true;
|
Chris@31
|
233 d.binCount = 2;
|
Chris@41
|
234 d.binNames.push_back("Frequency");
|
Chris@31
|
235 d.binNames.push_back("Velocity");
|
Chris@31
|
236 d.hasKnownExtents = false;
|
Chris@31
|
237 d.isQuantized = false;
|
Chris@31
|
238 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
239 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
240 d.hasDuration = true;
|
Chris@32
|
241 m_notesOutputNo = list.size();
|
Chris@32
|
242 list.push_back(d);
|
Chris@32
|
243
|
Chris@178
|
244 d.identifier = "timefreq";
|
Chris@178
|
245 d.name = "Time-frequency distribution";
|
Chris@271
|
246 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
247 d.unit = "";
|
Chris@178
|
248 d.hasFixedBinCount = true;
|
Chris@178
|
249 d.binCount = m_instruments[0].templateHeight;
|
Chris@178
|
250 d.binNames.clear();
|
Chris@178
|
251 if (m_cq) {
|
Chris@178
|
252 char name[20];
|
Chris@178
|
253 for (int i = 0; i < m_instruments[0].templateHeight; ++i) {
|
Chris@178
|
254 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
255 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
256 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
257 // frequency though, so these are still the first 545 bins
|
Chris@178
|
258 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
259 float freq = m_cq->getBinFrequency
|
Chris@178
|
260 (m_instruments[0].templateHeight - i - 1);
|
Chris@178
|
261 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
262 d.binNames.push_back(name);
|
Chris@178
|
263 }
|
Chris@178
|
264 }
|
Chris@178
|
265 d.hasKnownExtents = false;
|
Chris@178
|
266 d.isQuantized = false;
|
Chris@178
|
267 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
268 d.sampleRate = m_colsPerSec;
|
Chris@178
|
269 d.hasDuration = false;
|
Chris@178
|
270 m_fcqOutputNo = list.size();
|
Chris@178
|
271 list.push_back(d);
|
Chris@178
|
272
|
Chris@31
|
273 return list;
|
Chris@31
|
274 }
|
Chris@31
|
275
|
Chris@38
|
276 std::string
|
Chris@175
|
277 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@38
|
278 {
|
Chris@38
|
279 static const char *names[] = {
|
Chris@38
|
280 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
281 };
|
Chris@38
|
282
|
Chris@175
|
283 const char *n = names[note % 12];
|
Chris@38
|
284
|
Chris@175
|
285 int oct = (note + 9) / 12;
|
Chris@38
|
286
|
Chris@175
|
287 char buf[30];
|
Chris@175
|
288
|
Chris@175
|
289 float pshift = 0.f;
|
Chris@175
|
290 if (shiftCount > 1) {
|
Chris@175
|
291 // see noteFrequency below
|
Chris@175
|
292 pshift =
|
Chris@175
|
293 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
294 }
|
Chris@175
|
295
|
Chris@175
|
296 if (pshift > 0.f) {
|
Chris@175
|
297 sprintf(buf, "%s%d+%dc", n, oct, int(round(pshift * 100)));
|
Chris@175
|
298 } else if (pshift < 0.f) {
|
Chris@175
|
299 sprintf(buf, "%s%d-%dc", n, oct, int(round((-pshift) * 100)));
|
Chris@175
|
300 } else {
|
Chris@175
|
301 sprintf(buf, "%s%d", n, oct);
|
Chris@175
|
302 }
|
Chris@38
|
303
|
Chris@38
|
304 return buf;
|
Chris@38
|
305 }
|
Chris@38
|
306
|
Chris@41
|
307 float
|
Chris@168
|
308 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
309 {
|
Chris@169
|
310 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
311 // is an offset into the template array, which starts with some
|
Chris@169
|
312 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
313 //
|
Chris@169
|
314 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
315 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
316 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
317 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
318 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
319 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
320 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
321 // down in pitch, for a negative pitch shift.
|
Chris@169
|
322
|
Chris@175
|
323 float pshift = 0.f;
|
Chris@175
|
324 if (shiftCount > 1) {
|
Chris@175
|
325 pshift =
|
Chris@175
|
326 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
327 }
|
Chris@169
|
328
|
Chris@169
|
329 return float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@41
|
330 }
|
Chris@41
|
331
|
Chris@31
|
332 bool
|
Chris@31
|
333 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
334 {
|
Chris@272
|
335 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
336 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
337 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
338 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
339 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
340 return false;
|
Chris@272
|
341 }
|
Chris@272
|
342
|
Chris@31
|
343 if (channels < getMinChannelCount() ||
|
Chris@272
|
344 channels > getMaxChannelCount()) {
|
Chris@272
|
345 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
346 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
347 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
348 return false;
|
Chris@272
|
349 }
|
Chris@31
|
350
|
Chris@31
|
351 if (stepSize != blockSize) {
|
Chris@31
|
352 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
353 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
354 return false;
|
Chris@31
|
355 }
|
Chris@31
|
356
|
Chris@31
|
357 m_blockSize = blockSize;
|
Chris@31
|
358
|
Chris@31
|
359 reset();
|
Chris@31
|
360
|
Chris@31
|
361 return true;
|
Chris@31
|
362 }
|
Chris@31
|
363
|
Chris@31
|
364 void
|
Chris@31
|
365 Silvet::reset()
|
Chris@31
|
366 {
|
Chris@31
|
367 delete m_resampler;
|
Chris@246
|
368 delete m_flattener;
|
Chris@31
|
369 delete m_cq;
|
Chris@31
|
370
|
Chris@31
|
371 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
372 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
373 } else {
|
Chris@31
|
374 m_resampler = 0;
|
Chris@31
|
375 }
|
Chris@31
|
376
|
Chris@246
|
377 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
378 m_flattener->reset();
|
Chris@246
|
379
|
Chris@173
|
380 double minFreq = 27.5;
|
Chris@173
|
381
|
Chris@173
|
382 if (!m_hqMode) {
|
Chris@173
|
383 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
384 // so we can just pad with zeros
|
Chris@173
|
385 minFreq *= 2;
|
Chris@173
|
386 }
|
Chris@173
|
387
|
Chris@154
|
388 CQParameters params(processingSampleRate,
|
Chris@173
|
389 minFreq,
|
Chris@154
|
390 processingSampleRate / 3,
|
Chris@154
|
391 processingBPO);
|
Chris@154
|
392
|
Chris@155
|
393 params.q = 0.95; // MIREX code uses 0.8, but it seems 0.9 or lower
|
Chris@155
|
394 // drops the FFT size to 512 from 1024 and alters
|
Chris@155
|
395 // some other processing parameters, making
|
Chris@155
|
396 // everything much, much slower. Could be a flaw
|
Chris@155
|
397 // in the CQ parameter calculations, must check
|
Chris@154
|
398 params.atomHopFactor = 0.3;
|
Chris@154
|
399 params.threshold = 0.0005;
|
Chris@172
|
400 params.window = CQParameters::Hann;
|
Chris@154
|
401
|
Chris@154
|
402 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
403
|
Chris@165
|
404 m_colsPerSec = m_hqMode ? 50 : 25;
|
Chris@165
|
405
|
Chris@41
|
406 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
407 delete m_postFilter[i];
|
Chris@41
|
408 }
|
Chris@41
|
409 m_postFilter.clear();
|
Chris@176
|
410 for (int i = 0; i < m_instruments[0].templateNoteCount; ++i) {
|
Chris@41
|
411 m_postFilter.push_back(new MedianFilter<double>(3));
|
Chris@41
|
412 }
|
Chris@41
|
413 m_pianoRoll.clear();
|
Chris@246
|
414 m_inputGains.clear();
|
Chris@32
|
415 m_columnCount = 0;
|
Chris@272
|
416 m_resampledCount = 0;
|
Chris@40
|
417 m_startTime = RealTime::zeroTime;
|
Chris@31
|
418 }
|
Chris@31
|
419
|
Chris@31
|
420 Silvet::FeatureSet
|
Chris@31
|
421 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
422 {
|
Chris@40
|
423 if (m_columnCount == 0) {
|
Chris@40
|
424 m_startTime = timestamp;
|
Chris@40
|
425 }
|
Chris@246
|
426
|
Chris@246
|
427 vector<float> flattened(m_blockSize);
|
Chris@246
|
428 float gain = 1.f;
|
Chris@246
|
429 m_flattener->connectInputPort
|
Chris@246
|
430 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
431 m_flattener->connectOutputPort
|
Chris@246
|
432 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
433 m_flattener->connectOutputPort
|
Chris@246
|
434 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
435 m_flattener->process(m_blockSize);
|
Chris@246
|
436
|
Chris@252
|
437 m_inputGains[timestamp] = gain;
|
Chris@40
|
438
|
Chris@31
|
439 vector<double> data;
|
Chris@40
|
440 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
441 double d = flattened[i];
|
Chris@235
|
442 data.push_back(d);
|
Chris@40
|
443 }
|
Chris@31
|
444
|
Chris@31
|
445 if (m_resampler) {
|
Chris@272
|
446
|
Chris@31
|
447 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
448
|
Chris@272
|
449 int hadCount = m_resampledCount;
|
Chris@272
|
450 m_resampledCount += data.size();
|
Chris@272
|
451
|
Chris@272
|
452 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
453
|
Chris@272
|
454 if (hadCount < resamplerLatency) {
|
Chris@272
|
455 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
456 if (stillToDrop >= int(data.size())) {
|
Chris@272
|
457 return FeatureSet();
|
Chris@272
|
458 } else {
|
Chris@272
|
459 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
460 }
|
Chris@272
|
461 }
|
Chris@31
|
462 }
|
Chris@272
|
463
|
Chris@32
|
464 Grid cqout = m_cq->process(data);
|
Chris@51
|
465 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
466 return fs;
|
Chris@34
|
467 }
|
Chris@34
|
468
|
Chris@34
|
469 Silvet::FeatureSet
|
Chris@34
|
470 Silvet::getRemainingFeatures()
|
Chris@34
|
471 {
|
Chris@145
|
472 Grid cqout = m_cq->getRemainingOutput();
|
Chris@51
|
473 FeatureSet fs = transcribe(cqout);
|
Chris@51
|
474 return fs;
|
Chris@34
|
475 }
|
Chris@34
|
476
|
Chris@34
|
477 Silvet::FeatureSet
|
Chris@34
|
478 Silvet::transcribe(const Grid &cqout)
|
Chris@34
|
479 {
|
Chris@32
|
480 Grid filtered = preProcess(cqout);
|
Chris@31
|
481
|
Chris@32
|
482 FeatureSet fs;
|
Chris@32
|
483
|
Chris@104
|
484 if (filtered.empty()) return fs;
|
Chris@170
|
485
|
Chris@170
|
486 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@104
|
487
|
Chris@178
|
488 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
489 Feature f;
|
Chris@178
|
490 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
491 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
492 }
|
Chris@178
|
493 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
494 }
|
Chris@178
|
495
|
Chris@34
|
496 int width = filtered.size();
|
Chris@34
|
497
|
Chris@164
|
498 int iterations = m_hqMode ? 20 : 10;
|
Chris@34
|
499
|
Chris@176
|
500 Grid localPitches(width, vector<double>(pack.templateNoteCount, 0.0));
|
Chris@170
|
501
|
Chris@170
|
502 bool wantShifts = m_hqMode && m_fineTuning;
|
Chris@170
|
503 int shiftCount = 1;
|
Chris@170
|
504 if (wantShifts) {
|
Chris@170
|
505 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
506 }
|
Chris@170
|
507
|
Chris@170
|
508 vector<vector<int> > localBestShifts;
|
Chris@170
|
509 if (wantShifts) {
|
Chris@170
|
510 localBestShifts =
|
Chris@176
|
511 vector<vector<int> >(width, vector<int>(pack.templateNoteCount, 0));
|
Chris@170
|
512 }
|
Chris@170
|
513
|
Chris@170
|
514 vector<bool> present(width, false);
|
Chris@37
|
515
|
Chris@123
|
516 #pragma omp parallel for
|
Chris@123
|
517 for (int i = 0; i < width; ++i) {
|
Chris@104
|
518
|
Chris@170
|
519 double sum = 0.0;
|
Chris@176
|
520 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@170
|
521 sum += filtered.at(i).at(j);
|
Chris@170
|
522 }
|
Chris@170
|
523 if (sum < 1e-5) continue;
|
Chris@170
|
524
|
Chris@170
|
525 present[i] = true;
|
Chris@170
|
526
|
Chris@170
|
527 EM em(&pack, m_hqMode);
|
Chris@170
|
528
|
Chris@183
|
529 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@213
|
530 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@183
|
531
|
Chris@170
|
532 for (int j = 0; j < iterations; ++j) {
|
Chris@170
|
533 em.iterate(filtered.at(i).data());
|
Chris@37
|
534 }
|
Chris@37
|
535
|
Chris@170
|
536 const float *pitchDist = em.getPitchDistribution();
|
Chris@170
|
537 const float *const *shiftDist = em.getShifts();
|
Chris@37
|
538
|
Chris@176
|
539 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@104
|
540
|
Chris@170
|
541 localPitches[i][j] = pitchDist[j] * sum;
|
Chris@170
|
542
|
Chris@170
|
543 int bestShift = 0;
|
Chris@179
|
544 float bestShiftValue = 0.0;
|
Chris@170
|
545 if (wantShifts) {
|
Chris@170
|
546 for (int k = 0; k < shiftCount; ++k) {
|
Chris@179
|
547 float value = shiftDist[k][j];
|
Chris@179
|
548 if (k == 0 || value > bestShiftValue) {
|
Chris@179
|
549 bestShiftValue = value;
|
Chris@170
|
550 bestShift = k;
|
Chris@170
|
551 }
|
Chris@170
|
552 }
|
Chris@170
|
553 localBestShifts[i][j] = bestShift;
|
Chris@170
|
554 }
|
Chris@123
|
555 }
|
Chris@123
|
556 }
|
Chris@166
|
557
|
Chris@166
|
558 for (int i = 0; i < width; ++i) {
|
Chris@37
|
559
|
Chris@170
|
560 if (!present[i]) {
|
Chris@170
|
561 // silent column
|
Chris@176
|
562 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
563 m_postFilter[j]->push(0.0);
|
Chris@170
|
564 }
|
Chris@168
|
565 m_pianoRoll.push_back(map<int, double>());
|
Chris@170
|
566 if (wantShifts) {
|
Chris@168
|
567 m_pianoRollShifts.push_back(map<int, int>());
|
Chris@168
|
568 }
|
Chris@166
|
569 continue;
|
Chris@166
|
570 }
|
Chris@166
|
571
|
Chris@170
|
572 postProcess(localPitches[i], localBestShifts[i], wantShifts);
|
Chris@166
|
573
|
Chris@168
|
574 FeatureList noteFeatures = noteTrack(shiftCount);
|
Chris@38
|
575
|
Chris@123
|
576 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
577 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
578 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
579 }
|
Chris@34
|
580 }
|
Chris@34
|
581
|
Chris@32
|
582 return fs;
|
Chris@31
|
583 }
|
Chris@31
|
584
|
Chris@32
|
585 Silvet::Grid
|
Chris@32
|
586 Silvet::preProcess(const Grid &in)
|
Chris@32
|
587 {
|
Chris@32
|
588 int width = in.size();
|
Chris@32
|
589
|
Chris@165
|
590 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
591
|
Chris@165
|
592 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
593 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
594
|
Chris@32
|
595 Grid out;
|
Chris@32
|
596
|
Chris@58
|
597 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
598 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
599 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
600 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
601 // size we reduce to in a moment
|
Chris@33
|
602 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
603
|
Chris@176
|
604 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
605
|
Chris@32
|
606 for (int i = 0; i < width; ++i) {
|
Chris@32
|
607
|
Chris@33
|
608 if (m_columnCount < latentColumns) {
|
Chris@33
|
609 ++m_columnCount;
|
Chris@33
|
610 continue;
|
Chris@33
|
611 }
|
Chris@33
|
612
|
Chris@32
|
613 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
614 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
615
|
Chris@32
|
616 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
617
|
Chris@32
|
618 if (select) {
|
Chris@32
|
619 vector<double> inCol = in[i];
|
Chris@176
|
620 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
621
|
Chris@178
|
622 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@178
|
623 // lowest 55 of them.
|
Chris@178
|
624 //
|
Chris@178
|
625 // In draft mode the CQ is an octave shorter, returning
|
Chris@178
|
626 // 540 bins, so we instead pad them with an additional 5
|
Chris@178
|
627 // zeros.
|
Chris@178
|
628 //
|
Chris@178
|
629 // We also need to reverse the column as we go, since the
|
Chris@178
|
630 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
631 // the other way around.
|
Chris@32
|
632
|
Chris@178
|
633 if (m_hqMode) {
|
Chris@178
|
634 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
635 int ix = inCol.size() - j - 55;
|
Chris@178
|
636 outCol[j] = inCol[ix];
|
Chris@178
|
637 }
|
Chris@178
|
638 } else {
|
Chris@178
|
639 for (int j = 0; j < 5; ++j) {
|
Chris@178
|
640 outCol[j] = 0.0;
|
Chris@178
|
641 }
|
Chris@178
|
642 for (int j = 5; j < pack.templateHeight; ++j) {
|
Chris@178
|
643 int ix = inCol.size() - j + 4;
|
Chris@178
|
644 outCol[j] = inCol[ix];
|
Chris@178
|
645 }
|
Chris@46
|
646 }
|
Chris@32
|
647
|
Chris@46
|
648 vector<double> noiseLevel1 =
|
Chris@46
|
649 MedianFilter<double>::filter(40, outCol);
|
Chris@176
|
650 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
651 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
652 }
|
Chris@32
|
653
|
Chris@46
|
654 vector<double> noiseLevel2 =
|
Chris@46
|
655 MedianFilter<double>::filter(40, noiseLevel1);
|
Chris@176
|
656 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
657 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
658 }
|
Chris@32
|
659
|
Chris@165
|
660 out.push_back(outCol);
|
Chris@32
|
661 }
|
Chris@32
|
662
|
Chris@32
|
663 ++m_columnCount;
|
Chris@32
|
664 }
|
Chris@32
|
665
|
Chris@32
|
666 return out;
|
Chris@32
|
667 }
|
Chris@32
|
668
|
Chris@168
|
669 void
|
Chris@170
|
670 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
671 const vector<int> &bestShifts,
|
Chris@170
|
672 bool wantShifts)
|
Chris@166
|
673 {
|
Chris@176
|
674 const InstrumentPack &pack = m_instruments[m_instrument];
|
Chris@176
|
675
|
Chris@41
|
676 vector<double> filtered;
|
Chris@41
|
677
|
Chris@176
|
678 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
679 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
680 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
681 }
|
Chris@41
|
682
|
Chris@41
|
683 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
684
|
Chris@41
|
685 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
686
|
Chris@41
|
687 ValueIndexMap strengths;
|
Chris@166
|
688
|
Chris@176
|
689 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
690 double strength = filtered[j];
|
Chris@183
|
691 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
692 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
693 }
|
Chris@166
|
694
|
Chris@168
|
695 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
696
|
Chris@168
|
697 map<int, double> active;
|
Chris@168
|
698 map<int, int> activeShifts;
|
Chris@168
|
699
|
Chris@183
|
700 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
701
|
Chris@168
|
702 --si;
|
Chris@168
|
703
|
Chris@168
|
704 double strength = si->first;
|
Chris@168
|
705 int j = si->second;
|
Chris@168
|
706
|
Chris@168
|
707 active[j] = strength;
|
Chris@168
|
708
|
Chris@170
|
709 if (wantShifts) {
|
Chris@170
|
710 activeShifts[j] = bestShifts[j];
|
Chris@167
|
711 }
|
Chris@41
|
712 }
|
Chris@41
|
713
|
Chris@168
|
714 m_pianoRoll.push_back(active);
|
Chris@170
|
715
|
Chris@170
|
716 if (wantShifts) {
|
Chris@168
|
717 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
718 }
|
Chris@166
|
719 }
|
Chris@166
|
720
|
Chris@166
|
721 Vamp::Plugin::FeatureList
|
Chris@168
|
722 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
723 {
|
Chris@41
|
724 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
725 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
726 // latest active set but present in the prior set in the piano
|
Chris@41
|
727 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
728 // already, and if they haven't ended, we don't know their
|
Chris@41
|
729 // duration.
|
Chris@41
|
730
|
Chris@168
|
731 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
732
|
Chris@168
|
733 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
734
|
Chris@165
|
735 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
736
|
Chris@165
|
737 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
738 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
739 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
740
|
Chris@41
|
741 FeatureList noteFeatures;
|
Chris@41
|
742
|
Chris@41
|
743 if (width < durationThreshold + 1) {
|
Chris@41
|
744 return noteFeatures;
|
Chris@41
|
745 }
|
Chris@41
|
746
|
Chris@150
|
747 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
748
|
Chris@55
|
749 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
750 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
751
|
Chris@55
|
752 int note = ni->first;
|
Chris@41
|
753
|
Chris@41
|
754 if (active.find(note) != active.end()) {
|
Chris@41
|
755 // the note is still playing
|
Chris@41
|
756 continue;
|
Chris@41
|
757 }
|
Chris@41
|
758
|
Chris@41
|
759 // the note was playing but just ended
|
Chris@41
|
760 int end = width;
|
Chris@41
|
761 int start = end-1;
|
Chris@41
|
762
|
Chris@41
|
763 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
764 --start;
|
Chris@41
|
765 }
|
Chris@41
|
766 ++start;
|
Chris@41
|
767
|
Chris@169
|
768 if ((end - start) < durationThreshold) {
|
Chris@41
|
769 continue;
|
Chris@41
|
770 }
|
Chris@41
|
771
|
Chris@169
|
772 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@41
|
773 }
|
Chris@41
|
774
|
Chris@62
|
775 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
776
|
Chris@41
|
777 return noteFeatures;
|
Chris@41
|
778 }
|
Chris@41
|
779
|
Chris@169
|
780 void
|
Chris@169
|
781 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
782 FeatureList ¬eFeatures)
|
Chris@169
|
783 {
|
Chris@169
|
784 int partStart = start;
|
Chris@169
|
785 int partShift = 0;
|
Chris@169
|
786 int partVelocity = 0;
|
Chris@169
|
787
|
Chris@252
|
788 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
789
|
Chris@169
|
790 for (int i = start; i != end; ++i) {
|
Chris@169
|
791
|
Chris@169
|
792 double strength = m_pianoRoll[i][note];
|
Chris@169
|
793
|
Chris@169
|
794 int shift = 0;
|
Chris@169
|
795
|
Chris@169
|
796 if (shiftCount > 1) {
|
Chris@169
|
797
|
Chris@169
|
798 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
799
|
Chris@169
|
800 if (i == partStart) {
|
Chris@169
|
801 partShift = shift;
|
Chris@169
|
802 }
|
Chris@169
|
803
|
Chris@169
|
804 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
805
|
Chris@169
|
806 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
807
|
Chris@169
|
808 // pitch has changed, emit an intermediate note
|
Chris@252
|
809 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
810 i,
|
Chris@252
|
811 note,
|
Chris@252
|
812 partShift,
|
Chris@252
|
813 shiftCount,
|
Chris@252
|
814 partVelocity));
|
Chris@169
|
815 partStart = i;
|
Chris@169
|
816 partShift = shift;
|
Chris@169
|
817 partVelocity = 0;
|
Chris@169
|
818 }
|
Chris@169
|
819 }
|
Chris@169
|
820
|
Chris@246
|
821 int v = round(strength * 2);
|
Chris@169
|
822 if (v > partVelocity) {
|
Chris@169
|
823 partVelocity = v;
|
Chris@169
|
824 }
|
Chris@169
|
825 }
|
Chris@169
|
826
|
Chris@169
|
827 if (end >= partStart + partThreshold) {
|
Chris@252
|
828 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
829 end,
|
Chris@252
|
830 note,
|
Chris@252
|
831 partShift,
|
Chris@252
|
832 shiftCount,
|
Chris@252
|
833 partVelocity));
|
Chris@169
|
834 }
|
Chris@169
|
835 }
|
Chris@252
|
836
|
Chris@252
|
837 Silvet::Feature
|
Chris@252
|
838 Silvet::makeNoteFeature(int start,
|
Chris@252
|
839 int end,
|
Chris@252
|
840 int note,
|
Chris@252
|
841 int shift,
|
Chris@252
|
842 int shiftCount,
|
Chris@252
|
843 int velocity)
|
Chris@252
|
844 {
|
Chris@252
|
845 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@252
|
846 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@252
|
847
|
Chris@252
|
848 Feature f;
|
Chris@252
|
849
|
Chris@252
|
850 f.hasTimestamp = true;
|
Chris@285
|
851 f.timestamp = m_startTime + RealTime::fromSeconds
|
Chris@252
|
852 (columnDuration * (start - postFilterLatency) + 0.02);
|
Chris@252
|
853
|
Chris@252
|
854 f.hasDuration = true;
|
Chris@252
|
855 f.duration = RealTime::fromSeconds
|
Chris@252
|
856 (columnDuration * (end - start));
|
Chris@252
|
857
|
Chris@252
|
858 f.values.clear();
|
Chris@252
|
859
|
Chris@252
|
860 f.values.push_back
|
Chris@252
|
861 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
862
|
Chris@252
|
863 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
864 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
865 velocity = round(velocity / inputGain);
|
Chris@252
|
866 if (velocity > 127) velocity = 127;
|
Chris@252
|
867 if (velocity < 1) velocity = 1;
|
Chris@252
|
868 f.values.push_back(velocity);
|
Chris@252
|
869
|
Chris@252
|
870 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
871
|
Chris@252
|
872 return f;
|
Chris@252
|
873 }
|
Chris@252
|
874
|
Chris@252
|
875 float
|
Chris@252
|
876 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
877 {
|
Chris@252
|
878 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
879
|
Chris@252
|
880 if (i == m_inputGains.end()) {
|
Chris@252
|
881 if (i != m_inputGains.begin()) {
|
Chris@252
|
882 --i;
|
Chris@252
|
883 } else {
|
Chris@252
|
884 return 1.f; // no data
|
Chris@252
|
885 }
|
Chris@252
|
886 }
|
Chris@252
|
887
|
Chris@252
|
888 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
889
|
Chris@252
|
890 return i->second;
|
Chris@252
|
891 }
|
Chris@252
|
892
|