Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@329
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated fundamental frequency, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@319
|
256 d.identifier = "onsets";
|
Chris@319
|
257 d.name = "Note onsets";
|
Chris@323
|
258 d.description = "Note onsets, without durations. These can be calculated sooner than complete notes, because it isn't necessary to wait for a note to finish before returning its feature. Each event has time, estimated fundamental frequency in Hz, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@319
|
259 d.unit = "Hz";
|
Chris@319
|
260 d.hasFixedBinCount = true;
|
Chris@319
|
261 d.binCount = 2;
|
Chris@319
|
262 d.binNames.push_back("Frequency");
|
Chris@319
|
263 d.binNames.push_back("Velocity");
|
Chris@319
|
264 d.hasKnownExtents = false;
|
Chris@319
|
265 d.isQuantized = false;
|
Chris@319
|
266 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@319
|
267 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@319
|
268 d.hasDuration = false;
|
Chris@319
|
269 m_onsetsOutputNo = list.size();
|
Chris@319
|
270 list.push_back(d);
|
Chris@319
|
271
|
Chris@178
|
272 d.identifier = "timefreq";
|
Chris@178
|
273 d.name = "Time-frequency distribution";
|
Chris@271
|
274 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
275 d.unit = "";
|
Chris@178
|
276 d.hasFixedBinCount = true;
|
Chris@298
|
277 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
278 d.binNames.clear();
|
Chris@178
|
279 if (m_cq) {
|
Chris@294
|
280 char name[50];
|
Chris@298
|
281 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
282 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
283 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
284 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
285 // frequency though, so these are still the first 545 bins
|
Chris@178
|
286 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
287 float freq = m_cq->getBinFrequency
|
Chris@298
|
288 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
289 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
290 d.binNames.push_back(name);
|
Chris@178
|
291 }
|
Chris@178
|
292 }
|
Chris@178
|
293 d.hasKnownExtents = false;
|
Chris@178
|
294 d.isQuantized = false;
|
Chris@178
|
295 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
296 d.sampleRate = m_colsPerSec;
|
Chris@178
|
297 d.hasDuration = false;
|
Chris@178
|
298 m_fcqOutputNo = list.size();
|
Chris@178
|
299 list.push_back(d);
|
Chris@178
|
300
|
Chris@294
|
301 d.identifier = "pitchactivation";
|
Chris@294
|
302 d.name = "Pitch activation distribution";
|
Chris@294
|
303 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
304 d.unit = "";
|
Chris@294
|
305 d.hasFixedBinCount = true;
|
Chris@298
|
306 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
307 d.binNames.clear();
|
Chris@294
|
308 if (m_cq) {
|
Chris@298
|
309 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@320
|
310 d.binNames.push_back(getNoteName(i, 0, 1));
|
Chris@294
|
311 }
|
Chris@294
|
312 }
|
Chris@294
|
313 d.hasKnownExtents = false;
|
Chris@294
|
314 d.isQuantized = false;
|
Chris@294
|
315 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
316 d.sampleRate = m_colsPerSec;
|
Chris@294
|
317 d.hasDuration = false;
|
Chris@294
|
318 m_pitchOutputNo = list.size();
|
Chris@294
|
319 list.push_back(d);
|
Chris@294
|
320
|
Chris@309
|
321 d.identifier = "chroma";
|
Chris@309
|
322 d.name = "Pitch chroma distribution";
|
Chris@309
|
323 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
324 d.unit = "";
|
Chris@309
|
325 d.hasFixedBinCount = true;
|
Chris@309
|
326 d.binCount = 12;
|
Chris@309
|
327 d.binNames.clear();
|
Chris@309
|
328 if (m_cq) {
|
Chris@309
|
329 for (int i = 0; i < 12; ++i) {
|
Chris@320
|
330 d.binNames.push_back(getChromaName(i));
|
Chris@309
|
331 }
|
Chris@309
|
332 }
|
Chris@309
|
333 d.hasKnownExtents = false;
|
Chris@309
|
334 d.isQuantized = false;
|
Chris@309
|
335 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
336 d.sampleRate = m_colsPerSec;
|
Chris@309
|
337 d.hasDuration = false;
|
Chris@309
|
338 m_chromaOutputNo = list.size();
|
Chris@309
|
339 list.push_back(d);
|
Chris@309
|
340
|
Chris@302
|
341 d.identifier = "templates";
|
Chris@302
|
342 d.name = "Templates";
|
Chris@302
|
343 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
344 d.unit = "";
|
Chris@302
|
345 d.hasFixedBinCount = true;
|
Chris@302
|
346 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
347 d.binNames.clear();
|
Chris@302
|
348 if (m_cq) {
|
Chris@302
|
349 char name[50];
|
Chris@302
|
350 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
351 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
352 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
353 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
354 // frequency though, so these are still the first 545 bins
|
Chris@302
|
355 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
356 float freq = m_cq->getBinFrequency
|
Chris@302
|
357 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
358 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
359 d.binNames.push_back(name);
|
Chris@302
|
360 }
|
Chris@302
|
361 }
|
Chris@302
|
362 d.hasKnownExtents = false;
|
Chris@302
|
363 d.isQuantized = false;
|
Chris@302
|
364 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
365 d.sampleRate = m_colsPerSec;
|
Chris@302
|
366 d.hasDuration = false;
|
Chris@302
|
367 m_templateOutputNo = list.size();
|
Chris@302
|
368 list.push_back(d);
|
Chris@302
|
369
|
Chris@31
|
370 return list;
|
Chris@31
|
371 }
|
Chris@31
|
372
|
Chris@38
|
373 std::string
|
Chris@320
|
374 Silvet::getChromaName(int pitch) const
|
Chris@38
|
375 {
|
Chris@38
|
376 static const char *names[] = {
|
Chris@38
|
377 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
378 };
|
Chris@38
|
379
|
Chris@309
|
380 return names[pitch];
|
Chris@309
|
381 }
|
Chris@309
|
382
|
Chris@309
|
383 std::string
|
Chris@320
|
384 Silvet::getNoteName(int note, int shift, int shiftCount) const
|
Chris@309
|
385 {
|
Chris@320
|
386 string n = getChromaName(note % 12);
|
Chris@38
|
387
|
Chris@175
|
388 int oct = (note + 9) / 12;
|
Chris@38
|
389
|
Chris@175
|
390 char buf[30];
|
Chris@175
|
391
|
Chris@175
|
392 float pshift = 0.f;
|
Chris@175
|
393 if (shiftCount > 1) {
|
Chris@320
|
394 // see getNoteFrequency below
|
Chris@175
|
395 pshift =
|
Chris@175
|
396 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
397 }
|
Chris@175
|
398
|
Chris@175
|
399 if (pshift > 0.f) {
|
Chris@309
|
400 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
401 } else if (pshift < 0.f) {
|
Chris@309
|
402 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
403 } else {
|
Chris@309
|
404 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
405 }
|
Chris@38
|
406
|
Chris@38
|
407 return buf;
|
Chris@38
|
408 }
|
Chris@38
|
409
|
Chris@41
|
410 float
|
Chris@320
|
411 Silvet::getNoteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
412 {
|
Chris@169
|
413 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
414 // is an offset into the template array, which starts with some
|
Chris@169
|
415 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
416 //
|
Chris@169
|
417 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
418 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
419 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
420 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
421 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
422 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
423 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
424 // down in pitch, for a negative pitch shift.
|
Chris@169
|
425
|
Chris@175
|
426 float pshift = 0.f;
|
Chris@175
|
427 if (shiftCount > 1) {
|
Chris@175
|
428 pshift =
|
Chris@175
|
429 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
430 }
|
Chris@169
|
431
|
Chris@301
|
432 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
433
|
Chris@303
|
434 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
435 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
436
|
Chris@301
|
437 return freq;
|
Chris@41
|
438 }
|
Chris@41
|
439
|
Chris@31
|
440 bool
|
Chris@31
|
441 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
442 {
|
Chris@272
|
443 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
444 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
445 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
446 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
447 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
448 return false;
|
Chris@272
|
449 }
|
Chris@272
|
450
|
Chris@31
|
451 if (channels < getMinChannelCount() ||
|
Chris@272
|
452 channels > getMaxChannelCount()) {
|
Chris@272
|
453 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
454 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
455 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
456 return false;
|
Chris@272
|
457 }
|
Chris@31
|
458
|
Chris@31
|
459 if (stepSize != blockSize) {
|
Chris@31
|
460 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
461 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
462 return false;
|
Chris@31
|
463 }
|
Chris@31
|
464
|
Chris@31
|
465 m_blockSize = blockSize;
|
Chris@31
|
466
|
Chris@31
|
467 reset();
|
Chris@31
|
468
|
Chris@31
|
469 return true;
|
Chris@31
|
470 }
|
Chris@31
|
471
|
Chris@31
|
472 void
|
Chris@31
|
473 Silvet::reset()
|
Chris@31
|
474 {
|
Chris@31
|
475 delete m_resampler;
|
Chris@246
|
476 delete m_flattener;
|
Chris@31
|
477 delete m_cq;
|
Chris@31
|
478
|
Chris@31
|
479 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
480 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
481 } else {
|
Chris@31
|
482 m_resampler = 0;
|
Chris@31
|
483 }
|
Chris@31
|
484
|
Chris@246
|
485 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
486 m_flattener->reset();
|
Chris@246
|
487
|
Chris@301
|
488 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
489 // freq used for the EM templates:
|
Chris@301
|
490 double maxFreq = 14700;
|
Chris@301
|
491
|
Chris@301
|
492 if (m_mode == LiveMode) {
|
Chris@301
|
493 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
494 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
495 // lower than 14700
|
Chris@301
|
496 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
497 }
|
Chris@301
|
498
|
Chris@173
|
499 double minFreq = 27.5;
|
Chris@173
|
500
|
Chris@297
|
501 if (m_mode != HighQualityMode) {
|
Chris@173
|
502 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
503 // so we can just pad with zeros
|
Chris@173
|
504 minFreq *= 2;
|
Chris@173
|
505 }
|
Chris@173
|
506
|
Chris@298
|
507 int bpo = 12 *
|
Chris@298
|
508 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
509
|
Chris@154
|
510 CQParameters params(processingSampleRate,
|
Chris@173
|
511 minFreq,
|
Chris@303
|
512 maxFreq,
|
Chris@298
|
513 bpo);
|
Chris@154
|
514
|
Chris@325
|
515 params.q = 0.8;
|
Chris@325
|
516 params.atomHopFactor = (m_mode == LiveMode ? 1.0 : 0.3);
|
Chris@154
|
517 params.threshold = 0.0005;
|
Chris@317
|
518 params.decimator =
|
Chris@317
|
519 (m_mode == LiveMode ?
|
Chris@317
|
520 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
|
Chris@172
|
521 params.window = CQParameters::Hann;
|
Chris@154
|
522
|
Chris@154
|
523 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
524
|
Chris@303
|
525 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
526 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
527
|
Chris@297
|
528 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
529
|
Chris@41
|
530 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
531 delete m_postFilter[i];
|
Chris@41
|
532 }
|
Chris@41
|
533 m_postFilter.clear();
|
Chris@303
|
534 int postFilterLength = 3;
|
Chris@298
|
535 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
536 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
537 }
|
Chris@41
|
538 m_pianoRoll.clear();
|
Chris@246
|
539 m_inputGains.clear();
|
Chris@32
|
540 m_columnCount = 0;
|
Chris@272
|
541 m_resampledCount = 0;
|
Chris@40
|
542 m_startTime = RealTime::zeroTime;
|
Chris@313
|
543 m_haveStartTime = false;
|
Chris@31
|
544 }
|
Chris@31
|
545
|
Chris@31
|
546 Silvet::FeatureSet
|
Chris@31
|
547 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
548 {
|
Chris@302
|
549 FeatureSet fs;
|
Chris@302
|
550
|
Chris@313
|
551 if (!m_haveStartTime) {
|
Chris@314
|
552
|
Chris@40
|
553 m_startTime = timestamp;
|
Chris@313
|
554 m_haveStartTime = true;
|
Chris@314
|
555
|
Chris@302
|
556 insertTemplateFeatures(fs);
|
Chris@40
|
557 }
|
Chris@246
|
558
|
Chris@246
|
559 vector<float> flattened(m_blockSize);
|
Chris@246
|
560 float gain = 1.f;
|
Chris@246
|
561 m_flattener->connectInputPort
|
Chris@246
|
562 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
563 m_flattener->connectOutputPort
|
Chris@246
|
564 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
565 m_flattener->connectOutputPort
|
Chris@246
|
566 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
567 m_flattener->process(m_blockSize);
|
Chris@246
|
568
|
Chris@252
|
569 m_inputGains[timestamp] = gain;
|
Chris@40
|
570
|
Chris@31
|
571 vector<double> data;
|
Chris@40
|
572 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
573 double d = flattened[i];
|
Chris@235
|
574 data.push_back(d);
|
Chris@40
|
575 }
|
Chris@31
|
576
|
Chris@31
|
577 if (m_resampler) {
|
Chris@272
|
578
|
Chris@31
|
579 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
580
|
Chris@272
|
581 int hadCount = m_resampledCount;
|
Chris@272
|
582 m_resampledCount += data.size();
|
Chris@272
|
583
|
Chris@272
|
584 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
585
|
Chris@272
|
586 if (hadCount < resamplerLatency) {
|
Chris@272
|
587 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
588 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
589 return fs;
|
Chris@272
|
590 } else {
|
Chris@272
|
591 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
592 }
|
Chris@272
|
593 }
|
Chris@31
|
594 }
|
Chris@272
|
595
|
Chris@32
|
596 Grid cqout = m_cq->process(data);
|
Chris@302
|
597 transcribe(cqout, fs);
|
Chris@51
|
598 return fs;
|
Chris@34
|
599 }
|
Chris@34
|
600
|
Chris@34
|
601 Silvet::FeatureSet
|
Chris@34
|
602 Silvet::getRemainingFeatures()
|
Chris@34
|
603 {
|
Chris@145
|
604 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
605 FeatureSet fs;
|
Chris@302
|
606 if (m_columnCount == 0) {
|
Chris@302
|
607 // process() was never called, but we still want these
|
Chris@302
|
608 insertTemplateFeatures(fs);
|
Chris@302
|
609 } else {
|
Chris@302
|
610 transcribe(cqout, fs);
|
Chris@302
|
611 }
|
Chris@51
|
612 return fs;
|
Chris@34
|
613 }
|
Chris@34
|
614
|
Chris@302
|
615 void
|
Chris@302
|
616 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
617 {
|
Chris@302
|
618 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
619 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
620 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
621 Feature f;
|
Chris@302
|
622 char buffer[50];
|
Chris@302
|
623 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
624 f.label = buffer;
|
Chris@302
|
625 f.hasTimestamp = true;
|
Chris@302
|
626 f.timestamp = timestamp;
|
Chris@302
|
627 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
628 .data[i % pack.templateNoteCount];
|
Chris@302
|
629 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
630 }
|
Chris@302
|
631 }
|
Chris@302
|
632
|
Chris@302
|
633 void
|
Chris@302
|
634 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
635 {
|
Chris@32
|
636 Grid filtered = preProcess(cqout);
|
Chris@31
|
637
|
Chris@302
|
638 if (filtered.empty()) return;
|
Chris@170
|
639
|
Chris@298
|
640 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
641
|
Chris@325
|
642 int width = filtered.size();
|
Chris@325
|
643
|
Chris@325
|
644 double silenceThreshold = 0.01;
|
Chris@325
|
645
|
Chris@325
|
646 for (int i = 0; i < width; ++i) {
|
Chris@325
|
647
|
Chris@325
|
648 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1 + i);
|
Chris@325
|
649 float inputGain = getInputGainAt(timestamp);
|
Chris@325
|
650
|
Chris@178
|
651 Feature f;
|
Chris@325
|
652 double rms = 0.0;
|
Chris@325
|
653
|
Chris@178
|
654 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@325
|
655 double v = filtered[i][j];
|
Chris@325
|
656 rms += v * v;
|
Chris@325
|
657 f.values.push_back(float(v));
|
Chris@178
|
658 }
|
Chris@325
|
659
|
Chris@325
|
660 rms = sqrt(rms / pack.templateHeight);
|
Chris@325
|
661 if (rms / inputGain < silenceThreshold) {
|
Chris@325
|
662 filtered[i].clear();
|
Chris@325
|
663 }
|
Chris@325
|
664
|
Chris@178
|
665 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
666 }
|
Chris@325
|
667
|
Chris@311
|
668 Grid localPitches(width);
|
Chris@170
|
669
|
Chris@297
|
670 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
671 int shiftCount = 1;
|
Chris@170
|
672 if (wantShifts) {
|
Chris@170
|
673 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
674 }
|
Chris@170
|
675
|
Chris@170
|
676 vector<vector<int> > localBestShifts;
|
Chris@170
|
677 if (wantShifts) {
|
Chris@311
|
678 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
679 }
|
Chris@170
|
680
|
Chris@312
|
681 #ifndef MAX_EM_THREADS
|
Chris@312
|
682 #define MAX_EM_THREADS 8
|
Chris@312
|
683 #endif
|
Chris@312
|
684
|
Chris@317
|
685 int emThreadCount = MAX_EM_THREADS;
|
Chris@317
|
686 if (m_mode == LiveMode && pack.templates.size() == 1) {
|
Chris@317
|
687 // The EM step is probably not slow enough to merit it
|
Chris@317
|
688 emThreadCount = 1;
|
Chris@317
|
689 }
|
Chris@317
|
690
|
Chris@312
|
691 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@317
|
692 if (emThreadCount > 1) {
|
Chris@317
|
693 for (int i = 0; i < width; ) {
|
Chris@317
|
694 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@317
|
695 vector<EMFuture> results;
|
Chris@317
|
696 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
697 results.push_back
|
Chris@317
|
698 (async(std::launch::async,
|
Chris@317
|
699 [&](int index) {
|
Chris@325
|
700 return applyEM
|
Chris@325
|
701 (pack, filtered.at(index), wantShifts);
|
Chris@317
|
702 }, i + j));
|
Chris@317
|
703 }
|
Chris@317
|
704 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
705 auto out = results[j].get();
|
Chris@317
|
706 localPitches[i+j] = out.first;
|
Chris@317
|
707 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@317
|
708 }
|
Chris@317
|
709 i += emThreadCount;
|
Chris@312
|
710 }
|
Chris@123
|
711 }
|
Chris@312
|
712 #endif
|
Chris@317
|
713
|
Chris@317
|
714 if (emThreadCount == 1) {
|
Chris@317
|
715 for (int i = 0; i < width; ++i) {
|
Chris@317
|
716 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@317
|
717 localPitches[i] = out.first;
|
Chris@317
|
718 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@317
|
719 }
|
Chris@317
|
720 }
|
Chris@305
|
721
|
Chris@166
|
722 for (int i = 0; i < width; ++i) {
|
Chris@37
|
723
|
Chris@321
|
724 vector<double> filtered;
|
Chris@321
|
725
|
Chris@321
|
726 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
727 m_postFilter[j]->push(localPitches[i][j]);
|
Chris@321
|
728 filtered.push_back(m_postFilter[j]->get());
|
Chris@321
|
729 }
|
Chris@294
|
730
|
Chris@309
|
731 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
732 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
733
|
Chris@294
|
734 Feature f;
|
Chris@294
|
735 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
736 float v = filtered[j];
|
Chris@294
|
737 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
738 f.values.push_back(v / inputGain);
|
Chris@294
|
739 }
|
Chris@294
|
740 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
741
|
Chris@309
|
742 f.values.clear();
|
Chris@309
|
743 f.values.resize(12);
|
Chris@309
|
744 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
745 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
746 }
|
Chris@309
|
747 fs[m_chromaOutputNo].push_back(f);
|
Chris@38
|
748
|
Chris@321
|
749 // This pushes the up-to-max-polyphony activation column to
|
Chris@321
|
750 // m_pianoRoll
|
Chris@323
|
751 postProcess(filtered, localBestShifts[i], wantShifts);
|
Chris@321
|
752
|
Chris@319
|
753 auto events = noteTrack(shiftCount);
|
Chris@319
|
754
|
Chris@319
|
755 FeatureList noteFeatures = events.first;
|
Chris@123
|
756 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
757 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
758 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
759 }
|
Chris@319
|
760
|
Chris@319
|
761 FeatureList onsetFeatures = events.second;
|
Chris@319
|
762 for (FeatureList::const_iterator fi = onsetFeatures.begin();
|
Chris@319
|
763 fi != onsetFeatures.end(); ++fi) {
|
Chris@319
|
764 fs[m_onsetsOutputNo].push_back(*fi);
|
Chris@319
|
765 }
|
Chris@34
|
766 }
|
Chris@31
|
767 }
|
Chris@31
|
768
|
Chris@311
|
769 pair<vector<double>, vector<int> >
|
Chris@311
|
770 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
771 const vector<double> &column,
|
Chris@311
|
772 bool wantShifts)
|
Chris@311
|
773 {
|
Chris@311
|
774 double columnThreshold = 1e-5;
|
Chris@311
|
775
|
Chris@314
|
776 if (m_mode == LiveMode) {
|
Chris@325
|
777 columnThreshold /= 15;
|
Chris@314
|
778 }
|
Chris@314
|
779
|
Chris@311
|
780 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
781 vector<int> bestShifts;
|
Chris@325
|
782
|
Chris@325
|
783 if (column.empty()) return { pitches, bestShifts };
|
Chris@311
|
784
|
Chris@311
|
785 double sum = 0.0;
|
Chris@311
|
786 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
787 sum += column.at(j);
|
Chris@311
|
788 }
|
Chris@311
|
789 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
790
|
Chris@314
|
791 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
792
|
Chris@311
|
793 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
794 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
795
|
Chris@314
|
796 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
797
|
Chris@311
|
798 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
799 em.iterate(column.data());
|
Chris@311
|
800 }
|
Chris@311
|
801
|
Chris@311
|
802 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
803 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
804
|
Chris@311
|
805 int shiftCount = 1;
|
Chris@311
|
806 if (wantShifts) {
|
Chris@311
|
807 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
808 }
|
Chris@311
|
809
|
Chris@311
|
810 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
811
|
Chris@311
|
812 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
813
|
Chris@311
|
814 int bestShift = 0;
|
Chris@311
|
815 float bestShiftValue = 0.0;
|
Chris@311
|
816 if (wantShifts) {
|
Chris@311
|
817 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
818 float value = shiftDist[k][j];
|
Chris@311
|
819 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
820 bestShiftValue = value;
|
Chris@311
|
821 bestShift = k;
|
Chris@311
|
822 }
|
Chris@311
|
823 }
|
Chris@311
|
824 bestShifts.push_back(bestShift);
|
Chris@311
|
825 }
|
Chris@311
|
826 }
|
Chris@311
|
827
|
Chris@311
|
828 return { pitches, bestShifts };
|
Chris@311
|
829 }
|
Chris@311
|
830
|
Chris@32
|
831 Silvet::Grid
|
Chris@32
|
832 Silvet::preProcess(const Grid &in)
|
Chris@32
|
833 {
|
Chris@32
|
834 int width = in.size();
|
Chris@32
|
835
|
Chris@165
|
836 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
837
|
Chris@165
|
838 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
839 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
840
|
Chris@32
|
841 Grid out;
|
Chris@32
|
842
|
Chris@58
|
843 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
844 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
845 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
846 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
847 // size we reduce to in a moment
|
Chris@33
|
848 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
849
|
Chris@298
|
850 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
851
|
Chris@32
|
852 for (int i = 0; i < width; ++i) {
|
Chris@32
|
853
|
Chris@33
|
854 if (m_columnCount < latentColumns) {
|
Chris@33
|
855 ++m_columnCount;
|
Chris@33
|
856 continue;
|
Chris@33
|
857 }
|
Chris@33
|
858
|
Chris@32
|
859 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
860 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
861
|
Chris@32
|
862 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
863
|
Chris@32
|
864 if (select) {
|
Chris@32
|
865 vector<double> inCol = in[i];
|
Chris@176
|
866 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
867
|
Chris@178
|
868 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@298
|
869 // lowest 55 of them (assuming binsPerSemitone == 5).
|
Chris@178
|
870 //
|
Chris@297
|
871 // In draft and live mode the CQ is an octave shorter,
|
Chris@300
|
872 // returning 540 bins or equivalent, so we instead pad
|
Chris@300
|
873 // them with an additional 5 or equivalent zeros.
|
Chris@178
|
874 //
|
Chris@178
|
875 // We also need to reverse the column as we go, since the
|
Chris@178
|
876 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
877 // the other way around.
|
Chris@32
|
878
|
Chris@298
|
879 int bps = (m_mode == LiveMode ?
|
Chris@298
|
880 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
881
|
Chris@297
|
882 if (m_mode == HighQualityMode) {
|
Chris@178
|
883 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
884 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
885 outCol[j] = inCol[ix];
|
Chris@178
|
886 }
|
Chris@178
|
887 } else {
|
Chris@298
|
888 for (int j = 0; j < bps; ++j) {
|
Chris@178
|
889 outCol[j] = 0.0;
|
Chris@178
|
890 }
|
Chris@298
|
891 for (int j = bps; j < pack.templateHeight; ++j) {
|
Chris@298
|
892 int ix = inCol.size() - j + (bps-1);
|
Chris@178
|
893 outCol[j] = inCol[ix];
|
Chris@178
|
894 }
|
Chris@46
|
895 }
|
Chris@32
|
896
|
Chris@46
|
897 vector<double> noiseLevel1 =
|
Chris@298
|
898 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
899 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
900 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
901 }
|
Chris@32
|
902
|
Chris@46
|
903 vector<double> noiseLevel2 =
|
Chris@298
|
904 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
905 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
906 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
907 }
|
Chris@32
|
908
|
Chris@165
|
909 out.push_back(outCol);
|
Chris@32
|
910 }
|
Chris@32
|
911
|
Chris@32
|
912 ++m_columnCount;
|
Chris@32
|
913 }
|
Chris@32
|
914
|
Chris@32
|
915 return out;
|
Chris@32
|
916 }
|
Chris@32
|
917
|
Chris@321
|
918 void
|
Chris@170
|
919 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
920 const vector<int> &bestShifts,
|
Chris@170
|
921 bool wantShifts)
|
Chris@166
|
922 {
|
Chris@298
|
923 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
924
|
Chris@41
|
925 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
926
|
Chris@41
|
927 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
928
|
Chris@41
|
929 ValueIndexMap strengths;
|
Chris@166
|
930
|
Chris@176
|
931 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
932
|
Chris@321
|
933 double strength = pitches[j];
|
Chris@183
|
934 if (strength < pack.levelThreshold) continue;
|
Chris@321
|
935
|
Chris@321
|
936 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@321
|
937 // get clusters of two or three high scores at a time for
|
Chris@321
|
938 // neighbouring semitones. Eliminate these by picking only the
|
Chris@325
|
939 // peaks (except that we never eliminate a note that has
|
Chris@325
|
940 // already been established as currently playing). This means
|
Chris@325
|
941 // we can't recognise actual semitone chords if they ever
|
Chris@325
|
942 // appear, but it's not as if live mode is good enough for
|
Chris@325
|
943 // that to be a big deal anyway.
|
Chris@321
|
944 if (m_mode == LiveMode) {
|
Chris@325
|
945 if (m_current.find(j) == m_current.end() &&
|
Chris@325
|
946 (j == 0 ||
|
Chris@325
|
947 j + 1 == pack.templateNoteCount ||
|
Chris@325
|
948 pitches[j] < pitches[j-1] ||
|
Chris@325
|
949 pitches[j] < pitches[j+1])) {
|
Chris@325
|
950 // not a peak or a currently-playing note: skip it
|
Chris@321
|
951 continue;
|
Chris@321
|
952 }
|
Chris@321
|
953 }
|
Chris@323
|
954
|
Chris@168
|
955 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
956 }
|
Chris@166
|
957
|
Chris@168
|
958 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
959
|
Chris@168
|
960 map<int, double> active;
|
Chris@168
|
961 map<int, int> activeShifts;
|
Chris@168
|
962
|
Chris@183
|
963 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
964
|
Chris@168
|
965 --si;
|
Chris@168
|
966
|
Chris@168
|
967 double strength = si->first;
|
Chris@168
|
968 int j = si->second;
|
Chris@168
|
969
|
Chris@168
|
970 active[j] = strength;
|
Chris@168
|
971
|
Chris@170
|
972 if (wantShifts) {
|
Chris@170
|
973 activeShifts[j] = bestShifts[j];
|
Chris@167
|
974 }
|
Chris@41
|
975 }
|
Chris@41
|
976
|
Chris@168
|
977 m_pianoRoll.push_back(active);
|
Chris@170
|
978
|
Chris@170
|
979 if (wantShifts) {
|
Chris@168
|
980 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
981 }
|
Chris@294
|
982
|
Chris@321
|
983 return;
|
Chris@166
|
984 }
|
Chris@166
|
985
|
Chris@319
|
986 pair<Vamp::Plugin::FeatureList, Vamp::Plugin::FeatureList>
|
Chris@168
|
987 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
988 {
|
Chris@41
|
989 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
990 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
991 // latest active set but present in the prior set in the piano
|
Chris@41
|
992 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
993 // already, and if they haven't ended, we don't know their
|
Chris@41
|
994 // duration.
|
Chris@41
|
995
|
Chris@168
|
996 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
997
|
Chris@168
|
998 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
999
|
Chris@165
|
1000 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
1001
|
Chris@165
|
1002 // only keep notes >= 100ms or thereabouts
|
Chris@323
|
1003 double durationThrSec = 0.1;
|
Chris@323
|
1004 if (m_mode == LiveMode) durationThrSec = 0.07;
|
Chris@323
|
1005 int durationThreshold = floor(durationThrSec / columnDuration); // in cols
|
Chris@165
|
1006 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
1007
|
Chris@319
|
1008 FeatureList noteFeatures, onsetFeatures;
|
Chris@41
|
1009
|
Chris@41
|
1010 if (width < durationThreshold + 1) {
|
Chris@319
|
1011 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1012 }
|
Chris@41
|
1013
|
Chris@55
|
1014 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
1015 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
1016
|
Chris@55
|
1017 int note = ni->first;
|
Chris@41
|
1018
|
Chris@41
|
1019 int end = width;
|
Chris@41
|
1020 int start = end-1;
|
Chris@41
|
1021
|
Chris@41
|
1022 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
1023 --start;
|
Chris@41
|
1024 }
|
Chris@41
|
1025 ++start;
|
Chris@41
|
1026
|
Chris@319
|
1027 int duration = end - start;
|
Chris@319
|
1028
|
Chris@319
|
1029 if (duration < durationThreshold) {
|
Chris@41
|
1030 continue;
|
Chris@41
|
1031 }
|
Chris@41
|
1032
|
Chris@319
|
1033 if (duration == durationThreshold) {
|
Chris@325
|
1034 m_current.insert(note);
|
Chris@319
|
1035 emitOnset(start, note, shiftCount, onsetFeatures);
|
Chris@319
|
1036 }
|
Chris@319
|
1037
|
Chris@319
|
1038 if (active.find(note) == active.end()) {
|
Chris@319
|
1039 // the note was playing but just ended
|
Chris@325
|
1040 m_current.erase(note);
|
Chris@319
|
1041 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@334
|
1042 } else { // still playing
|
Chris@334
|
1043 // repeated note detection: if level is greater than this
|
Chris@334
|
1044 // multiple of its previous value, then we end the note and
|
Chris@334
|
1045 // restart it with the same pitch
|
Chris@334
|
1046 double restartFactor = 1.5;
|
Chris@334
|
1047 if (duration >= durationThreshold * 2 &&
|
Chris@334
|
1048 (active.find(note)->second >
|
Chris@334
|
1049 restartFactor * m_pianoRoll[width-1][note])) {
|
Chris@334
|
1050 m_current.erase(note);
|
Chris@334
|
1051 emitNote(start, end-1, note, shiftCount, noteFeatures);
|
Chris@334
|
1052 // and remove this so that we start counting the new
|
Chris@334
|
1053 // note's duration from the current position
|
Chris@334
|
1054 m_pianoRoll[width-1].erase(note);
|
Chris@334
|
1055 }
|
Chris@319
|
1056 }
|
Chris@41
|
1057 }
|
Chris@41
|
1058
|
Chris@62
|
1059 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
1060
|
Chris@319
|
1061 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1062 }
|
Chris@41
|
1063
|
Chris@169
|
1064 void
|
Chris@169
|
1065 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
1066 FeatureList ¬eFeatures)
|
Chris@169
|
1067 {
|
Chris@169
|
1068 int partStart = start;
|
Chris@169
|
1069 int partShift = 0;
|
Chris@320
|
1070 double partStrength = 0;
|
Chris@169
|
1071
|
Chris@252
|
1072 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1073
|
Chris@169
|
1074 for (int i = start; i != end; ++i) {
|
Chris@169
|
1075
|
Chris@169
|
1076 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1077
|
Chris@169
|
1078 int shift = 0;
|
Chris@169
|
1079
|
Chris@169
|
1080 if (shiftCount > 1) {
|
Chris@169
|
1081
|
Chris@169
|
1082 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1083
|
Chris@169
|
1084 if (i == partStart) {
|
Chris@169
|
1085 partShift = shift;
|
Chris@169
|
1086 }
|
Chris@169
|
1087
|
Chris@169
|
1088 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1089
|
Chris@169
|
1090 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1091
|
Chris@169
|
1092 // pitch has changed, emit an intermediate note
|
Chris@252
|
1093 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1094 i,
|
Chris@252
|
1095 note,
|
Chris@252
|
1096 partShift,
|
Chris@252
|
1097 shiftCount,
|
Chris@320
|
1098 partStrength));
|
Chris@169
|
1099 partStart = i;
|
Chris@169
|
1100 partShift = shift;
|
Chris@320
|
1101 partStrength = 0;
|
Chris@169
|
1102 }
|
Chris@169
|
1103 }
|
Chris@169
|
1104
|
Chris@320
|
1105 if (strength > partStrength) {
|
Chris@320
|
1106 partStrength = strength;
|
Chris@169
|
1107 }
|
Chris@169
|
1108 }
|
Chris@169
|
1109
|
Chris@169
|
1110 if (end >= partStart + partThreshold) {
|
Chris@252
|
1111 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1112 end,
|
Chris@252
|
1113 note,
|
Chris@252
|
1114 partShift,
|
Chris@252
|
1115 shiftCount,
|
Chris@320
|
1116 partStrength));
|
Chris@169
|
1117 }
|
Chris@169
|
1118 }
|
Chris@252
|
1119
|
Chris@319
|
1120 void
|
Chris@319
|
1121 Silvet::emitOnset(int start, int note, int shiftCount,
|
Chris@319
|
1122 FeatureList &onsetFeatures)
|
Chris@319
|
1123 {
|
Chris@319
|
1124 int len = int(m_pianoRoll.size());
|
Chris@320
|
1125
|
Chris@320
|
1126 double onsetStrength = 0;
|
Chris@319
|
1127
|
Chris@319
|
1128 int shift = 0;
|
Chris@319
|
1129 if (shiftCount > 1) {
|
Chris@319
|
1130 shift = m_pianoRollShifts[start][note];
|
Chris@319
|
1131 }
|
Chris@319
|
1132
|
Chris@319
|
1133 for (int i = start; i < len; ++i) {
|
Chris@319
|
1134 double strength = m_pianoRoll[i][note];
|
Chris@320
|
1135 if (strength > onsetStrength) {
|
Chris@320
|
1136 onsetStrength = strength;
|
Chris@319
|
1137 }
|
Chris@319
|
1138 }
|
Chris@319
|
1139
|
Chris@319
|
1140 onsetFeatures.push_back(makeOnsetFeature(start,
|
Chris@319
|
1141 note,
|
Chris@319
|
1142 shift,
|
Chris@319
|
1143 shiftCount,
|
Chris@320
|
1144 onsetStrength));
|
Chris@319
|
1145 }
|
Chris@319
|
1146
|
Chris@309
|
1147 RealTime
|
Chris@309
|
1148 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1149 {
|
Chris@309
|
1150 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1151 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1152
|
Chris@309
|
1153 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1154 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1155 }
|
Chris@309
|
1156
|
Chris@252
|
1157 Silvet::Feature
|
Chris@252
|
1158 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1159 int end,
|
Chris@252
|
1160 int note,
|
Chris@252
|
1161 int shift,
|
Chris@252
|
1162 int shiftCount,
|
Chris@320
|
1163 double strength)
|
Chris@252
|
1164 {
|
Chris@252
|
1165 Feature f;
|
Chris@252
|
1166
|
Chris@252
|
1167 f.hasTimestamp = true;
|
Chris@309
|
1168 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1169
|
Chris@252
|
1170 f.hasDuration = true;
|
Chris@309
|
1171 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1172
|
Chris@252
|
1173 f.values.clear();
|
Chris@320
|
1174 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1175 f.values.push_back(getVelocityFor(strength, start));
|
Chris@252
|
1176
|
Chris@320
|
1177 f.label = getNoteName(note, shift, shiftCount);
|
Chris@252
|
1178
|
Chris@252
|
1179 return f;
|
Chris@252
|
1180 }
|
Chris@252
|
1181
|
Chris@319
|
1182 Silvet::Feature
|
Chris@319
|
1183 Silvet::makeOnsetFeature(int start,
|
Chris@319
|
1184 int note,
|
Chris@319
|
1185 int shift,
|
Chris@319
|
1186 int shiftCount,
|
Chris@320
|
1187 double strength)
|
Chris@319
|
1188 {
|
Chris@319
|
1189 Feature f;
|
Chris@319
|
1190
|
Chris@319
|
1191 f.hasTimestamp = true;
|
Chris@319
|
1192 f.timestamp = getColumnTimestamp(start);
|
Chris@319
|
1193
|
Chris@319
|
1194 f.hasDuration = false;
|
Chris@319
|
1195
|
Chris@319
|
1196 f.values.clear();
|
Chris@320
|
1197 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1198 f.values.push_back(getVelocityFor(strength, start));
|
Chris@319
|
1199
|
Chris@320
|
1200 f.label = getNoteName(note, shift, shiftCount);
|
Chris@319
|
1201
|
Chris@319
|
1202 return f;
|
Chris@319
|
1203 }
|
Chris@319
|
1204
|
Chris@320
|
1205 int
|
Chris@320
|
1206 Silvet::getVelocityFor(double strength, int column)
|
Chris@320
|
1207 {
|
Chris@320
|
1208 RealTime rt = getColumnTimestamp(column + 1);
|
Chris@320
|
1209
|
Chris@320
|
1210 float inputGain = getInputGainAt(rt);
|
Chris@320
|
1211
|
Chris@320
|
1212 double scale = 2.0;
|
Chris@320
|
1213 if (m_mode == LiveMode) scale = 20.0;
|
Chris@320
|
1214
|
Chris@320
|
1215 double velocity = round((strength * scale) / inputGain);
|
Chris@320
|
1216
|
Chris@320
|
1217 if (velocity > 127.0) velocity = 127.0;
|
Chris@320
|
1218 if (velocity < 1.0) velocity = 1.0; // assume surpassed 0 threshold already
|
Chris@320
|
1219
|
Chris@320
|
1220 return int(velocity);
|
Chris@320
|
1221 }
|
Chris@320
|
1222
|
Chris@252
|
1223 float
|
Chris@252
|
1224 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1225 {
|
Chris@252
|
1226 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1227
|
Chris@252
|
1228 if (i == m_inputGains.end()) {
|
Chris@252
|
1229 if (i != m_inputGains.begin()) {
|
Chris@252
|
1230 --i;
|
Chris@252
|
1231 } else {
|
Chris@252
|
1232 return 1.f; // no data
|
Chris@252
|
1233 }
|
Chris@252
|
1234 }
|
Chris@252
|
1235
|
Chris@252
|
1236 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1237
|
Chris@252
|
1238 return i->second;
|
Chris@252
|
1239 }
|
Chris@252
|
1240
|