Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@271
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@319
|
256 d.identifier = "onsets";
|
Chris@319
|
257 d.name = "Note onsets";
|
Chris@319
|
258 d.description = "Note onsets, without durations. These can be calculated sooner than complete notes as it isn't necessary to wait for the note to finish. Each event has time, estimated fundamental frequency in Hz, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@319
|
259 d.unit = "Hz";
|
Chris@319
|
260 d.hasFixedBinCount = true;
|
Chris@319
|
261 d.binCount = 2;
|
Chris@319
|
262 d.binNames.push_back("Frequency");
|
Chris@319
|
263 d.binNames.push_back("Velocity");
|
Chris@319
|
264 d.hasKnownExtents = false;
|
Chris@319
|
265 d.isQuantized = false;
|
Chris@319
|
266 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@319
|
267 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@319
|
268 d.hasDuration = false;
|
Chris@319
|
269 m_onsetsOutputNo = list.size();
|
Chris@319
|
270 list.push_back(d);
|
Chris@319
|
271
|
Chris@178
|
272 d.identifier = "timefreq";
|
Chris@178
|
273 d.name = "Time-frequency distribution";
|
Chris@271
|
274 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
275 d.unit = "";
|
Chris@178
|
276 d.hasFixedBinCount = true;
|
Chris@298
|
277 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
278 d.binNames.clear();
|
Chris@178
|
279 if (m_cq) {
|
Chris@294
|
280 char name[50];
|
Chris@298
|
281 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
282 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
283 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
284 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
285 // frequency though, so these are still the first 545 bins
|
Chris@178
|
286 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
287 float freq = m_cq->getBinFrequency
|
Chris@298
|
288 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
289 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
290 d.binNames.push_back(name);
|
Chris@178
|
291 }
|
Chris@178
|
292 }
|
Chris@178
|
293 d.hasKnownExtents = false;
|
Chris@178
|
294 d.isQuantized = false;
|
Chris@178
|
295 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
296 d.sampleRate = m_colsPerSec;
|
Chris@178
|
297 d.hasDuration = false;
|
Chris@178
|
298 m_fcqOutputNo = list.size();
|
Chris@178
|
299 list.push_back(d);
|
Chris@178
|
300
|
Chris@294
|
301 d.identifier = "pitchactivation";
|
Chris@294
|
302 d.name = "Pitch activation distribution";
|
Chris@294
|
303 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
304 d.unit = "";
|
Chris@294
|
305 d.hasFixedBinCount = true;
|
Chris@298
|
306 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
307 d.binNames.clear();
|
Chris@294
|
308 if (m_cq) {
|
Chris@298
|
309 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@320
|
310 d.binNames.push_back(getNoteName(i, 0, 1));
|
Chris@294
|
311 }
|
Chris@294
|
312 }
|
Chris@294
|
313 d.hasKnownExtents = false;
|
Chris@294
|
314 d.isQuantized = false;
|
Chris@294
|
315 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
316 d.sampleRate = m_colsPerSec;
|
Chris@294
|
317 d.hasDuration = false;
|
Chris@294
|
318 m_pitchOutputNo = list.size();
|
Chris@294
|
319 list.push_back(d);
|
Chris@294
|
320
|
Chris@309
|
321 d.identifier = "chroma";
|
Chris@309
|
322 d.name = "Pitch chroma distribution";
|
Chris@309
|
323 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
324 d.unit = "";
|
Chris@309
|
325 d.hasFixedBinCount = true;
|
Chris@309
|
326 d.binCount = 12;
|
Chris@309
|
327 d.binNames.clear();
|
Chris@309
|
328 if (m_cq) {
|
Chris@309
|
329 for (int i = 0; i < 12; ++i) {
|
Chris@320
|
330 d.binNames.push_back(getChromaName(i));
|
Chris@309
|
331 }
|
Chris@309
|
332 }
|
Chris@309
|
333 d.hasKnownExtents = false;
|
Chris@309
|
334 d.isQuantized = false;
|
Chris@309
|
335 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
336 d.sampleRate = m_colsPerSec;
|
Chris@309
|
337 d.hasDuration = false;
|
Chris@309
|
338 m_chromaOutputNo = list.size();
|
Chris@309
|
339 list.push_back(d);
|
Chris@309
|
340
|
Chris@302
|
341 d.identifier = "templates";
|
Chris@302
|
342 d.name = "Templates";
|
Chris@302
|
343 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
344 d.unit = "";
|
Chris@302
|
345 d.hasFixedBinCount = true;
|
Chris@302
|
346 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
347 d.binNames.clear();
|
Chris@302
|
348 if (m_cq) {
|
Chris@302
|
349 char name[50];
|
Chris@302
|
350 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
351 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
352 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
353 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
354 // frequency though, so these are still the first 545 bins
|
Chris@302
|
355 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
356 float freq = m_cq->getBinFrequency
|
Chris@302
|
357 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
358 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
359 d.binNames.push_back(name);
|
Chris@302
|
360 }
|
Chris@302
|
361 }
|
Chris@302
|
362 d.hasKnownExtents = false;
|
Chris@302
|
363 d.isQuantized = false;
|
Chris@302
|
364 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
365 d.sampleRate = m_colsPerSec;
|
Chris@302
|
366 d.hasDuration = false;
|
Chris@302
|
367 m_templateOutputNo = list.size();
|
Chris@302
|
368 list.push_back(d);
|
Chris@302
|
369
|
Chris@31
|
370 return list;
|
Chris@31
|
371 }
|
Chris@31
|
372
|
Chris@38
|
373 std::string
|
Chris@320
|
374 Silvet::getChromaName(int pitch) const
|
Chris@38
|
375 {
|
Chris@38
|
376 static const char *names[] = {
|
Chris@38
|
377 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
378 };
|
Chris@38
|
379
|
Chris@309
|
380 return names[pitch];
|
Chris@309
|
381 }
|
Chris@309
|
382
|
Chris@309
|
383 std::string
|
Chris@320
|
384 Silvet::getNoteName(int note, int shift, int shiftCount) const
|
Chris@309
|
385 {
|
Chris@320
|
386 string n = getChromaName(note % 12);
|
Chris@38
|
387
|
Chris@175
|
388 int oct = (note + 9) / 12;
|
Chris@38
|
389
|
Chris@175
|
390 char buf[30];
|
Chris@175
|
391
|
Chris@175
|
392 float pshift = 0.f;
|
Chris@175
|
393 if (shiftCount > 1) {
|
Chris@320
|
394 // see getNoteFrequency below
|
Chris@175
|
395 pshift =
|
Chris@175
|
396 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
397 }
|
Chris@175
|
398
|
Chris@175
|
399 if (pshift > 0.f) {
|
Chris@309
|
400 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
401 } else if (pshift < 0.f) {
|
Chris@309
|
402 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
403 } else {
|
Chris@309
|
404 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
405 }
|
Chris@38
|
406
|
Chris@38
|
407 return buf;
|
Chris@38
|
408 }
|
Chris@38
|
409
|
Chris@41
|
410 float
|
Chris@320
|
411 Silvet::getNoteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
412 {
|
Chris@169
|
413 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
414 // is an offset into the template array, which starts with some
|
Chris@169
|
415 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
416 //
|
Chris@169
|
417 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
418 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
419 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
420 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
421 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
422 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
423 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
424 // down in pitch, for a negative pitch shift.
|
Chris@169
|
425
|
Chris@175
|
426 float pshift = 0.f;
|
Chris@175
|
427 if (shiftCount > 1) {
|
Chris@175
|
428 pshift =
|
Chris@175
|
429 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
430 }
|
Chris@169
|
431
|
Chris@301
|
432 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
433
|
Chris@303
|
434 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
435 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
436
|
Chris@301
|
437 return freq;
|
Chris@41
|
438 }
|
Chris@41
|
439
|
Chris@31
|
440 bool
|
Chris@31
|
441 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
442 {
|
Chris@272
|
443 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
444 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
445 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
446 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
447 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
448 return false;
|
Chris@272
|
449 }
|
Chris@272
|
450
|
Chris@31
|
451 if (channels < getMinChannelCount() ||
|
Chris@272
|
452 channels > getMaxChannelCount()) {
|
Chris@272
|
453 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
454 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
455 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
456 return false;
|
Chris@272
|
457 }
|
Chris@31
|
458
|
Chris@31
|
459 if (stepSize != blockSize) {
|
Chris@31
|
460 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
461 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
462 return false;
|
Chris@31
|
463 }
|
Chris@31
|
464
|
Chris@31
|
465 m_blockSize = blockSize;
|
Chris@31
|
466
|
Chris@31
|
467 reset();
|
Chris@31
|
468
|
Chris@31
|
469 return true;
|
Chris@31
|
470 }
|
Chris@31
|
471
|
Chris@31
|
472 void
|
Chris@31
|
473 Silvet::reset()
|
Chris@31
|
474 {
|
Chris@31
|
475 delete m_resampler;
|
Chris@246
|
476 delete m_flattener;
|
Chris@31
|
477 delete m_cq;
|
Chris@31
|
478
|
Chris@31
|
479 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
480 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
481 } else {
|
Chris@31
|
482 m_resampler = 0;
|
Chris@31
|
483 }
|
Chris@31
|
484
|
Chris@246
|
485 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
486 m_flattener->reset();
|
Chris@246
|
487
|
Chris@301
|
488 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
489 // freq used for the EM templates:
|
Chris@301
|
490 double maxFreq = 14700;
|
Chris@301
|
491
|
Chris@301
|
492 if (m_mode == LiveMode) {
|
Chris@301
|
493 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
494 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
495 // lower than 14700
|
Chris@301
|
496 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
497 }
|
Chris@301
|
498
|
Chris@173
|
499 double minFreq = 27.5;
|
Chris@173
|
500
|
Chris@297
|
501 if (m_mode != HighQualityMode) {
|
Chris@173
|
502 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
503 // so we can just pad with zeros
|
Chris@173
|
504 minFreq *= 2;
|
Chris@173
|
505 }
|
Chris@173
|
506
|
Chris@298
|
507 int bpo = 12 *
|
Chris@298
|
508 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
509
|
Chris@154
|
510 CQParameters params(processingSampleRate,
|
Chris@173
|
511 minFreq,
|
Chris@303
|
512 maxFreq,
|
Chris@298
|
513 bpo);
|
Chris@154
|
514
|
Chris@316
|
515 // For params.q, the MIREX code uses 0.8, but it seems that with
|
Chris@316
|
516 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT
|
Chris@316
|
517 // size to 512 from 1024 and alters some other processing
|
Chris@316
|
518 // parameters, making everything much, much slower. Could be a
|
Chris@316
|
519 // flaw in the CQ parameter calculations, must check. For
|
Chris@316
|
520 // atomHopFactor == 1, q == 0.8 is fine
|
Chris@316
|
521 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8);
|
Chris@316
|
522 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0);
|
Chris@154
|
523 params.threshold = 0.0005;
|
Chris@317
|
524 params.decimator =
|
Chris@317
|
525 (m_mode == LiveMode ?
|
Chris@317
|
526 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
|
Chris@172
|
527 params.window = CQParameters::Hann;
|
Chris@154
|
528
|
Chris@154
|
529 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
530
|
Chris@303
|
531 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
532 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
533
|
Chris@297
|
534 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
535
|
Chris@41
|
536 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
537 delete m_postFilter[i];
|
Chris@41
|
538 }
|
Chris@41
|
539 m_postFilter.clear();
|
Chris@303
|
540 int postFilterLength = 3;
|
Chris@298
|
541 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
542 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
543 }
|
Chris@41
|
544 m_pianoRoll.clear();
|
Chris@246
|
545 m_inputGains.clear();
|
Chris@32
|
546 m_columnCount = 0;
|
Chris@272
|
547 m_resampledCount = 0;
|
Chris@40
|
548 m_startTime = RealTime::zeroTime;
|
Chris@313
|
549 m_haveStartTime = false;
|
Chris@31
|
550 }
|
Chris@31
|
551
|
Chris@31
|
552 Silvet::FeatureSet
|
Chris@31
|
553 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
554 {
|
Chris@302
|
555 FeatureSet fs;
|
Chris@302
|
556
|
Chris@313
|
557 if (!m_haveStartTime) {
|
Chris@314
|
558
|
Chris@40
|
559 m_startTime = timestamp;
|
Chris@313
|
560 m_haveStartTime = true;
|
Chris@314
|
561
|
Chris@302
|
562 insertTemplateFeatures(fs);
|
Chris@40
|
563 }
|
Chris@246
|
564
|
Chris@246
|
565 vector<float> flattened(m_blockSize);
|
Chris@246
|
566 float gain = 1.f;
|
Chris@246
|
567 m_flattener->connectInputPort
|
Chris@246
|
568 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
569 m_flattener->connectOutputPort
|
Chris@246
|
570 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
571 m_flattener->connectOutputPort
|
Chris@246
|
572 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
573 m_flattener->process(m_blockSize);
|
Chris@246
|
574
|
Chris@252
|
575 m_inputGains[timestamp] = gain;
|
Chris@40
|
576
|
Chris@31
|
577 vector<double> data;
|
Chris@40
|
578 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
579 double d = flattened[i];
|
Chris@235
|
580 data.push_back(d);
|
Chris@40
|
581 }
|
Chris@31
|
582
|
Chris@31
|
583 if (m_resampler) {
|
Chris@272
|
584
|
Chris@31
|
585 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
586
|
Chris@272
|
587 int hadCount = m_resampledCount;
|
Chris@272
|
588 m_resampledCount += data.size();
|
Chris@272
|
589
|
Chris@272
|
590 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
591
|
Chris@272
|
592 if (hadCount < resamplerLatency) {
|
Chris@272
|
593 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
594 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
595 return fs;
|
Chris@272
|
596 } else {
|
Chris@272
|
597 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
598 }
|
Chris@272
|
599 }
|
Chris@31
|
600 }
|
Chris@272
|
601
|
Chris@32
|
602 Grid cqout = m_cq->process(data);
|
Chris@302
|
603 transcribe(cqout, fs);
|
Chris@51
|
604 return fs;
|
Chris@34
|
605 }
|
Chris@34
|
606
|
Chris@34
|
607 Silvet::FeatureSet
|
Chris@34
|
608 Silvet::getRemainingFeatures()
|
Chris@34
|
609 {
|
Chris@145
|
610 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
611 FeatureSet fs;
|
Chris@302
|
612 if (m_columnCount == 0) {
|
Chris@302
|
613 // process() was never called, but we still want these
|
Chris@302
|
614 insertTemplateFeatures(fs);
|
Chris@302
|
615 } else {
|
Chris@302
|
616 transcribe(cqout, fs);
|
Chris@302
|
617 }
|
Chris@51
|
618 return fs;
|
Chris@34
|
619 }
|
Chris@34
|
620
|
Chris@302
|
621 void
|
Chris@302
|
622 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
623 {
|
Chris@302
|
624 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
625 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
626 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
627 Feature f;
|
Chris@302
|
628 char buffer[50];
|
Chris@302
|
629 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
630 f.label = buffer;
|
Chris@302
|
631 f.hasTimestamp = true;
|
Chris@302
|
632 f.timestamp = timestamp;
|
Chris@302
|
633 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
634 .data[i % pack.templateNoteCount];
|
Chris@302
|
635 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
636 }
|
Chris@302
|
637 }
|
Chris@302
|
638
|
Chris@302
|
639 void
|
Chris@302
|
640 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
641 {
|
Chris@32
|
642 Grid filtered = preProcess(cqout);
|
Chris@31
|
643
|
Chris@302
|
644 if (filtered.empty()) return;
|
Chris@170
|
645
|
Chris@298
|
646 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
647
|
Chris@178
|
648 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
649 Feature f;
|
Chris@178
|
650 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
651 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
652 }
|
Chris@178
|
653 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
654 }
|
Chris@178
|
655
|
Chris@34
|
656 int width = filtered.size();
|
Chris@34
|
657
|
Chris@311
|
658 Grid localPitches(width);
|
Chris@170
|
659
|
Chris@297
|
660 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
661 int shiftCount = 1;
|
Chris@170
|
662 if (wantShifts) {
|
Chris@170
|
663 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
664 }
|
Chris@170
|
665
|
Chris@170
|
666 vector<vector<int> > localBestShifts;
|
Chris@170
|
667 if (wantShifts) {
|
Chris@311
|
668 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
669 }
|
Chris@170
|
670
|
Chris@312
|
671 #ifndef MAX_EM_THREADS
|
Chris@312
|
672 #define MAX_EM_THREADS 8
|
Chris@312
|
673 #endif
|
Chris@312
|
674
|
Chris@317
|
675 int emThreadCount = MAX_EM_THREADS;
|
Chris@317
|
676 if (m_mode == LiveMode && pack.templates.size() == 1) {
|
Chris@317
|
677 // The EM step is probably not slow enough to merit it
|
Chris@317
|
678 emThreadCount = 1;
|
Chris@317
|
679 }
|
Chris@317
|
680
|
Chris@312
|
681 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@317
|
682 if (emThreadCount > 1) {
|
Chris@317
|
683 for (int i = 0; i < width; ) {
|
Chris@317
|
684 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@317
|
685 vector<EMFuture> results;
|
Chris@317
|
686 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
687 results.push_back
|
Chris@317
|
688 (async(std::launch::async,
|
Chris@317
|
689 [&](int index) {
|
Chris@317
|
690 return applyEM(pack, filtered.at(index), wantShifts);
|
Chris@317
|
691 }, i + j));
|
Chris@317
|
692 }
|
Chris@317
|
693 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
694 auto out = results[j].get();
|
Chris@317
|
695 localPitches[i+j] = out.first;
|
Chris@317
|
696 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@317
|
697 }
|
Chris@317
|
698 i += emThreadCount;
|
Chris@312
|
699 }
|
Chris@123
|
700 }
|
Chris@312
|
701 #endif
|
Chris@317
|
702
|
Chris@317
|
703 if (emThreadCount == 1) {
|
Chris@317
|
704 for (int i = 0; i < width; ++i) {
|
Chris@317
|
705 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@317
|
706 localPitches[i] = out.first;
|
Chris@317
|
707 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@317
|
708 }
|
Chris@317
|
709 }
|
Chris@305
|
710
|
Chris@166
|
711 for (int i = 0; i < width; ++i) {
|
Chris@37
|
712
|
Chris@321
|
713 vector<double> filtered;
|
Chris@321
|
714
|
Chris@321
|
715 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
716 m_postFilter[j]->push(localPitches[i][j]);
|
Chris@321
|
717 filtered.push_back(m_postFilter[j]->get());
|
Chris@321
|
718 }
|
Chris@294
|
719
|
Chris@309
|
720 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
721 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
722
|
Chris@294
|
723 Feature f;
|
Chris@294
|
724 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
725 float v = filtered[j];
|
Chris@294
|
726 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
727 f.values.push_back(v / inputGain);
|
Chris@294
|
728 }
|
Chris@294
|
729 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
730
|
Chris@309
|
731 f.values.clear();
|
Chris@309
|
732 f.values.resize(12);
|
Chris@309
|
733 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
734 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
735 }
|
Chris@309
|
736 fs[m_chromaOutputNo].push_back(f);
|
Chris@38
|
737
|
Chris@321
|
738 // This pushes the up-to-max-polyphony activation column to
|
Chris@321
|
739 // m_pianoRoll
|
Chris@321
|
740 postProcess(localPitches[i], localBestShifts[i], wantShifts);
|
Chris@321
|
741
|
Chris@319
|
742 auto events = noteTrack(shiftCount);
|
Chris@319
|
743
|
Chris@319
|
744 FeatureList noteFeatures = events.first;
|
Chris@123
|
745 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
746 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
747 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
748 }
|
Chris@319
|
749
|
Chris@319
|
750 FeatureList onsetFeatures = events.second;
|
Chris@319
|
751 for (FeatureList::const_iterator fi = onsetFeatures.begin();
|
Chris@319
|
752 fi != onsetFeatures.end(); ++fi) {
|
Chris@319
|
753 fs[m_onsetsOutputNo].push_back(*fi);
|
Chris@319
|
754 }
|
Chris@34
|
755 }
|
Chris@31
|
756 }
|
Chris@31
|
757
|
Chris@311
|
758 pair<vector<double>, vector<int> >
|
Chris@311
|
759 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
760 const vector<double> &column,
|
Chris@311
|
761 bool wantShifts)
|
Chris@311
|
762 {
|
Chris@311
|
763 double columnThreshold = 1e-5;
|
Chris@311
|
764
|
Chris@314
|
765 if (m_mode == LiveMode) {
|
Chris@314
|
766 columnThreshold /= 20;
|
Chris@314
|
767 }
|
Chris@314
|
768
|
Chris@311
|
769 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
770 vector<int> bestShifts;
|
Chris@311
|
771
|
Chris@311
|
772 double sum = 0.0;
|
Chris@311
|
773 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
774 sum += column.at(j);
|
Chris@311
|
775 }
|
Chris@311
|
776 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
777
|
Chris@314
|
778 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
779
|
Chris@311
|
780 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
781 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
782
|
Chris@314
|
783 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
784
|
Chris@311
|
785 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
786 em.iterate(column.data());
|
Chris@311
|
787 }
|
Chris@311
|
788
|
Chris@311
|
789 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
790 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
791
|
Chris@311
|
792 int shiftCount = 1;
|
Chris@311
|
793 if (wantShifts) {
|
Chris@311
|
794 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
795 }
|
Chris@311
|
796
|
Chris@311
|
797 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
798
|
Chris@311
|
799 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
800
|
Chris@311
|
801 int bestShift = 0;
|
Chris@311
|
802 float bestShiftValue = 0.0;
|
Chris@311
|
803 if (wantShifts) {
|
Chris@311
|
804 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
805 float value = shiftDist[k][j];
|
Chris@311
|
806 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
807 bestShiftValue = value;
|
Chris@311
|
808 bestShift = k;
|
Chris@311
|
809 }
|
Chris@311
|
810 }
|
Chris@311
|
811 bestShifts.push_back(bestShift);
|
Chris@311
|
812 }
|
Chris@311
|
813 }
|
Chris@311
|
814
|
Chris@311
|
815 return { pitches, bestShifts };
|
Chris@311
|
816 }
|
Chris@311
|
817
|
Chris@32
|
818 Silvet::Grid
|
Chris@32
|
819 Silvet::preProcess(const Grid &in)
|
Chris@32
|
820 {
|
Chris@32
|
821 int width = in.size();
|
Chris@32
|
822
|
Chris@165
|
823 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
824
|
Chris@165
|
825 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
826 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
827
|
Chris@32
|
828 Grid out;
|
Chris@32
|
829
|
Chris@58
|
830 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
831 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
832 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
833 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
834 // size we reduce to in a moment
|
Chris@33
|
835 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
836
|
Chris@298
|
837 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
838
|
Chris@32
|
839 for (int i = 0; i < width; ++i) {
|
Chris@32
|
840
|
Chris@33
|
841 if (m_columnCount < latentColumns) {
|
Chris@33
|
842 ++m_columnCount;
|
Chris@33
|
843 continue;
|
Chris@33
|
844 }
|
Chris@33
|
845
|
Chris@32
|
846 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
847 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
848
|
Chris@32
|
849 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
850
|
Chris@32
|
851 if (select) {
|
Chris@32
|
852 vector<double> inCol = in[i];
|
Chris@176
|
853 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
854
|
Chris@178
|
855 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@298
|
856 // lowest 55 of them (assuming binsPerSemitone == 5).
|
Chris@178
|
857 //
|
Chris@297
|
858 // In draft and live mode the CQ is an octave shorter,
|
Chris@300
|
859 // returning 540 bins or equivalent, so we instead pad
|
Chris@300
|
860 // them with an additional 5 or equivalent zeros.
|
Chris@178
|
861 //
|
Chris@178
|
862 // We also need to reverse the column as we go, since the
|
Chris@178
|
863 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
864 // the other way around.
|
Chris@32
|
865
|
Chris@298
|
866 int bps = (m_mode == LiveMode ?
|
Chris@298
|
867 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
868
|
Chris@297
|
869 if (m_mode == HighQualityMode) {
|
Chris@178
|
870 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
871 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
872 outCol[j] = inCol[ix];
|
Chris@178
|
873 }
|
Chris@178
|
874 } else {
|
Chris@298
|
875 for (int j = 0; j < bps; ++j) {
|
Chris@178
|
876 outCol[j] = 0.0;
|
Chris@178
|
877 }
|
Chris@298
|
878 for (int j = bps; j < pack.templateHeight; ++j) {
|
Chris@298
|
879 int ix = inCol.size() - j + (bps-1);
|
Chris@178
|
880 outCol[j] = inCol[ix];
|
Chris@178
|
881 }
|
Chris@46
|
882 }
|
Chris@32
|
883
|
Chris@46
|
884 vector<double> noiseLevel1 =
|
Chris@298
|
885 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
886 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
887 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
888 }
|
Chris@32
|
889
|
Chris@46
|
890 vector<double> noiseLevel2 =
|
Chris@298
|
891 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
892 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
893 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
894 }
|
Chris@32
|
895
|
Chris@165
|
896 out.push_back(outCol);
|
Chris@32
|
897 }
|
Chris@32
|
898
|
Chris@32
|
899 ++m_columnCount;
|
Chris@32
|
900 }
|
Chris@32
|
901
|
Chris@32
|
902 return out;
|
Chris@32
|
903 }
|
Chris@32
|
904
|
Chris@321
|
905 void
|
Chris@170
|
906 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
907 const vector<int> &bestShifts,
|
Chris@170
|
908 bool wantShifts)
|
Chris@166
|
909 {
|
Chris@298
|
910 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
911
|
Chris@41
|
912 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
913
|
Chris@41
|
914 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
915
|
Chris@41
|
916 ValueIndexMap strengths;
|
Chris@166
|
917
|
Chris@176
|
918 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
919
|
Chris@321
|
920 double strength = pitches[j];
|
Chris@183
|
921 if (strength < pack.levelThreshold) continue;
|
Chris@321
|
922
|
Chris@321
|
923 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@321
|
924 // get clusters of two or three high scores at a time for
|
Chris@321
|
925 // neighbouring semitones. Eliminate these by picking only the
|
Chris@321
|
926 // peaks. This means we can't recognise actual semitone chords
|
Chris@321
|
927 // if they ever appear, but it's not as if live mode is good
|
Chris@321
|
928 // enough for that to be a big deal anyway.
|
Chris@321
|
929 if (m_mode == LiveMode) {
|
Chris@321
|
930 if (j == 0 || j + 1 == pack.templateNoteCount ||
|
Chris@321
|
931 pitches[j] < pitches[j-1] ||
|
Chris@321
|
932 pitches[j] < pitches[j+1]) {
|
Chris@321
|
933 continue;
|
Chris@321
|
934 }
|
Chris@321
|
935 }
|
Chris@321
|
936
|
Chris@168
|
937 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
938 }
|
Chris@166
|
939
|
Chris@168
|
940 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
941
|
Chris@168
|
942 map<int, double> active;
|
Chris@168
|
943 map<int, int> activeShifts;
|
Chris@168
|
944
|
Chris@183
|
945 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
946
|
Chris@168
|
947 --si;
|
Chris@168
|
948
|
Chris@168
|
949 double strength = si->first;
|
Chris@168
|
950 int j = si->second;
|
Chris@168
|
951
|
Chris@168
|
952 active[j] = strength;
|
Chris@168
|
953
|
Chris@170
|
954 if (wantShifts) {
|
Chris@170
|
955 activeShifts[j] = bestShifts[j];
|
Chris@167
|
956 }
|
Chris@41
|
957 }
|
Chris@41
|
958
|
Chris@168
|
959 m_pianoRoll.push_back(active);
|
Chris@170
|
960
|
Chris@170
|
961 if (wantShifts) {
|
Chris@168
|
962 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
963 }
|
Chris@294
|
964
|
Chris@321
|
965 return;
|
Chris@166
|
966 }
|
Chris@166
|
967
|
Chris@319
|
968 pair<Vamp::Plugin::FeatureList, Vamp::Plugin::FeatureList>
|
Chris@168
|
969 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
970 {
|
Chris@41
|
971 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
972 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
973 // latest active set but present in the prior set in the piano
|
Chris@41
|
974 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
975 // already, and if they haven't ended, we don't know their
|
Chris@41
|
976 // duration.
|
Chris@41
|
977
|
Chris@168
|
978 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
979
|
Chris@168
|
980 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
981
|
Chris@165
|
982 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
983
|
Chris@165
|
984 // only keep notes >= 100ms or thereabouts
|
Chris@321
|
985 double durationThreshSec = 0.1;
|
Chris@321
|
986 if (m_mode == LiveMode) durationThreshSec = 0.07;
|
Chris@321
|
987 int durationThreshold = floor(durationThreshSec / columnDuration); // in cols
|
Chris@165
|
988 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
989
|
Chris@319
|
990 FeatureList noteFeatures, onsetFeatures;
|
Chris@41
|
991
|
Chris@41
|
992 if (width < durationThreshold + 1) {
|
Chris@319
|
993 return { noteFeatures, onsetFeatures };
|
Chris@41
|
994 }
|
Chris@41
|
995
|
Chris@150
|
996 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
997
|
Chris@55
|
998 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
999 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
1000
|
Chris@55
|
1001 int note = ni->first;
|
Chris@41
|
1002
|
Chris@41
|
1003 int end = width;
|
Chris@41
|
1004 int start = end-1;
|
Chris@41
|
1005
|
Chris@41
|
1006 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
1007 --start;
|
Chris@41
|
1008 }
|
Chris@41
|
1009 ++start;
|
Chris@41
|
1010
|
Chris@319
|
1011 int duration = end - start;
|
Chris@319
|
1012
|
Chris@319
|
1013 if (duration < durationThreshold) {
|
Chris@41
|
1014 continue;
|
Chris@41
|
1015 }
|
Chris@41
|
1016
|
Chris@319
|
1017 if (duration == durationThreshold) {
|
Chris@319
|
1018 emitOnset(start, note, shiftCount, onsetFeatures);
|
Chris@319
|
1019 }
|
Chris@319
|
1020
|
Chris@319
|
1021 if (active.find(note) == active.end()) {
|
Chris@319
|
1022 // the note was playing but just ended
|
Chris@319
|
1023 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@319
|
1024 }
|
Chris@41
|
1025 }
|
Chris@41
|
1026
|
Chris@62
|
1027 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
1028
|
Chris@319
|
1029 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1030 }
|
Chris@41
|
1031
|
Chris@169
|
1032 void
|
Chris@169
|
1033 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
1034 FeatureList ¬eFeatures)
|
Chris@169
|
1035 {
|
Chris@169
|
1036 int partStart = start;
|
Chris@169
|
1037 int partShift = 0;
|
Chris@320
|
1038 double partStrength = 0;
|
Chris@169
|
1039
|
Chris@252
|
1040 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1041
|
Chris@169
|
1042 for (int i = start; i != end; ++i) {
|
Chris@169
|
1043
|
Chris@169
|
1044 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1045
|
Chris@169
|
1046 int shift = 0;
|
Chris@169
|
1047
|
Chris@169
|
1048 if (shiftCount > 1) {
|
Chris@169
|
1049
|
Chris@169
|
1050 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1051
|
Chris@169
|
1052 if (i == partStart) {
|
Chris@169
|
1053 partShift = shift;
|
Chris@169
|
1054 }
|
Chris@169
|
1055
|
Chris@169
|
1056 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1057
|
Chris@169
|
1058 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1059
|
Chris@169
|
1060 // pitch has changed, emit an intermediate note
|
Chris@252
|
1061 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1062 i,
|
Chris@252
|
1063 note,
|
Chris@252
|
1064 partShift,
|
Chris@252
|
1065 shiftCount,
|
Chris@320
|
1066 partStrength));
|
Chris@169
|
1067 partStart = i;
|
Chris@169
|
1068 partShift = shift;
|
Chris@320
|
1069 partStrength = 0;
|
Chris@169
|
1070 }
|
Chris@169
|
1071 }
|
Chris@169
|
1072
|
Chris@320
|
1073 if (strength > partStrength) {
|
Chris@320
|
1074 partStrength = strength;
|
Chris@169
|
1075 }
|
Chris@169
|
1076 }
|
Chris@169
|
1077
|
Chris@169
|
1078 if (end >= partStart + partThreshold) {
|
Chris@252
|
1079 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1080 end,
|
Chris@252
|
1081 note,
|
Chris@252
|
1082 partShift,
|
Chris@252
|
1083 shiftCount,
|
Chris@320
|
1084 partStrength));
|
Chris@169
|
1085 }
|
Chris@169
|
1086 }
|
Chris@252
|
1087
|
Chris@319
|
1088 void
|
Chris@319
|
1089 Silvet::emitOnset(int start, int note, int shiftCount,
|
Chris@319
|
1090 FeatureList &onsetFeatures)
|
Chris@319
|
1091 {
|
Chris@319
|
1092 int len = int(m_pianoRoll.size());
|
Chris@320
|
1093
|
Chris@320
|
1094 double onsetStrength = 0;
|
Chris@319
|
1095
|
Chris@319
|
1096 int shift = 0;
|
Chris@319
|
1097 if (shiftCount > 1) {
|
Chris@319
|
1098 shift = m_pianoRollShifts[start][note];
|
Chris@319
|
1099 }
|
Chris@319
|
1100
|
Chris@319
|
1101 for (int i = start; i < len; ++i) {
|
Chris@319
|
1102 double strength = m_pianoRoll[i][note];
|
Chris@320
|
1103 if (strength > onsetStrength) {
|
Chris@320
|
1104 onsetStrength = strength;
|
Chris@319
|
1105 }
|
Chris@319
|
1106 }
|
Chris@319
|
1107
|
Chris@319
|
1108 onsetFeatures.push_back(makeOnsetFeature(start,
|
Chris@319
|
1109 note,
|
Chris@319
|
1110 shift,
|
Chris@319
|
1111 shiftCount,
|
Chris@320
|
1112 onsetStrength));
|
Chris@319
|
1113 }
|
Chris@319
|
1114
|
Chris@309
|
1115 RealTime
|
Chris@309
|
1116 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1117 {
|
Chris@309
|
1118 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1119 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1120
|
Chris@309
|
1121 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1122 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1123 }
|
Chris@309
|
1124
|
Chris@252
|
1125 Silvet::Feature
|
Chris@252
|
1126 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1127 int end,
|
Chris@252
|
1128 int note,
|
Chris@252
|
1129 int shift,
|
Chris@252
|
1130 int shiftCount,
|
Chris@320
|
1131 double strength)
|
Chris@252
|
1132 {
|
Chris@252
|
1133 Feature f;
|
Chris@252
|
1134
|
Chris@252
|
1135 f.hasTimestamp = true;
|
Chris@309
|
1136 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1137
|
Chris@252
|
1138 f.hasDuration = true;
|
Chris@309
|
1139 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1140
|
Chris@252
|
1141 f.values.clear();
|
Chris@320
|
1142 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1143 f.values.push_back(getVelocityFor(strength, start));
|
Chris@252
|
1144
|
Chris@320
|
1145 f.label = getNoteName(note, shift, shiftCount);
|
Chris@252
|
1146
|
Chris@252
|
1147 return f;
|
Chris@252
|
1148 }
|
Chris@252
|
1149
|
Chris@319
|
1150 Silvet::Feature
|
Chris@319
|
1151 Silvet::makeOnsetFeature(int start,
|
Chris@319
|
1152 int note,
|
Chris@319
|
1153 int shift,
|
Chris@319
|
1154 int shiftCount,
|
Chris@320
|
1155 double strength)
|
Chris@319
|
1156 {
|
Chris@319
|
1157 Feature f;
|
Chris@319
|
1158
|
Chris@319
|
1159 f.hasTimestamp = true;
|
Chris@319
|
1160 f.timestamp = getColumnTimestamp(start);
|
Chris@319
|
1161
|
Chris@319
|
1162 f.hasDuration = false;
|
Chris@319
|
1163
|
Chris@319
|
1164 f.values.clear();
|
Chris@320
|
1165 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1166 f.values.push_back(getVelocityFor(strength, start));
|
Chris@319
|
1167
|
Chris@320
|
1168 f.label = getNoteName(note, shift, shiftCount);
|
Chris@319
|
1169
|
Chris@319
|
1170 return f;
|
Chris@319
|
1171 }
|
Chris@319
|
1172
|
Chris@320
|
1173 int
|
Chris@320
|
1174 Silvet::getVelocityFor(double strength, int column)
|
Chris@320
|
1175 {
|
Chris@320
|
1176 RealTime rt = getColumnTimestamp(column + 1);
|
Chris@320
|
1177
|
Chris@320
|
1178 float inputGain = getInputGainAt(rt);
|
Chris@320
|
1179
|
Chris@320
|
1180 double scale = 2.0;
|
Chris@320
|
1181 if (m_mode == LiveMode) scale = 20.0;
|
Chris@320
|
1182
|
Chris@320
|
1183 double velocity = round((strength * scale) / inputGain);
|
Chris@320
|
1184
|
Chris@320
|
1185 if (velocity > 127.0) velocity = 127.0;
|
Chris@320
|
1186 if (velocity < 1.0) velocity = 1.0; // assume surpassed 0 threshold already
|
Chris@320
|
1187
|
Chris@320
|
1188 return int(velocity);
|
Chris@320
|
1189 }
|
Chris@320
|
1190
|
Chris@252
|
1191 float
|
Chris@252
|
1192 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1193 {
|
Chris@252
|
1194 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1195
|
Chris@252
|
1196 if (i == m_inputGains.end()) {
|
Chris@252
|
1197 if (i != m_inputGains.begin()) {
|
Chris@252
|
1198 --i;
|
Chris@252
|
1199 } else {
|
Chris@252
|
1200 return 1.f; // no data
|
Chris@252
|
1201 }
|
Chris@252
|
1202 }
|
Chris@252
|
1203
|
Chris@252
|
1204 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1205
|
Chris@252
|
1206 return i->second;
|
Chris@252
|
1207 }
|
Chris@252
|
1208
|