Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@271
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@319
|
256 d.identifier = "onsets";
|
Chris@319
|
257 d.name = "Note onsets";
|
Chris@323
|
258 d.description = "Note onsets, without durations. These can be calculated sooner than complete notes, because it isn't necessary to wait for a note to finish before returning its feature. Each event has time, estimated fundamental frequency in Hz, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@319
|
259 d.unit = "Hz";
|
Chris@319
|
260 d.hasFixedBinCount = true;
|
Chris@319
|
261 d.binCount = 2;
|
Chris@319
|
262 d.binNames.push_back("Frequency");
|
Chris@319
|
263 d.binNames.push_back("Velocity");
|
Chris@319
|
264 d.hasKnownExtents = false;
|
Chris@319
|
265 d.isQuantized = false;
|
Chris@319
|
266 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@319
|
267 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@319
|
268 d.hasDuration = false;
|
Chris@319
|
269 m_onsetsOutputNo = list.size();
|
Chris@319
|
270 list.push_back(d);
|
Chris@319
|
271
|
Chris@178
|
272 d.identifier = "timefreq";
|
Chris@178
|
273 d.name = "Time-frequency distribution";
|
Chris@271
|
274 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
275 d.unit = "";
|
Chris@178
|
276 d.hasFixedBinCount = true;
|
Chris@298
|
277 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
278 d.binNames.clear();
|
Chris@178
|
279 if (m_cq) {
|
Chris@294
|
280 char name[50];
|
Chris@298
|
281 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
282 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
283 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
284 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
285 // frequency though, so these are still the first 545 bins
|
Chris@178
|
286 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
287 float freq = m_cq->getBinFrequency
|
Chris@298
|
288 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
289 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
290 d.binNames.push_back(name);
|
Chris@178
|
291 }
|
Chris@178
|
292 }
|
Chris@178
|
293 d.hasKnownExtents = false;
|
Chris@178
|
294 d.isQuantized = false;
|
Chris@178
|
295 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
296 d.sampleRate = m_colsPerSec;
|
Chris@178
|
297 d.hasDuration = false;
|
Chris@178
|
298 m_fcqOutputNo = list.size();
|
Chris@178
|
299 list.push_back(d);
|
Chris@178
|
300
|
Chris@294
|
301 d.identifier = "pitchactivation";
|
Chris@294
|
302 d.name = "Pitch activation distribution";
|
Chris@294
|
303 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
304 d.unit = "";
|
Chris@294
|
305 d.hasFixedBinCount = true;
|
Chris@298
|
306 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
307 d.binNames.clear();
|
Chris@294
|
308 if (m_cq) {
|
Chris@298
|
309 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@320
|
310 d.binNames.push_back(getNoteName(i, 0, 1));
|
Chris@294
|
311 }
|
Chris@294
|
312 }
|
Chris@294
|
313 d.hasKnownExtents = false;
|
Chris@294
|
314 d.isQuantized = false;
|
Chris@294
|
315 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
316 d.sampleRate = m_colsPerSec;
|
Chris@294
|
317 d.hasDuration = false;
|
Chris@294
|
318 m_pitchOutputNo = list.size();
|
Chris@294
|
319 list.push_back(d);
|
Chris@294
|
320
|
Chris@309
|
321 d.identifier = "chroma";
|
Chris@309
|
322 d.name = "Pitch chroma distribution";
|
Chris@309
|
323 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
324 d.unit = "";
|
Chris@309
|
325 d.hasFixedBinCount = true;
|
Chris@309
|
326 d.binCount = 12;
|
Chris@309
|
327 d.binNames.clear();
|
Chris@309
|
328 if (m_cq) {
|
Chris@309
|
329 for (int i = 0; i < 12; ++i) {
|
Chris@320
|
330 d.binNames.push_back(getChromaName(i));
|
Chris@309
|
331 }
|
Chris@309
|
332 }
|
Chris@309
|
333 d.hasKnownExtents = false;
|
Chris@309
|
334 d.isQuantized = false;
|
Chris@309
|
335 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
336 d.sampleRate = m_colsPerSec;
|
Chris@309
|
337 d.hasDuration = false;
|
Chris@309
|
338 m_chromaOutputNo = list.size();
|
Chris@309
|
339 list.push_back(d);
|
Chris@309
|
340
|
Chris@302
|
341 d.identifier = "templates";
|
Chris@302
|
342 d.name = "Templates";
|
Chris@302
|
343 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
344 d.unit = "";
|
Chris@302
|
345 d.hasFixedBinCount = true;
|
Chris@302
|
346 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
347 d.binNames.clear();
|
Chris@302
|
348 if (m_cq) {
|
Chris@302
|
349 char name[50];
|
Chris@302
|
350 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
351 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
352 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
353 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
354 // frequency though, so these are still the first 545 bins
|
Chris@302
|
355 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
356 float freq = m_cq->getBinFrequency
|
Chris@302
|
357 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
358 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
359 d.binNames.push_back(name);
|
Chris@302
|
360 }
|
Chris@302
|
361 }
|
Chris@302
|
362 d.hasKnownExtents = false;
|
Chris@302
|
363 d.isQuantized = false;
|
Chris@302
|
364 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
365 d.sampleRate = m_colsPerSec;
|
Chris@302
|
366 d.hasDuration = false;
|
Chris@302
|
367 m_templateOutputNo = list.size();
|
Chris@302
|
368 list.push_back(d);
|
Chris@302
|
369
|
Chris@31
|
370 return list;
|
Chris@31
|
371 }
|
Chris@31
|
372
|
Chris@38
|
373 std::string
|
Chris@320
|
374 Silvet::getChromaName(int pitch) const
|
Chris@38
|
375 {
|
Chris@38
|
376 static const char *names[] = {
|
Chris@38
|
377 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
378 };
|
Chris@38
|
379
|
Chris@309
|
380 return names[pitch];
|
Chris@309
|
381 }
|
Chris@309
|
382
|
Chris@309
|
383 std::string
|
Chris@320
|
384 Silvet::getNoteName(int note, int shift, int shiftCount) const
|
Chris@309
|
385 {
|
Chris@320
|
386 string n = getChromaName(note % 12);
|
Chris@38
|
387
|
Chris@175
|
388 int oct = (note + 9) / 12;
|
Chris@38
|
389
|
Chris@175
|
390 char buf[30];
|
Chris@175
|
391
|
Chris@175
|
392 float pshift = 0.f;
|
Chris@175
|
393 if (shiftCount > 1) {
|
Chris@320
|
394 // see getNoteFrequency below
|
Chris@175
|
395 pshift =
|
Chris@175
|
396 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
397 }
|
Chris@175
|
398
|
Chris@175
|
399 if (pshift > 0.f) {
|
Chris@309
|
400 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
401 } else if (pshift < 0.f) {
|
Chris@309
|
402 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
403 } else {
|
Chris@309
|
404 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
405 }
|
Chris@38
|
406
|
Chris@38
|
407 return buf;
|
Chris@38
|
408 }
|
Chris@38
|
409
|
Chris@41
|
410 float
|
Chris@320
|
411 Silvet::getNoteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
412 {
|
Chris@169
|
413 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
414 // is an offset into the template array, which starts with some
|
Chris@169
|
415 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
416 //
|
Chris@169
|
417 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
418 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
419 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
420 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
421 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
422 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
423 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
424 // down in pitch, for a negative pitch shift.
|
Chris@169
|
425
|
Chris@175
|
426 float pshift = 0.f;
|
Chris@175
|
427 if (shiftCount > 1) {
|
Chris@175
|
428 pshift =
|
Chris@175
|
429 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
430 }
|
Chris@169
|
431
|
Chris@301
|
432 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
433
|
Chris@303
|
434 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
435 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
436
|
Chris@301
|
437 return freq;
|
Chris@41
|
438 }
|
Chris@41
|
439
|
Chris@31
|
440 bool
|
Chris@31
|
441 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
442 {
|
Chris@272
|
443 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
444 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
445 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
446 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
447 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
448 return false;
|
Chris@272
|
449 }
|
Chris@272
|
450
|
Chris@31
|
451 if (channels < getMinChannelCount() ||
|
Chris@272
|
452 channels > getMaxChannelCount()) {
|
Chris@272
|
453 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
454 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
455 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
456 return false;
|
Chris@272
|
457 }
|
Chris@31
|
458
|
Chris@31
|
459 if (stepSize != blockSize) {
|
Chris@31
|
460 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
461 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
462 return false;
|
Chris@31
|
463 }
|
Chris@31
|
464
|
Chris@31
|
465 m_blockSize = blockSize;
|
Chris@31
|
466
|
Chris@31
|
467 reset();
|
Chris@31
|
468
|
Chris@31
|
469 return true;
|
Chris@31
|
470 }
|
Chris@31
|
471
|
Chris@31
|
472 void
|
Chris@31
|
473 Silvet::reset()
|
Chris@31
|
474 {
|
Chris@31
|
475 delete m_resampler;
|
Chris@246
|
476 delete m_flattener;
|
Chris@31
|
477 delete m_cq;
|
Chris@31
|
478
|
Chris@31
|
479 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
480 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
481 } else {
|
Chris@31
|
482 m_resampler = 0;
|
Chris@31
|
483 }
|
Chris@31
|
484
|
Chris@246
|
485 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
486 m_flattener->reset();
|
Chris@246
|
487
|
Chris@301
|
488 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
489 // freq used for the EM templates:
|
Chris@301
|
490 double maxFreq = 14700;
|
Chris@301
|
491
|
Chris@301
|
492 if (m_mode == LiveMode) {
|
Chris@301
|
493 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
494 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
495 // lower than 14700
|
Chris@301
|
496 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
497 }
|
Chris@301
|
498
|
Chris@173
|
499 double minFreq = 27.5;
|
Chris@173
|
500
|
Chris@297
|
501 if (m_mode != HighQualityMode) {
|
Chris@173
|
502 // We don't actually return any notes from the bottom octave,
|
Chris@327
|
503 // so we can just pad with zeros. In live mode the template is
|
Chris@327
|
504 // an octave shorter as well. Each octave the min frequency is
|
Chris@327
|
505 // raised by halves the processing latency.
|
Chris@327
|
506 if (m_mode == LiveMode) {
|
Chris@327
|
507 minFreq *= 4;
|
Chris@327
|
508 } else {
|
Chris@327
|
509 minFreq *= 2;
|
Chris@327
|
510 }
|
Chris@173
|
511 }
|
Chris@173
|
512
|
Chris@298
|
513 int bpo = 12 *
|
Chris@298
|
514 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
515
|
Chris@154
|
516 CQParameters params(processingSampleRate,
|
Chris@173
|
517 minFreq,
|
Chris@303
|
518 maxFreq,
|
Chris@298
|
519 bpo);
|
Chris@154
|
520
|
Chris@325
|
521 params.q = 0.8;
|
Chris@325
|
522 params.atomHopFactor = (m_mode == LiveMode ? 1.0 : 0.3);
|
Chris@154
|
523 params.threshold = 0.0005;
|
Chris@317
|
524 params.decimator =
|
Chris@317
|
525 (m_mode == LiveMode ?
|
Chris@317
|
526 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
|
Chris@172
|
527 params.window = CQParameters::Hann;
|
Chris@154
|
528
|
Chris@154
|
529 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
530
|
Chris@303
|
531 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
532 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
533
|
Chris@297
|
534 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
535
|
Chris@41
|
536 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
537 delete m_postFilter[i];
|
Chris@41
|
538 }
|
Chris@41
|
539 m_postFilter.clear();
|
Chris@303
|
540 int postFilterLength = 3;
|
Chris@298
|
541 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
542 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
543 }
|
Chris@41
|
544 m_pianoRoll.clear();
|
Chris@246
|
545 m_inputGains.clear();
|
Chris@32
|
546 m_columnCount = 0;
|
Chris@272
|
547 m_resampledCount = 0;
|
Chris@40
|
548 m_startTime = RealTime::zeroTime;
|
Chris@313
|
549 m_haveStartTime = false;
|
Chris@31
|
550 }
|
Chris@31
|
551
|
Chris@31
|
552 Silvet::FeatureSet
|
Chris@31
|
553 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
554 {
|
Chris@302
|
555 FeatureSet fs;
|
Chris@302
|
556
|
Chris@313
|
557 if (!m_haveStartTime) {
|
Chris@314
|
558
|
Chris@40
|
559 m_startTime = timestamp;
|
Chris@313
|
560 m_haveStartTime = true;
|
Chris@314
|
561
|
Chris@302
|
562 insertTemplateFeatures(fs);
|
Chris@40
|
563 }
|
Chris@246
|
564
|
Chris@246
|
565 vector<float> flattened(m_blockSize);
|
Chris@246
|
566 float gain = 1.f;
|
Chris@246
|
567 m_flattener->connectInputPort
|
Chris@246
|
568 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
569 m_flattener->connectOutputPort
|
Chris@246
|
570 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
571 m_flattener->connectOutputPort
|
Chris@246
|
572 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
573 m_flattener->process(m_blockSize);
|
Chris@246
|
574
|
Chris@252
|
575 m_inputGains[timestamp] = gain;
|
Chris@40
|
576
|
Chris@31
|
577 vector<double> data;
|
Chris@40
|
578 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
579 double d = flattened[i];
|
Chris@235
|
580 data.push_back(d);
|
Chris@40
|
581 }
|
Chris@31
|
582
|
Chris@31
|
583 if (m_resampler) {
|
Chris@272
|
584
|
Chris@31
|
585 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
586
|
Chris@272
|
587 int hadCount = m_resampledCount;
|
Chris@272
|
588 m_resampledCount += data.size();
|
Chris@272
|
589
|
Chris@272
|
590 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
591
|
Chris@272
|
592 if (hadCount < resamplerLatency) {
|
Chris@272
|
593 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
594 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
595 return fs;
|
Chris@272
|
596 } else {
|
Chris@272
|
597 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
598 }
|
Chris@272
|
599 }
|
Chris@31
|
600 }
|
Chris@272
|
601
|
Chris@32
|
602 Grid cqout = m_cq->process(data);
|
Chris@302
|
603 transcribe(cqout, fs);
|
Chris@51
|
604 return fs;
|
Chris@34
|
605 }
|
Chris@34
|
606
|
Chris@34
|
607 Silvet::FeatureSet
|
Chris@34
|
608 Silvet::getRemainingFeatures()
|
Chris@34
|
609 {
|
Chris@145
|
610 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
611 FeatureSet fs;
|
Chris@302
|
612 if (m_columnCount == 0) {
|
Chris@302
|
613 // process() was never called, but we still want these
|
Chris@302
|
614 insertTemplateFeatures(fs);
|
Chris@302
|
615 } else {
|
Chris@302
|
616 transcribe(cqout, fs);
|
Chris@302
|
617 }
|
Chris@51
|
618 return fs;
|
Chris@34
|
619 }
|
Chris@34
|
620
|
Chris@302
|
621 void
|
Chris@302
|
622 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
623 {
|
Chris@302
|
624 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
625 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
626 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
627 Feature f;
|
Chris@302
|
628 char buffer[50];
|
Chris@302
|
629 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
630 f.label = buffer;
|
Chris@302
|
631 f.hasTimestamp = true;
|
Chris@302
|
632 f.timestamp = timestamp;
|
Chris@302
|
633 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
634 .data[i % pack.templateNoteCount];
|
Chris@302
|
635 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
636 }
|
Chris@302
|
637 }
|
Chris@302
|
638
|
Chris@302
|
639 void
|
Chris@302
|
640 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
641 {
|
Chris@32
|
642 Grid filtered = preProcess(cqout);
|
Chris@31
|
643
|
Chris@302
|
644 if (filtered.empty()) return;
|
Chris@170
|
645
|
Chris@298
|
646 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
647
|
Chris@325
|
648 int width = filtered.size();
|
Chris@325
|
649
|
Chris@325
|
650 double silenceThreshold = 0.01;
|
Chris@325
|
651
|
Chris@325
|
652 for (int i = 0; i < width; ++i) {
|
Chris@325
|
653
|
Chris@325
|
654 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1 + i);
|
Chris@325
|
655 float inputGain = getInputGainAt(timestamp);
|
Chris@325
|
656
|
Chris@178
|
657 Feature f;
|
Chris@325
|
658 double rms = 0.0;
|
Chris@325
|
659
|
Chris@178
|
660 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@325
|
661 double v = filtered[i][j];
|
Chris@325
|
662 rms += v * v;
|
Chris@325
|
663 f.values.push_back(float(v));
|
Chris@178
|
664 }
|
Chris@325
|
665
|
Chris@325
|
666 rms = sqrt(rms / pack.templateHeight);
|
Chris@325
|
667 if (rms / inputGain < silenceThreshold) {
|
Chris@325
|
668 filtered[i].clear();
|
Chris@325
|
669 }
|
Chris@325
|
670
|
Chris@178
|
671 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
672 }
|
Chris@325
|
673
|
Chris@311
|
674 Grid localPitches(width);
|
Chris@170
|
675
|
Chris@297
|
676 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
677 int shiftCount = 1;
|
Chris@170
|
678 if (wantShifts) {
|
Chris@170
|
679 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
680 }
|
Chris@170
|
681
|
Chris@170
|
682 vector<vector<int> > localBestShifts;
|
Chris@170
|
683 if (wantShifts) {
|
Chris@311
|
684 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
685 }
|
Chris@170
|
686
|
Chris@312
|
687 #ifndef MAX_EM_THREADS
|
Chris@312
|
688 #define MAX_EM_THREADS 8
|
Chris@312
|
689 #endif
|
Chris@312
|
690
|
Chris@317
|
691 int emThreadCount = MAX_EM_THREADS;
|
Chris@317
|
692 if (m_mode == LiveMode && pack.templates.size() == 1) {
|
Chris@317
|
693 // The EM step is probably not slow enough to merit it
|
Chris@317
|
694 emThreadCount = 1;
|
Chris@317
|
695 }
|
Chris@317
|
696
|
Chris@312
|
697 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@317
|
698 if (emThreadCount > 1) {
|
Chris@317
|
699 for (int i = 0; i < width; ) {
|
Chris@317
|
700 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@317
|
701 vector<EMFuture> results;
|
Chris@317
|
702 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
703 results.push_back
|
Chris@317
|
704 (async(std::launch::async,
|
Chris@317
|
705 [&](int index) {
|
Chris@325
|
706 return applyEM
|
Chris@325
|
707 (pack, filtered.at(index), wantShifts);
|
Chris@317
|
708 }, i + j));
|
Chris@317
|
709 }
|
Chris@317
|
710 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
711 auto out = results[j].get();
|
Chris@317
|
712 localPitches[i+j] = out.first;
|
Chris@317
|
713 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@317
|
714 }
|
Chris@317
|
715 i += emThreadCount;
|
Chris@312
|
716 }
|
Chris@123
|
717 }
|
Chris@312
|
718 #endif
|
Chris@317
|
719
|
Chris@317
|
720 if (emThreadCount == 1) {
|
Chris@317
|
721 for (int i = 0; i < width; ++i) {
|
Chris@317
|
722 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@317
|
723 localPitches[i] = out.first;
|
Chris@317
|
724 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@317
|
725 }
|
Chris@317
|
726 }
|
Chris@305
|
727
|
Chris@166
|
728 for (int i = 0; i < width; ++i) {
|
Chris@37
|
729
|
Chris@321
|
730 vector<double> filtered;
|
Chris@321
|
731
|
Chris@321
|
732 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
733 m_postFilter[j]->push(localPitches[i][j]);
|
Chris@321
|
734 filtered.push_back(m_postFilter[j]->get());
|
Chris@321
|
735 }
|
Chris@294
|
736
|
Chris@309
|
737 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
738 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
739
|
Chris@294
|
740 Feature f;
|
Chris@294
|
741 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
742 float v = filtered[j];
|
Chris@294
|
743 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
744 f.values.push_back(v / inputGain);
|
Chris@294
|
745 }
|
Chris@294
|
746 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
747
|
Chris@309
|
748 f.values.clear();
|
Chris@309
|
749 f.values.resize(12);
|
Chris@309
|
750 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
751 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
752 }
|
Chris@309
|
753 fs[m_chromaOutputNo].push_back(f);
|
Chris@38
|
754
|
Chris@321
|
755 // This pushes the up-to-max-polyphony activation column to
|
Chris@321
|
756 // m_pianoRoll
|
Chris@323
|
757 postProcess(filtered, localBestShifts[i], wantShifts);
|
Chris@321
|
758
|
Chris@319
|
759 auto events = noteTrack(shiftCount);
|
Chris@319
|
760
|
Chris@319
|
761 FeatureList noteFeatures = events.first;
|
Chris@123
|
762 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
763 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
764 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
765 }
|
Chris@319
|
766
|
Chris@319
|
767 FeatureList onsetFeatures = events.second;
|
Chris@319
|
768 for (FeatureList::const_iterator fi = onsetFeatures.begin();
|
Chris@319
|
769 fi != onsetFeatures.end(); ++fi) {
|
Chris@319
|
770 fs[m_onsetsOutputNo].push_back(*fi);
|
Chris@319
|
771 }
|
Chris@34
|
772 }
|
Chris@31
|
773 }
|
Chris@31
|
774
|
Chris@311
|
775 pair<vector<double>, vector<int> >
|
Chris@311
|
776 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
777 const vector<double> &column,
|
Chris@311
|
778 bool wantShifts)
|
Chris@311
|
779 {
|
Chris@311
|
780 double columnThreshold = 1e-5;
|
Chris@311
|
781
|
Chris@314
|
782 if (m_mode == LiveMode) {
|
Chris@327
|
783 columnThreshold /= 20;
|
Chris@314
|
784 }
|
Chris@314
|
785
|
Chris@311
|
786 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
787 vector<int> bestShifts;
|
Chris@325
|
788
|
Chris@325
|
789 if (column.empty()) return { pitches, bestShifts };
|
Chris@311
|
790
|
Chris@311
|
791 double sum = 0.0;
|
Chris@311
|
792 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
793 sum += column.at(j);
|
Chris@311
|
794 }
|
Chris@311
|
795 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
796
|
Chris@314
|
797 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
798
|
Chris@311
|
799 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
800 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
801
|
Chris@314
|
802 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
803
|
Chris@311
|
804 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
805 em.iterate(column.data());
|
Chris@311
|
806 }
|
Chris@311
|
807
|
Chris@311
|
808 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
809 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
810
|
Chris@311
|
811 int shiftCount = 1;
|
Chris@311
|
812 if (wantShifts) {
|
Chris@311
|
813 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
814 }
|
Chris@311
|
815
|
Chris@311
|
816 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
817
|
Chris@311
|
818 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
819
|
Chris@311
|
820 int bestShift = 0;
|
Chris@311
|
821 float bestShiftValue = 0.0;
|
Chris@311
|
822 if (wantShifts) {
|
Chris@311
|
823 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
824 float value = shiftDist[k][j];
|
Chris@311
|
825 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
826 bestShiftValue = value;
|
Chris@311
|
827 bestShift = k;
|
Chris@311
|
828 }
|
Chris@311
|
829 }
|
Chris@311
|
830 bestShifts.push_back(bestShift);
|
Chris@311
|
831 }
|
Chris@311
|
832 }
|
Chris@311
|
833
|
Chris@311
|
834 return { pitches, bestShifts };
|
Chris@311
|
835 }
|
Chris@311
|
836
|
Chris@32
|
837 Silvet::Grid
|
Chris@32
|
838 Silvet::preProcess(const Grid &in)
|
Chris@32
|
839 {
|
Chris@32
|
840 int width = in.size();
|
Chris@32
|
841
|
Chris@165
|
842 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
843
|
Chris@165
|
844 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
845 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
846
|
Chris@32
|
847 Grid out;
|
Chris@32
|
848
|
Chris@58
|
849 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
850 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
851 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
852 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
853 // size we reduce to in a moment
|
Chris@33
|
854 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
855
|
Chris@298
|
856 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
857
|
Chris@32
|
858 for (int i = 0; i < width; ++i) {
|
Chris@32
|
859
|
Chris@33
|
860 if (m_columnCount < latentColumns) {
|
Chris@33
|
861 ++m_columnCount;
|
Chris@33
|
862 continue;
|
Chris@33
|
863 }
|
Chris@33
|
864
|
Chris@32
|
865 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
866 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
867
|
Chris@32
|
868 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
869
|
Chris@32
|
870 if (select) {
|
Chris@32
|
871 vector<double> inCol = in[i];
|
Chris@176
|
872 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
873
|
Chris@327
|
874 // In HQ mode, the CQ returns 600 bins (10 octaves at 5
|
Chris@327
|
875 // bins per semitone) and we ignore the lowest 55 of them,
|
Chris@327
|
876 // giving us 545 bins total, which matches the height of
|
Chris@327
|
877 // each of our instrument templates.
|
Chris@327
|
878 //
|
Chris@327
|
879 // In draft mode the CQ is an octave shorter, returning
|
Chris@327
|
880 // 540 bins, so we instead pad with an additional 5 zeros
|
Chris@327
|
881 // at the lowest frequencies to get the same 545 bins.
|
Chris@327
|
882 //
|
Chris@327
|
883 // In live mode the CQ is two octaves shorter and only has
|
Chris@327
|
884 // 1 bin per semitone, and the template is also an octave
|
Chris@327
|
885 // shorter. So we get 96 bins (= 8 * 12) and want 97 (=
|
Chris@327
|
886 // (545 / 5) - 12), meaning we have to pad with one extra
|
Chris@327
|
887 // bin at the lowest frequency position. Essentially this
|
Chris@327
|
888 // is the same as draft mode (pad with bins-per-semitone
|
Chris@327
|
889 // bins), just that the result is a shorter vector.
|
Chris@178
|
890 //
|
Chris@178
|
891 // We also need to reverse the column as we go, since the
|
Chris@178
|
892 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
893 // the other way around.
|
Chris@32
|
894
|
Chris@298
|
895 int bps = (m_mode == LiveMode ?
|
Chris@298
|
896 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
897
|
Chris@297
|
898 if (m_mode == HighQualityMode) {
|
Chris@178
|
899 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
900 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
901 outCol[j] = inCol[ix];
|
Chris@178
|
902 }
|
Chris@178
|
903 } else {
|
Chris@327
|
904 int pad = bps;
|
Chris@327
|
905 for (int j = 0; j < pad; ++j) {
|
Chris@178
|
906 outCol[j] = 0.0;
|
Chris@178
|
907 }
|
Chris@327
|
908 for (int j = pad; j < pack.templateHeight; ++j) {
|
Chris@327
|
909 int ix = inCol.size() - j + (pad-1);
|
Chris@178
|
910 outCol[j] = inCol[ix];
|
Chris@178
|
911 }
|
Chris@46
|
912 }
|
Chris@32
|
913
|
Chris@46
|
914 vector<double> noiseLevel1 =
|
Chris@298
|
915 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
916 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
917 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
918 }
|
Chris@32
|
919
|
Chris@46
|
920 vector<double> noiseLevel2 =
|
Chris@298
|
921 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
922 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
923 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
924 }
|
Chris@32
|
925
|
Chris@165
|
926 out.push_back(outCol);
|
Chris@32
|
927 }
|
Chris@32
|
928
|
Chris@32
|
929 ++m_columnCount;
|
Chris@32
|
930 }
|
Chris@32
|
931
|
Chris@32
|
932 return out;
|
Chris@32
|
933 }
|
Chris@32
|
934
|
Chris@321
|
935 void
|
Chris@170
|
936 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
937 const vector<int> &bestShifts,
|
Chris@170
|
938 bool wantShifts)
|
Chris@166
|
939 {
|
Chris@298
|
940 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
941
|
Chris@41
|
942 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
943
|
Chris@41
|
944 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
945
|
Chris@41
|
946 ValueIndexMap strengths;
|
Chris@166
|
947
|
Chris@176
|
948 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@321
|
949
|
Chris@321
|
950 double strength = pitches[j];
|
Chris@183
|
951 if (strength < pack.levelThreshold) continue;
|
Chris@321
|
952
|
Chris@321
|
953 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@321
|
954 // get clusters of two or three high scores at a time for
|
Chris@321
|
955 // neighbouring semitones. Eliminate these by picking only the
|
Chris@325
|
956 // peaks (except that we never eliminate a note that has
|
Chris@325
|
957 // already been established as currently playing). This means
|
Chris@325
|
958 // we can't recognise actual semitone chords if they ever
|
Chris@325
|
959 // appear, but it's not as if live mode is good enough for
|
Chris@325
|
960 // that to be a big deal anyway.
|
Chris@321
|
961 if (m_mode == LiveMode) {
|
Chris@325
|
962 if (m_current.find(j) == m_current.end() &&
|
Chris@325
|
963 (j == 0 ||
|
Chris@325
|
964 j + 1 == pack.templateNoteCount ||
|
Chris@325
|
965 pitches[j] < pitches[j-1] ||
|
Chris@325
|
966 pitches[j] < pitches[j+1])) {
|
Chris@325
|
967 // not a peak or a currently-playing note: skip it
|
Chris@321
|
968 continue;
|
Chris@321
|
969 }
|
Chris@321
|
970 }
|
Chris@323
|
971
|
Chris@168
|
972 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
973 }
|
Chris@166
|
974
|
Chris@168
|
975 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
976
|
Chris@168
|
977 map<int, double> active;
|
Chris@168
|
978 map<int, int> activeShifts;
|
Chris@168
|
979
|
Chris@183
|
980 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
981
|
Chris@168
|
982 --si;
|
Chris@168
|
983
|
Chris@168
|
984 double strength = si->first;
|
Chris@168
|
985 int j = si->second;
|
Chris@168
|
986
|
Chris@168
|
987 active[j] = strength;
|
Chris@168
|
988
|
Chris@170
|
989 if (wantShifts) {
|
Chris@170
|
990 activeShifts[j] = bestShifts[j];
|
Chris@167
|
991 }
|
Chris@41
|
992 }
|
Chris@41
|
993
|
Chris@168
|
994 m_pianoRoll.push_back(active);
|
Chris@170
|
995
|
Chris@170
|
996 if (wantShifts) {
|
Chris@168
|
997 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
998 }
|
Chris@294
|
999
|
Chris@321
|
1000 return;
|
Chris@166
|
1001 }
|
Chris@166
|
1002
|
Chris@319
|
1003 pair<Vamp::Plugin::FeatureList, Vamp::Plugin::FeatureList>
|
Chris@168
|
1004 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
1005 {
|
Chris@41
|
1006 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
1007 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
1008 // latest active set but present in the prior set in the piano
|
Chris@41
|
1009 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
1010 // already, and if they haven't ended, we don't know their
|
Chris@41
|
1011 // duration.
|
Chris@41
|
1012
|
Chris@168
|
1013 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
1014
|
Chris@168
|
1015 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
1016
|
Chris@165
|
1017 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
1018
|
Chris@165
|
1019 // only keep notes >= 100ms or thereabouts
|
Chris@323
|
1020 double durationThrSec = 0.1;
|
Chris@323
|
1021 if (m_mode == LiveMode) durationThrSec = 0.07;
|
Chris@323
|
1022 int durationThreshold = floor(durationThrSec / columnDuration); // in cols
|
Chris@165
|
1023 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
1024
|
Chris@319
|
1025 FeatureList noteFeatures, onsetFeatures;
|
Chris@41
|
1026
|
Chris@41
|
1027 if (width < durationThreshold + 1) {
|
Chris@319
|
1028 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1029 }
|
Chris@41
|
1030
|
Chris@150
|
1031 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
1032
|
Chris@55
|
1033 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
1034 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
1035
|
Chris@55
|
1036 int note = ni->first;
|
Chris@41
|
1037
|
Chris@41
|
1038 int end = width;
|
Chris@41
|
1039 int start = end-1;
|
Chris@41
|
1040
|
Chris@41
|
1041 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
1042 --start;
|
Chris@41
|
1043 }
|
Chris@41
|
1044 ++start;
|
Chris@41
|
1045
|
Chris@319
|
1046 int duration = end - start;
|
Chris@319
|
1047
|
Chris@319
|
1048 if (duration < durationThreshold) {
|
Chris@41
|
1049 continue;
|
Chris@41
|
1050 }
|
Chris@41
|
1051
|
Chris@319
|
1052 if (duration == durationThreshold) {
|
Chris@325
|
1053 m_current.insert(note);
|
Chris@319
|
1054 emitOnset(start, note, shiftCount, onsetFeatures);
|
Chris@319
|
1055 }
|
Chris@319
|
1056
|
Chris@319
|
1057 if (active.find(note) == active.end()) {
|
Chris@319
|
1058 // the note was playing but just ended
|
Chris@325
|
1059 m_current.erase(note);
|
Chris@319
|
1060 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@319
|
1061 }
|
Chris@41
|
1062 }
|
Chris@41
|
1063
|
Chris@62
|
1064 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
1065
|
Chris@319
|
1066 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1067 }
|
Chris@41
|
1068
|
Chris@169
|
1069 void
|
Chris@169
|
1070 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
1071 FeatureList ¬eFeatures)
|
Chris@169
|
1072 {
|
Chris@169
|
1073 int partStart = start;
|
Chris@169
|
1074 int partShift = 0;
|
Chris@320
|
1075 double partStrength = 0;
|
Chris@169
|
1076
|
Chris@252
|
1077 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1078
|
Chris@169
|
1079 for (int i = start; i != end; ++i) {
|
Chris@169
|
1080
|
Chris@169
|
1081 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1082
|
Chris@169
|
1083 int shift = 0;
|
Chris@169
|
1084
|
Chris@169
|
1085 if (shiftCount > 1) {
|
Chris@169
|
1086
|
Chris@169
|
1087 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1088
|
Chris@169
|
1089 if (i == partStart) {
|
Chris@169
|
1090 partShift = shift;
|
Chris@169
|
1091 }
|
Chris@169
|
1092
|
Chris@169
|
1093 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1094
|
Chris@169
|
1095 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1096
|
Chris@169
|
1097 // pitch has changed, emit an intermediate note
|
Chris@252
|
1098 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1099 i,
|
Chris@252
|
1100 note,
|
Chris@252
|
1101 partShift,
|
Chris@252
|
1102 shiftCount,
|
Chris@320
|
1103 partStrength));
|
Chris@169
|
1104 partStart = i;
|
Chris@169
|
1105 partShift = shift;
|
Chris@320
|
1106 partStrength = 0;
|
Chris@169
|
1107 }
|
Chris@169
|
1108 }
|
Chris@169
|
1109
|
Chris@320
|
1110 if (strength > partStrength) {
|
Chris@320
|
1111 partStrength = strength;
|
Chris@169
|
1112 }
|
Chris@169
|
1113 }
|
Chris@169
|
1114
|
Chris@169
|
1115 if (end >= partStart + partThreshold) {
|
Chris@252
|
1116 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1117 end,
|
Chris@252
|
1118 note,
|
Chris@252
|
1119 partShift,
|
Chris@252
|
1120 shiftCount,
|
Chris@320
|
1121 partStrength));
|
Chris@169
|
1122 }
|
Chris@169
|
1123 }
|
Chris@252
|
1124
|
Chris@319
|
1125 void
|
Chris@319
|
1126 Silvet::emitOnset(int start, int note, int shiftCount,
|
Chris@319
|
1127 FeatureList &onsetFeatures)
|
Chris@319
|
1128 {
|
Chris@319
|
1129 int len = int(m_pianoRoll.size());
|
Chris@320
|
1130
|
Chris@320
|
1131 double onsetStrength = 0;
|
Chris@319
|
1132
|
Chris@319
|
1133 int shift = 0;
|
Chris@319
|
1134 if (shiftCount > 1) {
|
Chris@319
|
1135 shift = m_pianoRollShifts[start][note];
|
Chris@319
|
1136 }
|
Chris@319
|
1137
|
Chris@319
|
1138 for (int i = start; i < len; ++i) {
|
Chris@319
|
1139 double strength = m_pianoRoll[i][note];
|
Chris@320
|
1140 if (strength > onsetStrength) {
|
Chris@320
|
1141 onsetStrength = strength;
|
Chris@319
|
1142 }
|
Chris@319
|
1143 }
|
Chris@319
|
1144
|
Chris@319
|
1145 onsetFeatures.push_back(makeOnsetFeature(start,
|
Chris@319
|
1146 note,
|
Chris@319
|
1147 shift,
|
Chris@319
|
1148 shiftCount,
|
Chris@320
|
1149 onsetStrength));
|
Chris@319
|
1150 }
|
Chris@319
|
1151
|
Chris@309
|
1152 RealTime
|
Chris@309
|
1153 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1154 {
|
Chris@309
|
1155 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1156 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1157
|
Chris@309
|
1158 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1159 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1160 }
|
Chris@309
|
1161
|
Chris@252
|
1162 Silvet::Feature
|
Chris@252
|
1163 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1164 int end,
|
Chris@252
|
1165 int note,
|
Chris@252
|
1166 int shift,
|
Chris@252
|
1167 int shiftCount,
|
Chris@320
|
1168 double strength)
|
Chris@252
|
1169 {
|
Chris@252
|
1170 Feature f;
|
Chris@252
|
1171
|
Chris@252
|
1172 f.hasTimestamp = true;
|
Chris@309
|
1173 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1174
|
Chris@252
|
1175 f.hasDuration = true;
|
Chris@309
|
1176 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1177
|
Chris@252
|
1178 f.values.clear();
|
Chris@320
|
1179 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1180 f.values.push_back(getVelocityFor(strength, start));
|
Chris@252
|
1181
|
Chris@320
|
1182 f.label = getNoteName(note, shift, shiftCount);
|
Chris@252
|
1183
|
Chris@252
|
1184 return f;
|
Chris@252
|
1185 }
|
Chris@252
|
1186
|
Chris@319
|
1187 Silvet::Feature
|
Chris@319
|
1188 Silvet::makeOnsetFeature(int start,
|
Chris@319
|
1189 int note,
|
Chris@319
|
1190 int shift,
|
Chris@319
|
1191 int shiftCount,
|
Chris@320
|
1192 double strength)
|
Chris@319
|
1193 {
|
Chris@319
|
1194 Feature f;
|
Chris@319
|
1195
|
Chris@319
|
1196 f.hasTimestamp = true;
|
Chris@319
|
1197 f.timestamp = getColumnTimestamp(start);
|
Chris@319
|
1198
|
Chris@319
|
1199 f.hasDuration = false;
|
Chris@319
|
1200
|
Chris@319
|
1201 f.values.clear();
|
Chris@320
|
1202 f.values.push_back(getNoteFrequency(note, shift, shiftCount));
|
Chris@320
|
1203 f.values.push_back(getVelocityFor(strength, start));
|
Chris@319
|
1204
|
Chris@320
|
1205 f.label = getNoteName(note, shift, shiftCount);
|
Chris@319
|
1206
|
Chris@319
|
1207 return f;
|
Chris@319
|
1208 }
|
Chris@319
|
1209
|
Chris@320
|
1210 int
|
Chris@320
|
1211 Silvet::getVelocityFor(double strength, int column)
|
Chris@320
|
1212 {
|
Chris@320
|
1213 RealTime rt = getColumnTimestamp(column + 1);
|
Chris@320
|
1214
|
Chris@320
|
1215 float inputGain = getInputGainAt(rt);
|
Chris@320
|
1216
|
Chris@320
|
1217 double scale = 2.0;
|
Chris@320
|
1218 if (m_mode == LiveMode) scale = 20.0;
|
Chris@320
|
1219
|
Chris@320
|
1220 double velocity = round((strength * scale) / inputGain);
|
Chris@320
|
1221
|
Chris@320
|
1222 if (velocity > 127.0) velocity = 127.0;
|
Chris@320
|
1223 if (velocity < 1.0) velocity = 1.0; // assume surpassed 0 threshold already
|
Chris@320
|
1224
|
Chris@320
|
1225 return int(velocity);
|
Chris@320
|
1226 }
|
Chris@320
|
1227
|
Chris@252
|
1228 float
|
Chris@252
|
1229 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1230 {
|
Chris@252
|
1231 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1232
|
Chris@252
|
1233 if (i == m_inputGains.end()) {
|
Chris@252
|
1234 if (i != m_inputGains.begin()) {
|
Chris@252
|
1235 --i;
|
Chris@252
|
1236 } else {
|
Chris@252
|
1237 return 1.f; // no data
|
Chris@252
|
1238 }
|
Chris@252
|
1239 }
|
Chris@252
|
1240
|
Chris@252
|
1241 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1242
|
Chris@252
|
1243 return i->second;
|
Chris@252
|
1244 }
|
Chris@252
|
1245
|