Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@271
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@319
|
256 d.identifier = "onsets";
|
Chris@319
|
257 d.name = "Note onsets";
|
Chris@319
|
258 d.description = "Note onsets, without durations. These can be calculated sooner than complete notes as it isn't necessary to wait for the note to finish. Each event has time, estimated fundamental frequency in Hz, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@319
|
259 d.unit = "Hz";
|
Chris@319
|
260 d.hasFixedBinCount = true;
|
Chris@319
|
261 d.binCount = 2;
|
Chris@319
|
262 d.binNames.push_back("Frequency");
|
Chris@319
|
263 d.binNames.push_back("Velocity");
|
Chris@319
|
264 d.hasKnownExtents = false;
|
Chris@319
|
265 d.isQuantized = false;
|
Chris@319
|
266 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@319
|
267 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@319
|
268 d.hasDuration = false;
|
Chris@319
|
269 m_onsetsOutputNo = list.size();
|
Chris@319
|
270 list.push_back(d);
|
Chris@319
|
271
|
Chris@178
|
272 d.identifier = "timefreq";
|
Chris@178
|
273 d.name = "Time-frequency distribution";
|
Chris@271
|
274 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
275 d.unit = "";
|
Chris@178
|
276 d.hasFixedBinCount = true;
|
Chris@298
|
277 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
278 d.binNames.clear();
|
Chris@178
|
279 if (m_cq) {
|
Chris@294
|
280 char name[50];
|
Chris@298
|
281 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
282 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
283 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
284 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
285 // frequency though, so these are still the first 545 bins
|
Chris@178
|
286 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
287 float freq = m_cq->getBinFrequency
|
Chris@298
|
288 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
289 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
290 d.binNames.push_back(name);
|
Chris@178
|
291 }
|
Chris@178
|
292 }
|
Chris@178
|
293 d.hasKnownExtents = false;
|
Chris@178
|
294 d.isQuantized = false;
|
Chris@178
|
295 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
296 d.sampleRate = m_colsPerSec;
|
Chris@178
|
297 d.hasDuration = false;
|
Chris@178
|
298 m_fcqOutputNo = list.size();
|
Chris@178
|
299 list.push_back(d);
|
Chris@178
|
300
|
Chris@294
|
301 d.identifier = "pitchactivation";
|
Chris@294
|
302 d.name = "Pitch activation distribution";
|
Chris@294
|
303 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
304 d.unit = "";
|
Chris@294
|
305 d.hasFixedBinCount = true;
|
Chris@298
|
306 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
307 d.binNames.clear();
|
Chris@294
|
308 if (m_cq) {
|
Chris@298
|
309 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@294
|
310 d.binNames.push_back(noteName(i, 0, 1));
|
Chris@294
|
311 }
|
Chris@294
|
312 }
|
Chris@294
|
313 d.hasKnownExtents = false;
|
Chris@294
|
314 d.isQuantized = false;
|
Chris@294
|
315 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
316 d.sampleRate = m_colsPerSec;
|
Chris@294
|
317 d.hasDuration = false;
|
Chris@294
|
318 m_pitchOutputNo = list.size();
|
Chris@294
|
319 list.push_back(d);
|
Chris@294
|
320
|
Chris@309
|
321 d.identifier = "chroma";
|
Chris@309
|
322 d.name = "Pitch chroma distribution";
|
Chris@309
|
323 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
324 d.unit = "";
|
Chris@309
|
325 d.hasFixedBinCount = true;
|
Chris@309
|
326 d.binCount = 12;
|
Chris@309
|
327 d.binNames.clear();
|
Chris@309
|
328 if (m_cq) {
|
Chris@309
|
329 for (int i = 0; i < 12; ++i) {
|
Chris@309
|
330 d.binNames.push_back(chromaName(i));
|
Chris@309
|
331 }
|
Chris@309
|
332 }
|
Chris@309
|
333 d.hasKnownExtents = false;
|
Chris@309
|
334 d.isQuantized = false;
|
Chris@309
|
335 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
336 d.sampleRate = m_colsPerSec;
|
Chris@309
|
337 d.hasDuration = false;
|
Chris@309
|
338 m_chromaOutputNo = list.size();
|
Chris@309
|
339 list.push_back(d);
|
Chris@309
|
340
|
Chris@302
|
341 d.identifier = "templates";
|
Chris@302
|
342 d.name = "Templates";
|
Chris@302
|
343 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
344 d.unit = "";
|
Chris@302
|
345 d.hasFixedBinCount = true;
|
Chris@302
|
346 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
347 d.binNames.clear();
|
Chris@302
|
348 if (m_cq) {
|
Chris@302
|
349 char name[50];
|
Chris@302
|
350 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
351 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
352 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
353 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
354 // frequency though, so these are still the first 545 bins
|
Chris@302
|
355 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
356 float freq = m_cq->getBinFrequency
|
Chris@302
|
357 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
358 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
359 d.binNames.push_back(name);
|
Chris@302
|
360 }
|
Chris@302
|
361 }
|
Chris@302
|
362 d.hasKnownExtents = false;
|
Chris@302
|
363 d.isQuantized = false;
|
Chris@302
|
364 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
365 d.sampleRate = m_colsPerSec;
|
Chris@302
|
366 d.hasDuration = false;
|
Chris@302
|
367 m_templateOutputNo = list.size();
|
Chris@302
|
368 list.push_back(d);
|
Chris@302
|
369
|
Chris@31
|
370 return list;
|
Chris@31
|
371 }
|
Chris@31
|
372
|
Chris@38
|
373 std::string
|
Chris@309
|
374 Silvet::chromaName(int pitch) const
|
Chris@38
|
375 {
|
Chris@38
|
376 static const char *names[] = {
|
Chris@38
|
377 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
378 };
|
Chris@38
|
379
|
Chris@309
|
380 return names[pitch];
|
Chris@309
|
381 }
|
Chris@309
|
382
|
Chris@309
|
383 std::string
|
Chris@309
|
384 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@309
|
385 {
|
Chris@309
|
386 string n = chromaName(note % 12);
|
Chris@38
|
387
|
Chris@175
|
388 int oct = (note + 9) / 12;
|
Chris@38
|
389
|
Chris@175
|
390 char buf[30];
|
Chris@175
|
391
|
Chris@175
|
392 float pshift = 0.f;
|
Chris@175
|
393 if (shiftCount > 1) {
|
Chris@175
|
394 // see noteFrequency below
|
Chris@175
|
395 pshift =
|
Chris@175
|
396 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
397 }
|
Chris@175
|
398
|
Chris@175
|
399 if (pshift > 0.f) {
|
Chris@309
|
400 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
401 } else if (pshift < 0.f) {
|
Chris@309
|
402 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
403 } else {
|
Chris@309
|
404 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
405 }
|
Chris@38
|
406
|
Chris@38
|
407 return buf;
|
Chris@38
|
408 }
|
Chris@38
|
409
|
Chris@41
|
410 float
|
Chris@168
|
411 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
412 {
|
Chris@169
|
413 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
414 // is an offset into the template array, which starts with some
|
Chris@169
|
415 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
416 //
|
Chris@169
|
417 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
418 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
419 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
420 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
421 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
422 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
423 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
424 // down in pitch, for a negative pitch shift.
|
Chris@169
|
425
|
Chris@175
|
426 float pshift = 0.f;
|
Chris@175
|
427 if (shiftCount > 1) {
|
Chris@175
|
428 pshift =
|
Chris@175
|
429 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
430 }
|
Chris@169
|
431
|
Chris@301
|
432 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
433
|
Chris@303
|
434 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
435 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
436
|
Chris@301
|
437 return freq;
|
Chris@41
|
438 }
|
Chris@41
|
439
|
Chris@31
|
440 bool
|
Chris@31
|
441 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
442 {
|
Chris@272
|
443 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
444 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
445 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
446 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
447 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
448 return false;
|
Chris@272
|
449 }
|
Chris@272
|
450
|
Chris@31
|
451 if (channels < getMinChannelCount() ||
|
Chris@272
|
452 channels > getMaxChannelCount()) {
|
Chris@272
|
453 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
454 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
455 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
456 return false;
|
Chris@272
|
457 }
|
Chris@31
|
458
|
Chris@31
|
459 if (stepSize != blockSize) {
|
Chris@31
|
460 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
461 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
462 return false;
|
Chris@31
|
463 }
|
Chris@31
|
464
|
Chris@31
|
465 m_blockSize = blockSize;
|
Chris@31
|
466
|
Chris@31
|
467 reset();
|
Chris@31
|
468
|
Chris@31
|
469 return true;
|
Chris@31
|
470 }
|
Chris@31
|
471
|
Chris@31
|
472 void
|
Chris@31
|
473 Silvet::reset()
|
Chris@31
|
474 {
|
Chris@31
|
475 delete m_resampler;
|
Chris@246
|
476 delete m_flattener;
|
Chris@31
|
477 delete m_cq;
|
Chris@31
|
478
|
Chris@31
|
479 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
480 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
481 } else {
|
Chris@31
|
482 m_resampler = 0;
|
Chris@31
|
483 }
|
Chris@31
|
484
|
Chris@246
|
485 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
486 m_flattener->reset();
|
Chris@246
|
487
|
Chris@301
|
488 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
489 // freq used for the EM templates:
|
Chris@301
|
490 double maxFreq = 14700;
|
Chris@301
|
491
|
Chris@301
|
492 if (m_mode == LiveMode) {
|
Chris@301
|
493 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
494 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
495 // lower than 14700
|
Chris@301
|
496 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
497 }
|
Chris@301
|
498
|
Chris@173
|
499 double minFreq = 27.5;
|
Chris@173
|
500
|
Chris@297
|
501 if (m_mode != HighQualityMode) {
|
Chris@173
|
502 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
503 // so we can just pad with zeros
|
Chris@173
|
504 minFreq *= 2;
|
Chris@173
|
505 }
|
Chris@173
|
506
|
Chris@298
|
507 int bpo = 12 *
|
Chris@298
|
508 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
509
|
Chris@154
|
510 CQParameters params(processingSampleRate,
|
Chris@173
|
511 minFreq,
|
Chris@303
|
512 maxFreq,
|
Chris@298
|
513 bpo);
|
Chris@154
|
514
|
Chris@316
|
515 // For params.q, the MIREX code uses 0.8, but it seems that with
|
Chris@316
|
516 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT
|
Chris@316
|
517 // size to 512 from 1024 and alters some other processing
|
Chris@316
|
518 // parameters, making everything much, much slower. Could be a
|
Chris@316
|
519 // flaw in the CQ parameter calculations, must check. For
|
Chris@316
|
520 // atomHopFactor == 1, q == 0.8 is fine
|
Chris@316
|
521 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8);
|
Chris@316
|
522 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0);
|
Chris@154
|
523 params.threshold = 0.0005;
|
Chris@317
|
524 params.decimator =
|
Chris@317
|
525 (m_mode == LiveMode ?
|
Chris@317
|
526 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
|
Chris@172
|
527 params.window = CQParameters::Hann;
|
Chris@154
|
528
|
Chris@154
|
529 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
530
|
Chris@303
|
531 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
532 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
533
|
Chris@297
|
534 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
535
|
Chris@41
|
536 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
537 delete m_postFilter[i];
|
Chris@41
|
538 }
|
Chris@41
|
539 m_postFilter.clear();
|
Chris@303
|
540 int postFilterLength = 3;
|
Chris@298
|
541 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
542 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
543 }
|
Chris@41
|
544 m_pianoRoll.clear();
|
Chris@246
|
545 m_inputGains.clear();
|
Chris@32
|
546 m_columnCount = 0;
|
Chris@272
|
547 m_resampledCount = 0;
|
Chris@40
|
548 m_startTime = RealTime::zeroTime;
|
Chris@313
|
549 m_haveStartTime = false;
|
Chris@31
|
550 }
|
Chris@31
|
551
|
Chris@31
|
552 Silvet::FeatureSet
|
Chris@31
|
553 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
554 {
|
Chris@302
|
555 FeatureSet fs;
|
Chris@302
|
556
|
Chris@313
|
557 if (!m_haveStartTime) {
|
Chris@314
|
558
|
Chris@40
|
559 m_startTime = timestamp;
|
Chris@313
|
560 m_haveStartTime = true;
|
Chris@314
|
561
|
Chris@302
|
562 insertTemplateFeatures(fs);
|
Chris@40
|
563 }
|
Chris@246
|
564
|
Chris@246
|
565 vector<float> flattened(m_blockSize);
|
Chris@246
|
566 float gain = 1.f;
|
Chris@246
|
567 m_flattener->connectInputPort
|
Chris@246
|
568 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
569 m_flattener->connectOutputPort
|
Chris@246
|
570 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
571 m_flattener->connectOutputPort
|
Chris@246
|
572 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
573 m_flattener->process(m_blockSize);
|
Chris@246
|
574
|
Chris@252
|
575 m_inputGains[timestamp] = gain;
|
Chris@40
|
576
|
Chris@31
|
577 vector<double> data;
|
Chris@40
|
578 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
579 double d = flattened[i];
|
Chris@235
|
580 data.push_back(d);
|
Chris@40
|
581 }
|
Chris@31
|
582
|
Chris@31
|
583 if (m_resampler) {
|
Chris@272
|
584
|
Chris@31
|
585 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
586
|
Chris@272
|
587 int hadCount = m_resampledCount;
|
Chris@272
|
588 m_resampledCount += data.size();
|
Chris@272
|
589
|
Chris@272
|
590 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
591
|
Chris@272
|
592 if (hadCount < resamplerLatency) {
|
Chris@272
|
593 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
594 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
595 return fs;
|
Chris@272
|
596 } else {
|
Chris@272
|
597 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
598 }
|
Chris@272
|
599 }
|
Chris@31
|
600 }
|
Chris@272
|
601
|
Chris@32
|
602 Grid cqout = m_cq->process(data);
|
Chris@302
|
603 transcribe(cqout, fs);
|
Chris@51
|
604 return fs;
|
Chris@34
|
605 }
|
Chris@34
|
606
|
Chris@34
|
607 Silvet::FeatureSet
|
Chris@34
|
608 Silvet::getRemainingFeatures()
|
Chris@34
|
609 {
|
Chris@145
|
610 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
611 FeatureSet fs;
|
Chris@302
|
612 if (m_columnCount == 0) {
|
Chris@302
|
613 // process() was never called, but we still want these
|
Chris@302
|
614 insertTemplateFeatures(fs);
|
Chris@302
|
615 } else {
|
Chris@302
|
616 transcribe(cqout, fs);
|
Chris@302
|
617 }
|
Chris@51
|
618 return fs;
|
Chris@34
|
619 }
|
Chris@34
|
620
|
Chris@302
|
621 void
|
Chris@302
|
622 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
623 {
|
Chris@302
|
624 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
625 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
626 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
627 Feature f;
|
Chris@302
|
628 char buffer[50];
|
Chris@302
|
629 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
630 f.label = buffer;
|
Chris@302
|
631 f.hasTimestamp = true;
|
Chris@302
|
632 f.timestamp = timestamp;
|
Chris@302
|
633 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
634 .data[i % pack.templateNoteCount];
|
Chris@302
|
635 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
636 }
|
Chris@302
|
637 }
|
Chris@302
|
638
|
Chris@302
|
639 void
|
Chris@302
|
640 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
641 {
|
Chris@32
|
642 Grid filtered = preProcess(cqout);
|
Chris@31
|
643
|
Chris@302
|
644 if (filtered.empty()) return;
|
Chris@170
|
645
|
Chris@298
|
646 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
647
|
Chris@178
|
648 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
649 Feature f;
|
Chris@178
|
650 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
651 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
652 }
|
Chris@178
|
653 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
654 }
|
Chris@178
|
655
|
Chris@34
|
656 int width = filtered.size();
|
Chris@34
|
657
|
Chris@311
|
658 Grid localPitches(width);
|
Chris@170
|
659
|
Chris@297
|
660 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
661 int shiftCount = 1;
|
Chris@170
|
662 if (wantShifts) {
|
Chris@170
|
663 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
664 }
|
Chris@170
|
665
|
Chris@170
|
666 vector<vector<int> > localBestShifts;
|
Chris@170
|
667 if (wantShifts) {
|
Chris@311
|
668 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
669 }
|
Chris@170
|
670
|
Chris@312
|
671 #ifndef MAX_EM_THREADS
|
Chris@312
|
672 #define MAX_EM_THREADS 8
|
Chris@312
|
673 #endif
|
Chris@312
|
674
|
Chris@317
|
675 int emThreadCount = MAX_EM_THREADS;
|
Chris@317
|
676 if (m_mode == LiveMode && pack.templates.size() == 1) {
|
Chris@317
|
677 // The EM step is probably not slow enough to merit it
|
Chris@317
|
678 emThreadCount = 1;
|
Chris@317
|
679 }
|
Chris@317
|
680
|
Chris@312
|
681 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@317
|
682 if (emThreadCount > 1) {
|
Chris@317
|
683 for (int i = 0; i < width; ) {
|
Chris@317
|
684 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@317
|
685 vector<EMFuture> results;
|
Chris@317
|
686 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
687 results.push_back
|
Chris@317
|
688 (async(std::launch::async,
|
Chris@317
|
689 [&](int index) {
|
Chris@317
|
690 return applyEM(pack, filtered.at(index), wantShifts);
|
Chris@317
|
691 }, i + j));
|
Chris@317
|
692 }
|
Chris@317
|
693 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
694 auto out = results[j].get();
|
Chris@317
|
695 localPitches[i+j] = out.first;
|
Chris@317
|
696 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@317
|
697 }
|
Chris@317
|
698 i += emThreadCount;
|
Chris@312
|
699 }
|
Chris@123
|
700 }
|
Chris@312
|
701 #endif
|
Chris@317
|
702
|
Chris@317
|
703 if (emThreadCount == 1) {
|
Chris@317
|
704 for (int i = 0; i < width; ++i) {
|
Chris@317
|
705 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@317
|
706 localPitches[i] = out.first;
|
Chris@317
|
707 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@317
|
708 }
|
Chris@317
|
709 }
|
Chris@305
|
710
|
Chris@166
|
711 for (int i = 0; i < width; ++i) {
|
Chris@37
|
712
|
Chris@309
|
713 // This returns a filtered column, and pushes the
|
Chris@309
|
714 // up-to-max-polyphony activation column to m_pianoRoll
|
Chris@294
|
715 vector<double> filtered = postProcess
|
Chris@294
|
716 (localPitches[i], localBestShifts[i], wantShifts);
|
Chris@294
|
717
|
Chris@309
|
718 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
719 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
720
|
Chris@294
|
721 Feature f;
|
Chris@294
|
722 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
723 float v = filtered[j];
|
Chris@294
|
724 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
725 f.values.push_back(v / inputGain);
|
Chris@294
|
726 }
|
Chris@294
|
727 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
728
|
Chris@309
|
729 f.values.clear();
|
Chris@309
|
730 f.values.resize(12);
|
Chris@309
|
731 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
732 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
733 }
|
Chris@309
|
734 fs[m_chromaOutputNo].push_back(f);
|
Chris@38
|
735
|
Chris@319
|
736 auto events = noteTrack(shiftCount);
|
Chris@319
|
737
|
Chris@319
|
738 FeatureList noteFeatures = events.first;
|
Chris@123
|
739 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
740 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
741 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
742 }
|
Chris@319
|
743
|
Chris@319
|
744 FeatureList onsetFeatures = events.second;
|
Chris@319
|
745 for (FeatureList::const_iterator fi = onsetFeatures.begin();
|
Chris@319
|
746 fi != onsetFeatures.end(); ++fi) {
|
Chris@319
|
747 fs[m_onsetsOutputNo].push_back(*fi);
|
Chris@319
|
748 }
|
Chris@34
|
749 }
|
Chris@31
|
750 }
|
Chris@31
|
751
|
Chris@311
|
752 pair<vector<double>, vector<int> >
|
Chris@311
|
753 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
754 const vector<double> &column,
|
Chris@311
|
755 bool wantShifts)
|
Chris@311
|
756 {
|
Chris@311
|
757 double columnThreshold = 1e-5;
|
Chris@311
|
758
|
Chris@314
|
759 if (m_mode == LiveMode) {
|
Chris@314
|
760 columnThreshold /= 20;
|
Chris@314
|
761 }
|
Chris@314
|
762
|
Chris@311
|
763 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
764 vector<int> bestShifts;
|
Chris@311
|
765
|
Chris@311
|
766 double sum = 0.0;
|
Chris@311
|
767 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
768 sum += column.at(j);
|
Chris@311
|
769 }
|
Chris@311
|
770 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
771
|
Chris@314
|
772 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
773
|
Chris@311
|
774 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
775 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
776
|
Chris@314
|
777 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
778
|
Chris@311
|
779 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
780 em.iterate(column.data());
|
Chris@311
|
781 }
|
Chris@311
|
782
|
Chris@311
|
783 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
784 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
785
|
Chris@311
|
786 int shiftCount = 1;
|
Chris@311
|
787 if (wantShifts) {
|
Chris@311
|
788 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
789 }
|
Chris@311
|
790
|
Chris@311
|
791 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
792
|
Chris@311
|
793 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
794
|
Chris@311
|
795 int bestShift = 0;
|
Chris@311
|
796 float bestShiftValue = 0.0;
|
Chris@311
|
797 if (wantShifts) {
|
Chris@311
|
798 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
799 float value = shiftDist[k][j];
|
Chris@311
|
800 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
801 bestShiftValue = value;
|
Chris@311
|
802 bestShift = k;
|
Chris@311
|
803 }
|
Chris@311
|
804 }
|
Chris@311
|
805 bestShifts.push_back(bestShift);
|
Chris@311
|
806 }
|
Chris@311
|
807 }
|
Chris@311
|
808
|
Chris@311
|
809 return { pitches, bestShifts };
|
Chris@311
|
810 }
|
Chris@311
|
811
|
Chris@32
|
812 Silvet::Grid
|
Chris@32
|
813 Silvet::preProcess(const Grid &in)
|
Chris@32
|
814 {
|
Chris@32
|
815 int width = in.size();
|
Chris@32
|
816
|
Chris@165
|
817 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
818
|
Chris@165
|
819 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
820 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
821
|
Chris@32
|
822 Grid out;
|
Chris@32
|
823
|
Chris@58
|
824 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
825 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
826 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
827 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
828 // size we reduce to in a moment
|
Chris@33
|
829 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
830
|
Chris@298
|
831 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
832
|
Chris@32
|
833 for (int i = 0; i < width; ++i) {
|
Chris@32
|
834
|
Chris@33
|
835 if (m_columnCount < latentColumns) {
|
Chris@33
|
836 ++m_columnCount;
|
Chris@33
|
837 continue;
|
Chris@33
|
838 }
|
Chris@33
|
839
|
Chris@32
|
840 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
841 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
842
|
Chris@32
|
843 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
844
|
Chris@32
|
845 if (select) {
|
Chris@32
|
846 vector<double> inCol = in[i];
|
Chris@176
|
847 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
848
|
Chris@178
|
849 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@298
|
850 // lowest 55 of them (assuming binsPerSemitone == 5).
|
Chris@178
|
851 //
|
Chris@297
|
852 // In draft and live mode the CQ is an octave shorter,
|
Chris@300
|
853 // returning 540 bins or equivalent, so we instead pad
|
Chris@300
|
854 // them with an additional 5 or equivalent zeros.
|
Chris@178
|
855 //
|
Chris@178
|
856 // We also need to reverse the column as we go, since the
|
Chris@178
|
857 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
858 // the other way around.
|
Chris@32
|
859
|
Chris@298
|
860 int bps = (m_mode == LiveMode ?
|
Chris@298
|
861 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
862
|
Chris@297
|
863 if (m_mode == HighQualityMode) {
|
Chris@178
|
864 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
865 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
866 outCol[j] = inCol[ix];
|
Chris@178
|
867 }
|
Chris@178
|
868 } else {
|
Chris@298
|
869 for (int j = 0; j < bps; ++j) {
|
Chris@178
|
870 outCol[j] = 0.0;
|
Chris@178
|
871 }
|
Chris@298
|
872 for (int j = bps; j < pack.templateHeight; ++j) {
|
Chris@298
|
873 int ix = inCol.size() - j + (bps-1);
|
Chris@178
|
874 outCol[j] = inCol[ix];
|
Chris@178
|
875 }
|
Chris@46
|
876 }
|
Chris@32
|
877
|
Chris@46
|
878 vector<double> noiseLevel1 =
|
Chris@298
|
879 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
880 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
881 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
882 }
|
Chris@32
|
883
|
Chris@46
|
884 vector<double> noiseLevel2 =
|
Chris@298
|
885 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
886 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
887 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
888 }
|
Chris@32
|
889
|
Chris@165
|
890 out.push_back(outCol);
|
Chris@32
|
891 }
|
Chris@32
|
892
|
Chris@32
|
893 ++m_columnCount;
|
Chris@32
|
894 }
|
Chris@32
|
895
|
Chris@32
|
896 return out;
|
Chris@32
|
897 }
|
Chris@32
|
898
|
Chris@294
|
899 vector<double>
|
Chris@170
|
900 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
901 const vector<int> &bestShifts,
|
Chris@170
|
902 bool wantShifts)
|
Chris@166
|
903 {
|
Chris@298
|
904 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
905
|
Chris@41
|
906 vector<double> filtered;
|
Chris@41
|
907
|
Chris@176
|
908 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
909 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
910 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
911 }
|
Chris@41
|
912
|
Chris@316
|
913 if (m_mode == LiveMode) {
|
Chris@316
|
914 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@316
|
915 // get clusters of two or three high scores at a time for
|
Chris@316
|
916 // neighbouring semitones. Eliminate these by picking only the
|
Chris@316
|
917 // peaks. This means we can't recognise actual semitone chords
|
Chris@316
|
918 // if they ever appear, but it's not as if live mode is good
|
Chris@316
|
919 // enough for that to be a big deal anyway.
|
Chris@316
|
920 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@316
|
921 if (j > 0 && j + 1 < pack.templateNoteCount &&
|
Chris@316
|
922 filtered[j] >= filtered[j-1] &&
|
Chris@316
|
923 filtered[j] >= filtered[j+1]) {
|
Chris@316
|
924 } else {
|
Chris@316
|
925 filtered[j] = 0.0;
|
Chris@316
|
926 }
|
Chris@316
|
927 }
|
Chris@316
|
928 }
|
Chris@316
|
929
|
Chris@41
|
930 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
931
|
Chris@41
|
932 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
933
|
Chris@41
|
934 ValueIndexMap strengths;
|
Chris@166
|
935
|
Chris@176
|
936 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
937 double strength = filtered[j];
|
Chris@183
|
938 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
939 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
940 }
|
Chris@166
|
941
|
Chris@168
|
942 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
943
|
Chris@168
|
944 map<int, double> active;
|
Chris@168
|
945 map<int, int> activeShifts;
|
Chris@168
|
946
|
Chris@183
|
947 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
948
|
Chris@168
|
949 --si;
|
Chris@168
|
950
|
Chris@168
|
951 double strength = si->first;
|
Chris@168
|
952 int j = si->second;
|
Chris@168
|
953
|
Chris@168
|
954 active[j] = strength;
|
Chris@168
|
955
|
Chris@170
|
956 if (wantShifts) {
|
Chris@170
|
957 activeShifts[j] = bestShifts[j];
|
Chris@167
|
958 }
|
Chris@41
|
959 }
|
Chris@41
|
960
|
Chris@168
|
961 m_pianoRoll.push_back(active);
|
Chris@170
|
962
|
Chris@170
|
963 if (wantShifts) {
|
Chris@168
|
964 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
965 }
|
Chris@294
|
966
|
Chris@294
|
967 return filtered;
|
Chris@166
|
968 }
|
Chris@166
|
969
|
Chris@319
|
970 pair<Vamp::Plugin::FeatureList, Vamp::Plugin::FeatureList>
|
Chris@168
|
971 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
972 {
|
Chris@41
|
973 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
974 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
975 // latest active set but present in the prior set in the piano
|
Chris@41
|
976 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
977 // already, and if they haven't ended, we don't know their
|
Chris@41
|
978 // duration.
|
Chris@41
|
979
|
Chris@168
|
980 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
981
|
Chris@168
|
982 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
983
|
Chris@165
|
984 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
985
|
Chris@165
|
986 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
987 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
988 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
989
|
Chris@319
|
990 FeatureList noteFeatures, onsetFeatures;
|
Chris@41
|
991
|
Chris@41
|
992 if (width < durationThreshold + 1) {
|
Chris@319
|
993 return { noteFeatures, onsetFeatures };
|
Chris@41
|
994 }
|
Chris@41
|
995
|
Chris@150
|
996 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
997
|
Chris@55
|
998 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
999 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
1000
|
Chris@55
|
1001 int note = ni->first;
|
Chris@41
|
1002
|
Chris@41
|
1003 int end = width;
|
Chris@41
|
1004 int start = end-1;
|
Chris@41
|
1005
|
Chris@41
|
1006 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
1007 --start;
|
Chris@41
|
1008 }
|
Chris@41
|
1009 ++start;
|
Chris@41
|
1010
|
Chris@319
|
1011 int duration = end - start;
|
Chris@319
|
1012
|
Chris@319
|
1013 if (duration < durationThreshold) {
|
Chris@41
|
1014 continue;
|
Chris@41
|
1015 }
|
Chris@41
|
1016
|
Chris@319
|
1017 if (duration == durationThreshold) {
|
Chris@319
|
1018 emitOnset(start, note, shiftCount, onsetFeatures);
|
Chris@319
|
1019 }
|
Chris@319
|
1020
|
Chris@319
|
1021 if (active.find(note) == active.end()) {
|
Chris@319
|
1022 // the note was playing but just ended
|
Chris@319
|
1023 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@319
|
1024 }
|
Chris@41
|
1025 }
|
Chris@41
|
1026
|
Chris@62
|
1027 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
1028
|
Chris@319
|
1029 return { noteFeatures, onsetFeatures };
|
Chris@41
|
1030 }
|
Chris@41
|
1031
|
Chris@169
|
1032 void
|
Chris@169
|
1033 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
1034 FeatureList ¬eFeatures)
|
Chris@169
|
1035 {
|
Chris@169
|
1036 int partStart = start;
|
Chris@169
|
1037 int partShift = 0;
|
Chris@169
|
1038 int partVelocity = 0;
|
Chris@169
|
1039
|
Chris@252
|
1040 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1041
|
Chris@169
|
1042 for (int i = start; i != end; ++i) {
|
Chris@169
|
1043
|
Chris@169
|
1044 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1045
|
Chris@169
|
1046 int shift = 0;
|
Chris@169
|
1047
|
Chris@169
|
1048 if (shiftCount > 1) {
|
Chris@169
|
1049
|
Chris@169
|
1050 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1051
|
Chris@169
|
1052 if (i == partStart) {
|
Chris@169
|
1053 partShift = shift;
|
Chris@169
|
1054 }
|
Chris@169
|
1055
|
Chris@169
|
1056 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1057
|
Chris@169
|
1058 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1059
|
Chris@169
|
1060 // pitch has changed, emit an intermediate note
|
Chris@252
|
1061 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1062 i,
|
Chris@252
|
1063 note,
|
Chris@252
|
1064 partShift,
|
Chris@252
|
1065 shiftCount,
|
Chris@252
|
1066 partVelocity));
|
Chris@169
|
1067 partStart = i;
|
Chris@169
|
1068 partShift = shift;
|
Chris@169
|
1069 partVelocity = 0;
|
Chris@169
|
1070 }
|
Chris@169
|
1071 }
|
Chris@169
|
1072
|
Chris@303
|
1073 int v;
|
Chris@303
|
1074 if (m_mode == LiveMode) {
|
Chris@316
|
1075 v = round(strength * 20);
|
Chris@303
|
1076 } else {
|
Chris@303
|
1077 v = round(strength * 2);
|
Chris@303
|
1078 }
|
Chris@169
|
1079 if (v > partVelocity) {
|
Chris@169
|
1080 partVelocity = v;
|
Chris@169
|
1081 }
|
Chris@169
|
1082 }
|
Chris@169
|
1083
|
Chris@169
|
1084 if (end >= partStart + partThreshold) {
|
Chris@252
|
1085 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1086 end,
|
Chris@252
|
1087 note,
|
Chris@252
|
1088 partShift,
|
Chris@252
|
1089 shiftCount,
|
Chris@252
|
1090 partVelocity));
|
Chris@169
|
1091 }
|
Chris@169
|
1092 }
|
Chris@252
|
1093
|
Chris@319
|
1094 void
|
Chris@319
|
1095 Silvet::emitOnset(int start, int note, int shiftCount,
|
Chris@319
|
1096 FeatureList &onsetFeatures)
|
Chris@319
|
1097 {
|
Chris@319
|
1098 int len = int(m_pianoRoll.size());
|
Chris@319
|
1099 int velocity = 0;
|
Chris@319
|
1100
|
Chris@319
|
1101 int shift = 0;
|
Chris@319
|
1102 if (shiftCount > 1) {
|
Chris@319
|
1103 shift = m_pianoRollShifts[start][note];
|
Chris@319
|
1104 }
|
Chris@319
|
1105
|
Chris@319
|
1106 for (int i = start; i < len; ++i) {
|
Chris@319
|
1107
|
Chris@319
|
1108 double strength = m_pianoRoll[i][note];
|
Chris@319
|
1109
|
Chris@319
|
1110 int v;
|
Chris@319
|
1111 if (m_mode == LiveMode) {
|
Chris@319
|
1112 v = round(strength * 20);
|
Chris@319
|
1113 } else {
|
Chris@319
|
1114 v = round(strength * 2);
|
Chris@319
|
1115 }
|
Chris@319
|
1116 if (v > velocity) {
|
Chris@319
|
1117 velocity = v;
|
Chris@319
|
1118 }
|
Chris@319
|
1119 }
|
Chris@319
|
1120
|
Chris@319
|
1121 onsetFeatures.push_back(makeOnsetFeature(start,
|
Chris@319
|
1122 note,
|
Chris@319
|
1123 shift,
|
Chris@319
|
1124 shiftCount,
|
Chris@319
|
1125 velocity));
|
Chris@319
|
1126 }
|
Chris@319
|
1127
|
Chris@309
|
1128 RealTime
|
Chris@309
|
1129 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1130 {
|
Chris@309
|
1131 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1132 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1133
|
Chris@309
|
1134 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1135 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1136 }
|
Chris@309
|
1137
|
Chris@252
|
1138 Silvet::Feature
|
Chris@252
|
1139 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1140 int end,
|
Chris@252
|
1141 int note,
|
Chris@252
|
1142 int shift,
|
Chris@252
|
1143 int shiftCount,
|
Chris@252
|
1144 int velocity)
|
Chris@252
|
1145 {
|
Chris@252
|
1146 Feature f;
|
Chris@252
|
1147
|
Chris@252
|
1148 f.hasTimestamp = true;
|
Chris@309
|
1149 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1150
|
Chris@252
|
1151 f.hasDuration = true;
|
Chris@309
|
1152 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1153
|
Chris@252
|
1154 f.values.clear();
|
Chris@252
|
1155
|
Chris@252
|
1156 f.values.push_back
|
Chris@252
|
1157 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
1158
|
Chris@252
|
1159 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
1160 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
1161 velocity = round(velocity / inputGain);
|
Chris@252
|
1162 if (velocity > 127) velocity = 127;
|
Chris@252
|
1163 if (velocity < 1) velocity = 1;
|
Chris@252
|
1164 f.values.push_back(velocity);
|
Chris@252
|
1165
|
Chris@252
|
1166 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
1167
|
Chris@252
|
1168 return f;
|
Chris@252
|
1169 }
|
Chris@252
|
1170
|
Chris@319
|
1171 Silvet::Feature
|
Chris@319
|
1172 Silvet::makeOnsetFeature(int start,
|
Chris@319
|
1173 int note,
|
Chris@319
|
1174 int shift,
|
Chris@319
|
1175 int shiftCount,
|
Chris@319
|
1176 int velocity)
|
Chris@319
|
1177 {
|
Chris@319
|
1178 Feature f;
|
Chris@319
|
1179
|
Chris@319
|
1180 f.hasTimestamp = true;
|
Chris@319
|
1181 f.timestamp = getColumnTimestamp(start);
|
Chris@319
|
1182
|
Chris@319
|
1183 f.hasDuration = false;
|
Chris@319
|
1184
|
Chris@319
|
1185 f.values.clear();
|
Chris@319
|
1186
|
Chris@319
|
1187 f.values.push_back
|
Chris@319
|
1188 (noteFrequency(note, shift, shiftCount));
|
Chris@319
|
1189
|
Chris@319
|
1190 float inputGain = getInputGainAt(f.timestamp);
|
Chris@319
|
1191 velocity = round(velocity / inputGain);
|
Chris@319
|
1192 if (velocity > 127) velocity = 127;
|
Chris@319
|
1193 if (velocity < 1) velocity = 1;
|
Chris@319
|
1194 f.values.push_back(velocity);
|
Chris@319
|
1195
|
Chris@319
|
1196 f.label = noteName(note, shift, shiftCount);
|
Chris@319
|
1197
|
Chris@319
|
1198 return f;
|
Chris@319
|
1199 }
|
Chris@319
|
1200
|
Chris@252
|
1201 float
|
Chris@252
|
1202 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1203 {
|
Chris@252
|
1204 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1205
|
Chris@252
|
1206 if (i == m_inputGains.end()) {
|
Chris@252
|
1207 if (i != m_inputGains.begin()) {
|
Chris@252
|
1208 --i;
|
Chris@252
|
1209 } else {
|
Chris@252
|
1210 return 1.f; // no data
|
Chris@252
|
1211 }
|
Chris@252
|
1212 }
|
Chris@252
|
1213
|
Chris@252
|
1214 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1215
|
Chris@252
|
1216 return i->second;
|
Chris@252
|
1217 }
|
Chris@252
|
1218
|