Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@271
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@178
|
256 d.identifier = "timefreq";
|
Chris@178
|
257 d.name = "Time-frequency distribution";
|
Chris@271
|
258 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
259 d.unit = "";
|
Chris@178
|
260 d.hasFixedBinCount = true;
|
Chris@298
|
261 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
262 d.binNames.clear();
|
Chris@178
|
263 if (m_cq) {
|
Chris@294
|
264 char name[50];
|
Chris@298
|
265 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
266 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
267 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
268 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
269 // frequency though, so these are still the first 545 bins
|
Chris@178
|
270 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
271 float freq = m_cq->getBinFrequency
|
Chris@298
|
272 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
273 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
274 d.binNames.push_back(name);
|
Chris@178
|
275 }
|
Chris@178
|
276 }
|
Chris@178
|
277 d.hasKnownExtents = false;
|
Chris@178
|
278 d.isQuantized = false;
|
Chris@178
|
279 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
280 d.sampleRate = m_colsPerSec;
|
Chris@178
|
281 d.hasDuration = false;
|
Chris@178
|
282 m_fcqOutputNo = list.size();
|
Chris@178
|
283 list.push_back(d);
|
Chris@178
|
284
|
Chris@294
|
285 d.identifier = "pitchactivation";
|
Chris@294
|
286 d.name = "Pitch activation distribution";
|
Chris@294
|
287 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
288 d.unit = "";
|
Chris@294
|
289 d.hasFixedBinCount = true;
|
Chris@298
|
290 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
291 d.binNames.clear();
|
Chris@294
|
292 if (m_cq) {
|
Chris@298
|
293 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@294
|
294 d.binNames.push_back(noteName(i, 0, 1));
|
Chris@294
|
295 }
|
Chris@294
|
296 }
|
Chris@294
|
297 d.hasKnownExtents = false;
|
Chris@294
|
298 d.isQuantized = false;
|
Chris@294
|
299 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
300 d.sampleRate = m_colsPerSec;
|
Chris@294
|
301 d.hasDuration = false;
|
Chris@294
|
302 m_pitchOutputNo = list.size();
|
Chris@294
|
303 list.push_back(d);
|
Chris@294
|
304
|
Chris@309
|
305 d.identifier = "chroma";
|
Chris@309
|
306 d.name = "Pitch chroma distribution";
|
Chris@309
|
307 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
308 d.unit = "";
|
Chris@309
|
309 d.hasFixedBinCount = true;
|
Chris@309
|
310 d.binCount = 12;
|
Chris@309
|
311 d.binNames.clear();
|
Chris@309
|
312 if (m_cq) {
|
Chris@309
|
313 for (int i = 0; i < 12; ++i) {
|
Chris@309
|
314 d.binNames.push_back(chromaName(i));
|
Chris@309
|
315 }
|
Chris@309
|
316 }
|
Chris@309
|
317 d.hasKnownExtents = false;
|
Chris@309
|
318 d.isQuantized = false;
|
Chris@309
|
319 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
320 d.sampleRate = m_colsPerSec;
|
Chris@309
|
321 d.hasDuration = false;
|
Chris@309
|
322 m_chromaOutputNo = list.size();
|
Chris@309
|
323 list.push_back(d);
|
Chris@309
|
324
|
Chris@302
|
325 d.identifier = "templates";
|
Chris@302
|
326 d.name = "Templates";
|
Chris@302
|
327 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
328 d.unit = "";
|
Chris@302
|
329 d.hasFixedBinCount = true;
|
Chris@302
|
330 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
331 d.binNames.clear();
|
Chris@302
|
332 if (m_cq) {
|
Chris@302
|
333 char name[50];
|
Chris@302
|
334 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
335 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
336 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
337 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
338 // frequency though, so these are still the first 545 bins
|
Chris@302
|
339 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
340 float freq = m_cq->getBinFrequency
|
Chris@302
|
341 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
342 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
343 d.binNames.push_back(name);
|
Chris@302
|
344 }
|
Chris@302
|
345 }
|
Chris@302
|
346 d.hasKnownExtents = false;
|
Chris@302
|
347 d.isQuantized = false;
|
Chris@302
|
348 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
349 d.sampleRate = m_colsPerSec;
|
Chris@302
|
350 d.hasDuration = false;
|
Chris@302
|
351 m_templateOutputNo = list.size();
|
Chris@302
|
352 list.push_back(d);
|
Chris@302
|
353
|
Chris@31
|
354 return list;
|
Chris@31
|
355 }
|
Chris@31
|
356
|
Chris@38
|
357 std::string
|
Chris@309
|
358 Silvet::chromaName(int pitch) const
|
Chris@38
|
359 {
|
Chris@38
|
360 static const char *names[] = {
|
Chris@38
|
361 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
362 };
|
Chris@38
|
363
|
Chris@309
|
364 return names[pitch];
|
Chris@309
|
365 }
|
Chris@309
|
366
|
Chris@309
|
367 std::string
|
Chris@309
|
368 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@309
|
369 {
|
Chris@309
|
370 string n = chromaName(note % 12);
|
Chris@38
|
371
|
Chris@175
|
372 int oct = (note + 9) / 12;
|
Chris@38
|
373
|
Chris@175
|
374 char buf[30];
|
Chris@175
|
375
|
Chris@175
|
376 float pshift = 0.f;
|
Chris@175
|
377 if (shiftCount > 1) {
|
Chris@175
|
378 // see noteFrequency below
|
Chris@175
|
379 pshift =
|
Chris@175
|
380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
381 }
|
Chris@175
|
382
|
Chris@175
|
383 if (pshift > 0.f) {
|
Chris@309
|
384 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
385 } else if (pshift < 0.f) {
|
Chris@309
|
386 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
387 } else {
|
Chris@309
|
388 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
389 }
|
Chris@38
|
390
|
Chris@38
|
391 return buf;
|
Chris@38
|
392 }
|
Chris@38
|
393
|
Chris@41
|
394 float
|
Chris@168
|
395 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
396 {
|
Chris@169
|
397 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
398 // is an offset into the template array, which starts with some
|
Chris@169
|
399 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
400 //
|
Chris@169
|
401 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
402 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
403 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
404 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
405 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
406 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
407 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
408 // down in pitch, for a negative pitch shift.
|
Chris@169
|
409
|
Chris@175
|
410 float pshift = 0.f;
|
Chris@175
|
411 if (shiftCount > 1) {
|
Chris@175
|
412 pshift =
|
Chris@175
|
413 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
414 }
|
Chris@169
|
415
|
Chris@301
|
416 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
417
|
Chris@303
|
418 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
419 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
420
|
Chris@301
|
421 return freq;
|
Chris@41
|
422 }
|
Chris@41
|
423
|
Chris@31
|
424 bool
|
Chris@31
|
425 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
426 {
|
Chris@272
|
427 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
428 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
429 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
430 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
431 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
432 return false;
|
Chris@272
|
433 }
|
Chris@272
|
434
|
Chris@31
|
435 if (channels < getMinChannelCount() ||
|
Chris@272
|
436 channels > getMaxChannelCount()) {
|
Chris@272
|
437 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
438 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
439 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
440 return false;
|
Chris@272
|
441 }
|
Chris@31
|
442
|
Chris@31
|
443 if (stepSize != blockSize) {
|
Chris@31
|
444 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
445 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
446 return false;
|
Chris@31
|
447 }
|
Chris@31
|
448
|
Chris@31
|
449 m_blockSize = blockSize;
|
Chris@31
|
450
|
Chris@31
|
451 reset();
|
Chris@31
|
452
|
Chris@31
|
453 return true;
|
Chris@31
|
454 }
|
Chris@31
|
455
|
Chris@31
|
456 void
|
Chris@31
|
457 Silvet::reset()
|
Chris@31
|
458 {
|
Chris@31
|
459 delete m_resampler;
|
Chris@246
|
460 delete m_flattener;
|
Chris@31
|
461 delete m_cq;
|
Chris@31
|
462
|
Chris@31
|
463 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
464 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
465 } else {
|
Chris@31
|
466 m_resampler = 0;
|
Chris@31
|
467 }
|
Chris@31
|
468
|
Chris@246
|
469 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
470 m_flattener->reset();
|
Chris@246
|
471
|
Chris@301
|
472 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
473 // freq used for the EM templates:
|
Chris@301
|
474 double maxFreq = 14700;
|
Chris@301
|
475
|
Chris@301
|
476 if (m_mode == LiveMode) {
|
Chris@301
|
477 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
478 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
479 // lower than 14700
|
Chris@301
|
480 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
481 }
|
Chris@301
|
482
|
Chris@173
|
483 double minFreq = 27.5;
|
Chris@173
|
484
|
Chris@297
|
485 if (m_mode != HighQualityMode) {
|
Chris@173
|
486 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
487 // so we can just pad with zeros
|
Chris@173
|
488 minFreq *= 2;
|
Chris@173
|
489 }
|
Chris@173
|
490
|
Chris@298
|
491 int bpo = 12 *
|
Chris@298
|
492 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
493
|
Chris@154
|
494 CQParameters params(processingSampleRate,
|
Chris@173
|
495 minFreq,
|
Chris@303
|
496 maxFreq,
|
Chris@298
|
497 bpo);
|
Chris@154
|
498
|
Chris@316
|
499 // For params.q, the MIREX code uses 0.8, but it seems that with
|
Chris@316
|
500 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT
|
Chris@316
|
501 // size to 512 from 1024 and alters some other processing
|
Chris@316
|
502 // parameters, making everything much, much slower. Could be a
|
Chris@316
|
503 // flaw in the CQ parameter calculations, must check. For
|
Chris@316
|
504 // atomHopFactor == 1, q == 0.8 is fine
|
Chris@316
|
505 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8);
|
Chris@316
|
506 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0);
|
Chris@154
|
507 params.threshold = 0.0005;
|
Chris@317
|
508 params.decimator =
|
Chris@317
|
509 (m_mode == LiveMode ?
|
Chris@317
|
510 CQParameters::FasterDecimator : CQParameters::BetterDecimator);
|
Chris@172
|
511 params.window = CQParameters::Hann;
|
Chris@154
|
512
|
Chris@154
|
513 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
514
|
Chris@303
|
515 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
516 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
517
|
Chris@297
|
518 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
519
|
Chris@41
|
520 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
521 delete m_postFilter[i];
|
Chris@41
|
522 }
|
Chris@41
|
523 m_postFilter.clear();
|
Chris@303
|
524 int postFilterLength = 3;
|
Chris@298
|
525 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
526 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
527 }
|
Chris@41
|
528 m_pianoRoll.clear();
|
Chris@246
|
529 m_inputGains.clear();
|
Chris@32
|
530 m_columnCount = 0;
|
Chris@272
|
531 m_resampledCount = 0;
|
Chris@40
|
532 m_startTime = RealTime::zeroTime;
|
Chris@313
|
533 m_haveStartTime = false;
|
Chris@31
|
534 }
|
Chris@31
|
535
|
Chris@31
|
536 Silvet::FeatureSet
|
Chris@31
|
537 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
538 {
|
Chris@302
|
539 FeatureSet fs;
|
Chris@302
|
540
|
Chris@313
|
541 if (!m_haveStartTime) {
|
Chris@314
|
542
|
Chris@40
|
543 m_startTime = timestamp;
|
Chris@313
|
544 m_haveStartTime = true;
|
Chris@314
|
545
|
Chris@302
|
546 insertTemplateFeatures(fs);
|
Chris@40
|
547 }
|
Chris@246
|
548
|
Chris@246
|
549 vector<float> flattened(m_blockSize);
|
Chris@246
|
550 float gain = 1.f;
|
Chris@246
|
551 m_flattener->connectInputPort
|
Chris@246
|
552 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
553 m_flattener->connectOutputPort
|
Chris@246
|
554 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
555 m_flattener->connectOutputPort
|
Chris@246
|
556 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
557 m_flattener->process(m_blockSize);
|
Chris@246
|
558
|
Chris@252
|
559 m_inputGains[timestamp] = gain;
|
Chris@40
|
560
|
Chris@31
|
561 vector<double> data;
|
Chris@40
|
562 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
563 double d = flattened[i];
|
Chris@235
|
564 data.push_back(d);
|
Chris@40
|
565 }
|
Chris@31
|
566
|
Chris@31
|
567 if (m_resampler) {
|
Chris@272
|
568
|
Chris@31
|
569 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
570
|
Chris@272
|
571 int hadCount = m_resampledCount;
|
Chris@272
|
572 m_resampledCount += data.size();
|
Chris@272
|
573
|
Chris@272
|
574 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
575
|
Chris@272
|
576 if (hadCount < resamplerLatency) {
|
Chris@272
|
577 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
578 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
579 return fs;
|
Chris@272
|
580 } else {
|
Chris@272
|
581 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
582 }
|
Chris@272
|
583 }
|
Chris@31
|
584 }
|
Chris@272
|
585
|
Chris@32
|
586 Grid cqout = m_cq->process(data);
|
Chris@302
|
587 transcribe(cqout, fs);
|
Chris@51
|
588 return fs;
|
Chris@34
|
589 }
|
Chris@34
|
590
|
Chris@34
|
591 Silvet::FeatureSet
|
Chris@34
|
592 Silvet::getRemainingFeatures()
|
Chris@34
|
593 {
|
Chris@145
|
594 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
595 FeatureSet fs;
|
Chris@302
|
596 if (m_columnCount == 0) {
|
Chris@302
|
597 // process() was never called, but we still want these
|
Chris@302
|
598 insertTemplateFeatures(fs);
|
Chris@302
|
599 } else {
|
Chris@302
|
600 transcribe(cqout, fs);
|
Chris@302
|
601 }
|
Chris@51
|
602 return fs;
|
Chris@34
|
603 }
|
Chris@34
|
604
|
Chris@302
|
605 void
|
Chris@302
|
606 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
607 {
|
Chris@302
|
608 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
609 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
610 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
611 Feature f;
|
Chris@302
|
612 char buffer[50];
|
Chris@302
|
613 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
614 f.label = buffer;
|
Chris@302
|
615 f.hasTimestamp = true;
|
Chris@302
|
616 f.timestamp = timestamp;
|
Chris@302
|
617 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
618 .data[i % pack.templateNoteCount];
|
Chris@302
|
619 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
620 }
|
Chris@302
|
621 }
|
Chris@302
|
622
|
Chris@302
|
623 void
|
Chris@302
|
624 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
625 {
|
Chris@32
|
626 Grid filtered = preProcess(cqout);
|
Chris@31
|
627
|
Chris@302
|
628 if (filtered.empty()) return;
|
Chris@170
|
629
|
Chris@298
|
630 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
631
|
Chris@178
|
632 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
633 Feature f;
|
Chris@178
|
634 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
635 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
636 }
|
Chris@178
|
637 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
638 }
|
Chris@178
|
639
|
Chris@34
|
640 int width = filtered.size();
|
Chris@34
|
641
|
Chris@311
|
642 Grid localPitches(width);
|
Chris@170
|
643
|
Chris@297
|
644 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
645 int shiftCount = 1;
|
Chris@170
|
646 if (wantShifts) {
|
Chris@170
|
647 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
648 }
|
Chris@170
|
649
|
Chris@170
|
650 vector<vector<int> > localBestShifts;
|
Chris@170
|
651 if (wantShifts) {
|
Chris@311
|
652 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
653 }
|
Chris@170
|
654
|
Chris@312
|
655 #ifndef MAX_EM_THREADS
|
Chris@312
|
656 #define MAX_EM_THREADS 8
|
Chris@312
|
657 #endif
|
Chris@312
|
658
|
Chris@317
|
659 int emThreadCount = MAX_EM_THREADS;
|
Chris@317
|
660 if (m_mode == LiveMode && pack.templates.size() == 1) {
|
Chris@317
|
661 // The EM step is probably not slow enough to merit it
|
Chris@317
|
662 emThreadCount = 1;
|
Chris@317
|
663 }
|
Chris@317
|
664
|
Chris@312
|
665 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@317
|
666 if (emThreadCount > 1) {
|
Chris@317
|
667 for (int i = 0; i < width; ) {
|
Chris@317
|
668 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@317
|
669 vector<EMFuture> results;
|
Chris@317
|
670 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
671 results.push_back
|
Chris@317
|
672 (async(std::launch::async,
|
Chris@317
|
673 [&](int index) {
|
Chris@317
|
674 return applyEM(pack, filtered.at(index), wantShifts);
|
Chris@317
|
675 }, i + j));
|
Chris@317
|
676 }
|
Chris@317
|
677 for (int j = 0; j < emThreadCount && i + j < width; ++j) {
|
Chris@317
|
678 auto out = results[j].get();
|
Chris@317
|
679 localPitches[i+j] = out.first;
|
Chris@317
|
680 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@317
|
681 }
|
Chris@317
|
682 i += emThreadCount;
|
Chris@312
|
683 }
|
Chris@123
|
684 }
|
Chris@312
|
685 #endif
|
Chris@317
|
686
|
Chris@317
|
687 if (emThreadCount == 1) {
|
Chris@317
|
688 for (int i = 0; i < width; ++i) {
|
Chris@317
|
689 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@317
|
690 localPitches[i] = out.first;
|
Chris@317
|
691 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@317
|
692 }
|
Chris@317
|
693 }
|
Chris@305
|
694
|
Chris@166
|
695 for (int i = 0; i < width; ++i) {
|
Chris@37
|
696
|
Chris@309
|
697 // This returns a filtered column, and pushes the
|
Chris@309
|
698 // up-to-max-polyphony activation column to m_pianoRoll
|
Chris@294
|
699 vector<double> filtered = postProcess
|
Chris@294
|
700 (localPitches[i], localBestShifts[i], wantShifts);
|
Chris@294
|
701
|
Chris@309
|
702 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
703 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
704
|
Chris@294
|
705 Feature f;
|
Chris@294
|
706 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
707 float v = filtered[j];
|
Chris@294
|
708 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
709 f.values.push_back(v / inputGain);
|
Chris@294
|
710 }
|
Chris@294
|
711 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
712
|
Chris@309
|
713 f.values.clear();
|
Chris@309
|
714 f.values.resize(12);
|
Chris@309
|
715 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
716 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
717 }
|
Chris@309
|
718 fs[m_chromaOutputNo].push_back(f);
|
Chris@166
|
719
|
Chris@168
|
720 FeatureList noteFeatures = noteTrack(shiftCount);
|
Chris@38
|
721
|
Chris@123
|
722 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
723 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
724 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
725 }
|
Chris@34
|
726 }
|
Chris@31
|
727 }
|
Chris@31
|
728
|
Chris@311
|
729 pair<vector<double>, vector<int> >
|
Chris@311
|
730 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
731 const vector<double> &column,
|
Chris@311
|
732 bool wantShifts)
|
Chris@311
|
733 {
|
Chris@311
|
734 double columnThreshold = 1e-5;
|
Chris@311
|
735
|
Chris@314
|
736 if (m_mode == LiveMode) {
|
Chris@314
|
737 columnThreshold /= 20;
|
Chris@314
|
738 }
|
Chris@314
|
739
|
Chris@311
|
740 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
741 vector<int> bestShifts;
|
Chris@311
|
742
|
Chris@311
|
743 double sum = 0.0;
|
Chris@311
|
744 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
745 sum += column.at(j);
|
Chris@311
|
746 }
|
Chris@311
|
747 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
748
|
Chris@314
|
749 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
750
|
Chris@311
|
751 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
752 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
753
|
Chris@314
|
754 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
755
|
Chris@311
|
756 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
757 em.iterate(column.data());
|
Chris@311
|
758 }
|
Chris@311
|
759
|
Chris@311
|
760 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
761 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
762
|
Chris@311
|
763 int shiftCount = 1;
|
Chris@311
|
764 if (wantShifts) {
|
Chris@311
|
765 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
766 }
|
Chris@311
|
767
|
Chris@311
|
768 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
769
|
Chris@311
|
770 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
771
|
Chris@311
|
772 int bestShift = 0;
|
Chris@311
|
773 float bestShiftValue = 0.0;
|
Chris@311
|
774 if (wantShifts) {
|
Chris@311
|
775 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
776 float value = shiftDist[k][j];
|
Chris@311
|
777 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
778 bestShiftValue = value;
|
Chris@311
|
779 bestShift = k;
|
Chris@311
|
780 }
|
Chris@311
|
781 }
|
Chris@311
|
782 bestShifts.push_back(bestShift);
|
Chris@311
|
783 }
|
Chris@311
|
784 }
|
Chris@311
|
785
|
Chris@311
|
786 return { pitches, bestShifts };
|
Chris@311
|
787 }
|
Chris@311
|
788
|
Chris@32
|
789 Silvet::Grid
|
Chris@32
|
790 Silvet::preProcess(const Grid &in)
|
Chris@32
|
791 {
|
Chris@32
|
792 int width = in.size();
|
Chris@32
|
793
|
Chris@165
|
794 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
795
|
Chris@165
|
796 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
797 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
798
|
Chris@32
|
799 Grid out;
|
Chris@32
|
800
|
Chris@58
|
801 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
802 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
803 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
804 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
805 // size we reduce to in a moment
|
Chris@33
|
806 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
807
|
Chris@298
|
808 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
809
|
Chris@32
|
810 for (int i = 0; i < width; ++i) {
|
Chris@32
|
811
|
Chris@33
|
812 if (m_columnCount < latentColumns) {
|
Chris@33
|
813 ++m_columnCount;
|
Chris@33
|
814 continue;
|
Chris@33
|
815 }
|
Chris@33
|
816
|
Chris@32
|
817 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
818 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
819
|
Chris@32
|
820 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
821
|
Chris@32
|
822 if (select) {
|
Chris@32
|
823 vector<double> inCol = in[i];
|
Chris@176
|
824 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
825
|
Chris@178
|
826 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@298
|
827 // lowest 55 of them (assuming binsPerSemitone == 5).
|
Chris@178
|
828 //
|
Chris@297
|
829 // In draft and live mode the CQ is an octave shorter,
|
Chris@300
|
830 // returning 540 bins or equivalent, so we instead pad
|
Chris@300
|
831 // them with an additional 5 or equivalent zeros.
|
Chris@178
|
832 //
|
Chris@178
|
833 // We also need to reverse the column as we go, since the
|
Chris@178
|
834 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
835 // the other way around.
|
Chris@32
|
836
|
Chris@298
|
837 int bps = (m_mode == LiveMode ?
|
Chris@298
|
838 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
839
|
Chris@297
|
840 if (m_mode == HighQualityMode) {
|
Chris@178
|
841 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
842 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
843 outCol[j] = inCol[ix];
|
Chris@178
|
844 }
|
Chris@178
|
845 } else {
|
Chris@298
|
846 for (int j = 0; j < bps; ++j) {
|
Chris@178
|
847 outCol[j] = 0.0;
|
Chris@178
|
848 }
|
Chris@298
|
849 for (int j = bps; j < pack.templateHeight; ++j) {
|
Chris@298
|
850 int ix = inCol.size() - j + (bps-1);
|
Chris@178
|
851 outCol[j] = inCol[ix];
|
Chris@178
|
852 }
|
Chris@46
|
853 }
|
Chris@32
|
854
|
Chris@46
|
855 vector<double> noiseLevel1 =
|
Chris@298
|
856 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
857 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
858 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
859 }
|
Chris@32
|
860
|
Chris@46
|
861 vector<double> noiseLevel2 =
|
Chris@298
|
862 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
863 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
864 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
865 }
|
Chris@32
|
866
|
Chris@165
|
867 out.push_back(outCol);
|
Chris@32
|
868 }
|
Chris@32
|
869
|
Chris@32
|
870 ++m_columnCount;
|
Chris@32
|
871 }
|
Chris@32
|
872
|
Chris@32
|
873 return out;
|
Chris@32
|
874 }
|
Chris@32
|
875
|
Chris@294
|
876 vector<double>
|
Chris@170
|
877 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
878 const vector<int> &bestShifts,
|
Chris@170
|
879 bool wantShifts)
|
Chris@166
|
880 {
|
Chris@298
|
881 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
882
|
Chris@41
|
883 vector<double> filtered;
|
Chris@41
|
884
|
Chris@176
|
885 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
886 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
887 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
888 }
|
Chris@41
|
889
|
Chris@316
|
890 if (m_mode == LiveMode) {
|
Chris@316
|
891 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@316
|
892 // get clusters of two or three high scores at a time for
|
Chris@316
|
893 // neighbouring semitones. Eliminate these by picking only the
|
Chris@316
|
894 // peaks. This means we can't recognise actual semitone chords
|
Chris@316
|
895 // if they ever appear, but it's not as if live mode is good
|
Chris@316
|
896 // enough for that to be a big deal anyway.
|
Chris@316
|
897 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@316
|
898 if (j > 0 && j + 1 < pack.templateNoteCount &&
|
Chris@316
|
899 filtered[j] >= filtered[j-1] &&
|
Chris@316
|
900 filtered[j] >= filtered[j+1]) {
|
Chris@316
|
901 } else {
|
Chris@316
|
902 filtered[j] = 0.0;
|
Chris@316
|
903 }
|
Chris@316
|
904 }
|
Chris@316
|
905 }
|
Chris@316
|
906
|
Chris@41
|
907 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
908
|
Chris@41
|
909 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
910
|
Chris@41
|
911 ValueIndexMap strengths;
|
Chris@166
|
912
|
Chris@176
|
913 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
914 double strength = filtered[j];
|
Chris@183
|
915 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
916 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
917 }
|
Chris@166
|
918
|
Chris@168
|
919 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
920
|
Chris@168
|
921 map<int, double> active;
|
Chris@168
|
922 map<int, int> activeShifts;
|
Chris@168
|
923
|
Chris@183
|
924 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
925
|
Chris@168
|
926 --si;
|
Chris@168
|
927
|
Chris@168
|
928 double strength = si->first;
|
Chris@168
|
929 int j = si->second;
|
Chris@168
|
930
|
Chris@168
|
931 active[j] = strength;
|
Chris@168
|
932
|
Chris@170
|
933 if (wantShifts) {
|
Chris@170
|
934 activeShifts[j] = bestShifts[j];
|
Chris@167
|
935 }
|
Chris@41
|
936 }
|
Chris@41
|
937
|
Chris@168
|
938 m_pianoRoll.push_back(active);
|
Chris@170
|
939
|
Chris@170
|
940 if (wantShifts) {
|
Chris@168
|
941 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
942 }
|
Chris@294
|
943
|
Chris@294
|
944 return filtered;
|
Chris@166
|
945 }
|
Chris@166
|
946
|
Chris@166
|
947 Vamp::Plugin::FeatureList
|
Chris@168
|
948 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
949 {
|
Chris@41
|
950 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
951 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
952 // latest active set but present in the prior set in the piano
|
Chris@41
|
953 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
954 // already, and if they haven't ended, we don't know their
|
Chris@41
|
955 // duration.
|
Chris@41
|
956
|
Chris@168
|
957 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
958
|
Chris@168
|
959 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
960
|
Chris@165
|
961 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
962
|
Chris@165
|
963 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
964 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
965 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
966
|
Chris@41
|
967 FeatureList noteFeatures;
|
Chris@41
|
968
|
Chris@41
|
969 if (width < durationThreshold + 1) {
|
Chris@41
|
970 return noteFeatures;
|
Chris@41
|
971 }
|
Chris@41
|
972
|
Chris@150
|
973 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
974
|
Chris@55
|
975 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
976 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
977
|
Chris@55
|
978 int note = ni->first;
|
Chris@41
|
979
|
Chris@41
|
980 if (active.find(note) != active.end()) {
|
Chris@41
|
981 // the note is still playing
|
Chris@41
|
982 continue;
|
Chris@41
|
983 }
|
Chris@41
|
984
|
Chris@41
|
985 // the note was playing but just ended
|
Chris@41
|
986 int end = width;
|
Chris@41
|
987 int start = end-1;
|
Chris@41
|
988
|
Chris@41
|
989 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
990 --start;
|
Chris@41
|
991 }
|
Chris@41
|
992 ++start;
|
Chris@41
|
993
|
Chris@169
|
994 if ((end - start) < durationThreshold) {
|
Chris@41
|
995 continue;
|
Chris@41
|
996 }
|
Chris@41
|
997
|
Chris@169
|
998 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@41
|
999 }
|
Chris@41
|
1000
|
Chris@62
|
1001 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
1002
|
Chris@41
|
1003 return noteFeatures;
|
Chris@41
|
1004 }
|
Chris@41
|
1005
|
Chris@169
|
1006 void
|
Chris@169
|
1007 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
1008 FeatureList ¬eFeatures)
|
Chris@169
|
1009 {
|
Chris@169
|
1010 int partStart = start;
|
Chris@169
|
1011 int partShift = 0;
|
Chris@169
|
1012 int partVelocity = 0;
|
Chris@169
|
1013
|
Chris@252
|
1014 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1015
|
Chris@169
|
1016 for (int i = start; i != end; ++i) {
|
Chris@169
|
1017
|
Chris@169
|
1018 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1019
|
Chris@169
|
1020 int shift = 0;
|
Chris@169
|
1021
|
Chris@169
|
1022 if (shiftCount > 1) {
|
Chris@169
|
1023
|
Chris@169
|
1024 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1025
|
Chris@169
|
1026 if (i == partStart) {
|
Chris@169
|
1027 partShift = shift;
|
Chris@169
|
1028 }
|
Chris@169
|
1029
|
Chris@169
|
1030 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1031
|
Chris@169
|
1032 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1033
|
Chris@169
|
1034 // pitch has changed, emit an intermediate note
|
Chris@252
|
1035 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1036 i,
|
Chris@252
|
1037 note,
|
Chris@252
|
1038 partShift,
|
Chris@252
|
1039 shiftCount,
|
Chris@252
|
1040 partVelocity));
|
Chris@169
|
1041 partStart = i;
|
Chris@169
|
1042 partShift = shift;
|
Chris@169
|
1043 partVelocity = 0;
|
Chris@169
|
1044 }
|
Chris@169
|
1045 }
|
Chris@169
|
1046
|
Chris@303
|
1047 int v;
|
Chris@303
|
1048 if (m_mode == LiveMode) {
|
Chris@316
|
1049 v = round(strength * 20);
|
Chris@303
|
1050 } else {
|
Chris@303
|
1051 v = round(strength * 2);
|
Chris@303
|
1052 }
|
Chris@169
|
1053 if (v > partVelocity) {
|
Chris@169
|
1054 partVelocity = v;
|
Chris@169
|
1055 }
|
Chris@169
|
1056 }
|
Chris@169
|
1057
|
Chris@169
|
1058 if (end >= partStart + partThreshold) {
|
Chris@252
|
1059 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1060 end,
|
Chris@252
|
1061 note,
|
Chris@252
|
1062 partShift,
|
Chris@252
|
1063 shiftCount,
|
Chris@252
|
1064 partVelocity));
|
Chris@169
|
1065 }
|
Chris@169
|
1066 }
|
Chris@252
|
1067
|
Chris@309
|
1068 RealTime
|
Chris@309
|
1069 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1070 {
|
Chris@309
|
1071 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1072 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1073
|
Chris@309
|
1074 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1075 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1076 }
|
Chris@309
|
1077
|
Chris@252
|
1078 Silvet::Feature
|
Chris@252
|
1079 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1080 int end,
|
Chris@252
|
1081 int note,
|
Chris@252
|
1082 int shift,
|
Chris@252
|
1083 int shiftCount,
|
Chris@252
|
1084 int velocity)
|
Chris@252
|
1085 {
|
Chris@252
|
1086 Feature f;
|
Chris@252
|
1087
|
Chris@252
|
1088 f.hasTimestamp = true;
|
Chris@309
|
1089 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1090
|
Chris@252
|
1091 f.hasDuration = true;
|
Chris@309
|
1092 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1093
|
Chris@252
|
1094 f.values.clear();
|
Chris@252
|
1095
|
Chris@252
|
1096 f.values.push_back
|
Chris@252
|
1097 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
1098
|
Chris@252
|
1099 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
1100 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
1101 velocity = round(velocity / inputGain);
|
Chris@252
|
1102 if (velocity > 127) velocity = 127;
|
Chris@252
|
1103 if (velocity < 1) velocity = 1;
|
Chris@252
|
1104 f.values.push_back(velocity);
|
Chris@252
|
1105
|
Chris@252
|
1106 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
1107
|
Chris@252
|
1108 return f;
|
Chris@252
|
1109 }
|
Chris@252
|
1110
|
Chris@252
|
1111 float
|
Chris@252
|
1112 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1113 {
|
Chris@252
|
1114 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1115
|
Chris@252
|
1116 if (i == m_inputGains.end()) {
|
Chris@252
|
1117 if (i != m_inputGains.begin()) {
|
Chris@252
|
1118 --i;
|
Chris@252
|
1119 } else {
|
Chris@252
|
1120 return 1.f; // no data
|
Chris@252
|
1121 }
|
Chris@252
|
1122 }
|
Chris@252
|
1123
|
Chris@252
|
1124 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1125
|
Chris@252
|
1126 return i->second;
|
Chris@252
|
1127 }
|
Chris@252
|
1128
|