Chris@31
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@31
|
2
|
Chris@31
|
3 /*
|
Chris@31
|
4 Silvet
|
Chris@31
|
5
|
Chris@31
|
6 A Vamp plugin for note transcription.
|
Chris@31
|
7 Centre for Digital Music, Queen Mary University of London.
|
Chris@31
|
8
|
Chris@31
|
9 This program is free software; you can redistribute it and/or
|
Chris@31
|
10 modify it under the terms of the GNU General Public License as
|
Chris@31
|
11 published by the Free Software Foundation; either version 2 of the
|
Chris@31
|
12 License, or (at your option) any later version. See the file
|
Chris@31
|
13 COPYING included with this distribution for more information.
|
Chris@31
|
14 */
|
Chris@31
|
15
|
Chris@31
|
16 #include "Silvet.h"
|
Chris@34
|
17 #include "EM.h"
|
Chris@31
|
18
|
Chris@152
|
19 #include <cq/CQSpectrogram.h>
|
Chris@31
|
20
|
Chris@152
|
21 #include "MedianFilter.h"
|
Chris@152
|
22 #include "constant-q-cpp/src/dsp/Resampler.h"
|
Chris@246
|
23 #include "flattendynamics-ladspa.h"
|
Chris@298
|
24 #include "LiveInstruments.h"
|
Chris@31
|
25
|
Chris@31
|
26 #include <vector>
|
Chris@312
|
27 #include <future>
|
Chris@31
|
28
|
Chris@32
|
29 #include <cstdio>
|
Chris@32
|
30
|
Chris@31
|
31 using std::vector;
|
Chris@48
|
32 using std::cout;
|
Chris@31
|
33 using std::cerr;
|
Chris@31
|
34 using std::endl;
|
Chris@311
|
35 using std::pair;
|
Chris@312
|
36 using std::future;
|
Chris@312
|
37 using std::async;
|
Chris@40
|
38 using Vamp::RealTime;
|
Chris@31
|
39
|
Chris@31
|
40 static int processingSampleRate = 44100;
|
Chris@298
|
41
|
Chris@298
|
42 static int binsPerSemitoneLive = 1;
|
Chris@298
|
43 static int binsPerSemitoneNormal = 5;
|
Chris@170
|
44
|
Chris@272
|
45 static int minInputSampleRate = 100;
|
Chris@272
|
46 static int maxInputSampleRate = 192000;
|
Chris@272
|
47
|
Chris@316
|
48 static const Silvet::ProcessingMode defaultMode = Silvet::HighQualityMode;
|
Chris@316
|
49
|
Chris@31
|
50 Silvet::Silvet(float inputSampleRate) :
|
Chris@31
|
51 Plugin(inputSampleRate),
|
Chris@161
|
52 m_instruments(InstrumentPack::listInstrumentPacks()),
|
Chris@298
|
53 m_liveInstruments(LiveAdapter::adaptAll(m_instruments)),
|
Chris@31
|
54 m_resampler(0),
|
Chris@246
|
55 m_flattener(0),
|
Chris@110
|
56 m_cq(0),
|
Chris@316
|
57 m_mode(defaultMode),
|
Chris@166
|
58 m_fineTuning(false),
|
Chris@178
|
59 m_instrument(0),
|
Chris@313
|
60 m_colsPerSec(50),
|
Chris@313
|
61 m_haveStartTime(false)
|
Chris@31
|
62 {
|
Chris@31
|
63 }
|
Chris@31
|
64
|
Chris@31
|
65 Silvet::~Silvet()
|
Chris@31
|
66 {
|
Chris@31
|
67 delete m_resampler;
|
Chris@246
|
68 delete m_flattener;
|
Chris@31
|
69 delete m_cq;
|
Chris@41
|
70 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
71 delete m_postFilter[i];
|
Chris@41
|
72 }
|
Chris@31
|
73 }
|
Chris@31
|
74
|
Chris@31
|
75 string
|
Chris@31
|
76 Silvet::getIdentifier() const
|
Chris@31
|
77 {
|
Chris@31
|
78 return "silvet";
|
Chris@31
|
79 }
|
Chris@31
|
80
|
Chris@31
|
81 string
|
Chris@31
|
82 Silvet::getName() const
|
Chris@31
|
83 {
|
Chris@31
|
84 return "Silvet Note Transcription";
|
Chris@31
|
85 }
|
Chris@31
|
86
|
Chris@31
|
87 string
|
Chris@31
|
88 Silvet::getDescription() const
|
Chris@31
|
89 {
|
Chris@191
|
90 return "Estimate the note onsets, pitches, and durations that make up a music recording.";
|
Chris@31
|
91 }
|
Chris@31
|
92
|
Chris@31
|
93 string
|
Chris@31
|
94 Silvet::getMaker() const
|
Chris@31
|
95 {
|
Chris@191
|
96 return "Queen Mary, University of London";
|
Chris@31
|
97 }
|
Chris@31
|
98
|
Chris@31
|
99 int
|
Chris@31
|
100 Silvet::getPluginVersion() const
|
Chris@31
|
101 {
|
Chris@309
|
102 return 3;
|
Chris@31
|
103 }
|
Chris@31
|
104
|
Chris@31
|
105 string
|
Chris@31
|
106 Silvet::getCopyright() const
|
Chris@31
|
107 {
|
Chris@191
|
108 return "Method by Emmanouil Benetos and Simon Dixon; plugin by Chris Cannam and Emmanouil Benetos. GPL licence.";
|
Chris@31
|
109 }
|
Chris@31
|
110
|
Chris@31
|
111 Silvet::InputDomain
|
Chris@31
|
112 Silvet::getInputDomain() const
|
Chris@31
|
113 {
|
Chris@31
|
114 return TimeDomain;
|
Chris@31
|
115 }
|
Chris@31
|
116
|
Chris@31
|
117 size_t
|
Chris@31
|
118 Silvet::getPreferredBlockSize() const
|
Chris@31
|
119 {
|
Chris@31
|
120 return 0;
|
Chris@31
|
121 }
|
Chris@31
|
122
|
Chris@31
|
123 size_t
|
Chris@31
|
124 Silvet::getPreferredStepSize() const
|
Chris@31
|
125 {
|
Chris@31
|
126 return 0;
|
Chris@31
|
127 }
|
Chris@31
|
128
|
Chris@31
|
129 size_t
|
Chris@31
|
130 Silvet::getMinChannelCount() const
|
Chris@31
|
131 {
|
Chris@31
|
132 return 1;
|
Chris@31
|
133 }
|
Chris@31
|
134
|
Chris@31
|
135 size_t
|
Chris@31
|
136 Silvet::getMaxChannelCount() const
|
Chris@31
|
137 {
|
Chris@31
|
138 return 1;
|
Chris@31
|
139 }
|
Chris@31
|
140
|
Chris@31
|
141 Silvet::ParameterList
|
Chris@31
|
142 Silvet::getParameterDescriptors() const
|
Chris@31
|
143 {
|
Chris@31
|
144 ParameterList list;
|
Chris@110
|
145
|
Chris@110
|
146 ParameterDescriptor desc;
|
Chris@110
|
147 desc.identifier = "mode";
|
Chris@110
|
148 desc.name = "Processing mode";
|
Chris@110
|
149 desc.unit = "";
|
Chris@297
|
150 desc.description = "Sets the tradeoff of processing speed against transcription quality. Draft mode is tuned in favour of overall speed; Live mode is tuned in favour of lower latency; while Intensive mode (the default) will almost always produce the best results.";
|
Chris@110
|
151 desc.minValue = 0;
|
Chris@297
|
152 desc.maxValue = 2;
|
Chris@316
|
153 desc.defaultValue = int(defaultMode);
|
Chris@110
|
154 desc.isQuantized = true;
|
Chris@110
|
155 desc.quantizeStep = 1;
|
Chris@166
|
156 desc.valueNames.push_back("Draft (faster)");
|
Chris@165
|
157 desc.valueNames.push_back("Intensive (higher quality)");
|
Chris@297
|
158 desc.valueNames.push_back("Live (lower latency)");
|
Chris@161
|
159 list.push_back(desc);
|
Chris@161
|
160
|
Chris@176
|
161 desc.identifier = "instrument";
|
Chris@176
|
162 desc.name = "Instrument";
|
Chris@161
|
163 desc.unit = "";
|
Chris@271
|
164 desc.description = "The instrument or instruments known to be present in the recording. This affects the set of instrument templates used, as well as the expected level of polyphony in the output. Using a more limited set of instruments than the default will also make the plugin run faster.\nNote that this plugin cannot isolate instruments: you can't use this setting to request notes from only one instrument in a recording with several. Instead, use this as a hint to the plugin about which instruments are actually present.";
|
Chris@161
|
165 desc.minValue = 0;
|
Chris@162
|
166 desc.maxValue = m_instruments.size()-1;
|
Chris@162
|
167 desc.defaultValue = 0;
|
Chris@161
|
168 desc.isQuantized = true;
|
Chris@161
|
169 desc.quantizeStep = 1;
|
Chris@161
|
170 desc.valueNames.clear();
|
Chris@162
|
171 for (int i = 0; i < int(m_instruments.size()); ++i) {
|
Chris@162
|
172 desc.valueNames.push_back(m_instruments[i].name);
|
Chris@162
|
173 }
|
Chris@166
|
174 list.push_back(desc);
|
Chris@161
|
175
|
Chris@166
|
176 desc.identifier = "finetune";
|
Chris@166
|
177 desc.name = "Return fine pitch estimates";
|
Chris@166
|
178 desc.unit = "";
|
Chris@271
|
179 desc.description = "Return pitch estimates at finer than semitone resolution. This works only in Intensive mode. Notes that appear to drift in pitch will be split up into shorter notes with individually finer pitches.";
|
Chris@166
|
180 desc.minValue = 0;
|
Chris@166
|
181 desc.maxValue = 1;
|
Chris@166
|
182 desc.defaultValue = 0;
|
Chris@166
|
183 desc.isQuantized = true;
|
Chris@166
|
184 desc.quantizeStep = 1;
|
Chris@166
|
185 desc.valueNames.clear();
|
Chris@110
|
186 list.push_back(desc);
|
Chris@110
|
187
|
Chris@31
|
188 return list;
|
Chris@31
|
189 }
|
Chris@31
|
190
|
Chris@31
|
191 float
|
Chris@31
|
192 Silvet::getParameter(string identifier) const
|
Chris@31
|
193 {
|
Chris@110
|
194 if (identifier == "mode") {
|
Chris@297
|
195 return (float)(int)m_mode;
|
Chris@166
|
196 } else if (identifier == "finetune") {
|
Chris@166
|
197 return m_fineTuning ? 1.f : 0.f;
|
Chris@176
|
198 } else if (identifier == "instrument") {
|
Chris@162
|
199 return m_instrument;
|
Chris@110
|
200 }
|
Chris@31
|
201 return 0;
|
Chris@31
|
202 }
|
Chris@31
|
203
|
Chris@31
|
204 void
|
Chris@31
|
205 Silvet::setParameter(string identifier, float value)
|
Chris@31
|
206 {
|
Chris@110
|
207 if (identifier == "mode") {
|
Chris@297
|
208 m_mode = (ProcessingMode)(int)(value + 0.5);
|
Chris@166
|
209 } else if (identifier == "finetune") {
|
Chris@166
|
210 m_fineTuning = (value > 0.5);
|
Chris@176
|
211 } else if (identifier == "instrument") {
|
Chris@162
|
212 m_instrument = lrintf(value);
|
Chris@110
|
213 }
|
Chris@31
|
214 }
|
Chris@31
|
215
|
Chris@31
|
216 Silvet::ProgramList
|
Chris@31
|
217 Silvet::getPrograms() const
|
Chris@31
|
218 {
|
Chris@31
|
219 ProgramList list;
|
Chris@31
|
220 return list;
|
Chris@31
|
221 }
|
Chris@31
|
222
|
Chris@31
|
223 string
|
Chris@31
|
224 Silvet::getCurrentProgram() const
|
Chris@31
|
225 {
|
Chris@31
|
226 return "";
|
Chris@31
|
227 }
|
Chris@31
|
228
|
Chris@31
|
229 void
|
Chris@31
|
230 Silvet::selectProgram(string name)
|
Chris@31
|
231 {
|
Chris@31
|
232 }
|
Chris@31
|
233
|
Chris@31
|
234 Silvet::OutputList
|
Chris@31
|
235 Silvet::getOutputDescriptors() const
|
Chris@31
|
236 {
|
Chris@31
|
237 OutputList list;
|
Chris@31
|
238
|
Chris@31
|
239 OutputDescriptor d;
|
Chris@51
|
240 d.identifier = "notes";
|
Chris@51
|
241 d.name = "Note transcription";
|
Chris@271
|
242 d.description = "Overall note transcription. Each note has time, duration, estimated pitch, and a synthetic MIDI velocity (1-127) estimated from the strength of the pitch in the mixture.";
|
Chris@41
|
243 d.unit = "Hz";
|
Chris@31
|
244 d.hasFixedBinCount = true;
|
Chris@31
|
245 d.binCount = 2;
|
Chris@41
|
246 d.binNames.push_back("Frequency");
|
Chris@31
|
247 d.binNames.push_back("Velocity");
|
Chris@31
|
248 d.hasKnownExtents = false;
|
Chris@31
|
249 d.isQuantized = false;
|
Chris@31
|
250 d.sampleType = OutputDescriptor::VariableSampleRate;
|
Chris@246
|
251 d.sampleRate = processingSampleRate / (m_cq ? m_cq->getColumnHop() : 62);
|
Chris@31
|
252 d.hasDuration = true;
|
Chris@32
|
253 m_notesOutputNo = list.size();
|
Chris@32
|
254 list.push_back(d);
|
Chris@32
|
255
|
Chris@178
|
256 d.identifier = "timefreq";
|
Chris@178
|
257 d.name = "Time-frequency distribution";
|
Chris@271
|
258 d.description = "Filtered constant-Q time-frequency distribution as used as input to the expectation-maximisation algorithm.";
|
Chris@178
|
259 d.unit = "";
|
Chris@178
|
260 d.hasFixedBinCount = true;
|
Chris@298
|
261 d.binCount = getPack(0).templateHeight;
|
Chris@178
|
262 d.binNames.clear();
|
Chris@178
|
263 if (m_cq) {
|
Chris@294
|
264 char name[50];
|
Chris@298
|
265 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@178
|
266 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@178
|
267 // lowest-frequency 55 bins have been dropped, for a
|
Chris@178
|
268 // 545-bin template. The native CQ bins go high->low
|
Chris@178
|
269 // frequency though, so these are still the first 545 bins
|
Chris@178
|
270 // as reported by getBinFrequency, though in reverse order
|
Chris@178
|
271 float freq = m_cq->getBinFrequency
|
Chris@298
|
272 (getPack(0).templateHeight - i - 1);
|
Chris@178
|
273 sprintf(name, "%.1f Hz", freq);
|
Chris@178
|
274 d.binNames.push_back(name);
|
Chris@178
|
275 }
|
Chris@178
|
276 }
|
Chris@178
|
277 d.hasKnownExtents = false;
|
Chris@178
|
278 d.isQuantized = false;
|
Chris@178
|
279 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@178
|
280 d.sampleRate = m_colsPerSec;
|
Chris@178
|
281 d.hasDuration = false;
|
Chris@178
|
282 m_fcqOutputNo = list.size();
|
Chris@178
|
283 list.push_back(d);
|
Chris@178
|
284
|
Chris@294
|
285 d.identifier = "pitchactivation";
|
Chris@294
|
286 d.name = "Pitch activation distribution";
|
Chris@294
|
287 d.description = "Pitch activation distribution resulting from expectation-maximisation algorithm, prior to note extraction.";
|
Chris@294
|
288 d.unit = "";
|
Chris@294
|
289 d.hasFixedBinCount = true;
|
Chris@298
|
290 d.binCount = getPack(0).templateNoteCount;
|
Chris@294
|
291 d.binNames.clear();
|
Chris@294
|
292 if (m_cq) {
|
Chris@298
|
293 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@294
|
294 d.binNames.push_back(noteName(i, 0, 1));
|
Chris@294
|
295 }
|
Chris@294
|
296 }
|
Chris@294
|
297 d.hasKnownExtents = false;
|
Chris@294
|
298 d.isQuantized = false;
|
Chris@294
|
299 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@294
|
300 d.sampleRate = m_colsPerSec;
|
Chris@294
|
301 d.hasDuration = false;
|
Chris@294
|
302 m_pitchOutputNo = list.size();
|
Chris@294
|
303 list.push_back(d);
|
Chris@294
|
304
|
Chris@309
|
305 d.identifier = "chroma";
|
Chris@309
|
306 d.name = "Pitch chroma distribution";
|
Chris@309
|
307 d.description = "Pitch chroma distribution formed by wrapping the un-thresholded pitch activation distribution into a single octave of semitone bins.";
|
Chris@309
|
308 d.unit = "";
|
Chris@309
|
309 d.hasFixedBinCount = true;
|
Chris@309
|
310 d.binCount = 12;
|
Chris@309
|
311 d.binNames.clear();
|
Chris@309
|
312 if (m_cq) {
|
Chris@309
|
313 for (int i = 0; i < 12; ++i) {
|
Chris@309
|
314 d.binNames.push_back(chromaName(i));
|
Chris@309
|
315 }
|
Chris@309
|
316 }
|
Chris@309
|
317 d.hasKnownExtents = false;
|
Chris@309
|
318 d.isQuantized = false;
|
Chris@309
|
319 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@309
|
320 d.sampleRate = m_colsPerSec;
|
Chris@309
|
321 d.hasDuration = false;
|
Chris@309
|
322 m_chromaOutputNo = list.size();
|
Chris@309
|
323 list.push_back(d);
|
Chris@309
|
324
|
Chris@302
|
325 d.identifier = "templates";
|
Chris@302
|
326 d.name = "Templates";
|
Chris@302
|
327 d.description = "Constant-Q spectral templates for the selected instrument pack.";
|
Chris@302
|
328 d.unit = "";
|
Chris@302
|
329 d.hasFixedBinCount = true;
|
Chris@302
|
330 d.binCount = getPack(0).templateHeight;
|
Chris@302
|
331 d.binNames.clear();
|
Chris@302
|
332 if (m_cq) {
|
Chris@302
|
333 char name[50];
|
Chris@302
|
334 for (int i = 0; i < getPack(0).templateHeight; ++i) {
|
Chris@302
|
335 // We have a 600-bin (10 oct 60-bin CQ) of which the
|
Chris@302
|
336 // lowest-frequency 55 bins have been dropped, for a
|
Chris@302
|
337 // 545-bin template. The native CQ bins go high->low
|
Chris@302
|
338 // frequency though, so these are still the first 545 bins
|
Chris@302
|
339 // as reported by getBinFrequency, though in reverse order
|
Chris@302
|
340 float freq = m_cq->getBinFrequency
|
Chris@302
|
341 (getPack(0).templateHeight - i - 1);
|
Chris@302
|
342 sprintf(name, "%.1f Hz", freq);
|
Chris@302
|
343 d.binNames.push_back(name);
|
Chris@302
|
344 }
|
Chris@302
|
345 }
|
Chris@302
|
346 d.hasKnownExtents = false;
|
Chris@302
|
347 d.isQuantized = false;
|
Chris@302
|
348 d.sampleType = OutputDescriptor::FixedSampleRate;
|
Chris@302
|
349 d.sampleRate = m_colsPerSec;
|
Chris@302
|
350 d.hasDuration = false;
|
Chris@302
|
351 m_templateOutputNo = list.size();
|
Chris@302
|
352 list.push_back(d);
|
Chris@302
|
353
|
Chris@31
|
354 return list;
|
Chris@31
|
355 }
|
Chris@31
|
356
|
Chris@38
|
357 std::string
|
Chris@309
|
358 Silvet::chromaName(int pitch) const
|
Chris@38
|
359 {
|
Chris@38
|
360 static const char *names[] = {
|
Chris@38
|
361 "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"
|
Chris@38
|
362 };
|
Chris@38
|
363
|
Chris@309
|
364 return names[pitch];
|
Chris@309
|
365 }
|
Chris@309
|
366
|
Chris@309
|
367 std::string
|
Chris@309
|
368 Silvet::noteName(int note, int shift, int shiftCount) const
|
Chris@309
|
369 {
|
Chris@309
|
370 string n = chromaName(note % 12);
|
Chris@38
|
371
|
Chris@175
|
372 int oct = (note + 9) / 12;
|
Chris@38
|
373
|
Chris@175
|
374 char buf[30];
|
Chris@175
|
375
|
Chris@175
|
376 float pshift = 0.f;
|
Chris@175
|
377 if (shiftCount > 1) {
|
Chris@175
|
378 // see noteFrequency below
|
Chris@175
|
379 pshift =
|
Chris@175
|
380 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
381 }
|
Chris@175
|
382
|
Chris@175
|
383 if (pshift > 0.f) {
|
Chris@309
|
384 sprintf(buf, "%s%d+%dc", n.c_str(), oct, int(round(pshift * 100)));
|
Chris@175
|
385 } else if (pshift < 0.f) {
|
Chris@309
|
386 sprintf(buf, "%s%d-%dc", n.c_str(), oct, int(round((-pshift) * 100)));
|
Chris@175
|
387 } else {
|
Chris@309
|
388 sprintf(buf, "%s%d", n.c_str(), oct);
|
Chris@175
|
389 }
|
Chris@38
|
390
|
Chris@38
|
391 return buf;
|
Chris@38
|
392 }
|
Chris@38
|
393
|
Chris@41
|
394 float
|
Chris@168
|
395 Silvet::noteFrequency(int note, int shift, int shiftCount) const
|
Chris@41
|
396 {
|
Chris@169
|
397 // Convert shift number to a pitch shift. The given shift number
|
Chris@169
|
398 // is an offset into the template array, which starts with some
|
Chris@169
|
399 // zeros, followed by the template, then some trailing zeros.
|
Chris@169
|
400 //
|
Chris@169
|
401 // Example: if we have templateMaxShift == 2 and thus shiftCount
|
Chris@169
|
402 // == 5, then the number will be in the range 0-4 and the template
|
Chris@169
|
403 // will have 2 zeros at either end. Thus number 2 represents the
|
Chris@169
|
404 // template "as recorded", for a pitch shift of 0; smaller indices
|
Chris@169
|
405 // represent moving the template *up* in pitch (by introducing
|
Chris@169
|
406 // zeros at the start, which is the low-frequency end), for a
|
Chris@169
|
407 // positive pitch shift; and higher values represent moving it
|
Chris@169
|
408 // down in pitch, for a negative pitch shift.
|
Chris@169
|
409
|
Chris@175
|
410 float pshift = 0.f;
|
Chris@175
|
411 if (shiftCount > 1) {
|
Chris@175
|
412 pshift =
|
Chris@175
|
413 float((shiftCount - shift) - int(shiftCount / 2) - 1) / shiftCount;
|
Chris@175
|
414 }
|
Chris@169
|
415
|
Chris@301
|
416 float freq = float(27.5 * pow(2.0, (note + pshift) / 12.0));
|
Chris@301
|
417
|
Chris@303
|
418 // cerr << "note = " << note << ", shift = " << shift << ", shiftCount = "
|
Chris@303
|
419 // << shiftCount << ", obtained freq = " << freq << endl;
|
Chris@301
|
420
|
Chris@301
|
421 return freq;
|
Chris@41
|
422 }
|
Chris@41
|
423
|
Chris@31
|
424 bool
|
Chris@31
|
425 Silvet::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
Chris@31
|
426 {
|
Chris@272
|
427 if (m_inputSampleRate < minInputSampleRate ||
|
Chris@272
|
428 m_inputSampleRate > maxInputSampleRate) {
|
Chris@272
|
429 cerr << "Silvet::initialise: Unsupported input sample rate "
|
Chris@272
|
430 << m_inputSampleRate << " (supported min " << minInputSampleRate
|
Chris@272
|
431 << ", max " << maxInputSampleRate << ")" << endl;
|
Chris@272
|
432 return false;
|
Chris@272
|
433 }
|
Chris@272
|
434
|
Chris@31
|
435 if (channels < getMinChannelCount() ||
|
Chris@272
|
436 channels > getMaxChannelCount()) {
|
Chris@272
|
437 cerr << "Silvet::initialise: Unsupported channel count " << channels
|
Chris@272
|
438 << " (supported min " << getMinChannelCount() << ", max "
|
Chris@272
|
439 << getMaxChannelCount() << ")" << endl;
|
Chris@272
|
440 return false;
|
Chris@272
|
441 }
|
Chris@31
|
442
|
Chris@31
|
443 if (stepSize != blockSize) {
|
Chris@31
|
444 cerr << "Silvet::initialise: Step size must be the same as block size ("
|
Chris@31
|
445 << stepSize << " != " << blockSize << ")" << endl;
|
Chris@31
|
446 return false;
|
Chris@31
|
447 }
|
Chris@31
|
448
|
Chris@31
|
449 m_blockSize = blockSize;
|
Chris@31
|
450
|
Chris@31
|
451 reset();
|
Chris@31
|
452
|
Chris@31
|
453 return true;
|
Chris@31
|
454 }
|
Chris@31
|
455
|
Chris@31
|
456 void
|
Chris@31
|
457 Silvet::reset()
|
Chris@31
|
458 {
|
Chris@31
|
459 delete m_resampler;
|
Chris@246
|
460 delete m_flattener;
|
Chris@31
|
461 delete m_cq;
|
Chris@31
|
462
|
Chris@31
|
463 if (m_inputSampleRate != processingSampleRate) {
|
Chris@31
|
464 m_resampler = new Resampler(m_inputSampleRate, processingSampleRate);
|
Chris@31
|
465 } else {
|
Chris@31
|
466 m_resampler = 0;
|
Chris@31
|
467 }
|
Chris@31
|
468
|
Chris@246
|
469 m_flattener = new FlattenDynamics(m_inputSampleRate); // before resampling
|
Chris@246
|
470 m_flattener->reset();
|
Chris@246
|
471
|
Chris@301
|
472 // this happens to be processingSampleRate / 3, and is the top
|
Chris@301
|
473 // freq used for the EM templates:
|
Chris@301
|
474 double maxFreq = 14700;
|
Chris@301
|
475
|
Chris@301
|
476 if (m_mode == LiveMode) {
|
Chris@301
|
477 // We only have 12 bpo rather than 60, so we need the top bin
|
Chris@301
|
478 // to be the middle one of the top 5, i.e. 2/5 of a semitone
|
Chris@301
|
479 // lower than 14700
|
Chris@301
|
480 maxFreq *= powf(2.0, -1.0 / 30.0);
|
Chris@301
|
481 }
|
Chris@301
|
482
|
Chris@173
|
483 double minFreq = 27.5;
|
Chris@173
|
484
|
Chris@297
|
485 if (m_mode != HighQualityMode) {
|
Chris@173
|
486 // We don't actually return any notes from the bottom octave,
|
Chris@173
|
487 // so we can just pad with zeros
|
Chris@173
|
488 minFreq *= 2;
|
Chris@173
|
489 }
|
Chris@173
|
490
|
Chris@298
|
491 int bpo = 12 *
|
Chris@298
|
492 (m_mode == LiveMode ? binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@301
|
493
|
Chris@154
|
494 CQParameters params(processingSampleRate,
|
Chris@173
|
495 minFreq,
|
Chris@303
|
496 maxFreq,
|
Chris@298
|
497 bpo);
|
Chris@154
|
498
|
Chris@316
|
499 // For params.q, the MIREX code uses 0.8, but it seems that with
|
Chris@316
|
500 // atomHopFactor of 0.3, using q == 0.9 or lower drops the FFT
|
Chris@316
|
501 // size to 512 from 1024 and alters some other processing
|
Chris@316
|
502 // parameters, making everything much, much slower. Could be a
|
Chris@316
|
503 // flaw in the CQ parameter calculations, must check. For
|
Chris@316
|
504 // atomHopFactor == 1, q == 0.8 is fine
|
Chris@316
|
505 params.q = (m_mode == HighQualityMode ? 0.95 : 0.8);
|
Chris@316
|
506 params.atomHopFactor = (m_mode == HighQualityMode ? 0.3 : 1.0);
|
Chris@154
|
507 params.threshold = 0.0005;
|
Chris@172
|
508 params.window = CQParameters::Hann;
|
Chris@154
|
509
|
Chris@154
|
510 m_cq = new CQSpectrogram(params, CQSpectrogram::InterpolateLinear);
|
Chris@31
|
511
|
Chris@303
|
512 // cerr << "CQ bins = " << m_cq->getTotalBins() << endl;
|
Chris@303
|
513 // cerr << "CQ min freq = " << m_cq->getMinFrequency() << " (and for confirmation, freq of bin 0 = " << m_cq->getBinFrequency(0) << ")" << endl;
|
Chris@297
|
514
|
Chris@297
|
515 m_colsPerSec = (m_mode == DraftMode ? 25 : 50);
|
Chris@165
|
516
|
Chris@41
|
517 for (int i = 0; i < (int)m_postFilter.size(); ++i) {
|
Chris@41
|
518 delete m_postFilter[i];
|
Chris@41
|
519 }
|
Chris@41
|
520 m_postFilter.clear();
|
Chris@303
|
521 int postFilterLength = 3;
|
Chris@298
|
522 for (int i = 0; i < getPack(0).templateNoteCount; ++i) {
|
Chris@303
|
523 m_postFilter.push_back(new MedianFilter<double>(postFilterLength));
|
Chris@41
|
524 }
|
Chris@41
|
525 m_pianoRoll.clear();
|
Chris@246
|
526 m_inputGains.clear();
|
Chris@32
|
527 m_columnCount = 0;
|
Chris@272
|
528 m_resampledCount = 0;
|
Chris@40
|
529 m_startTime = RealTime::zeroTime;
|
Chris@313
|
530 m_haveStartTime = false;
|
Chris@31
|
531 }
|
Chris@31
|
532
|
Chris@31
|
533 Silvet::FeatureSet
|
Chris@31
|
534 Silvet::process(const float *const *inputBuffers, Vamp::RealTime timestamp)
|
Chris@31
|
535 {
|
Chris@302
|
536 FeatureSet fs;
|
Chris@302
|
537
|
Chris@313
|
538 if (!m_haveStartTime) {
|
Chris@314
|
539
|
Chris@40
|
540 m_startTime = timestamp;
|
Chris@313
|
541 m_haveStartTime = true;
|
Chris@314
|
542
|
Chris@302
|
543 insertTemplateFeatures(fs);
|
Chris@40
|
544 }
|
Chris@246
|
545
|
Chris@246
|
546 vector<float> flattened(m_blockSize);
|
Chris@246
|
547 float gain = 1.f;
|
Chris@246
|
548 m_flattener->connectInputPort
|
Chris@246
|
549 (FlattenDynamics::AudioInputPort, inputBuffers[0]);
|
Chris@246
|
550 m_flattener->connectOutputPort
|
Chris@246
|
551 (FlattenDynamics::AudioOutputPort, &flattened[0]);
|
Chris@246
|
552 m_flattener->connectOutputPort
|
Chris@246
|
553 (FlattenDynamics::GainOutputPort, &gain);
|
Chris@246
|
554 m_flattener->process(m_blockSize);
|
Chris@246
|
555
|
Chris@252
|
556 m_inputGains[timestamp] = gain;
|
Chris@40
|
557
|
Chris@31
|
558 vector<double> data;
|
Chris@40
|
559 for (int i = 0; i < m_blockSize; ++i) {
|
Chris@246
|
560 double d = flattened[i];
|
Chris@235
|
561 data.push_back(d);
|
Chris@40
|
562 }
|
Chris@31
|
563
|
Chris@31
|
564 if (m_resampler) {
|
Chris@272
|
565
|
Chris@31
|
566 data = m_resampler->process(data.data(), data.size());
|
Chris@272
|
567
|
Chris@272
|
568 int hadCount = m_resampledCount;
|
Chris@272
|
569 m_resampledCount += data.size();
|
Chris@272
|
570
|
Chris@272
|
571 int resamplerLatency = m_resampler->getLatency();
|
Chris@272
|
572
|
Chris@272
|
573 if (hadCount < resamplerLatency) {
|
Chris@272
|
574 int stillToDrop = resamplerLatency - hadCount;
|
Chris@272
|
575 if (stillToDrop >= int(data.size())) {
|
Chris@302
|
576 return fs;
|
Chris@272
|
577 } else {
|
Chris@272
|
578 data = vector<double>(data.begin() + stillToDrop, data.end());
|
Chris@272
|
579 }
|
Chris@272
|
580 }
|
Chris@31
|
581 }
|
Chris@272
|
582
|
Chris@32
|
583 Grid cqout = m_cq->process(data);
|
Chris@302
|
584 transcribe(cqout, fs);
|
Chris@51
|
585 return fs;
|
Chris@34
|
586 }
|
Chris@34
|
587
|
Chris@34
|
588 Silvet::FeatureSet
|
Chris@34
|
589 Silvet::getRemainingFeatures()
|
Chris@34
|
590 {
|
Chris@145
|
591 Grid cqout = m_cq->getRemainingOutput();
|
Chris@302
|
592 FeatureSet fs;
|
Chris@302
|
593 if (m_columnCount == 0) {
|
Chris@302
|
594 // process() was never called, but we still want these
|
Chris@302
|
595 insertTemplateFeatures(fs);
|
Chris@302
|
596 } else {
|
Chris@302
|
597 transcribe(cqout, fs);
|
Chris@302
|
598 }
|
Chris@51
|
599 return fs;
|
Chris@34
|
600 }
|
Chris@34
|
601
|
Chris@302
|
602 void
|
Chris@302
|
603 Silvet::insertTemplateFeatures(FeatureSet &fs)
|
Chris@302
|
604 {
|
Chris@302
|
605 const InstrumentPack &pack = getPack(m_instrument);
|
Chris@302
|
606 for (int i = 0; i < int(pack.templates.size()) * pack.templateNoteCount; ++i) {
|
Chris@302
|
607 RealTime timestamp = RealTime::fromSeconds(double(i) / m_colsPerSec);
|
Chris@302
|
608 Feature f;
|
Chris@302
|
609 char buffer[50];
|
Chris@302
|
610 sprintf(buffer, "Note %d", i + 1);
|
Chris@302
|
611 f.label = buffer;
|
Chris@302
|
612 f.hasTimestamp = true;
|
Chris@302
|
613 f.timestamp = timestamp;
|
Chris@302
|
614 f.values = pack.templates[i / pack.templateNoteCount]
|
Chris@302
|
615 .data[i % pack.templateNoteCount];
|
Chris@302
|
616 fs[m_templateOutputNo].push_back(f);
|
Chris@302
|
617 }
|
Chris@302
|
618 }
|
Chris@302
|
619
|
Chris@302
|
620 void
|
Chris@302
|
621 Silvet::transcribe(const Grid &cqout, Silvet::FeatureSet &fs)
|
Chris@34
|
622 {
|
Chris@32
|
623 Grid filtered = preProcess(cqout);
|
Chris@31
|
624
|
Chris@302
|
625 if (filtered.empty()) return;
|
Chris@170
|
626
|
Chris@298
|
627 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@104
|
628
|
Chris@178
|
629 for (int i = 0; i < (int)filtered.size(); ++i) {
|
Chris@178
|
630 Feature f;
|
Chris@178
|
631 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@178
|
632 f.values.push_back(float(filtered[i][j]));
|
Chris@178
|
633 }
|
Chris@178
|
634 fs[m_fcqOutputNo].push_back(f);
|
Chris@178
|
635 }
|
Chris@178
|
636
|
Chris@34
|
637 int width = filtered.size();
|
Chris@34
|
638
|
Chris@311
|
639 Grid localPitches(width);
|
Chris@170
|
640
|
Chris@297
|
641 bool wantShifts = (m_mode == HighQualityMode) && m_fineTuning;
|
Chris@170
|
642 int shiftCount = 1;
|
Chris@170
|
643 if (wantShifts) {
|
Chris@170
|
644 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@170
|
645 }
|
Chris@170
|
646
|
Chris@170
|
647 vector<vector<int> > localBestShifts;
|
Chris@170
|
648 if (wantShifts) {
|
Chris@311
|
649 localBestShifts = vector<vector<int> >(width);
|
Chris@170
|
650 }
|
Chris@170
|
651
|
Chris@312
|
652 #ifndef MAX_EM_THREADS
|
Chris@312
|
653 #define MAX_EM_THREADS 8
|
Chris@312
|
654 #endif
|
Chris@312
|
655
|
Chris@312
|
656 #if (defined(MAX_EM_THREADS) && (MAX_EM_THREADS > 1))
|
Chris@312
|
657 for (int i = 0; i < width; ) {
|
Chris@312
|
658 typedef future<pair<vector<double>, vector<int>>> EMFuture;
|
Chris@312
|
659 vector<EMFuture> results;
|
Chris@312
|
660 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
|
Chris@312
|
661 results.push_back
|
Chris@312
|
662 (async(std::launch::async,
|
Chris@312
|
663 [&](int index) {
|
Chris@312
|
664 return applyEM(pack, filtered.at(index), wantShifts);
|
Chris@312
|
665 }, i + j));
|
Chris@312
|
666 }
|
Chris@312
|
667 for (int j = 0; j < MAX_EM_THREADS && i + j < width; ++j) {
|
Chris@312
|
668 auto out = results[j].get();
|
Chris@312
|
669 localPitches[i+j] = out.first;
|
Chris@312
|
670 if (wantShifts) localBestShifts[i+j] = out.second;
|
Chris@312
|
671 }
|
Chris@312
|
672 i += MAX_EM_THREADS;
|
Chris@312
|
673 }
|
Chris@312
|
674 #else
|
Chris@123
|
675 for (int i = 0; i < width; ++i) {
|
Chris@311
|
676 auto out = applyEM(pack, filtered.at(i), wantShifts);
|
Chris@311
|
677 localPitches[i] = out.first;
|
Chris@311
|
678 if (wantShifts) localBestShifts[i] = out.second;
|
Chris@123
|
679 }
|
Chris@312
|
680 #endif
|
Chris@305
|
681
|
Chris@166
|
682 for (int i = 0; i < width; ++i) {
|
Chris@37
|
683
|
Chris@309
|
684 // This returns a filtered column, and pushes the
|
Chris@309
|
685 // up-to-max-polyphony activation column to m_pianoRoll
|
Chris@294
|
686 vector<double> filtered = postProcess
|
Chris@294
|
687 (localPitches[i], localBestShifts[i], wantShifts);
|
Chris@294
|
688
|
Chris@309
|
689 RealTime timestamp = getColumnTimestamp(m_pianoRoll.size() - 1);
|
Chris@309
|
690 float inputGain = getInputGainAt(timestamp);
|
Chris@309
|
691
|
Chris@294
|
692 Feature f;
|
Chris@294
|
693 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
694 float v = filtered[j];
|
Chris@294
|
695 if (v < pack.levelThreshold) v = 0.f;
|
Chris@309
|
696 f.values.push_back(v / inputGain);
|
Chris@294
|
697 }
|
Chris@294
|
698 fs[m_pitchOutputNo].push_back(f);
|
Chris@309
|
699
|
Chris@309
|
700 f.values.clear();
|
Chris@309
|
701 f.values.resize(12);
|
Chris@309
|
702 for (int j = 0; j < (int)filtered.size(); ++j) {
|
Chris@309
|
703 f.values[j % 12] += filtered[j] / inputGain;
|
Chris@309
|
704 }
|
Chris@309
|
705 fs[m_chromaOutputNo].push_back(f);
|
Chris@166
|
706
|
Chris@168
|
707 FeatureList noteFeatures = noteTrack(shiftCount);
|
Chris@38
|
708
|
Chris@123
|
709 for (FeatureList::const_iterator fi = noteFeatures.begin();
|
Chris@123
|
710 fi != noteFeatures.end(); ++fi) {
|
Chris@123
|
711 fs[m_notesOutputNo].push_back(*fi);
|
Chris@40
|
712 }
|
Chris@34
|
713 }
|
Chris@31
|
714 }
|
Chris@31
|
715
|
Chris@311
|
716 pair<vector<double>, vector<int> >
|
Chris@311
|
717 Silvet::applyEM(const InstrumentPack &pack,
|
Chris@311
|
718 const vector<double> &column,
|
Chris@311
|
719 bool wantShifts)
|
Chris@311
|
720 {
|
Chris@311
|
721 double columnThreshold = 1e-5;
|
Chris@311
|
722
|
Chris@314
|
723 if (m_mode == LiveMode) {
|
Chris@314
|
724 columnThreshold /= 20;
|
Chris@314
|
725 }
|
Chris@314
|
726
|
Chris@311
|
727 vector<double> pitches(pack.templateNoteCount, 0.0);
|
Chris@311
|
728 vector<int> bestShifts;
|
Chris@311
|
729
|
Chris@311
|
730 double sum = 0.0;
|
Chris@311
|
731 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@311
|
732 sum += column.at(j);
|
Chris@311
|
733 }
|
Chris@311
|
734 if (sum < columnThreshold) return { pitches, bestShifts };
|
Chris@311
|
735
|
Chris@314
|
736 EM em(&pack, m_mode == HighQualityMode);
|
Chris@311
|
737
|
Chris@311
|
738 em.setPitchSparsity(pack.pitchSparsity);
|
Chris@311
|
739 em.setSourceSparsity(pack.sourceSparsity);
|
Chris@311
|
740
|
Chris@314
|
741 int iterations = (m_mode == HighQualityMode ? 20 : 10);
|
Chris@311
|
742
|
Chris@311
|
743 for (int j = 0; j < iterations; ++j) {
|
Chris@311
|
744 em.iterate(column.data());
|
Chris@311
|
745 }
|
Chris@311
|
746
|
Chris@311
|
747 const float *pitchDist = em.getPitchDistribution();
|
Chris@311
|
748 const float *const *shiftDist = em.getShifts();
|
Chris@311
|
749
|
Chris@311
|
750 int shiftCount = 1;
|
Chris@311
|
751 if (wantShifts) {
|
Chris@311
|
752 shiftCount = pack.templateMaxShift * 2 + 1;
|
Chris@311
|
753 }
|
Chris@311
|
754
|
Chris@311
|
755 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@311
|
756
|
Chris@311
|
757 pitches[j] = pitchDist[j] * sum;
|
Chris@311
|
758
|
Chris@311
|
759 int bestShift = 0;
|
Chris@311
|
760 float bestShiftValue = 0.0;
|
Chris@311
|
761 if (wantShifts) {
|
Chris@311
|
762 for (int k = 0; k < shiftCount; ++k) {
|
Chris@311
|
763 float value = shiftDist[k][j];
|
Chris@311
|
764 if (k == 0 || value > bestShiftValue) {
|
Chris@311
|
765 bestShiftValue = value;
|
Chris@311
|
766 bestShift = k;
|
Chris@311
|
767 }
|
Chris@311
|
768 }
|
Chris@311
|
769 bestShifts.push_back(bestShift);
|
Chris@311
|
770 }
|
Chris@311
|
771 }
|
Chris@311
|
772
|
Chris@311
|
773 return { pitches, bestShifts };
|
Chris@311
|
774 }
|
Chris@311
|
775
|
Chris@32
|
776 Silvet::Grid
|
Chris@32
|
777 Silvet::preProcess(const Grid &in)
|
Chris@32
|
778 {
|
Chris@32
|
779 int width = in.size();
|
Chris@32
|
780
|
Chris@165
|
781 int spacing = processingSampleRate / m_colsPerSec;
|
Chris@32
|
782
|
Chris@165
|
783 // need to be careful that col spacing is an integer number of samples!
|
Chris@165
|
784 assert(spacing * m_colsPerSec == processingSampleRate);
|
Chris@32
|
785
|
Chris@32
|
786 Grid out;
|
Chris@32
|
787
|
Chris@58
|
788 // We count the CQ latency in terms of processing hops, but
|
Chris@58
|
789 // actually it probably isn't an exact number of hops so this
|
Chris@58
|
790 // isn't quite accurate. But the small constant offset is
|
Chris@165
|
791 // practically irrelevant compared to the jitter from the frame
|
Chris@165
|
792 // size we reduce to in a moment
|
Chris@33
|
793 int latentColumns = m_cq->getLatency() / m_cq->getColumnHop();
|
Chris@33
|
794
|
Chris@298
|
795 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
796
|
Chris@32
|
797 for (int i = 0; i < width; ++i) {
|
Chris@32
|
798
|
Chris@33
|
799 if (m_columnCount < latentColumns) {
|
Chris@33
|
800 ++m_columnCount;
|
Chris@33
|
801 continue;
|
Chris@33
|
802 }
|
Chris@33
|
803
|
Chris@32
|
804 int prevSampleNo = (m_columnCount - 1) * m_cq->getColumnHop();
|
Chris@32
|
805 int sampleNo = m_columnCount * m_cq->getColumnHop();
|
Chris@32
|
806
|
Chris@32
|
807 bool select = (sampleNo / spacing != prevSampleNo / spacing);
|
Chris@32
|
808
|
Chris@32
|
809 if (select) {
|
Chris@32
|
810 vector<double> inCol = in[i];
|
Chris@176
|
811 vector<double> outCol(pack.templateHeight);
|
Chris@32
|
812
|
Chris@178
|
813 // In HQ mode, the CQ returns 600 bins and we ignore the
|
Chris@298
|
814 // lowest 55 of them (assuming binsPerSemitone == 5).
|
Chris@178
|
815 //
|
Chris@297
|
816 // In draft and live mode the CQ is an octave shorter,
|
Chris@300
|
817 // returning 540 bins or equivalent, so we instead pad
|
Chris@300
|
818 // them with an additional 5 or equivalent zeros.
|
Chris@178
|
819 //
|
Chris@178
|
820 // We also need to reverse the column as we go, since the
|
Chris@178
|
821 // raw CQ has the high frequencies first and we need it
|
Chris@178
|
822 // the other way around.
|
Chris@32
|
823
|
Chris@298
|
824 int bps = (m_mode == LiveMode ?
|
Chris@298
|
825 binsPerSemitoneLive : binsPerSemitoneNormal);
|
Chris@298
|
826
|
Chris@297
|
827 if (m_mode == HighQualityMode) {
|
Chris@178
|
828 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@298
|
829 int ix = inCol.size() - j - (11 * bps);
|
Chris@178
|
830 outCol[j] = inCol[ix];
|
Chris@178
|
831 }
|
Chris@178
|
832 } else {
|
Chris@298
|
833 for (int j = 0; j < bps; ++j) {
|
Chris@178
|
834 outCol[j] = 0.0;
|
Chris@178
|
835 }
|
Chris@298
|
836 for (int j = bps; j < pack.templateHeight; ++j) {
|
Chris@298
|
837 int ix = inCol.size() - j + (bps-1);
|
Chris@178
|
838 outCol[j] = inCol[ix];
|
Chris@178
|
839 }
|
Chris@46
|
840 }
|
Chris@32
|
841
|
Chris@46
|
842 vector<double> noiseLevel1 =
|
Chris@298
|
843 MedianFilter<double>::filter(8 * bps, outCol);
|
Chris@176
|
844 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
845 noiseLevel1[j] = std::min(outCol[j], noiseLevel1[j]);
|
Chris@46
|
846 }
|
Chris@32
|
847
|
Chris@46
|
848 vector<double> noiseLevel2 =
|
Chris@298
|
849 MedianFilter<double>::filter(8 * bps, noiseLevel1);
|
Chris@176
|
850 for (int j = 0; j < pack.templateHeight; ++j) {
|
Chris@46
|
851 outCol[j] = std::max(outCol[j] - noiseLevel2[j], 0.0);
|
Chris@32
|
852 }
|
Chris@32
|
853
|
Chris@165
|
854 out.push_back(outCol);
|
Chris@32
|
855 }
|
Chris@32
|
856
|
Chris@32
|
857 ++m_columnCount;
|
Chris@32
|
858 }
|
Chris@32
|
859
|
Chris@32
|
860 return out;
|
Chris@32
|
861 }
|
Chris@32
|
862
|
Chris@294
|
863 vector<double>
|
Chris@170
|
864 Silvet::postProcess(const vector<double> &pitches,
|
Chris@170
|
865 const vector<int> &bestShifts,
|
Chris@170
|
866 bool wantShifts)
|
Chris@166
|
867 {
|
Chris@298
|
868 const InstrumentPack &pack(getPack(m_instrument));
|
Chris@176
|
869
|
Chris@41
|
870 vector<double> filtered;
|
Chris@41
|
871
|
Chris@176
|
872 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@170
|
873 m_postFilter[j]->push(pitches[j]);
|
Chris@41
|
874 filtered.push_back(m_postFilter[j]->get());
|
Chris@41
|
875 }
|
Chris@41
|
876
|
Chris@316
|
877 if (m_mode == LiveMode) {
|
Chris@316
|
878 // In live mode with only a 12-bpo CQ, we are very likely to
|
Chris@316
|
879 // get clusters of two or three high scores at a time for
|
Chris@316
|
880 // neighbouring semitones. Eliminate these by picking only the
|
Chris@316
|
881 // peaks. This means we can't recognise actual semitone chords
|
Chris@316
|
882 // if they ever appear, but it's not as if live mode is good
|
Chris@316
|
883 // enough for that to be a big deal anyway.
|
Chris@316
|
884 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@316
|
885 if (j > 0 && j + 1 < pack.templateNoteCount &&
|
Chris@316
|
886 filtered[j] >= filtered[j-1] &&
|
Chris@316
|
887 filtered[j] >= filtered[j+1]) {
|
Chris@316
|
888 } else {
|
Chris@316
|
889 filtered[j] = 0.0;
|
Chris@316
|
890 }
|
Chris@316
|
891 }
|
Chris@316
|
892 }
|
Chris@316
|
893
|
Chris@41
|
894 // Threshold for level and reduce number of candidate pitches
|
Chris@41
|
895
|
Chris@41
|
896 typedef std::multimap<double, int> ValueIndexMap;
|
Chris@41
|
897
|
Chris@41
|
898 ValueIndexMap strengths;
|
Chris@166
|
899
|
Chris@176
|
900 for (int j = 0; j < pack.templateNoteCount; ++j) {
|
Chris@166
|
901 double strength = filtered[j];
|
Chris@183
|
902 if (strength < pack.levelThreshold) continue;
|
Chris@168
|
903 strengths.insert(ValueIndexMap::value_type(strength, j));
|
Chris@168
|
904 }
|
Chris@166
|
905
|
Chris@168
|
906 ValueIndexMap::const_iterator si = strengths.end();
|
Chris@167
|
907
|
Chris@168
|
908 map<int, double> active;
|
Chris@168
|
909 map<int, int> activeShifts;
|
Chris@168
|
910
|
Chris@183
|
911 while (int(active.size()) < pack.maxPolyphony && si != strengths.begin()) {
|
Chris@168
|
912
|
Chris@168
|
913 --si;
|
Chris@168
|
914
|
Chris@168
|
915 double strength = si->first;
|
Chris@168
|
916 int j = si->second;
|
Chris@168
|
917
|
Chris@168
|
918 active[j] = strength;
|
Chris@168
|
919
|
Chris@170
|
920 if (wantShifts) {
|
Chris@170
|
921 activeShifts[j] = bestShifts[j];
|
Chris@167
|
922 }
|
Chris@41
|
923 }
|
Chris@41
|
924
|
Chris@168
|
925 m_pianoRoll.push_back(active);
|
Chris@170
|
926
|
Chris@170
|
927 if (wantShifts) {
|
Chris@168
|
928 m_pianoRollShifts.push_back(activeShifts);
|
Chris@41
|
929 }
|
Chris@294
|
930
|
Chris@294
|
931 return filtered;
|
Chris@166
|
932 }
|
Chris@166
|
933
|
Chris@166
|
934 Vamp::Plugin::FeatureList
|
Chris@168
|
935 Silvet::noteTrack(int shiftCount)
|
Chris@166
|
936 {
|
Chris@41
|
937 // Minimum duration pruning, and conversion to notes. We can only
|
Chris@41
|
938 // report notes that have just ended (i.e. that are absent in the
|
Chris@168
|
939 // latest active set but present in the prior set in the piano
|
Chris@41
|
940 // roll) -- any notes that ended earlier will have been reported
|
Chris@41
|
941 // already, and if they haven't ended, we don't know their
|
Chris@41
|
942 // duration.
|
Chris@41
|
943
|
Chris@168
|
944 int width = m_pianoRoll.size() - 1;
|
Chris@168
|
945
|
Chris@168
|
946 const map<int, double> &active = m_pianoRoll[width];
|
Chris@41
|
947
|
Chris@165
|
948 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@165
|
949
|
Chris@165
|
950 // only keep notes >= 100ms or thereabouts
|
Chris@165
|
951 int durationThreshold = floor(0.1 / columnDuration); // columns
|
Chris@165
|
952 if (durationThreshold < 1) durationThreshold = 1;
|
Chris@41
|
953
|
Chris@41
|
954 FeatureList noteFeatures;
|
Chris@41
|
955
|
Chris@41
|
956 if (width < durationThreshold + 1) {
|
Chris@41
|
957 return noteFeatures;
|
Chris@41
|
958 }
|
Chris@41
|
959
|
Chris@150
|
960 //!!! try: repeated note detection? (look for change in first derivative of the pitch matrix)
|
Chris@150
|
961
|
Chris@55
|
962 for (map<int, double>::const_iterator ni = m_pianoRoll[width-1].begin();
|
Chris@41
|
963 ni != m_pianoRoll[width-1].end(); ++ni) {
|
Chris@41
|
964
|
Chris@55
|
965 int note = ni->first;
|
Chris@41
|
966
|
Chris@41
|
967 if (active.find(note) != active.end()) {
|
Chris@41
|
968 // the note is still playing
|
Chris@41
|
969 continue;
|
Chris@41
|
970 }
|
Chris@41
|
971
|
Chris@41
|
972 // the note was playing but just ended
|
Chris@41
|
973 int end = width;
|
Chris@41
|
974 int start = end-1;
|
Chris@41
|
975
|
Chris@41
|
976 while (m_pianoRoll[start].find(note) != m_pianoRoll[start].end()) {
|
Chris@41
|
977 --start;
|
Chris@41
|
978 }
|
Chris@41
|
979 ++start;
|
Chris@41
|
980
|
Chris@169
|
981 if ((end - start) < durationThreshold) {
|
Chris@41
|
982 continue;
|
Chris@41
|
983 }
|
Chris@41
|
984
|
Chris@169
|
985 emitNote(start, end, note, shiftCount, noteFeatures);
|
Chris@41
|
986 }
|
Chris@41
|
987
|
Chris@62
|
988 // cerr << "returning " << noteFeatures.size() << " complete note(s) " << endl;
|
Chris@41
|
989
|
Chris@41
|
990 return noteFeatures;
|
Chris@41
|
991 }
|
Chris@41
|
992
|
Chris@169
|
993 void
|
Chris@169
|
994 Silvet::emitNote(int start, int end, int note, int shiftCount,
|
Chris@169
|
995 FeatureList ¬eFeatures)
|
Chris@169
|
996 {
|
Chris@169
|
997 int partStart = start;
|
Chris@169
|
998 int partShift = 0;
|
Chris@169
|
999 int partVelocity = 0;
|
Chris@169
|
1000
|
Chris@252
|
1001 int partThreshold = floor(0.05 * m_colsPerSec);
|
Chris@169
|
1002
|
Chris@169
|
1003 for (int i = start; i != end; ++i) {
|
Chris@169
|
1004
|
Chris@169
|
1005 double strength = m_pianoRoll[i][note];
|
Chris@169
|
1006
|
Chris@169
|
1007 int shift = 0;
|
Chris@169
|
1008
|
Chris@169
|
1009 if (shiftCount > 1) {
|
Chris@169
|
1010
|
Chris@169
|
1011 shift = m_pianoRollShifts[i][note];
|
Chris@169
|
1012
|
Chris@169
|
1013 if (i == partStart) {
|
Chris@169
|
1014 partShift = shift;
|
Chris@169
|
1015 }
|
Chris@169
|
1016
|
Chris@169
|
1017 if (i > partStart + partThreshold && shift != partShift) {
|
Chris@169
|
1018
|
Chris@169
|
1019 // cerr << "i = " << i << ", partStart = " << partStart << ", shift = " << shift << ", partShift = " << partShift << endl;
|
Chris@169
|
1020
|
Chris@169
|
1021 // pitch has changed, emit an intermediate note
|
Chris@252
|
1022 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1023 i,
|
Chris@252
|
1024 note,
|
Chris@252
|
1025 partShift,
|
Chris@252
|
1026 shiftCount,
|
Chris@252
|
1027 partVelocity));
|
Chris@169
|
1028 partStart = i;
|
Chris@169
|
1029 partShift = shift;
|
Chris@169
|
1030 partVelocity = 0;
|
Chris@169
|
1031 }
|
Chris@169
|
1032 }
|
Chris@169
|
1033
|
Chris@303
|
1034 int v;
|
Chris@303
|
1035 if (m_mode == LiveMode) {
|
Chris@316
|
1036 v = round(strength * 20);
|
Chris@303
|
1037 } else {
|
Chris@303
|
1038 v = round(strength * 2);
|
Chris@303
|
1039 }
|
Chris@169
|
1040 if (v > partVelocity) {
|
Chris@169
|
1041 partVelocity = v;
|
Chris@169
|
1042 }
|
Chris@169
|
1043 }
|
Chris@169
|
1044
|
Chris@169
|
1045 if (end >= partStart + partThreshold) {
|
Chris@252
|
1046 noteFeatures.push_back(makeNoteFeature(partStart,
|
Chris@252
|
1047 end,
|
Chris@252
|
1048 note,
|
Chris@252
|
1049 partShift,
|
Chris@252
|
1050 shiftCount,
|
Chris@252
|
1051 partVelocity));
|
Chris@169
|
1052 }
|
Chris@169
|
1053 }
|
Chris@252
|
1054
|
Chris@309
|
1055 RealTime
|
Chris@309
|
1056 Silvet::getColumnTimestamp(int column)
|
Chris@309
|
1057 {
|
Chris@309
|
1058 double columnDuration = 1.0 / m_colsPerSec;
|
Chris@309
|
1059 int postFilterLatency = int(m_postFilter[0]->getSize() / 2);
|
Chris@309
|
1060
|
Chris@309
|
1061 return m_startTime + RealTime::fromSeconds
|
Chris@309
|
1062 (columnDuration * (column - postFilterLatency) + 0.02);
|
Chris@309
|
1063 }
|
Chris@309
|
1064
|
Chris@252
|
1065 Silvet::Feature
|
Chris@252
|
1066 Silvet::makeNoteFeature(int start,
|
Chris@252
|
1067 int end,
|
Chris@252
|
1068 int note,
|
Chris@252
|
1069 int shift,
|
Chris@252
|
1070 int shiftCount,
|
Chris@252
|
1071 int velocity)
|
Chris@252
|
1072 {
|
Chris@252
|
1073 Feature f;
|
Chris@252
|
1074
|
Chris@252
|
1075 f.hasTimestamp = true;
|
Chris@309
|
1076 f.timestamp = getColumnTimestamp(start);
|
Chris@252
|
1077
|
Chris@252
|
1078 f.hasDuration = true;
|
Chris@309
|
1079 f.duration = getColumnTimestamp(end) - f.timestamp;
|
Chris@252
|
1080
|
Chris@252
|
1081 f.values.clear();
|
Chris@252
|
1082
|
Chris@252
|
1083 f.values.push_back
|
Chris@252
|
1084 (noteFrequency(note, shift, shiftCount));
|
Chris@252
|
1085
|
Chris@252
|
1086 float inputGain = getInputGainAt(f.timestamp);
|
Chris@252
|
1087 // cerr << "adjusting velocity from " << velocity << " to " << round(velocity/inputGain) << endl;
|
Chris@252
|
1088 velocity = round(velocity / inputGain);
|
Chris@252
|
1089 if (velocity > 127) velocity = 127;
|
Chris@252
|
1090 if (velocity < 1) velocity = 1;
|
Chris@252
|
1091 f.values.push_back(velocity);
|
Chris@252
|
1092
|
Chris@252
|
1093 f.label = noteName(note, shift, shiftCount);
|
Chris@252
|
1094
|
Chris@252
|
1095 return f;
|
Chris@252
|
1096 }
|
Chris@252
|
1097
|
Chris@252
|
1098 float
|
Chris@252
|
1099 Silvet::getInputGainAt(RealTime t)
|
Chris@252
|
1100 {
|
Chris@252
|
1101 map<RealTime, float>::const_iterator i = m_inputGains.lower_bound(t);
|
Chris@252
|
1102
|
Chris@252
|
1103 if (i == m_inputGains.end()) {
|
Chris@252
|
1104 if (i != m_inputGains.begin()) {
|
Chris@252
|
1105 --i;
|
Chris@252
|
1106 } else {
|
Chris@252
|
1107 return 1.f; // no data
|
Chris@252
|
1108 }
|
Chris@252
|
1109 }
|
Chris@252
|
1110
|
Chris@252
|
1111 // cerr << "gain at time " << t << " = " << i->second << endl;
|
Chris@252
|
1112
|
Chris@252
|
1113 return i->second;
|
Chris@252
|
1114 }
|
Chris@252
|
1115
|