matthiasm@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@9
|
2
|
matthiasm@0
|
3 /*
|
Chris@9
|
4 pYIN - A fundamental frequency estimator for monophonic audio
|
Chris@9
|
5 Centre for Digital Music, Queen Mary, University of London.
|
Chris@9
|
6
|
Chris@9
|
7 This program is free software; you can redistribute it and/or
|
Chris@9
|
8 modify it under the terms of the GNU General Public License as
|
Chris@9
|
9 published by the Free Software Foundation; either version 2 of the
|
Chris@9
|
10 License, or (at your option) any later version. See the file
|
Chris@9
|
11 COPYING included with this distribution for more information.
|
matthiasm@0
|
12 */
|
matthiasm@0
|
13
|
matthiasm@36
|
14 #include "PYinVamp.h"
|
matthiasm@0
|
15 #include "MonoNote.h"
|
mail@131
|
16 #include "MonoPitchHMM.h"
|
matthiasm@0
|
17
|
matthiasm@0
|
18 #include <vector>
|
matthiasm@0
|
19 #include <algorithm>
|
matthiasm@0
|
20
|
matthiasm@0
|
21 #include <cstdio>
|
matthiasm@0
|
22 #include <cmath>
|
matthiasm@0
|
23 #include <complex>
|
matthiasm@0
|
24
|
matthiasm@0
|
25 using std::string;
|
matthiasm@0
|
26 using std::vector;
|
matthiasm@0
|
27 using Vamp::RealTime;
|
matthiasm@0
|
28
|
matthiasm@0
|
29
|
matthiasm@36
|
30 PYinVamp::PYinVamp(float inputSampleRate) :
|
matthiasm@0
|
31 Plugin(inputSampleRate),
|
matthiasm@0
|
32 m_channels(0),
|
matthiasm@0
|
33 m_stepSize(256),
|
matthiasm@0
|
34 m_blockSize(2048),
|
matthiasm@0
|
35 m_fmin(40),
|
matthiasm@58
|
36 m_fmax(1600),
|
matthiasm@0
|
37 m_yin(2048, inputSampleRate, 0.0),
|
matthiasm@0
|
38 m_oF0Candidates(0),
|
matthiasm@0
|
39 m_oF0Probs(0),
|
matthiasm@0
|
40 m_oVoicedProb(0),
|
matthiasm@0
|
41 m_oCandidateSalience(0),
|
matthiasm@0
|
42 m_oSmoothedPitchTrack(0),
|
matthiasm@0
|
43 m_oNotes(0),
|
matthiasm@0
|
44 m_threshDistr(2.0f),
|
mail@132
|
45 m_fixedLag(1.0f),
|
matthiasm@6
|
46 m_outputUnvoiced(0.0f),
|
matthiasm@70
|
47 m_preciseTime(0.0f),
|
matthiasm@117
|
48 m_lowAmp(0.1f),
|
matthiasm@117
|
49 m_onsetSensitivity(0.7f),
|
matthiasm@117
|
50 m_pruneThresh(0.1f),
|
mail@132
|
51 m_pitchHmm(0),
|
matthiasm@0
|
52 m_pitchProb(0),
|
matthiasm@103
|
53 m_timestamp(0),
|
mail@133
|
54 m_level(0),
|
mail@133
|
55 m_pitchTrack(0)
|
matthiasm@0
|
56 {
|
matthiasm@0
|
57 }
|
matthiasm@0
|
58
|
matthiasm@36
|
59 PYinVamp::~PYinVamp()
|
matthiasm@0
|
60 {
|
matthiasm@0
|
61 }
|
matthiasm@0
|
62
|
matthiasm@0
|
63 string
|
matthiasm@36
|
64 PYinVamp::getIdentifier() const
|
matthiasm@0
|
65 {
|
matthiasm@1
|
66 return "pyin";
|
matthiasm@0
|
67 }
|
matthiasm@0
|
68
|
matthiasm@0
|
69 string
|
matthiasm@36
|
70 PYinVamp::getName() const
|
matthiasm@0
|
71 {
|
matthiasm@1
|
72 return "pYin";
|
matthiasm@0
|
73 }
|
matthiasm@0
|
74
|
matthiasm@0
|
75 string
|
matthiasm@36
|
76 PYinVamp::getDescription() const
|
matthiasm@0
|
77 {
|
matthiasm@0
|
78 return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
|
matthiasm@0
|
79 }
|
matthiasm@0
|
80
|
matthiasm@0
|
81 string
|
matthiasm@36
|
82 PYinVamp::getMaker() const
|
matthiasm@0
|
83 {
|
matthiasm@0
|
84 return "Matthias Mauch";
|
matthiasm@0
|
85 }
|
matthiasm@0
|
86
|
matthiasm@0
|
87 int
|
matthiasm@36
|
88 PYinVamp::getPluginVersion() const
|
matthiasm@0
|
89 {
|
matthiasm@0
|
90 // Increment this each time you release a version that behaves
|
matthiasm@0
|
91 // differently from the previous one
|
Chris@143
|
92 return 3;
|
matthiasm@0
|
93 }
|
matthiasm@0
|
94
|
matthiasm@0
|
95 string
|
matthiasm@36
|
96 PYinVamp::getCopyright() const
|
matthiasm@0
|
97 {
|
matthiasm@0
|
98 return "GPL";
|
matthiasm@0
|
99 }
|
matthiasm@0
|
100
|
matthiasm@36
|
101 PYinVamp::InputDomain
|
matthiasm@36
|
102 PYinVamp::getInputDomain() const
|
matthiasm@0
|
103 {
|
matthiasm@0
|
104 return TimeDomain;
|
matthiasm@0
|
105 }
|
matthiasm@0
|
106
|
matthiasm@0
|
107 size_t
|
matthiasm@36
|
108 PYinVamp::getPreferredBlockSize() const
|
matthiasm@0
|
109 {
|
matthiasm@0
|
110 return 2048;
|
matthiasm@0
|
111 }
|
matthiasm@0
|
112
|
matthiasm@0
|
113 size_t
|
matthiasm@36
|
114 PYinVamp::getPreferredStepSize() const
|
matthiasm@0
|
115 {
|
matthiasm@0
|
116 return 256;
|
matthiasm@0
|
117 }
|
matthiasm@0
|
118
|
matthiasm@0
|
119 size_t
|
matthiasm@36
|
120 PYinVamp::getMinChannelCount() const
|
matthiasm@0
|
121 {
|
matthiasm@0
|
122 return 1;
|
matthiasm@0
|
123 }
|
matthiasm@0
|
124
|
matthiasm@0
|
125 size_t
|
matthiasm@36
|
126 PYinVamp::getMaxChannelCount() const
|
matthiasm@0
|
127 {
|
matthiasm@0
|
128 return 1;
|
matthiasm@0
|
129 }
|
matthiasm@0
|
130
|
matthiasm@36
|
131 PYinVamp::ParameterList
|
matthiasm@36
|
132 PYinVamp::getParameterDescriptors() const
|
matthiasm@0
|
133 {
|
matthiasm@0
|
134 ParameterList list;
|
matthiasm@0
|
135
|
matthiasm@0
|
136 ParameterDescriptor d;
|
matthiasm@0
|
137
|
matthiasm@0
|
138 d.identifier = "threshdistr";
|
matthiasm@0
|
139 d.name = "Yin threshold distribution";
|
matthiasm@0
|
140 d.description = ".";
|
matthiasm@0
|
141 d.unit = "";
|
matthiasm@0
|
142 d.minValue = 0.0f;
|
matthiasm@0
|
143 d.maxValue = 7.0f;
|
matthiasm@0
|
144 d.defaultValue = 2.0f;
|
matthiasm@0
|
145 d.isQuantized = true;
|
matthiasm@0
|
146 d.quantizeStep = 1.0f;
|
matthiasm@0
|
147 d.valueNames.push_back("Uniform");
|
matthiasm@0
|
148 d.valueNames.push_back("Beta (mean 0.10)");
|
matthiasm@0
|
149 d.valueNames.push_back("Beta (mean 0.15)");
|
matthiasm@0
|
150 d.valueNames.push_back("Beta (mean 0.20)");
|
matthiasm@0
|
151 d.valueNames.push_back("Beta (mean 0.30)");
|
matthiasm@0
|
152 d.valueNames.push_back("Single Value 0.10");
|
matthiasm@0
|
153 d.valueNames.push_back("Single Value 0.15");
|
matthiasm@0
|
154 d.valueNames.push_back("Single Value 0.20");
|
matthiasm@0
|
155 list.push_back(d);
|
matthiasm@0
|
156
|
mail@130
|
157 d.valueNames.clear();
|
mail@130
|
158
|
mail@130
|
159 d.identifier = "fixedlag";
|
mail@130
|
160 d.name = "Fixed-lag smoothing";
|
mail@130
|
161 d.description = "Use fixed lag smoothing, not full Viterbi smoothing.";
|
mail@130
|
162 d.unit = "";
|
mail@130
|
163 d.minValue = 0.0f;
|
mail@130
|
164 d.maxValue = 1.0f;
|
Chris@148
|
165 d.defaultValue = 1.0f;
|
mail@130
|
166 d.isQuantized = true;
|
mail@130
|
167 d.quantizeStep = 1.0f;
|
mail@130
|
168 list.push_back(d);
|
mail@130
|
169
|
matthiasm@0
|
170 d.identifier = "outputunvoiced";
|
matthiasm@0
|
171 d.valueNames.clear();
|
matthiasm@0
|
172 d.name = "Output estimates classified as unvoiced?";
|
matthiasm@0
|
173 d.description = ".";
|
matthiasm@0
|
174 d.unit = "";
|
matthiasm@0
|
175 d.minValue = 0.0f;
|
matthiasm@0
|
176 d.maxValue = 2.0f;
|
matthiasm@6
|
177 d.defaultValue = 0.0f;
|
matthiasm@0
|
178 d.isQuantized = true;
|
matthiasm@0
|
179 d.quantizeStep = 1.0f;
|
matthiasm@0
|
180 d.valueNames.push_back("No");
|
matthiasm@0
|
181 d.valueNames.push_back("Yes");
|
matthiasm@0
|
182 d.valueNames.push_back("Yes, as negative frequencies");
|
matthiasm@0
|
183 list.push_back(d);
|
matthiasm@0
|
184
|
matthiasm@70
|
185 d.identifier = "precisetime";
|
matthiasm@70
|
186 d.valueNames.clear();
|
matthiasm@70
|
187 d.name = "Use non-standard precise YIN timing (slow).";
|
matthiasm@70
|
188 d.description = ".";
|
matthiasm@70
|
189 d.unit = "";
|
matthiasm@70
|
190 d.minValue = 0.0f;
|
matthiasm@70
|
191 d.maxValue = 1.0f;
|
matthiasm@70
|
192 d.defaultValue = 0.0f;
|
matthiasm@70
|
193 d.isQuantized = true;
|
matthiasm@70
|
194 d.quantizeStep = 1.0f;
|
matthiasm@70
|
195 list.push_back(d);
|
matthiasm@70
|
196
|
matthiasm@72
|
197 d.identifier = "lowampsuppression";
|
matthiasm@72
|
198 d.valueNames.clear();
|
matthiasm@72
|
199 d.name = "Suppress low amplitude pitch estimates.";
|
matthiasm@72
|
200 d.description = ".";
|
matthiasm@72
|
201 d.unit = "";
|
matthiasm@72
|
202 d.minValue = 0.0f;
|
matthiasm@72
|
203 d.maxValue = 1.0f;
|
matthiasm@73
|
204 d.defaultValue = 0.1f;
|
matthiasm@72
|
205 d.isQuantized = false;
|
matthiasm@72
|
206 list.push_back(d);
|
matthiasm@70
|
207
|
matthiasm@107
|
208 d.identifier = "onsetsensitivity";
|
matthiasm@107
|
209 d.valueNames.clear();
|
matthiasm@107
|
210 d.name = "Onset sensitivity";
|
matthiasm@107
|
211 d.description = "Adds additional note onsets when RMS increases.";
|
matthiasm@107
|
212 d.unit = "";
|
matthiasm@107
|
213 d.minValue = 0.0f;
|
matthiasm@107
|
214 d.maxValue = 1.0f;
|
matthiasm@117
|
215 d.defaultValue = 0.7f;
|
matthiasm@108
|
216 d.isQuantized = false;
|
matthiasm@108
|
217 list.push_back(d);
|
matthiasm@108
|
218
|
matthiasm@108
|
219 d.identifier = "prunethresh";
|
matthiasm@108
|
220 d.valueNames.clear();
|
matthiasm@108
|
221 d.name = "Duration pruning threshold.";
|
matthiasm@108
|
222 d.description = "Prune notes that are shorter than this value.";
|
matthiasm@108
|
223 d.unit = "";
|
matthiasm@108
|
224 d.minValue = 0.0f;
|
matthiasm@108
|
225 d.maxValue = 0.2f;
|
matthiasm@117
|
226 d.defaultValue = 0.1f;
|
matthiasm@107
|
227 d.isQuantized = false;
|
matthiasm@107
|
228 list.push_back(d);
|
matthiasm@107
|
229
|
matthiasm@0
|
230 return list;
|
matthiasm@0
|
231 }
|
matthiasm@0
|
232
|
matthiasm@0
|
233 float
|
matthiasm@36
|
234 PYinVamp::getParameter(string identifier) const
|
matthiasm@0
|
235 {
|
matthiasm@0
|
236 if (identifier == "threshdistr") {
|
matthiasm@0
|
237 return m_threshDistr;
|
matthiasm@0
|
238 }
|
mail@130
|
239 if (identifier == "fixedlag") {
|
mail@130
|
240 return m_fixedLag;
|
mail@130
|
241 }
|
matthiasm@0
|
242 if (identifier == "outputunvoiced") {
|
matthiasm@0
|
243 return m_outputUnvoiced;
|
matthiasm@0
|
244 }
|
matthiasm@70
|
245 if (identifier == "precisetime") {
|
matthiasm@70
|
246 return m_preciseTime;
|
matthiasm@70
|
247 }
|
matthiasm@72
|
248 if (identifier == "lowampsuppression") {
|
matthiasm@72
|
249 return m_lowAmp;
|
matthiasm@72
|
250 }
|
matthiasm@107
|
251 if (identifier == "onsetsensitivity") {
|
matthiasm@107
|
252 return m_onsetSensitivity;
|
matthiasm@107
|
253 }
|
matthiasm@108
|
254 if (identifier == "prunethresh") {
|
matthiasm@108
|
255 return m_pruneThresh;
|
matthiasm@108
|
256 }
|
matthiasm@0
|
257 return 0.f;
|
matthiasm@0
|
258 }
|
matthiasm@0
|
259
|
matthiasm@0
|
260 void
|
matthiasm@36
|
261 PYinVamp::setParameter(string identifier, float value)
|
matthiasm@0
|
262 {
|
matthiasm@0
|
263 if (identifier == "threshdistr")
|
matthiasm@0
|
264 {
|
matthiasm@0
|
265 m_threshDistr = value;
|
matthiasm@0
|
266 }
|
mail@130
|
267 if (identifier == "fixedlag")
|
mail@130
|
268 {
|
mail@130
|
269 m_fixedLag = value;
|
mail@130
|
270 }
|
matthiasm@0
|
271 if (identifier == "outputunvoiced")
|
matthiasm@0
|
272 {
|
matthiasm@0
|
273 m_outputUnvoiced = value;
|
matthiasm@0
|
274 }
|
matthiasm@70
|
275 if (identifier == "precisetime")
|
matthiasm@70
|
276 {
|
matthiasm@70
|
277 m_preciseTime = value;
|
matthiasm@70
|
278 }
|
matthiasm@72
|
279 if (identifier == "lowampsuppression")
|
matthiasm@72
|
280 {
|
matthiasm@72
|
281 m_lowAmp = value;
|
matthiasm@72
|
282 }
|
matthiasm@107
|
283 if (identifier == "onsetsensitivity")
|
matthiasm@107
|
284 {
|
matthiasm@107
|
285 m_onsetSensitivity = value;
|
matthiasm@107
|
286 }
|
matthiasm@108
|
287 if (identifier == "prunethresh")
|
matthiasm@108
|
288 {
|
matthiasm@108
|
289 m_pruneThresh = value;
|
matthiasm@108
|
290 }
|
matthiasm@0
|
291 }
|
matthiasm@0
|
292
|
matthiasm@36
|
293 PYinVamp::ProgramList
|
matthiasm@36
|
294 PYinVamp::getPrograms() const
|
matthiasm@0
|
295 {
|
matthiasm@0
|
296 ProgramList list;
|
matthiasm@0
|
297 return list;
|
matthiasm@0
|
298 }
|
matthiasm@0
|
299
|
matthiasm@0
|
300 string
|
matthiasm@36
|
301 PYinVamp::getCurrentProgram() const
|
matthiasm@0
|
302 {
|
matthiasm@0
|
303 return ""; // no programs
|
matthiasm@0
|
304 }
|
matthiasm@0
|
305
|
matthiasm@0
|
306 void
|
Chris@138
|
307 PYinVamp::selectProgram(string)
|
matthiasm@0
|
308 {
|
matthiasm@0
|
309 }
|
matthiasm@0
|
310
|
matthiasm@36
|
311 PYinVamp::OutputList
|
matthiasm@36
|
312 PYinVamp::getOutputDescriptors() const
|
matthiasm@0
|
313 {
|
matthiasm@0
|
314 OutputList outputs;
|
matthiasm@0
|
315
|
matthiasm@0
|
316 OutputDescriptor d;
|
matthiasm@0
|
317
|
matthiasm@0
|
318 int outputNumber = 0;
|
matthiasm@0
|
319
|
matthiasm@0
|
320 d.identifier = "f0candidates";
|
matthiasm@0
|
321 d.name = "F0 Candidates";
|
matthiasm@0
|
322 d.description = "Estimated fundamental frequency candidates.";
|
matthiasm@0
|
323 d.unit = "Hz";
|
matthiasm@0
|
324 d.hasFixedBinCount = false;
|
matthiasm@0
|
325 d.hasKnownExtents = true;
|
matthiasm@0
|
326 d.minValue = m_fmin;
|
matthiasm@0
|
327 d.maxValue = 500;
|
matthiasm@0
|
328 d.isQuantized = false;
|
matthiasm@0
|
329 d.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
330 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
331 d.hasDuration = false;
|
matthiasm@0
|
332 outputs.push_back(d);
|
matthiasm@0
|
333 m_oF0Candidates = outputNumber++;
|
matthiasm@0
|
334
|
matthiasm@0
|
335 d.identifier = "f0probs";
|
matthiasm@0
|
336 d.name = "Candidate Probabilities";
|
Chris@146
|
337 d.description = "Probabilities of estimated fundamental frequency candidates.";
|
matthiasm@0
|
338 d.unit = "";
|
matthiasm@0
|
339 d.hasFixedBinCount = false;
|
matthiasm@0
|
340 d.hasKnownExtents = true;
|
matthiasm@0
|
341 d.minValue = 0;
|
matthiasm@0
|
342 d.maxValue = 1;
|
matthiasm@0
|
343 d.isQuantized = false;
|
matthiasm@0
|
344 d.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
345 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
346 d.hasDuration = false;
|
matthiasm@0
|
347 outputs.push_back(d);
|
matthiasm@0
|
348 m_oF0Probs = outputNumber++;
|
matthiasm@0
|
349
|
matthiasm@0
|
350 d.identifier = "voicedprob";
|
matthiasm@0
|
351 d.name = "Voiced Probability";
|
matthiasm@0
|
352 d.description = "Probability that the signal is voiced according to Probabilistic Yin.";
|
matthiasm@0
|
353 d.unit = "";
|
matthiasm@0
|
354 d.hasFixedBinCount = true;
|
matthiasm@0
|
355 d.binCount = 1;
|
matthiasm@0
|
356 d.hasKnownExtents = true;
|
matthiasm@0
|
357 d.minValue = 0;
|
matthiasm@0
|
358 d.maxValue = 1;
|
matthiasm@0
|
359 d.isQuantized = false;
|
matthiasm@0
|
360 d.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
361 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
362 d.hasDuration = false;
|
matthiasm@0
|
363 outputs.push_back(d);
|
matthiasm@0
|
364 m_oVoicedProb = outputNumber++;
|
matthiasm@0
|
365
|
matthiasm@0
|
366 d.identifier = "candidatesalience";
|
matthiasm@0
|
367 d.name = "Candidate Salience";
|
matthiasm@0
|
368 d.description = "Candidate Salience";
|
matthiasm@0
|
369 d.hasFixedBinCount = true;
|
matthiasm@0
|
370 d.binCount = m_blockSize / 2;
|
matthiasm@0
|
371 d.hasKnownExtents = true;
|
matthiasm@0
|
372 d.minValue = 0;
|
matthiasm@0
|
373 d.maxValue = 1;
|
matthiasm@0
|
374 d.isQuantized = false;
|
matthiasm@0
|
375 d.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
376 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
377 d.hasDuration = false;
|
matthiasm@0
|
378 outputs.push_back(d);
|
matthiasm@0
|
379 m_oCandidateSalience = outputNumber++;
|
matthiasm@0
|
380
|
matthiasm@0
|
381 d.identifier = "smoothedpitchtrack";
|
matthiasm@0
|
382 d.name = "Smoothed Pitch Track";
|
Chris@146
|
383 d.description = "Frame-by-frame pitch estimate after smoothing";
|
matthiasm@0
|
384 d.unit = "Hz";
|
matthiasm@0
|
385 d.hasFixedBinCount = true;
|
matthiasm@0
|
386 d.binCount = 1;
|
matthiasm@0
|
387 d.hasKnownExtents = false;
|
matthiasm@0
|
388 d.isQuantized = false;
|
matthiasm@0
|
389 d.sampleType = OutputDescriptor::FixedSampleRate;
|
matthiasm@0
|
390 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
391 d.hasDuration = false;
|
matthiasm@0
|
392 outputs.push_back(d);
|
matthiasm@0
|
393 m_oSmoothedPitchTrack = outputNumber++;
|
matthiasm@0
|
394
|
matthiasm@0
|
395 d.identifier = "notes";
|
matthiasm@0
|
396 d.name = "Notes";
|
matthiasm@0
|
397 d.description = "Derived fixed-pitch note frequencies";
|
matthiasm@0
|
398 d.unit = "Hz";
|
matthiasm@0
|
399 d.hasFixedBinCount = true;
|
matthiasm@0
|
400 d.binCount = 1;
|
matthiasm@0
|
401 d.hasKnownExtents = false;
|
matthiasm@0
|
402 d.isQuantized = false;
|
matthiasm@0
|
403 d.sampleType = OutputDescriptor::VariableSampleRate;
|
matthiasm@0
|
404 d.sampleRate = (m_inputSampleRate / m_stepSize);
|
matthiasm@0
|
405 d.hasDuration = true;
|
matthiasm@0
|
406 outputs.push_back(d);
|
matthiasm@0
|
407 m_oNotes = outputNumber++;
|
matthiasm@0
|
408
|
matthiasm@0
|
409 return outputs;
|
matthiasm@0
|
410 }
|
matthiasm@0
|
411
|
matthiasm@0
|
412 bool
|
matthiasm@36
|
413 PYinVamp::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
matthiasm@0
|
414 {
|
matthiasm@0
|
415 if (channels < getMinChannelCount() ||
|
matthiasm@0
|
416 channels > getMaxChannelCount()) return false;
|
matthiasm@0
|
417
|
matthiasm@0
|
418 m_channels = channels;
|
matthiasm@0
|
419 m_stepSize = stepSize;
|
matthiasm@0
|
420 m_blockSize = blockSize;
|
matthiasm@0
|
421
|
matthiasm@0
|
422 reset();
|
matthiasm@0
|
423
|
matthiasm@0
|
424 return true;
|
matthiasm@0
|
425 }
|
matthiasm@0
|
426
|
matthiasm@0
|
427 void
|
matthiasm@36
|
428 PYinVamp::reset()
|
matthiasm@0
|
429 {
|
matthiasm@0
|
430 m_yin.setThresholdDistr(m_threshDistr);
|
matthiasm@0
|
431 m_yin.setFrameSize(m_blockSize);
|
matthiasm@117
|
432 m_yin.setFast(!m_preciseTime);
|
mail@132
|
433
|
Chris@150
|
434 if (m_fixedLag > 0.5f) m_pitchHmm = MonoPitchHMM(100);
|
mail@132
|
435 else m_pitchHmm = MonoPitchHMM(0);
|
matthiasm@0
|
436
|
matthiasm@0
|
437 m_pitchProb.clear();
|
matthiasm@0
|
438 m_timestamp.clear();
|
matthiasm@103
|
439 m_level.clear();
|
mail@133
|
440 m_pitchTrack.clear();
|
matthiasm@0
|
441 }
|
matthiasm@0
|
442
|
matthiasm@36
|
443 PYinVamp::FeatureSet
|
matthiasm@36
|
444 PYinVamp::process(const float *const *inputBuffers, RealTime timestamp)
|
matthiasm@0
|
445 {
|
matthiasm@77
|
446 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
|
mail@133
|
447 timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset,
|
mail@133
|
448 lrintf(m_inputSampleRate));
|
matthiasm@77
|
449
|
matthiasm@0
|
450 FeatureSet fs;
|
matthiasm@0
|
451
|
matthiasm@46
|
452 float rms = 0;
|
matthiasm@46
|
453
|
matthiasm@0
|
454 double *dInputBuffers = new double[m_blockSize];
|
matthiasm@46
|
455 for (size_t i = 0; i < m_blockSize; ++i) {
|
matthiasm@46
|
456 dInputBuffers[i] = inputBuffers[0][i];
|
matthiasm@46
|
457 rms += inputBuffers[0][i] * inputBuffers[0][i];
|
matthiasm@46
|
458 }
|
matthiasm@46
|
459 rms /= m_blockSize;
|
matthiasm@46
|
460 rms = sqrt(rms);
|
matthiasm@116
|
461
|
matthiasm@72
|
462 bool isLowAmplitude = (rms < m_lowAmp);
|
matthiasm@0
|
463
|
matthiasm@0
|
464 Yin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
|
matthiasm@27
|
465 delete [] dInputBuffers;
|
matthiasm@27
|
466
|
matthiasm@103
|
467 m_level.push_back(yo.rms);
|
matthiasm@103
|
468
|
matthiasm@27
|
469 vector<pair<double, double> > tempPitchProb;
|
matthiasm@27
|
470 for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
|
matthiasm@27
|
471 {
|
matthiasm@27
|
472 double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
|
matthiasm@50
|
473 if (!isLowAmplitude)
|
matthiasm@116
|
474 {
|
matthiasm@46
|
475 tempPitchProb.push_back(pair<double, double>
|
matthiasm@46
|
476 (tempPitch, yo.freqProb[iCandidate].second));
|
matthiasm@116
|
477 } else {
|
matthiasm@116
|
478 float factor = ((rms+0.01*m_lowAmp)/(1.01*m_lowAmp));
|
matthiasm@46
|
479 tempPitchProb.push_back(pair<double, double>
|
matthiasm@65
|
480 (tempPitch, yo.freqProb[iCandidate].second*factor));
|
matthiasm@65
|
481 }
|
matthiasm@27
|
482 }
|
mail@130
|
483
|
mail@132
|
484 vector<double> tempObsProb = m_pitchHmm.calculateObsProb(tempPitchProb);
|
mail@132
|
485 if (m_timestamp.empty())
|
mail@130
|
486 {
|
mail@132
|
487 m_pitchHmm.initialise(tempObsProb);
|
mail@132
|
488 } else {
|
mail@132
|
489 m_pitchHmm.process(tempObsProb);
|
mail@132
|
490 }
|
mail@132
|
491
|
matthiasm@27
|
492 m_pitchProb.push_back(tempPitchProb);
|
matthiasm@27
|
493 m_timestamp.push_back(timestamp);
|
matthiasm@27
|
494
|
mail@132
|
495 int lag = m_pitchHmm.m_fixedLag;
|
mail@132
|
496
|
Chris@150
|
497 if (m_fixedLag > 0.5f) // do fixed-lag smoothing instead of full Viterbi
|
mail@132
|
498 {
|
Chris@141
|
499 if (int(m_timestamp.size()) == lag + 1)
|
mail@131
|
500 {
|
mail@132
|
501 m_timestamp.pop_front();
|
mail@132
|
502 m_pitchProb.pop_front();
|
mail@132
|
503
|
mail@132
|
504 Feature f;
|
mail@132
|
505 f.hasTimestamp = true;
|
mail@132
|
506 vector<int> rawPitchPath = m_pitchHmm.track();
|
mail@132
|
507 float freq = m_pitchHmm.nearestFreq(rawPitchPath[0],
|
mail@132
|
508 m_pitchProb[0]);
|
mail@133
|
509 m_pitchTrack.push_back(freq);
|
mail@132
|
510 f.timestamp = m_timestamp[0];
|
mail@132
|
511 f.values.clear();
|
mail@132
|
512
|
mail@132
|
513 // different output modes
|
mail@132
|
514 if (freq < 0 && (m_outputUnvoiced==0))
|
mail@132
|
515 {
|
mail@132
|
516
|
mail@132
|
517 } else {
|
mail@132
|
518 if (m_outputUnvoiced == 1)
|
mail@132
|
519 {
|
mail@132
|
520 f.values.push_back(fabs(freq));
|
mail@132
|
521 } else {
|
mail@132
|
522 f.values.push_back(freq);
|
mail@132
|
523 }
|
mail@132
|
524 fs[m_oSmoothedPitchTrack].push_back(f);
|
mail@132
|
525 }
|
mail@131
|
526 }
|
mail@130
|
527 }
|
mail@132
|
528
|
matthiasm@27
|
529 // F0 CANDIDATES
|
matthiasm@0
|
530 Feature f;
|
matthiasm@0
|
531 f.hasTimestamp = true;
|
matthiasm@0
|
532 f.timestamp = timestamp;
|
matthiasm@0
|
533 for (size_t i = 0; i < yo.freqProb.size(); ++i)
|
matthiasm@0
|
534 {
|
matthiasm@0
|
535 f.values.push_back(yo.freqProb[i].first);
|
matthiasm@0
|
536 }
|
matthiasm@0
|
537 fs[m_oF0Candidates].push_back(f);
|
matthiasm@0
|
538
|
matthiasm@27
|
539 // VOICEDPROB
|
matthiasm@0
|
540 f.values.clear();
|
matthiasm@0
|
541 float voicedProb = 0;
|
matthiasm@0
|
542 for (size_t i = 0; i < yo.freqProb.size(); ++i)
|
matthiasm@0
|
543 {
|
matthiasm@0
|
544 f.values.push_back(yo.freqProb[i].second);
|
matthiasm@0
|
545 voicedProb += yo.freqProb[i].second;
|
matthiasm@0
|
546 }
|
matthiasm@0
|
547 fs[m_oF0Probs].push_back(f);
|
matthiasm@0
|
548
|
mail@128
|
549 f.values.clear();
|
matthiasm@0
|
550 f.values.push_back(voicedProb);
|
matthiasm@0
|
551 fs[m_oVoicedProb].push_back(f);
|
matthiasm@0
|
552
|
matthiasm@27
|
553 // SALIENCE -- maybe this should eventually disappear
|
matthiasm@0
|
554 f.values.clear();
|
matthiasm@0
|
555 float salienceSum = 0;
|
matthiasm@0
|
556 for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
|
matthiasm@0
|
557 {
|
matthiasm@0
|
558 f.values.push_back(yo.salience[iBin]);
|
matthiasm@0
|
559 salienceSum += yo.salience[iBin];
|
matthiasm@0
|
560 }
|
matthiasm@0
|
561 fs[m_oCandidateSalience].push_back(f);
|
matthiasm@0
|
562
|
matthiasm@0
|
563 return fs;
|
matthiasm@0
|
564 }
|
matthiasm@0
|
565
|
matthiasm@36
|
566 PYinVamp::FeatureSet
|
matthiasm@36
|
567 PYinVamp::getRemainingFeatures()
|
matthiasm@0
|
568 {
|
matthiasm@0
|
569 FeatureSet fs;
|
Chris@146
|
570
|
Chris@4
|
571 if (m_pitchProb.empty()) {
|
Chris@4
|
572 return fs;
|
Chris@4
|
573 }
|
Chris@4
|
574
|
Chris@146
|
575 Feature f;
|
Chris@146
|
576 f.hasTimestamp = true;
|
Chris@146
|
577 f.hasDuration = false;
|
Chris@146
|
578
|
mail@131
|
579 // ================== P I T C H T R A C K =================================
|
mail@131
|
580
|
Chris@146
|
581 // NB we do this even in fixed-lag mode, as we still have the last
|
Chris@146
|
582 // lag's-worth of pitch probs to consume
|
Chris@146
|
583
|
mail@132
|
584 vector<int> rawPitchPath = m_pitchHmm.track();
|
mail@131
|
585
|
mail@131
|
586 for (size_t iFrame = 0; iFrame < rawPitchPath.size(); ++iFrame)
|
matthiasm@0
|
587 {
|
mail@132
|
588 float freq = m_pitchHmm.nearestFreq(rawPitchPath[iFrame],
|
mail@132
|
589 m_pitchProb[iFrame]);
|
mail@133
|
590 m_pitchTrack.push_back(freq); // for note processing below
|
Chris@146
|
591
|
matthiasm@0
|
592 f.timestamp = m_timestamp[iFrame];
|
matthiasm@0
|
593 f.values.clear();
|
Chris@146
|
594
|
mail@131
|
595 // different output modes
|
mail@131
|
596 if (freq < 0 && (m_outputUnvoiced==0)) continue;
|
matthiasm@0
|
597 if (m_outputUnvoiced == 1)
|
matthiasm@0
|
598 {
|
mail@131
|
599 f.values.push_back(fabs(freq));
|
matthiasm@0
|
600 } else {
|
mail@131
|
601 f.values.push_back(freq);
|
matthiasm@0
|
602 }
|
matthiasm@0
|
603 fs[m_oSmoothedPitchTrack].push_back(f);
|
matthiasm@0
|
604 }
|
Chris@146
|
605
|
Chris@146
|
606 addNoteFeatures(fs);
|
Chris@146
|
607
|
Chris@146
|
608 return fs;
|
Chris@146
|
609 }
|
Chris@146
|
610
|
Chris@146
|
611 void
|
Chris@146
|
612 PYinVamp::addNoteFeatures(FeatureSet &fs)
|
Chris@146
|
613 {
|
matthiasm@1
|
614 std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
|
mail@133
|
615 for (size_t iFrame = 0; iFrame < m_pitchTrack.size(); ++iFrame) {
|
matthiasm@1
|
616 std::vector<std::pair<double, double> > temp;
|
mail@133
|
617 if (m_pitchTrack[iFrame] > 0)
|
matthiasm@1
|
618 {
|
mail@133
|
619 double tempPitch = 12 *
|
mail@133
|
620 std::log(m_pitchTrack[iFrame]/440)/std::log(2.) + 69;
|
matthiasm@1
|
621 temp.push_back(std::pair<double,double>(tempPitch, .9));
|
matthiasm@1
|
622 }
|
matthiasm@1
|
623 smoothedPitch.push_back(temp);
|
matthiasm@1
|
624 }
|
mail@133
|
625
|
Chris@150
|
626 // In fixed-lag mode, we use fixed-lag processing for the note
|
Chris@150
|
627 // transitions here as well as for the pitch transitions in
|
Chris@150
|
628 // process. The main reason we provide the fixed-lag option is so
|
Chris@150
|
629 // that we can get pitch results incrementally from process; we
|
Chris@150
|
630 // don't get that outcome here, but we do benefit from its bounded
|
Chris@150
|
631 // memory usage, which can be quite a big deal. So if the caller
|
Chris@150
|
632 // asked for it there, we use it here too. (It is a bit slower,
|
Chris@150
|
633 // but not much.)
|
Chris@150
|
634
|
Chris@150
|
635 MonoNote mn(m_fixedLag > 0.5f);
|
matthiasm@1
|
636 vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
|
Chris@146
|
637
|
mail@133
|
638 std::cerr << "mnOut size: " << mnOut.size() << std::endl;
|
mail@133
|
639 std::cerr << "m_pitchTrack size: " << m_pitchTrack.size() << std::endl;
|
matthiasm@1
|
640
|
matthiasm@6
|
641 // turning feature into a note feature
|
Chris@146
|
642 Feature f;
|
matthiasm@1
|
643 f.hasTimestamp = true;
|
matthiasm@1
|
644 f.hasDuration = true;
|
matthiasm@1
|
645 f.values.clear();
|
matthiasm@6
|
646
|
matthiasm@6
|
647 int onsetFrame = 0;
|
matthiasm@6
|
648 bool isVoiced = 0;
|
matthiasm@6
|
649 bool oldIsVoiced = 0;
|
mail@133
|
650 size_t nFrame = m_pitchTrack.size();
|
matthiasm@108
|
651
|
matthiasm@108
|
652 float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
|
matthiasm@1
|
653
|
mail@133
|
654 // the body of the loop below should be in a function/method
|
mail@133
|
655 // but what does it actually do??
|
mail@133
|
656 // * takes the result of the note tracking HMM
|
mail@133
|
657 // * collects contiguously pitched pitches
|
mail@133
|
658 // * writes a note once it notices the voiced segment has ended
|
mail@133
|
659 // complications:
|
mail@133
|
660 // * it needs a lookahead of two frames for m_level (wtf was I thinking)
|
mail@133
|
661 // * it needs to know the timestamp (which can be guessed from the frame no)
|
mail@133
|
662 // *
|
mail@133
|
663 int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
|
mail@133
|
664 RealTime timestampOffset = Vamp::RealTime::frame2RealTime(offset,
|
mail@133
|
665 lrintf(m_inputSampleRate));
|
mail@133
|
666
|
mail@133
|
667 std::vector<float> notePitchTrack; // collects pitches for 1 note at a time
|
matthiasm@6
|
668 for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
|
matthiasm@1
|
669 {
|
mail@133
|
670 isVoiced = mnOut[iFrame].noteState < 3
|
mail@133
|
671 && smoothedPitch[iFrame].size() > 0
|
mail@133
|
672 && (iFrame >= nFrame-2
|
mail@133
|
673 || ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
|
matthiasm@6
|
674 if (isVoiced && iFrame != nFrame-1)
|
matthiasm@1
|
675 {
|
matthiasm@6
|
676 if (oldIsVoiced == 0) // beginning of a note
|
matthiasm@1
|
677 {
|
matthiasm@6
|
678 onsetFrame = iFrame;
|
matthiasm@1
|
679 }
|
matthiasm@6
|
680 float pitch = smoothedPitch[iFrame][0].first;
|
matthiasm@6
|
681 notePitchTrack.push_back(pitch); // add to the note's pitch track
|
matthiasm@6
|
682 } else { // not currently voiced
|
matthiasm@108
|
683 if (oldIsVoiced == 1) // end of note
|
matthiasm@6
|
684 {
|
matthiasm@108
|
685 if (notePitchTrack.size() >= minNoteFrames)
|
matthiasm@108
|
686 {
|
matthiasm@108
|
687 std::sort(notePitchTrack.begin(), notePitchTrack.end());
|
matthiasm@108
|
688 float medianPitch = notePitchTrack[notePitchTrack.size()/2];
|
mail@133
|
689 float medianFreq =
|
mail@133
|
690 std::pow(2,(medianPitch - 69) / 12) * 440;
|
matthiasm@108
|
691 f.values.clear();
|
matthiasm@108
|
692 f.values.push_back(medianFreq);
|
mail@133
|
693 RealTime start = RealTime::frame2RealTime(
|
mail@133
|
694 onsetFrame * m_stepSize, lrintf(m_inputSampleRate)) +
|
mail@133
|
695 timestampOffset;
|
mail@133
|
696 RealTime end = RealTime::frame2RealTime(
|
mail@133
|
697 iFrame * m_stepSize, lrintf(m_inputSampleRate)) +
|
mail@133
|
698 timestampOffset;
|
mail@133
|
699 f.timestamp = start;
|
mail@133
|
700 f.duration = end - start;
|
matthiasm@108
|
701 fs[m_oNotes].push_back(f);
|
matthiasm@108
|
702 }
|
matthiasm@108
|
703 notePitchTrack.clear();
|
matthiasm@1
|
704 }
|
matthiasm@1
|
705 }
|
matthiasm@6
|
706 oldIsVoiced = isVoiced;
|
matthiasm@1
|
707 }
|
matthiasm@0
|
708 }
|