Mercurial > hg > tuning-difference
comparison chroma-compare-plugin/TuningDifference.cpp @ 13:c74071731d74
The bulk of the chroma-compare implementation. Should be using the newly refactored Chromagram class from libcq though (chroma is currently upside-down!)
author | Chris Cannam |
---|---|
date | Wed, 04 Feb 2015 15:10:07 +0000 |
parents | 23572f9d25d9 |
children | 812e4d021443 |
comparison
equal
deleted
inserted
replaced
12:23572f9d25d9 | 13:c74071731d74 |
---|---|
4 #include <iostream> | 4 #include <iostream> |
5 | 5 |
6 #include <cmath> | 6 #include <cmath> |
7 #include <cstdio> | 7 #include <cstdio> |
8 | 8 |
9 using std::cerr; | 9 #include <algorithm> |
10 using std::endl; | 10 |
11 | 11 using namespace std; |
12 static double targetFmin = 60.0; | 12 |
13 static double targetFmax = 1500.0; | 13 static double pitchToFrequency(int pitch, |
14 double centsOffset = 0., | |
15 double concertA = 440.) | |
16 { | |
17 double p = double(pitch) + (centsOffset / 100.); | |
18 return concertA * pow(2.0, (p - 69.0) / 12.0); | |
19 } | |
20 | |
21 static double frequencyForCentsAbove440(double cents) | |
22 { | |
23 return pitchToFrequency(69, cents, 440.); | |
24 } | |
14 | 25 |
15 TuningDifference::TuningDifference(float inputSampleRate) : | 26 TuningDifference::TuningDifference(float inputSampleRate) : |
16 Plugin(inputSampleRate) | 27 Plugin(inputSampleRate), |
28 m_bpo(60), | |
29 m_refCQ(new CQSpectrogram(paramsForTuningFrequency(440.), | |
30 CQSpectrogram::InterpolateHold)) | |
17 { | 31 { |
18 } | 32 } |
19 | 33 |
20 TuningDifference::~TuningDifference() | 34 TuningDifference::~TuningDifference() |
21 { | 35 { |
66 } | 80 } |
67 | 81 |
68 TuningDifference::InputDomain | 82 TuningDifference::InputDomain |
69 TuningDifference::getInputDomain() const | 83 TuningDifference::getInputDomain() const |
70 { | 84 { |
71 return FrequencyDomain; | 85 return TimeDomain; |
72 } | 86 } |
73 | 87 |
74 size_t | 88 size_t |
75 TuningDifference::getPreferredBlockSize() const | 89 TuningDifference::getPreferredBlockSize() const |
76 { | 90 { |
77 return 16384; | 91 return 0; |
78 } | 92 } |
79 | 93 |
80 size_t | 94 size_t |
81 TuningDifference::getPreferredStepSize() const | 95 TuningDifference::getPreferredStepSize() const |
82 { | 96 { |
145 d.binCount = 1; | 159 d.binCount = 1; |
146 d.hasKnownExtents = false; | 160 d.hasKnownExtents = false; |
147 d.isQuantized = false; | 161 d.isQuantized = false; |
148 d.sampleType = OutputDescriptor::VariableSampleRate; | 162 d.sampleType = OutputDescriptor::VariableSampleRate; |
149 d.hasDuration = false; | 163 d.hasDuration = false; |
164 m_outputs[d.identifier] = list.size(); | |
150 list.push_back(d); | 165 list.push_back(d); |
151 | 166 |
152 d.identifier = "tuningfreq"; | 167 d.identifier = "tuningfreq"; |
153 d.name = "Relative Tuning Frequency"; | 168 d.name = "Relative Tuning Frequency"; |
154 d.description = "Tuning frequency of channel 2, if channel 1 is assumed to contain the same music as it at a tuning frequency of A=440Hz."; | 169 d.description = "Tuning frequency of channel 2, if channel 1 is assumed to contain the same music as it at a tuning frequency of A=440Hz."; |
157 d.binCount = 1; | 172 d.binCount = 1; |
158 d.hasKnownExtents = false; | 173 d.hasKnownExtents = false; |
159 d.isQuantized = false; | 174 d.isQuantized = false; |
160 d.sampleType = OutputDescriptor::VariableSampleRate; | 175 d.sampleType = OutputDescriptor::VariableSampleRate; |
161 d.hasDuration = false; | 176 d.hasDuration = false; |
177 m_outputs[d.identifier] = list.size(); | |
162 list.push_back(d); | 178 list.push_back(d); |
163 | 179 |
164 d.identifier = "curve"; | 180 d.identifier = "reffeature"; |
165 d.name = "Shift Correlation Curve"; | 181 d.name = "Reference Feature"; |
166 d.description = "Correlation between shifted and unshifted sources, for each probed shift in cents."; | 182 d.description = "Chroma feature from reference audio."; |
167 d.unit = ""; | 183 d.unit = ""; |
168 d.hasFixedBinCount = true; | 184 d.hasFixedBinCount = true; |
169 d.binCount = 1; | 185 d.binCount = m_bpo; |
170 d.hasKnownExtents = false; | |
171 d.isQuantized = false; | |
172 d.sampleType = OutputDescriptor::FixedSampleRate; | |
173 d.sampleRate = 100; | |
174 d.hasDuration = false; | |
175 list.push_back(d); | |
176 | |
177 int targetBinMin = int(floor(targetFmin * m_blockSize / m_inputSampleRate)); | |
178 int targetBinMax = int(ceil(targetFmax * m_blockSize / m_inputSampleRate)); | |
179 cerr << "target bin range: " << targetBinMin << " -> " << targetBinMax << endl; | |
180 | |
181 d.identifier = "averages"; | |
182 d.name = "Spectrum averages"; | |
183 d.description = "Average magnitude spectrum for each channel."; | |
184 d.unit = ""; | |
185 d.hasFixedBinCount = true; | |
186 d.binCount = (targetBinMax > targetBinMin ? targetBinMax - targetBinMin + 1 : 100); | |
187 d.hasKnownExtents = false; | 186 d.hasKnownExtents = false; |
188 d.isQuantized = false; | 187 d.isQuantized = false; |
189 d.sampleType = OutputDescriptor::FixedSampleRate; | 188 d.sampleType = OutputDescriptor::FixedSampleRate; |
190 d.sampleRate = 1; | 189 d.sampleRate = 1; |
191 d.hasDuration = false; | 190 d.hasDuration = false; |
191 m_outputs[d.identifier] = list.size(); | |
192 list.push_back(d); | |
193 | |
194 d.identifier = "otherfeature"; | |
195 d.name = "Other Feature"; | |
196 d.description = "Chroma feature from other audio, before rotation."; | |
197 d.unit = ""; | |
198 d.hasFixedBinCount = true; | |
199 d.binCount = m_bpo; | |
200 d.hasKnownExtents = false; | |
201 d.isQuantized = false; | |
202 d.sampleType = OutputDescriptor::FixedSampleRate; | |
203 d.sampleRate = 1; | |
204 d.hasDuration = false; | |
205 m_outputs[d.identifier] = list.size(); | |
206 list.push_back(d); | |
207 | |
208 d.identifier = "rotfeature"; | |
209 d.name = "Other Feature at Rotated Frequency"; | |
210 d.description = "Chroma feature from reference audio calculated with the tuning frequency obtained from rotation matching."; | |
211 d.unit = ""; | |
212 d.hasFixedBinCount = true; | |
213 d.binCount = m_bpo; | |
214 d.hasKnownExtents = false; | |
215 d.isQuantized = false; | |
216 d.sampleType = OutputDescriptor::FixedSampleRate; | |
217 d.sampleRate = 1; | |
218 d.hasDuration = false; | |
219 m_outputs[d.identifier] = list.size(); | |
192 list.push_back(d); | 220 list.push_back(d); |
193 | 221 |
194 return list; | 222 return list; |
195 } | 223 } |
196 | 224 |
198 TuningDifference::initialise(size_t channels, size_t stepSize, size_t blockSize) | 226 TuningDifference::initialise(size_t channels, size_t stepSize, size_t blockSize) |
199 { | 227 { |
200 if (channels < getMinChannelCount() || | 228 if (channels < getMinChannelCount() || |
201 channels > getMaxChannelCount()) return false; | 229 channels > getMaxChannelCount()) return false; |
202 | 230 |
203 if (blockSize != getPreferredBlockSize() || | 231 if (stepSize != blockSize) return false; |
204 stepSize != blockSize/2) return false; | |
205 | 232 |
206 m_blockSize = blockSize; | 233 m_blockSize = blockSize; |
207 | 234 |
208 reset(); | 235 reset(); |
209 | 236 |
211 } | 238 } |
212 | 239 |
213 void | 240 void |
214 TuningDifference::reset() | 241 TuningDifference::reset() |
215 { | 242 { |
216 m_sum[0].clear(); | 243 if (m_frameCount > 0) { |
217 m_sum[1].clear(); | 244 m_refCQ.reset(new CQSpectrogram(paramsForTuningFrequency(440.), |
218 m_frameCount = 0; | 245 CQSpectrogram::InterpolateHold)); |
246 m_frameCount = 0; | |
247 } | |
248 m_refTotals = Chroma(m_refCQ->getTotalBins(), 0.0); | |
249 m_other.clear(); | |
250 } | |
251 | |
252 template<typename T> | |
253 void addTo(vector<T> &a, const vector<T> &b) | |
254 { | |
255 transform(a.begin(), a.end(), b.begin(), a.begin(), plus<T>()); | |
256 } | |
257 | |
258 template<typename T> | |
259 T distance(const vector<T> &a, const vector<T> &b) | |
260 { | |
261 return inner_product(a.begin(), a.end(), b.begin(), T(), | |
262 plus<T>(), [](T x, T y) { return fabs(x - y); }); | |
263 } | |
264 | |
265 TuningDifference::TFeature | |
266 TuningDifference::computeFeatureFromTotals(const Chroma &totals) const | |
267 { | |
268 TFeature feature(m_bpo); | |
269 double sum = 0.0; | |
270 | |
271 for (int i = 0; i < (int)totals.size(); ++i) { | |
272 double value = totals[i] / m_frameCount; | |
273 feature[i % m_bpo] += value; | |
274 sum += value; | |
275 } | |
276 | |
277 for (int i = 0; i < m_bpo; ++i) { | |
278 feature[i] /= sum; | |
279 } | |
280 | |
281 cerr << "computeFeatureFromTotals: feature values:" << endl; | |
282 for (auto v: feature) cerr << v << " "; | |
283 cerr << endl; | |
284 | |
285 return feature; | |
286 } | |
287 | |
288 CQParameters | |
289 TuningDifference::paramsForTuningFrequency(double hz) const | |
290 { | |
291 return CQParameters(m_inputSampleRate, | |
292 pitchToFrequency(36, hz), | |
293 pitchToFrequency(96, hz), | |
294 m_bpo); | |
295 } | |
296 | |
297 TuningDifference::TFeature | |
298 TuningDifference::computeFeatureFromSignal(const Signal &signal, double hz) const | |
299 { | |
300 CQSpectrogram cq(paramsForTuningFrequency(hz), | |
301 CQSpectrogram::InterpolateHold); | |
302 | |
303 Chroma totals(m_refCQ->getTotalBins(), 0.0); | |
304 | |
305 for (int i = 0; i < m_frameCount; ++i) { | |
306 Signal::const_iterator first = signal.begin() + i * m_blockSize; | |
307 Signal::const_iterator last = first + m_blockSize; | |
308 if (last > signal.end()) last = signal.end(); | |
309 CQSpectrogram::RealSequence input(first, last); | |
310 input.resize(m_blockSize); | |
311 CQSpectrogram::RealBlock block = cq.process(input); | |
312 for (const auto &v: block) addTo(totals, v); | |
313 } | |
314 | |
315 return computeFeatureFromTotals(totals); | |
219 } | 316 } |
220 | 317 |
221 TuningDifference::FeatureSet | 318 TuningDifference::FeatureSet |
222 TuningDifference::process(const float *const *inputBuffers, Vamp::RealTime timestamp) | 319 TuningDifference::process(const float *const *inputBuffers, Vamp::RealTime) |
223 { | 320 { |
224 for (int c = 0; c < 2; ++c) { | 321 CQSpectrogram::RealBlock block; |
225 if (m_sum[c].size() == 0) { | 322 CQSpectrogram::RealSequence input; |
226 m_sum[c].resize(m_blockSize/2 + 1, 0.0); | 323 |
227 } | 324 input = CQSpectrogram::RealSequence |
228 for (int i = 0; i <= m_blockSize/2; ++i) { | 325 (inputBuffers[0], inputBuffers[0] + m_blockSize); |
229 double energy = | 326 block = m_refCQ->process(input); |
230 inputBuffers[c][i*2 ] * inputBuffers[c][i*2 ] + | 327 for (const auto &v: block) addTo(m_refTotals, v); |
231 inputBuffers[c][i*2+1] * inputBuffers[c][i*2+1]; | 328 |
232 double mag = sqrt(energy); | 329 m_other.insert(m_other.end(), |
233 m_sum[c][i] += mag; | 330 inputBuffers[1], inputBuffers[1] + m_blockSize); |
234 m_sum[c][i/2] += mag; | |
235 } | |
236 } | |
237 | 331 |
238 ++m_frameCount; | 332 ++m_frameCount; |
239 return FeatureSet(); | 333 return FeatureSet(); |
240 } | 334 } |
241 | 335 |
336 double | |
337 TuningDifference::featureDistance(const TFeature &other, int rotation) const | |
338 { | |
339 if (rotation == 0) { | |
340 return distance(m_refFeature, other); | |
341 } else { | |
342 TFeature r(other); | |
343 if (rotation > 0) { | |
344 rotate(r.begin(), r.begin() + rotation, r.end()); | |
345 } else { | |
346 rotate(r.begin(), r.end() + rotation, r.end()); | |
347 } | |
348 return distance(m_refFeature, r); | |
349 } | |
350 } | |
351 | |
352 int | |
353 TuningDifference::findBestRotation(const TFeature &other) const | |
354 { | |
355 map<double, int> dists; | |
356 | |
357 int maxSemis = 6; | |
358 int maxRotation = (m_bpo * maxSemis) / 12; | |
359 | |
360 for (int r = -maxRotation; r <= maxRotation; ++r) { | |
361 double dist = featureDistance(other, r); | |
362 dists[dist] = r; | |
363 cerr << "rotation " << r << ": score " << dist << endl; | |
364 } | |
365 | |
366 int best = dists.begin()->second; | |
367 | |
368 cerr << "best is " << best << endl; | |
369 return best; | |
370 } | |
371 | |
242 TuningDifference::FeatureSet | 372 TuningDifference::FeatureSet |
243 TuningDifference::getRemainingFeatures() | 373 TuningDifference::getRemainingFeatures() |
244 { | 374 { |
245 int n = m_sum[0].size(); | 375 FeatureSet fs; |
246 if (n == 0) return FeatureSet(); | 376 if (m_frameCount == 0) return fs; |
377 | |
378 m_refFeature = computeFeatureFromTotals(m_refTotals); | |
379 TFeature otherFeature = computeFeatureFromSignal(m_other, 440.); | |
247 | 380 |
248 Feature f; | 381 Feature f; |
249 FeatureSet fs; | 382 |
250 | |
251 int maxshift = 2400; // cents | |
252 | |
253 int bestshift = -1; | |
254 double bestdist = 0; | |
255 | |
256 for (int i = 0; i < n; ++i) { | |
257 m_sum[0][i] /= m_frameCount; | |
258 m_sum[1][i] /= m_frameCount; | |
259 } | |
260 | |
261 for (int c = 0; c < 2; ++c) { | |
262 double tot = 0.0; | |
263 for (int i = 0; i < n; ++i) { | |
264 tot += m_sum[c][i]; | |
265 } | |
266 if (tot != 0.0) { | |
267 for (int i = 0; i < n; ++i) { | |
268 m_sum[c][i] /= tot; | |
269 } | |
270 } | |
271 } | |
272 | |
273 int targetBinMin = int(floor(targetFmin * m_blockSize / m_inputSampleRate)); | |
274 int targetBinMax = int(ceil(targetFmax * m_blockSize / m_inputSampleRate)); | |
275 cerr << "target bin range: " << targetBinMin << " -> " << targetBinMax << endl; | |
276 | |
277 f.values.clear(); | 383 f.values.clear(); |
278 for (int i = targetBinMin; i < targetBinMax; ++i) { | 384 for (auto v: m_refFeature) f.values.push_back(v); |
279 f.values.push_back(m_sum[0][i]); | 385 fs[m_outputs["reffeature"]].push_back(f); |
280 } | 386 |
281 fs[3].push_back(f); | |
282 f.values.clear(); | 387 f.values.clear(); |
283 for (int i = targetBinMin; i < targetBinMax; ++i) { | 388 for (auto v: otherFeature) f.values.push_back(v); |
284 f.values.push_back(m_sum[1][i]); | 389 fs[m_outputs["otherfeature"]].push_back(f); |
285 } | 390 |
286 fs[3].push_back(f); | 391 int rotation = findBestRotation(otherFeature); |
392 | |
393 int coarseCents = -(rotation * 100) / (m_bpo / 12); | |
394 | |
395 cerr << "rotation " << rotation << " -> cents " << coarseCents << endl; | |
396 | |
397 double coarseHz = frequencyForCentsAbove440(coarseCents); | |
398 | |
399 TFeature coarseFeature = computeFeatureFromSignal(m_other, coarseHz); | |
400 double coarseScore = featureDistance(coarseFeature); | |
401 | |
402 cerr << "corresponding Hz " << coarseHz << " scores " << coarseScore << endl; | |
287 | 403 |
288 f.values.clear(); | 404 f.values.clear(); |
289 | 405 for (auto v: coarseFeature) f.values.push_back(v); |
290 for (int shift = -maxshift; shift <= maxshift; ++shift) { | 406 fs[m_outputs["rotfeature"]].push_back(f); |
291 | |
292 double multiplier = pow(2.0, double(shift) / 1200.0); | |
293 double dist = 0.0; | |
294 | |
295 // cerr << "shift = " << shift << ", multiplier = " << multiplier << endl; | |
296 | |
297 int contributing = 0; | |
298 | |
299 for (int i = targetBinMin; i < targetBinMax; ++i) { | |
300 | |
301 double source = i / multiplier; | |
302 int s0 = int(source), s1 = s0 + 1; | |
303 double p1 = source - s0; | |
304 double p0 = 1.0 - p1; | |
305 | |
306 double value = 0.0; | |
307 if (s0 >= 0 && s0 < n) { | |
308 value += p0 * m_sum[1][s0]; | |
309 ++contributing; | |
310 } | |
311 if (s1 >= 0 && s1 < n) { | |
312 value += p1 * m_sum[1][s1]; | |
313 ++contributing; | |
314 } | |
315 | |
316 // if (shift == -1) { | |
317 // cerr << "for multiplier " << multiplier << ", target " << i << ", source " << source << ", value " << p0 << " * " << m_sum[1][s0] << " + " << p1 << " * " << m_sum[1][s1] << " = " << value << ", other " << m_sum[0][i] << endl; | |
318 // } | |
319 | |
320 double diff = fabs(m_sum[0][i] - value); | |
321 dist += diff; | |
322 } | |
323 | |
324 dist /= contributing; | |
325 | |
326 f.values.clear(); | |
327 f.values.push_back(dist); | |
328 char label[100]; | |
329 sprintf(label, "%f at shift %d freq mult %f", dist, shift, multiplier); | |
330 f.label = label; | |
331 fs[2].push_back(f); | |
332 | |
333 if (bestshift == -1 || dist < bestdist) { | |
334 bestshift = shift; | |
335 bestdist = dist; | |
336 } | |
337 } | |
338 | |
339 f.timestamp = Vamp::RealTime::zeroTime; | |
340 f.hasTimestamp = true; | |
341 f.label = ""; | |
342 | |
343 f.values.clear(); | |
344 // cerr << "best dist = " << bestdist << " at shift " << bestshift << endl; | |
345 f.values.push_back(-bestshift); | |
346 fs[0].push_back(f); | |
347 | |
348 f.values.clear(); | |
349 f.values.push_back(440.0 / pow(2.0, double(bestshift) / 1200.0)); | |
350 fs[1].push_back(f); | |
351 | 407 |
352 return fs; | 408 return fs; |
353 } | 409 } |
354 | 410 |