comparison chroma-compare-plugin/TuningDifference.cpp @ 13:c74071731d74

The bulk of the chroma-compare implementation. Should be using the newly refactored Chromagram class from libcq though (chroma is currently upside-down!)
author Chris Cannam
date Wed, 04 Feb 2015 15:10:07 +0000
parents 23572f9d25d9
children 812e4d021443
comparison
equal deleted inserted replaced
12:23572f9d25d9 13:c74071731d74
4 #include <iostream> 4 #include <iostream>
5 5
6 #include <cmath> 6 #include <cmath>
7 #include <cstdio> 7 #include <cstdio>
8 8
9 using std::cerr; 9 #include <algorithm>
10 using std::endl; 10
11 11 using namespace std;
12 static double targetFmin = 60.0; 12
13 static double targetFmax = 1500.0; 13 static double pitchToFrequency(int pitch,
14 double centsOffset = 0.,
15 double concertA = 440.)
16 {
17 double p = double(pitch) + (centsOffset / 100.);
18 return concertA * pow(2.0, (p - 69.0) / 12.0);
19 }
20
21 static double frequencyForCentsAbove440(double cents)
22 {
23 return pitchToFrequency(69, cents, 440.);
24 }
14 25
15 TuningDifference::TuningDifference(float inputSampleRate) : 26 TuningDifference::TuningDifference(float inputSampleRate) :
16 Plugin(inputSampleRate) 27 Plugin(inputSampleRate),
28 m_bpo(60),
29 m_refCQ(new CQSpectrogram(paramsForTuningFrequency(440.),
30 CQSpectrogram::InterpolateHold))
17 { 31 {
18 } 32 }
19 33
20 TuningDifference::~TuningDifference() 34 TuningDifference::~TuningDifference()
21 { 35 {
66 } 80 }
67 81
68 TuningDifference::InputDomain 82 TuningDifference::InputDomain
69 TuningDifference::getInputDomain() const 83 TuningDifference::getInputDomain() const
70 { 84 {
71 return FrequencyDomain; 85 return TimeDomain;
72 } 86 }
73 87
74 size_t 88 size_t
75 TuningDifference::getPreferredBlockSize() const 89 TuningDifference::getPreferredBlockSize() const
76 { 90 {
77 return 16384; 91 return 0;
78 } 92 }
79 93
80 size_t 94 size_t
81 TuningDifference::getPreferredStepSize() const 95 TuningDifference::getPreferredStepSize() const
82 { 96 {
145 d.binCount = 1; 159 d.binCount = 1;
146 d.hasKnownExtents = false; 160 d.hasKnownExtents = false;
147 d.isQuantized = false; 161 d.isQuantized = false;
148 d.sampleType = OutputDescriptor::VariableSampleRate; 162 d.sampleType = OutputDescriptor::VariableSampleRate;
149 d.hasDuration = false; 163 d.hasDuration = false;
164 m_outputs[d.identifier] = list.size();
150 list.push_back(d); 165 list.push_back(d);
151 166
152 d.identifier = "tuningfreq"; 167 d.identifier = "tuningfreq";
153 d.name = "Relative Tuning Frequency"; 168 d.name = "Relative Tuning Frequency";
154 d.description = "Tuning frequency of channel 2, if channel 1 is assumed to contain the same music as it at a tuning frequency of A=440Hz."; 169 d.description = "Tuning frequency of channel 2, if channel 1 is assumed to contain the same music as it at a tuning frequency of A=440Hz.";
157 d.binCount = 1; 172 d.binCount = 1;
158 d.hasKnownExtents = false; 173 d.hasKnownExtents = false;
159 d.isQuantized = false; 174 d.isQuantized = false;
160 d.sampleType = OutputDescriptor::VariableSampleRate; 175 d.sampleType = OutputDescriptor::VariableSampleRate;
161 d.hasDuration = false; 176 d.hasDuration = false;
177 m_outputs[d.identifier] = list.size();
162 list.push_back(d); 178 list.push_back(d);
163 179
164 d.identifier = "curve"; 180 d.identifier = "reffeature";
165 d.name = "Shift Correlation Curve"; 181 d.name = "Reference Feature";
166 d.description = "Correlation between shifted and unshifted sources, for each probed shift in cents."; 182 d.description = "Chroma feature from reference audio.";
167 d.unit = ""; 183 d.unit = "";
168 d.hasFixedBinCount = true; 184 d.hasFixedBinCount = true;
169 d.binCount = 1; 185 d.binCount = m_bpo;
170 d.hasKnownExtents = false;
171 d.isQuantized = false;
172 d.sampleType = OutputDescriptor::FixedSampleRate;
173 d.sampleRate = 100;
174 d.hasDuration = false;
175 list.push_back(d);
176
177 int targetBinMin = int(floor(targetFmin * m_blockSize / m_inputSampleRate));
178 int targetBinMax = int(ceil(targetFmax * m_blockSize / m_inputSampleRate));
179 cerr << "target bin range: " << targetBinMin << " -> " << targetBinMax << endl;
180
181 d.identifier = "averages";
182 d.name = "Spectrum averages";
183 d.description = "Average magnitude spectrum for each channel.";
184 d.unit = "";
185 d.hasFixedBinCount = true;
186 d.binCount = (targetBinMax > targetBinMin ? targetBinMax - targetBinMin + 1 : 100);
187 d.hasKnownExtents = false; 186 d.hasKnownExtents = false;
188 d.isQuantized = false; 187 d.isQuantized = false;
189 d.sampleType = OutputDescriptor::FixedSampleRate; 188 d.sampleType = OutputDescriptor::FixedSampleRate;
190 d.sampleRate = 1; 189 d.sampleRate = 1;
191 d.hasDuration = false; 190 d.hasDuration = false;
191 m_outputs[d.identifier] = list.size();
192 list.push_back(d);
193
194 d.identifier = "otherfeature";
195 d.name = "Other Feature";
196 d.description = "Chroma feature from other audio, before rotation.";
197 d.unit = "";
198 d.hasFixedBinCount = true;
199 d.binCount = m_bpo;
200 d.hasKnownExtents = false;
201 d.isQuantized = false;
202 d.sampleType = OutputDescriptor::FixedSampleRate;
203 d.sampleRate = 1;
204 d.hasDuration = false;
205 m_outputs[d.identifier] = list.size();
206 list.push_back(d);
207
208 d.identifier = "rotfeature";
209 d.name = "Other Feature at Rotated Frequency";
210 d.description = "Chroma feature from reference audio calculated with the tuning frequency obtained from rotation matching.";
211 d.unit = "";
212 d.hasFixedBinCount = true;
213 d.binCount = m_bpo;
214 d.hasKnownExtents = false;
215 d.isQuantized = false;
216 d.sampleType = OutputDescriptor::FixedSampleRate;
217 d.sampleRate = 1;
218 d.hasDuration = false;
219 m_outputs[d.identifier] = list.size();
192 list.push_back(d); 220 list.push_back(d);
193 221
194 return list; 222 return list;
195 } 223 }
196 224
198 TuningDifference::initialise(size_t channels, size_t stepSize, size_t blockSize) 226 TuningDifference::initialise(size_t channels, size_t stepSize, size_t blockSize)
199 { 227 {
200 if (channels < getMinChannelCount() || 228 if (channels < getMinChannelCount() ||
201 channels > getMaxChannelCount()) return false; 229 channels > getMaxChannelCount()) return false;
202 230
203 if (blockSize != getPreferredBlockSize() || 231 if (stepSize != blockSize) return false;
204 stepSize != blockSize/2) return false;
205 232
206 m_blockSize = blockSize; 233 m_blockSize = blockSize;
207 234
208 reset(); 235 reset();
209 236
211 } 238 }
212 239
213 void 240 void
214 TuningDifference::reset() 241 TuningDifference::reset()
215 { 242 {
216 m_sum[0].clear(); 243 if (m_frameCount > 0) {
217 m_sum[1].clear(); 244 m_refCQ.reset(new CQSpectrogram(paramsForTuningFrequency(440.),
218 m_frameCount = 0; 245 CQSpectrogram::InterpolateHold));
246 m_frameCount = 0;
247 }
248 m_refTotals = Chroma(m_refCQ->getTotalBins(), 0.0);
249 m_other.clear();
250 }
251
252 template<typename T>
253 void addTo(vector<T> &a, const vector<T> &b)
254 {
255 transform(a.begin(), a.end(), b.begin(), a.begin(), plus<T>());
256 }
257
258 template<typename T>
259 T distance(const vector<T> &a, const vector<T> &b)
260 {
261 return inner_product(a.begin(), a.end(), b.begin(), T(),
262 plus<T>(), [](T x, T y) { return fabs(x - y); });
263 }
264
265 TuningDifference::TFeature
266 TuningDifference::computeFeatureFromTotals(const Chroma &totals) const
267 {
268 TFeature feature(m_bpo);
269 double sum = 0.0;
270
271 for (int i = 0; i < (int)totals.size(); ++i) {
272 double value = totals[i] / m_frameCount;
273 feature[i % m_bpo] += value;
274 sum += value;
275 }
276
277 for (int i = 0; i < m_bpo; ++i) {
278 feature[i] /= sum;
279 }
280
281 cerr << "computeFeatureFromTotals: feature values:" << endl;
282 for (auto v: feature) cerr << v << " ";
283 cerr << endl;
284
285 return feature;
286 }
287
288 CQParameters
289 TuningDifference::paramsForTuningFrequency(double hz) const
290 {
291 return CQParameters(m_inputSampleRate,
292 pitchToFrequency(36, hz),
293 pitchToFrequency(96, hz),
294 m_bpo);
295 }
296
297 TuningDifference::TFeature
298 TuningDifference::computeFeatureFromSignal(const Signal &signal, double hz) const
299 {
300 CQSpectrogram cq(paramsForTuningFrequency(hz),
301 CQSpectrogram::InterpolateHold);
302
303 Chroma totals(m_refCQ->getTotalBins(), 0.0);
304
305 for (int i = 0; i < m_frameCount; ++i) {
306 Signal::const_iterator first = signal.begin() + i * m_blockSize;
307 Signal::const_iterator last = first + m_blockSize;
308 if (last > signal.end()) last = signal.end();
309 CQSpectrogram::RealSequence input(first, last);
310 input.resize(m_blockSize);
311 CQSpectrogram::RealBlock block = cq.process(input);
312 for (const auto &v: block) addTo(totals, v);
313 }
314
315 return computeFeatureFromTotals(totals);
219 } 316 }
220 317
221 TuningDifference::FeatureSet 318 TuningDifference::FeatureSet
222 TuningDifference::process(const float *const *inputBuffers, Vamp::RealTime timestamp) 319 TuningDifference::process(const float *const *inputBuffers, Vamp::RealTime)
223 { 320 {
224 for (int c = 0; c < 2; ++c) { 321 CQSpectrogram::RealBlock block;
225 if (m_sum[c].size() == 0) { 322 CQSpectrogram::RealSequence input;
226 m_sum[c].resize(m_blockSize/2 + 1, 0.0); 323
227 } 324 input = CQSpectrogram::RealSequence
228 for (int i = 0; i <= m_blockSize/2; ++i) { 325 (inputBuffers[0], inputBuffers[0] + m_blockSize);
229 double energy = 326 block = m_refCQ->process(input);
230 inputBuffers[c][i*2 ] * inputBuffers[c][i*2 ] + 327 for (const auto &v: block) addTo(m_refTotals, v);
231 inputBuffers[c][i*2+1] * inputBuffers[c][i*2+1]; 328
232 double mag = sqrt(energy); 329 m_other.insert(m_other.end(),
233 m_sum[c][i] += mag; 330 inputBuffers[1], inputBuffers[1] + m_blockSize);
234 m_sum[c][i/2] += mag;
235 }
236 }
237 331
238 ++m_frameCount; 332 ++m_frameCount;
239 return FeatureSet(); 333 return FeatureSet();
240 } 334 }
241 335
336 double
337 TuningDifference::featureDistance(const TFeature &other, int rotation) const
338 {
339 if (rotation == 0) {
340 return distance(m_refFeature, other);
341 } else {
342 TFeature r(other);
343 if (rotation > 0) {
344 rotate(r.begin(), r.begin() + rotation, r.end());
345 } else {
346 rotate(r.begin(), r.end() + rotation, r.end());
347 }
348 return distance(m_refFeature, r);
349 }
350 }
351
352 int
353 TuningDifference::findBestRotation(const TFeature &other) const
354 {
355 map<double, int> dists;
356
357 int maxSemis = 6;
358 int maxRotation = (m_bpo * maxSemis) / 12;
359
360 for (int r = -maxRotation; r <= maxRotation; ++r) {
361 double dist = featureDistance(other, r);
362 dists[dist] = r;
363 cerr << "rotation " << r << ": score " << dist << endl;
364 }
365
366 int best = dists.begin()->second;
367
368 cerr << "best is " << best << endl;
369 return best;
370 }
371
242 TuningDifference::FeatureSet 372 TuningDifference::FeatureSet
243 TuningDifference::getRemainingFeatures() 373 TuningDifference::getRemainingFeatures()
244 { 374 {
245 int n = m_sum[0].size(); 375 FeatureSet fs;
246 if (n == 0) return FeatureSet(); 376 if (m_frameCount == 0) return fs;
377
378 m_refFeature = computeFeatureFromTotals(m_refTotals);
379 TFeature otherFeature = computeFeatureFromSignal(m_other, 440.);
247 380
248 Feature f; 381 Feature f;
249 FeatureSet fs; 382
250
251 int maxshift = 2400; // cents
252
253 int bestshift = -1;
254 double bestdist = 0;
255
256 for (int i = 0; i < n; ++i) {
257 m_sum[0][i] /= m_frameCount;
258 m_sum[1][i] /= m_frameCount;
259 }
260
261 for (int c = 0; c < 2; ++c) {
262 double tot = 0.0;
263 for (int i = 0; i < n; ++i) {
264 tot += m_sum[c][i];
265 }
266 if (tot != 0.0) {
267 for (int i = 0; i < n; ++i) {
268 m_sum[c][i] /= tot;
269 }
270 }
271 }
272
273 int targetBinMin = int(floor(targetFmin * m_blockSize / m_inputSampleRate));
274 int targetBinMax = int(ceil(targetFmax * m_blockSize / m_inputSampleRate));
275 cerr << "target bin range: " << targetBinMin << " -> " << targetBinMax << endl;
276
277 f.values.clear(); 383 f.values.clear();
278 for (int i = targetBinMin; i < targetBinMax; ++i) { 384 for (auto v: m_refFeature) f.values.push_back(v);
279 f.values.push_back(m_sum[0][i]); 385 fs[m_outputs["reffeature"]].push_back(f);
280 } 386
281 fs[3].push_back(f);
282 f.values.clear(); 387 f.values.clear();
283 for (int i = targetBinMin; i < targetBinMax; ++i) { 388 for (auto v: otherFeature) f.values.push_back(v);
284 f.values.push_back(m_sum[1][i]); 389 fs[m_outputs["otherfeature"]].push_back(f);
285 } 390
286 fs[3].push_back(f); 391 int rotation = findBestRotation(otherFeature);
392
393 int coarseCents = -(rotation * 100) / (m_bpo / 12);
394
395 cerr << "rotation " << rotation << " -> cents " << coarseCents << endl;
396
397 double coarseHz = frequencyForCentsAbove440(coarseCents);
398
399 TFeature coarseFeature = computeFeatureFromSignal(m_other, coarseHz);
400 double coarseScore = featureDistance(coarseFeature);
401
402 cerr << "corresponding Hz " << coarseHz << " scores " << coarseScore << endl;
287 403
288 f.values.clear(); 404 f.values.clear();
289 405 for (auto v: coarseFeature) f.values.push_back(v);
290 for (int shift = -maxshift; shift <= maxshift; ++shift) { 406 fs[m_outputs["rotfeature"]].push_back(f);
291
292 double multiplier = pow(2.0, double(shift) / 1200.0);
293 double dist = 0.0;
294
295 // cerr << "shift = " << shift << ", multiplier = " << multiplier << endl;
296
297 int contributing = 0;
298
299 for (int i = targetBinMin; i < targetBinMax; ++i) {
300
301 double source = i / multiplier;
302 int s0 = int(source), s1 = s0 + 1;
303 double p1 = source - s0;
304 double p0 = 1.0 - p1;
305
306 double value = 0.0;
307 if (s0 >= 0 && s0 < n) {
308 value += p0 * m_sum[1][s0];
309 ++contributing;
310 }
311 if (s1 >= 0 && s1 < n) {
312 value += p1 * m_sum[1][s1];
313 ++contributing;
314 }
315
316 // if (shift == -1) {
317 // cerr << "for multiplier " << multiplier << ", target " << i << ", source " << source << ", value " << p0 << " * " << m_sum[1][s0] << " + " << p1 << " * " << m_sum[1][s1] << " = " << value << ", other " << m_sum[0][i] << endl;
318 // }
319
320 double diff = fabs(m_sum[0][i] - value);
321 dist += diff;
322 }
323
324 dist /= contributing;
325
326 f.values.clear();
327 f.values.push_back(dist);
328 char label[100];
329 sprintf(label, "%f at shift %d freq mult %f", dist, shift, multiplier);
330 f.label = label;
331 fs[2].push_back(f);
332
333 if (bestshift == -1 || dist < bestdist) {
334 bestshift = shift;
335 bestdist = dist;
336 }
337 }
338
339 f.timestamp = Vamp::RealTime::zeroTime;
340 f.hasTimestamp = true;
341 f.label = "";
342
343 f.values.clear();
344 // cerr << "best dist = " << bestdist << " at shift " << bestshift << endl;
345 f.values.push_back(-bestshift);
346 fs[0].push_back(f);
347
348 f.values.clear();
349 f.values.push_back(440.0 / pow(2.0, double(bestshift) / 1200.0));
350 fs[1].push_back(f);
351 407
352 return fs; 408 return fs;
353 } 409 }
354 410