Mercurial > hg > nnls-chroma
comparison NNLSChroma.cpp @ 110:be2e5d44ce79 speechmusic
re-introducing the old code
author | Matthias Mauch <mail@matthiasmauch.net> |
---|---|
date | Tue, 07 Dec 2010 22:21:30 +0900 |
parents | d1398182a072 |
children |
comparison
equal
deleted
inserted
replaced
101:109745cd67c3 | 110:be2e5d44ce79 |
---|---|
183 d7.sampleType = OutputDescriptor::FixedSampleRate; | 183 d7.sampleType = OutputDescriptor::FixedSampleRate; |
184 d7.hasDuration = false; | 184 d7.hasDuration = false; |
185 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; | 185 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; |
186 list.push_back(d7); | 186 list.push_back(d7); |
187 m_outputConsonance = index++; | 187 m_outputConsonance = index++; |
188 | |
189 OutputDescriptor speechity; | |
190 speechity.identifier = "speechity"; | |
191 speechity.name = "Speech vs music segmenter."; | |
192 speechity.description = "."; | |
193 speechity.unit = ""; | |
194 speechity.hasFixedBinCount = true; | |
195 speechity.binCount = 1; | |
196 speechity.hasKnownExtents = false; | |
197 speechity.isQuantized = false; | |
198 speechity.sampleType = OutputDescriptor::FixedSampleRate; | |
199 speechity.hasDuration = false; | |
200 speechity.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; | |
201 list.push_back(speechity); | |
202 m_outputSpeechity = index++; | |
203 | |
204 OutputDescriptor mssegment; | |
205 mssegment.identifier = "mssegment"; | |
206 mssegment.name = "Speech vs music segmenter."; | |
207 mssegment.description = "."; | |
208 mssegment.unit = ""; | |
209 mssegment.hasFixedBinCount = true; | |
210 mssegment.binCount = 1; | |
211 mssegment.hasKnownExtents = false; | |
212 mssegment.isQuantized = false; | |
213 mssegment.sampleType = OutputDescriptor::FixedSampleRate; | |
214 mssegment.hasDuration = false; | |
215 mssegment.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; | |
216 list.push_back(mssegment); | |
217 m_outputMssegment = index++; | |
188 | 218 |
189 return list; | 219 return list; |
190 } | 220 } |
191 | 221 |
192 | 222 |
343 Feature f3; // semitone spectrum | 373 Feature f3; // semitone spectrum |
344 Feature f4; // treble chromagram | 374 Feature f4; // treble chromagram |
345 Feature f5; // bass chromagram | 375 Feature f5; // bass chromagram |
346 Feature f6; // treble and bass chromagram | 376 Feature f6; // treble and bass chromagram |
347 Feature consonance; | 377 Feature consonance; |
378 Feature speechity; | |
348 | 379 |
349 f3.hasTimestamp = true; | 380 f3.hasTimestamp = true; |
350 f3.timestamp = f2.timestamp; | 381 f3.timestamp = f2.timestamp; |
351 | 382 |
352 f4.hasTimestamp = true; | 383 f4.hasTimestamp = true; |
358 f6.hasTimestamp = true; | 389 f6.hasTimestamp = true; |
359 f6.timestamp = f2.timestamp; | 390 f6.timestamp = f2.timestamp; |
360 | 391 |
361 consonance.hasTimestamp = true; | 392 consonance.hasTimestamp = true; |
362 consonance.timestamp = f2.timestamp; | 393 consonance.timestamp = f2.timestamp; |
394 speechity.hasTimestamp = true; | |
395 speechity.timestamp = f2.timestamp; | |
363 | 396 |
364 float b[nNote]; | 397 float b[nNote]; |
365 | 398 |
366 bool some_b_greater_zero = false; | 399 bool some_b_greater_zero = false; |
367 float sumb = 0; | 400 float sumb = 0; |
503 for (size_t i = 0; i < f6.values.size(); i++) { | 536 for (size_t i = 0; i < f6.values.size(); i++) { |
504 f6.values[i] /= chromanorm[2]; | 537 f6.values[i] /= chromanorm[2]; |
505 } | 538 } |
506 } | 539 } |
507 } | 540 } |
508 | 541 // float speechityvalue = 0; |
542 // for (int iPC = 0; iPC < 12; ++iPC) { | |
543 // speechityvalue += abs(f3.values[iPC] - oldchroma[iPC]); | |
544 // oldchroma[iPC] = f3.values[iPC]; | |
545 // } | |
546 // speechity.values.push_back(speechityvalue); | |
547 | |
509 fsOut[m_outputSemiSpec].push_back(f3); | 548 fsOut[m_outputSemiSpec].push_back(f3); |
510 fsOut[m_outputChroma].push_back(f4); | 549 fsOut[m_outputChroma].push_back(f4); |
511 fsOut[m_outputBassChroma].push_back(f5); | 550 fsOut[m_outputBassChroma].push_back(f5); |
512 fsOut[m_outputBothChroma].push_back(f6); | 551 fsOut[m_outputBothChroma].push_back(f6); |
513 fsOut[m_outputConsonance].push_back(consonance); | 552 fsOut[m_outputConsonance].push_back(consonance); |
553 // fsOut[m_outputSpeechity].push_back(speechity); | |
514 count++; | 554 count++; |
515 } | 555 } |
516 cerr << "done." << endl; | 556 cerr << "done." << endl; |
517 | 557 |
558 | |
559 // musicity | |
560 count = 0; | |
561 int oldlabeltype = 0; // start value is 0, music is 1, speech is 2 | |
562 vector<float> musicityValue; | |
563 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) { | |
564 Feature f4 = *it; | |
565 | |
566 int startIndex = max(count - musicitykernelwidth/2,0); | |
567 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1); | |
568 float chromasum = 0; | |
569 float diffsum = 0; | |
570 for (int k = 0; k < 12; k++) { | |
571 for (int i = startIndex + 1; i < endIndex; i++) { | |
572 chromasum += pow(fsOut[4][i].values[k],2); | |
573 diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]); | |
574 } | |
575 } | |
576 diffsum /= chromasum; | |
577 musicityValue.push_back(diffsum); | |
578 count++; | |
579 } | |
580 | |
581 float musicityThreshold = 0.44; | |
582 if (m_stepSize == 4096) { | |
583 musicityThreshold = 0.74; | |
584 } | |
585 if (m_stepSize == 4410) { | |
586 musicityThreshold = 0.77; | |
587 } | |
588 | |
589 count = 0; | |
590 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) { | |
591 Feature f4 = *it; | |
592 Feature speechity; // musicity | |
593 Feature f9; // musicity segmenter | |
594 | |
595 speechity.hasTimestamp = true; | |
596 speechity.timestamp = f4.timestamp; | |
597 mssegment.hasTimestamp = true; | |
598 mssegment.timestamp = f4.timestamp; | |
599 | |
600 int startIndex = max(count - musicitykernelwidth/2,0); | |
601 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1); | |
602 int musicityCount = 0; | |
603 for (int i = startIndex; i <= endIndex; i++) { | |
604 if (musicityValue[i] > musicityThreshold) musicityCount++; | |
605 } | |
606 bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1); | |
607 | |
608 if (isSpeech) { | |
609 if (oldlabeltype != 2) { | |
610 mssegment.label = "Speech"; | |
611 fsOut[m_outputMssegment].push_back(mssegment); | |
612 oldlabeltype = 2; | |
613 } | |
614 } else { | |
615 if (oldlabeltype != 1) { | |
616 mssegment.label = "Music"; | |
617 fsOut[m_outputMssegment].push_back(mssegment); | |
618 oldlabeltype = 1; | |
619 } | |
620 } | |
621 speechity.values.push_back(musicityValue[count]); | |
622 fsOut[m_outputSpeechity].push_back(speechity); | |
623 count++; | |
624 } | |
625 | |
626 | |
518 return fsOut; | 627 return fsOut; |
519 | 628 |
520 } | 629 } |
521 | 630 |