comparison NNLSChroma.cpp @ 110:be2e5d44ce79 speechmusic

re-introducing the old code
author Matthias Mauch <mail@matthiasmauch.net>
date Tue, 07 Dec 2010 22:21:30 +0900
parents d1398182a072
children
comparison
equal deleted inserted replaced
101:109745cd67c3 110:be2e5d44ce79
183 d7.sampleType = OutputDescriptor::FixedSampleRate; 183 d7.sampleType = OutputDescriptor::FixedSampleRate;
184 d7.hasDuration = false; 184 d7.hasDuration = false;
185 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize; 185 d7.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
186 list.push_back(d7); 186 list.push_back(d7);
187 m_outputConsonance = index++; 187 m_outputConsonance = index++;
188
189 OutputDescriptor speechity;
190 speechity.identifier = "speechity";
191 speechity.name = "Speech vs music segmenter.";
192 speechity.description = ".";
193 speechity.unit = "";
194 speechity.hasFixedBinCount = true;
195 speechity.binCount = 1;
196 speechity.hasKnownExtents = false;
197 speechity.isQuantized = false;
198 speechity.sampleType = OutputDescriptor::FixedSampleRate;
199 speechity.hasDuration = false;
200 speechity.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
201 list.push_back(speechity);
202 m_outputSpeechity = index++;
203
204 OutputDescriptor mssegment;
205 mssegment.identifier = "mssegment";
206 mssegment.name = "Speech vs music segmenter.";
207 mssegment.description = ".";
208 mssegment.unit = "";
209 mssegment.hasFixedBinCount = true;
210 mssegment.binCount = 1;
211 mssegment.hasKnownExtents = false;
212 mssegment.isQuantized = false;
213 mssegment.sampleType = OutputDescriptor::FixedSampleRate;
214 mssegment.hasDuration = false;
215 mssegment.sampleRate = (m_stepSize == 0) ? m_inputSampleRate/2048 : m_inputSampleRate/m_stepSize;
216 list.push_back(mssegment);
217 m_outputMssegment = index++;
188 218
189 return list; 219 return list;
190 } 220 }
191 221
192 222
343 Feature f3; // semitone spectrum 373 Feature f3; // semitone spectrum
344 Feature f4; // treble chromagram 374 Feature f4; // treble chromagram
345 Feature f5; // bass chromagram 375 Feature f5; // bass chromagram
346 Feature f6; // treble and bass chromagram 376 Feature f6; // treble and bass chromagram
347 Feature consonance; 377 Feature consonance;
378 Feature speechity;
348 379
349 f3.hasTimestamp = true; 380 f3.hasTimestamp = true;
350 f3.timestamp = f2.timestamp; 381 f3.timestamp = f2.timestamp;
351 382
352 f4.hasTimestamp = true; 383 f4.hasTimestamp = true;
358 f6.hasTimestamp = true; 389 f6.hasTimestamp = true;
359 f6.timestamp = f2.timestamp; 390 f6.timestamp = f2.timestamp;
360 391
361 consonance.hasTimestamp = true; 392 consonance.hasTimestamp = true;
362 consonance.timestamp = f2.timestamp; 393 consonance.timestamp = f2.timestamp;
394 speechity.hasTimestamp = true;
395 speechity.timestamp = f2.timestamp;
363 396
364 float b[nNote]; 397 float b[nNote];
365 398
366 bool some_b_greater_zero = false; 399 bool some_b_greater_zero = false;
367 float sumb = 0; 400 float sumb = 0;
503 for (size_t i = 0; i < f6.values.size(); i++) { 536 for (size_t i = 0; i < f6.values.size(); i++) {
504 f6.values[i] /= chromanorm[2]; 537 f6.values[i] /= chromanorm[2];
505 } 538 }
506 } 539 }
507 } 540 }
508 541 // float speechityvalue = 0;
542 // for (int iPC = 0; iPC < 12; ++iPC) {
543 // speechityvalue += abs(f3.values[iPC] - oldchroma[iPC]);
544 // oldchroma[iPC] = f3.values[iPC];
545 // }
546 // speechity.values.push_back(speechityvalue);
547
509 fsOut[m_outputSemiSpec].push_back(f3); 548 fsOut[m_outputSemiSpec].push_back(f3);
510 fsOut[m_outputChroma].push_back(f4); 549 fsOut[m_outputChroma].push_back(f4);
511 fsOut[m_outputBassChroma].push_back(f5); 550 fsOut[m_outputBassChroma].push_back(f5);
512 fsOut[m_outputBothChroma].push_back(f6); 551 fsOut[m_outputBothChroma].push_back(f6);
513 fsOut[m_outputConsonance].push_back(consonance); 552 fsOut[m_outputConsonance].push_back(consonance);
553 // fsOut[m_outputSpeechity].push_back(speechity);
514 count++; 554 count++;
515 } 555 }
516 cerr << "done." << endl; 556 cerr << "done." << endl;
517 557
558
559 // musicity
560 count = 0;
561 int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
562 vector<float> musicityValue;
563 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
564 Feature f4 = *it;
565
566 int startIndex = max(count - musicitykernelwidth/2,0);
567 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
568 float chromasum = 0;
569 float diffsum = 0;
570 for (int k = 0; k < 12; k++) {
571 for (int i = startIndex + 1; i < endIndex; i++) {
572 chromasum += pow(fsOut[4][i].values[k],2);
573 diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
574 }
575 }
576 diffsum /= chromasum;
577 musicityValue.push_back(diffsum);
578 count++;
579 }
580
581 float musicityThreshold = 0.44;
582 if (m_stepSize == 4096) {
583 musicityThreshold = 0.74;
584 }
585 if (m_stepSize == 4410) {
586 musicityThreshold = 0.77;
587 }
588
589 count = 0;
590 for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
591 Feature f4 = *it;
592 Feature speechity; // musicity
593 Feature f9; // musicity segmenter
594
595 speechity.hasTimestamp = true;
596 speechity.timestamp = f4.timestamp;
597 mssegment.hasTimestamp = true;
598 mssegment.timestamp = f4.timestamp;
599
600 int startIndex = max(count - musicitykernelwidth/2,0);
601 int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
602 int musicityCount = 0;
603 for (int i = startIndex; i <= endIndex; i++) {
604 if (musicityValue[i] > musicityThreshold) musicityCount++;
605 }
606 bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
607
608 if (isSpeech) {
609 if (oldlabeltype != 2) {
610 mssegment.label = "Speech";
611 fsOut[m_outputMssegment].push_back(mssegment);
612 oldlabeltype = 2;
613 }
614 } else {
615 if (oldlabeltype != 1) {
616 mssegment.label = "Music";
617 fsOut[m_outputMssegment].push_back(mssegment);
618 oldlabeltype = 1;
619 }
620 }
621 speechity.values.push_back(musicityValue[count]);
622 fsOut[m_outputSpeechity].push_back(speechity);
623 count++;
624 }
625
626
518 return fsOut; 627 return fsOut;
519 628
520 } 629 }
521 630