comparison NNLSBase.cpp @ 81:4270f3039ab0 matthiasm-plugin

dont remember, sorry
author Matthias Mauch <mail@matthiasmauch.net>
date Mon, 15 Nov 2010 11:01:36 +0900
parents 026a5c0ee2c2
children e5c16976513d
comparison
equal deleted inserted replaced
80:026a5c0ee2c2 81:4270f3039ab0
527 #ifdef NOT_DEFINED 527 #ifdef NOT_DEFINED
528 528
529 NNLSBase::FeatureSet 529 NNLSBase::FeatureSet
530 NNLSBase::getRemainingFeatures() 530 NNLSBase::getRemainingFeatures()
531 { 531 {
532 if (debug_on) cerr << "--> getRemainingFeatures" << endl; 532 // if (debug_on) cerr << "--> getRemainingFeatures" << endl;
533 FeatureSet fsOut; 533 FeatureSet fsOut;
534 if (m_logSpectrum.size() == 0) return fsOut; 534 // if (m_logSpectrum.size() == 0) return fsOut;
535 int nChord = m_chordnames.size(); 535 // int nChord = m_chordnames.size();
536 // //
537 // /** Calculate Tuning
538 // calculate tuning from (using the angle of the complex number defined by the
539 // cumulative mean real and imag values)
540 // **/
541 // float meanTuningImag = sinvalue * m_meanTunings[1] - sinvalue * m_meanTunings[2];
542 // float meanTuningReal = m_meanTunings[0] + cosvalue * m_meanTunings[1] + cosvalue * m_meanTunings[2];
543 // float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI));
544 // float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI);
545 // int intShift = floor(normalisedtuning * 3);
546 // float floatShift = normalisedtuning * 3 - intShift; // floatShift is a really bad name for this
547 //
548 // char buffer0 [50];
549 //
550 // sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning);
551 //
552 // // cerr << "normalisedtuning: " << normalisedtuning << '\n';
553 //
554 // // push tuning to FeatureSet fsOut
555 // Feature f0; // tuning
556 // f0.hasTimestamp = true;
557 // f0.timestamp = Vamp::RealTime::frame2RealTime(0, lrintf(m_inputSampleRate));;
558 // f0.label = buffer0;
559 // fsOut[0].push_back(f0);
560 //
561 // /** Tune Log-Frequency Spectrogram
562 // calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to
563 // perform linear interpolation on the existing log-frequency spectrogram (kinda f1).
564 // **/
565 // cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... ";
566 //
567 // float tempValue = 0;
568 // float dbThreshold = 0; // relative to the background spectrum
569 // float thresh = pow(10,dbThreshold/20);
570 // // cerr << "tune local ? " << m_tuneLocal << endl;
571 // int count = 0;
572 //
573 // for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) {
574 // Feature f1 = *i;
575 // Feature f2; // tuned log-frequency spectrum
576 // f2.hasTimestamp = true;
577 // f2.timestamp = f1.timestamp;
578 // f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero
579 //
580 // if (m_tuneLocal == 1.0) {
581 // intShift = floor(m_localTuning[count] * 3);
582 // floatShift = m_localTuning[count] * 3 - intShift; // floatShift is a really bad name for this
583 // }
584 //
585 // // cerr << intShift << " " << floatShift << endl;
586 //
587 // for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins
588 // tempValue = f1.values[k + intShift] * (1-floatShift) + f1.values[k+intShift+1] * floatShift;
589 // f2.values.push_back(tempValue);
590 // }
591 //
592 // f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge
593 // vector<float> runningmean = SpecialConvolution(f2.values,hw);
594 // vector<float> runningstd;
595 // for (int i = 0; i < nNote; i++) { // first step: squared values into vector (variance)
596 // runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i]));
597 // }
598 // runningstd = SpecialConvolution(runningstd,hw); // second step convolve
599 // for (int i = 0; i < nNote; i++) {
600 // runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std
601 // if (runningstd[i] > 0) {
602 // // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ?
603 // // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
604 // f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ?
605 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0;
606 // }
607 // if (f2.values[i] < 0) {
608 // cerr << "ERROR: negative value in logfreq spectrum" << endl;
609 // }
610 // }
611 // fsOut[2].push_back(f2);
612 // count++;
613 // }
614 // cerr << "done." << endl;
615 //
616 // /** Semitone spectrum and chromagrams
617 // Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum
618 // is inferred using a non-negative least squares algorithm.
619 // Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means
620 // bass and treble stacked onto each other).
621 // **/
622 // if (m_useNNLS == 0) {
623 // cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... ";
624 // } else {
625 // cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... ";
626 // }
536 // 627 //
537 /** Calculate Tuning 628 //
538 calculate tuning from (using the angle of the complex number defined by the 629 // vector<vector<float> > chordogram;
539 cumulative mean real and imag values) 630 // vector<vector<int> > scoreChordogram;
540 **/ 631 // vector<float> chordchange = vector<float>(fsOut[2].size(),0);
541 float meanTuningImag = sinvalue * m_meanTunings[1] - sinvalue * m_meanTunings[2]; 632 // vector<float> oldchroma = vector<float>(12,0);
542 float meanTuningReal = m_meanTunings[0] + cosvalue * m_meanTunings[1] + cosvalue * m_meanTunings[2]; 633 // vector<float> oldbasschroma = vector<float>(12,0);
543 float cumulativetuning = 440 * pow(2,atan2(meanTuningImag, meanTuningReal)/(24*M_PI)); 634 // count = 0;
544 float normalisedtuning = atan2(meanTuningImag, meanTuningReal)/(2*M_PI); 635 //
545 int intShift = floor(normalisedtuning * 3); 636 // for (FeatureList::iterator it = fsOut[2].begin(); it != fsOut[2].end(); ++it) {
546 float floatShift = normalisedtuning * 3 - intShift; // floatShift is a really bad name for this 637 // Feature f2 = *it; // logfreq spectrum
547 638 // Feature f3; // semitone spectrum
548 char buffer0 [50]; 639 // Feature f4; // treble chromagram
549 640 // Feature f5; // bass chromagram
550 sprintf(buffer0, "estimated tuning: %0.1f Hz", cumulativetuning); 641 // Feature f6; // treble and bass chromagram
551 642 //
552 // cerr << "normalisedtuning: " << normalisedtuning << '\n'; 643 // f3.hasTimestamp = true;
553 644 // f3.timestamp = f2.timestamp;
554 // push tuning to FeatureSet fsOut 645 //
555 Feature f0; // tuning 646 // f4.hasTimestamp = true;
556 f0.hasTimestamp = true; 647 // f4.timestamp = f2.timestamp;
557 f0.timestamp = Vamp::RealTime::frame2RealTime(0, lrintf(m_inputSampleRate));; 648 //
558 f0.label = buffer0; 649 // f5.hasTimestamp = true;
559 fsOut[0].push_back(f0); 650 // f5.timestamp = f2.timestamp;
560 651 //
561 /** Tune Log-Frequency Spectrogram 652 // f6.hasTimestamp = true;
562 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to 653 // f6.timestamp = f2.timestamp;
563 perform linear interpolation on the existing log-frequency spectrogram (kinda f1). 654 //
564 **/ 655 // float b[nNote];
565 cerr << endl << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... "; 656 //
566 657 // bool some_b_greater_zero = false;
567 float tempValue = 0; 658 // float sumb = 0;
568 float dbThreshold = 0; // relative to the background spectrum 659 // for (int i = 0; i < nNote; i++) {
569 float thresh = pow(10,dbThreshold/20); 660 // // b[i] = m_dict[(nNote * count + i) % (nNote * 84)];
570 // cerr << "tune local ? " << m_tuneLocal << endl; 661 // b[i] = f2.values[i];
571 int count = 0; 662 // sumb += b[i];
572 663 // if (b[i] > 0) {
573 for (FeatureList::iterator i = m_logSpectrum.begin(); i != m_logSpectrum.end(); ++i) { 664 // some_b_greater_zero = true;
574 Feature f1 = *i; 665 // }
575 Feature f2; // tuned log-frequency spectrum 666 // }
576 f2.hasTimestamp = true; 667 //
577 f2.timestamp = f1.timestamp; 668 // // here's where the non-negative least squares algorithm calculates the note activation x
578 f2.values.push_back(0.0); f2.values.push_back(0.0); // set lower edge to zero 669 //
579 670 // vector<float> chroma = vector<float>(12, 0);
580 if (m_tuneLocal == 1.0) { 671 // vector<float> basschroma = vector<float>(12, 0);
581 intShift = floor(m_localTuning[count] * 3); 672 // float currval;
582 floatShift = m_localTuning[count] * 3 - intShift; // floatShift is a really bad name for this 673 // unsigned iSemitone = 0;
583 } 674 //
584 675 // if (some_b_greater_zero) {
585 // cerr << intShift << " " << floatShift << endl; 676 // if (m_useNNLS == 0) {
586 677 // for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
587 for (unsigned k = 2; k < f1.values.size() - 3; ++k) { // interpolate all inner bins 678 // currval = 0;
588 tempValue = f1.values[k + intShift] * (1-floatShift) + f1.values[k+intShift+1] * floatShift; 679 // currval += b[iNote + 1 + -1] * 0.5;
589 f2.values.push_back(tempValue); 680 // currval += b[iNote + 1 + 0] * 1.0;
590 } 681 // currval += b[iNote + 1 + 1] * 0.5;
591 682 // f3.values.push_back(currval);
592 f2.values.push_back(0.0); f2.values.push_back(0.0); f2.values.push_back(0.0); // upper edge 683 // chroma[iSemitone % 12] += currval * treblewindow[iSemitone];
593 vector<float> runningmean = SpecialConvolution(f2.values,hw); 684 // basschroma[iSemitone % 12] += currval * basswindow[iSemitone];
594 vector<float> runningstd; 685 // iSemitone++;
595 for (int i = 0; i < nNote; i++) { // first step: squared values into vector (variance) 686 // }
596 runningstd.push_back((f2.values[i] - runningmean[i]) * (f2.values[i] - runningmean[i])); 687 //
597 } 688 // } else {
598 runningstd = SpecialConvolution(runningstd,hw); // second step convolve 689 // float x[84+1000];
599 for (int i = 0; i < nNote; i++) { 690 // for (int i = 1; i < 1084; ++i) x[i] = 1.0;
600 runningstd[i] = sqrt(runningstd[i]); // square root to finally have running std 691 // vector<int> signifIndex;
601 if (runningstd[i] > 0) { 692 // int index=0;
602 // f2.values[i] = (f2.values[i] / runningmean[i]) > thresh ? 693 // sumb /= 84.0;
603 // (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; 694 // for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) {
604 f2.values[i] = (f2.values[i] - runningmean[i]) > 0 ? 695 // float currval = 0;
605 (f2.values[i] - runningmean[i]) / pow(runningstd[i],m_whitening) : 0; 696 // currval += b[iNote + 1 + -1];
606 } 697 // currval += b[iNote + 1 + 0];
607 if (f2.values[i] < 0) { 698 // currval += b[iNote + 1 + 1];
608 cerr << "ERROR: negative value in logfreq spectrum" << endl; 699 // if (currval > 0) signifIndex.push_back(index);
609 } 700 // f3.values.push_back(0); // fill the values, change later
610 } 701 // index++;
611 fsOut[2].push_back(f2); 702 // }
612 count++; 703 // float rnorm;
613 } 704 // float w[84+1000];
614 cerr << "done." << endl; 705 // float zz[84+1000];
615 706 // int indx[84+1000];
616 /** Semitone spectrum and chromagrams 707 // int mode;
617 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum 708 // int dictsize = nNote*signifIndex.size();
618 is inferred using a non-negative least squares algorithm. 709 // // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl;
619 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means 710 // float *curr_dict = new float[dictsize];
620 bass and treble stacked onto each other). 711 // for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
621 **/ 712 // for (unsigned iBin = 0; iBin < nNote; iBin++) {
622 if (m_useNNLS == 0) { 713 // curr_dict[iNote * nNote + iBin] = 1.0 * m_dict[signifIndex[iNote] * nNote + iBin];
623 cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... "; 714 // }
624 } else { 715 // }
625 cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... "; 716 // nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode);
626 } 717 // delete [] curr_dict;
627 718 // for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) {
628 719 // f3.values[signifIndex[iNote]] = x[iNote];
629 vector<vector<float> > chordogram; 720 // // cerr << mode << endl;
630 vector<vector<int> > scoreChordogram; 721 // chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]];
631 vector<float> chordchange = vector<float>(fsOut[2].size(),0); 722 // basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]];
632 vector<float> oldchroma = vector<float>(12,0); 723 // }
633 vector<float> oldbasschroma = vector<float>(12,0); 724 // }
634 count = 0; 725 // }
635 726 //
636 for (FeatureList::iterator it = fsOut[2].begin(); it != fsOut[2].end(); ++it) { 727 //
637 Feature f2 = *it; // logfreq spectrum 728 //
638 Feature f3; // semitone spectrum 729 //
639 Feature f4; // treble chromagram 730 // f4.values = chroma;
640 Feature f5; // bass chromagram 731 // f5.values = basschroma;
641 Feature f6; // treble and bass chromagram 732 // chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas
642 733 // f6.values = chroma;
643 f3.hasTimestamp = true; 734 //
644 f3.timestamp = f2.timestamp; 735 // if (m_doNormalizeChroma > 0) {
645 736 // vector<float> chromanorm = vector<float>(3,0);
646 f4.hasTimestamp = true; 737 // switch (int(m_doNormalizeChroma)) {
647 f4.timestamp = f2.timestamp; 738 // case 0: // should never end up here
648 739 // break;
649 f5.hasTimestamp = true; 740 // case 1:
650 f5.timestamp = f2.timestamp; 741 // chromanorm[0] = *max_element(f4.values.begin(), f4.values.end());
651 742 // chromanorm[1] = *max_element(f5.values.begin(), f5.values.end());
652 f6.hasTimestamp = true; 743 // chromanorm[2] = max(chromanorm[0], chromanorm[1]);
653 f6.timestamp = f2.timestamp; 744 // break;
654 745 // case 2:
655 float b[nNote]; 746 // for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
656 747 // chromanorm[0] += *it;
657 bool some_b_greater_zero = false; 748 // }
658 float sumb = 0; 749 // for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
659 for (int i = 0; i < nNote; i++) { 750 // chromanorm[1] += *it;
660 // b[i] = m_dict[(nNote * count + i) % (nNote * 84)]; 751 // }
661 b[i] = f2.values[i]; 752 // for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
662 sumb += b[i]; 753 // chromanorm[2] += *it;
663 if (b[i] > 0) { 754 // }
664 some_b_greater_zero = true; 755 // break;
665 } 756 // case 3:
666 } 757 // for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) {
667 758 // chromanorm[0] += pow(*it,2);
668 // here's where the non-negative least squares algorithm calculates the note activation x 759 // }
669 760 // chromanorm[0] = sqrt(chromanorm[0]);
670 vector<float> chroma = vector<float>(12, 0); 761 // for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) {
671 vector<float> basschroma = vector<float>(12, 0); 762 // chromanorm[1] += pow(*it,2);
672 float currval; 763 // }
673 unsigned iSemitone = 0; 764 // chromanorm[1] = sqrt(chromanorm[1]);
674 765 // for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) {
675 if (some_b_greater_zero) { 766 // chromanorm[2] += pow(*it,2);
676 if (m_useNNLS == 0) { 767 // }
677 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) { 768 // chromanorm[2] = sqrt(chromanorm[2]);
678 currval = 0; 769 // break;
679 currval += b[iNote + 1 + -1] * 0.5; 770 // }
680 currval += b[iNote + 1 + 0] * 1.0; 771 // if (chromanorm[0] > 0) {
681 currval += b[iNote + 1 + 1] * 0.5; 772 // for (int i = 0; i < f4.values.size(); i++) {
682 f3.values.push_back(currval); 773 // f4.values[i] /= chromanorm[0];
683 chroma[iSemitone % 12] += currval * treblewindow[iSemitone]; 774 // }
684 basschroma[iSemitone % 12] += currval * basswindow[iSemitone]; 775 // }
685 iSemitone++; 776 // if (chromanorm[1] > 0) {
686 } 777 // for (int i = 0; i < f5.values.size(); i++) {
687 778 // f5.values[i] /= chromanorm[1];
688 } else { 779 // }
689 float x[84+1000]; 780 // }
690 for (int i = 1; i < 1084; ++i) x[i] = 1.0; 781 // if (chromanorm[2] > 0) {
691 vector<int> signifIndex; 782 // for (int i = 0; i < f6.values.size(); i++) {
692 int index=0; 783 // f6.values[i] /= chromanorm[2];
693 sumb /= 84.0; 784 // }
694 for (unsigned iNote = 2; iNote < nNote - 2; iNote += 3) { 785 // }
695 float currval = 0; 786 //
696 currval += b[iNote + 1 + -1]; 787 // }
697 currval += b[iNote + 1 + 0]; 788 //
698 currval += b[iNote + 1 + 1]; 789 // // local chord estimation
699 if (currval > 0) signifIndex.push_back(index); 790 // vector<float> currentChordSalience;
700 f3.values.push_back(0); // fill the values, change later 791 // float tempchordvalue = 0;
701 index++; 792 // float sumchordvalue = 0;
702 } 793 //
703 float rnorm; 794 // for (int iChord = 0; iChord < nChord; iChord++) {
704 float w[84+1000]; 795 // tempchordvalue = 0;
705 float zz[84+1000]; 796 // for (int iBin = 0; iBin < 12; iBin++) {
706 int indx[84+1000]; 797 // tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
707 int mode; 798 // }
708 int dictsize = nNote*signifIndex.size(); 799 // for (int iBin = 12; iBin < 24; iBin++) {
709 // cerr << "dictsize is " << dictsize << "and values size" << f3.values.size()<< endl; 800 // tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin];
710 float *curr_dict = new float[dictsize]; 801 // }
711 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) { 802 // sumchordvalue+=tempchordvalue;
712 for (unsigned iBin = 0; iBin < nNote; iBin++) { 803 // currentChordSalience.push_back(tempchordvalue);
713 curr_dict[iNote * nNote + iBin] = 1.0 * m_dict[signifIndex[iNote] * nNote + iBin]; 804 // }
714 } 805 // if (sumchordvalue > 0) {
715 } 806 // for (int iChord = 0; iChord < nChord; iChord++) {
716 nnls(curr_dict, nNote, nNote, signifIndex.size(), b, x, &rnorm, w, zz, indx, &mode); 807 // currentChordSalience[iChord] /= sumchordvalue;
717 delete [] curr_dict; 808 // }
718 for (unsigned iNote = 0; iNote < signifIndex.size(); ++iNote) { 809 // } else {
719 f3.values[signifIndex[iNote]] = x[iNote]; 810 // currentChordSalience[nChord-1] = 1.0;
720 // cerr << mode << endl; 811 // }
721 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]]; 812 // chordogram.push_back(currentChordSalience);
722 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]]; 813 //
723 } 814 // fsOut[3].push_back(f3);
724 } 815 // fsOut[4].push_back(f4);
725 } 816 // fsOut[5].push_back(f5);
726 817 // fsOut[6].push_back(f6);
727 818 // count++;
728 819 // }
729 820 // cerr << "done." << endl;
730 f4.values = chroma; 821 //
731 f5.values = basschroma; 822 //
732 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas 823 // /* Simple chord estimation
733 f6.values = chroma; 824 // I just take the local chord estimates ("currentChordSalience") and average them over time, then
734 825 // take the maximum. Very simple, don't do this at home...
735 if (m_doNormalizeChroma > 0) { 826 // */
736 vector<float> chromanorm = vector<float>(3,0); 827 // cerr << "[NNLS Chroma Plugin] Chord Estimation ... ";
737 switch (int(m_doNormalizeChroma)) { 828 // count = 0;
738 case 0: // should never end up here 829 // int halfwindowlength = m_inputSampleRate / m_stepSize;
739 break; 830 // vector<int> chordSequence;
740 case 1: 831 // for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram
741 chromanorm[0] = *max_element(f4.values.begin(), f4.values.end()); 832 // vector<int> temp = vector<int>(nChord,0);
742 chromanorm[1] = *max_element(f5.values.begin(), f5.values.end()); 833 // scoreChordogram.push_back(temp);
743 chromanorm[2] = max(chromanorm[0], chromanorm[1]); 834 // }
744 break; 835 // for (FeatureList::iterator it = fsOut[6].begin(); it < fsOut[6].end()-2*halfwindowlength-1; ++it) {
745 case 2: 836 // int startIndex = count + 1;
746 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { 837 // int endIndex = count + 2 * halfwindowlength;
747 chromanorm[0] += *it; 838 //
748 } 839 // float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1);
749 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { 840 //
750 chromanorm[1] += *it; 841 // vector<int> chordCandidates;
751 } 842 // for (unsigned iChord = 0; iChord < nChord-1; iChord++) {
752 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { 843 // // float currsum = 0;
753 chromanorm[2] += *it; 844 // // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
754 } 845 // // currsum += chordogram[iFrame][iChord];
755 break; 846 // // }
756 case 3: 847 // // if (currsum > chordThreshold) chordCandidates.push_back(iChord);
757 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { 848 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) {
758 chromanorm[0] += pow(*it,2); 849 // if (chordogram[iFrame][iChord] > chordThreshold) {
759 } 850 // chordCandidates.push_back(iChord);
760 chromanorm[0] = sqrt(chromanorm[0]); 851 // break;
761 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { 852 // }
762 chromanorm[1] += pow(*it,2); 853 // }
763 } 854 // }
764 chromanorm[1] = sqrt(chromanorm[1]); 855 // chordCandidates.push_back(nChord-1);
765 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { 856 // // cerr << chordCandidates.size() << endl;
766 chromanorm[2] += pow(*it,2); 857 //
767 } 858 // float maxval = 0; // will be the value of the most salient *chord change* in this frame
768 chromanorm[2] = sqrt(chromanorm[2]); 859 // float maxindex = 0; //... and the index thereof
769 break; 860 // unsigned bestchordL = nChord-1; // index of the best "left" chord
770 } 861 // unsigned bestchordR = nChord-1; // index of the best "right" chord
771 if (chromanorm[0] > 0) { 862 //
772 for (int i = 0; i < f4.values.size(); i++) { 863 // for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) {
773 f4.values[i] /= chromanorm[0]; 864 // // now find the max values on both sides of iWF
774 } 865 // // left side:
775 } 866 // float maxL = 0;
776 if (chromanorm[1] > 0) { 867 // unsigned maxindL = nChord-1;
777 for (int i = 0; i < f5.values.size(); i++) { 868 // for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
778 f5.values[i] /= chromanorm[1]; 869 // unsigned iChord = chordCandidates[kChord];
779 } 870 // float currsum = 0;
780 } 871 // for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) {
781 if (chromanorm[2] > 0) { 872 // currsum += chordogram[count+iFrame][iChord];
782 for (int i = 0; i < f6.values.size(); i++) { 873 // }
783 f6.values[i] /= chromanorm[2]; 874 // if (iChord == nChord-1) currsum *= 0.8;
784 } 875 // if (currsum > maxL) {
785 } 876 // maxL = currsum;
786 877 // maxindL = iChord;
787 } 878 // }
788 879 // }
789 // local chord estimation 880 // // right side:
790 vector<float> currentChordSalience; 881 // float maxR = 0;
791 float tempchordvalue = 0; 882 // unsigned maxindR = nChord-1;
792 float sumchordvalue = 0; 883 // for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) {
793 884 // unsigned iChord = chordCandidates[kChord];
794 for (int iChord = 0; iChord < nChord; iChord++) { 885 // float currsum = 0;
795 tempchordvalue = 0; 886 // for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) {
796 for (int iBin = 0; iBin < 12; iBin++) { 887 // currsum += chordogram[count+iFrame][iChord];
797 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin]; 888 // }
798 } 889 // if (iChord == nChord-1) currsum *= 0.8;
799 for (int iBin = 12; iBin < 24; iBin++) { 890 // if (currsum > maxR) {
800 tempchordvalue += m_chorddict[24 * iChord + iBin] * chroma[iBin]; 891 // maxR = currsum;
801 } 892 // maxindR = iChord;
802 sumchordvalue+=tempchordvalue; 893 // }
803 currentChordSalience.push_back(tempchordvalue); 894 // }
804 } 895 // if (maxL+maxR > maxval) {
805 if (sumchordvalue > 0) { 896 // maxval = maxL+maxR;
806 for (int iChord = 0; iChord < nChord; iChord++) { 897 // maxindex = iWF;
807 currentChordSalience[iChord] /= sumchordvalue; 898 // bestchordL = maxindL;
808 } 899 // bestchordR = maxindR;
809 } else { 900 // }
810 currentChordSalience[nChord-1] = 1.0; 901 //
811 } 902 // }
812 chordogram.push_back(currentChordSalience); 903 // // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl;
813 904 // // add a score to every chord-frame-point that was part of a maximum
814 fsOut[3].push_back(f3); 905 // for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) {
815 fsOut[4].push_back(f4); 906 // scoreChordogram[iFrame+count][bestchordL]++;
816 fsOut[5].push_back(f5); 907 // }
817 fsOut[6].push_back(f6); 908 // for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) {
818 count++; 909 // scoreChordogram[iFrame+count][bestchordR]++;
819 } 910 // }
820 cerr << "done." << endl; 911 // if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength;
821 912 // count++;
822 913 // }
823 /* Simple chord estimation 914 // // cerr << "******* agent finished *******" << endl;
824 I just take the local chord estimates ("currentChordSalience") and average them over time, then 915 // count = 0;
825 take the maximum. Very simple, don't do this at home... 916 // for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
826 */ 917 // float maxval = 0; // will be the value of the most salient chord in this frame
827 cerr << "[NNLS Chroma Plugin] Chord Estimation ... "; 918 // float maxindex = 0; //... and the index thereof
828 count = 0; 919 // for (unsigned iChord = 0; iChord < nChord; iChord++) {
829 int halfwindowlength = m_inputSampleRate / m_stepSize; 920 // if (scoreChordogram[count][iChord] > maxval) {
830 vector<int> chordSequence; 921 // maxval = scoreChordogram[count][iChord];
831 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram 922 // maxindex = iChord;
832 vector<int> temp = vector<int>(nChord,0); 923 // // cerr << iChord << endl;
833 scoreChordogram.push_back(temp); 924 // }
834 } 925 // }
835 for (FeatureList::iterator it = fsOut[6].begin(); it < fsOut[6].end()-2*halfwindowlength-1; ++it) { 926 // chordSequence.push_back(maxindex);
836 int startIndex = count + 1; 927 // // cerr << "before modefilter, maxindex: " << maxindex << endl;
837 int endIndex = count + 2 * halfwindowlength; 928 // count++;
838 929 // }
839 float chordThreshold = 2.5/nChord;//*(2*halfwindowlength+1); 930 // // cerr << "******* mode filter done *******" << endl;
840 931 //
841 vector<int> chordCandidates; 932 //
842 for (unsigned iChord = 0; iChord < nChord-1; iChord++) { 933 // // mode filter on chordSequence
843 // float currsum = 0; 934 // count = 0;
844 // for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) { 935 // string oldChord = "";
845 // currsum += chordogram[iFrame][iChord]; 936 // for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) {
846 // } 937 // Feature f6 = *it;
847 // if (currsum > chordThreshold) chordCandidates.push_back(iChord); 938 // Feature f7; // chord estimate
848 for (unsigned iFrame = startIndex; iFrame < endIndex; ++iFrame) { 939 // f7.hasTimestamp = true;
849 if (chordogram[iFrame][iChord] > chordThreshold) { 940 // f7.timestamp = f6.timestamp;
850 chordCandidates.push_back(iChord); 941 // Feature f8; // chord estimate
851 break; 942 // f8.hasTimestamp = true;
852 } 943 // f8.timestamp = f6.timestamp;
853 } 944 //
854 } 945 // vector<int> chordCount = vector<int>(nChord,0);
855 chordCandidates.push_back(nChord-1); 946 // int maxChordCount = 0;
856 // cerr << chordCandidates.size() << endl; 947 // int maxChordIndex = nChord-1;
857 948 // string maxChord;
858 float maxval = 0; // will be the value of the most salient *chord change* in this frame 949 // int startIndex = max(count - halfwindowlength/2,0);
859 float maxindex = 0; //... and the index thereof 950 // int endIndex = min(int(chordogram.size()), count + halfwindowlength/2);
860 unsigned bestchordL = nChord-1; // index of the best "left" chord 951 // for (int i = startIndex; i < endIndex; i++) {
861 unsigned bestchordR = nChord-1; // index of the best "right" chord 952 // chordCount[chordSequence[i]]++;
862 953 // if (chordCount[chordSequence[i]] > maxChordCount) {
863 for (int iWF = 1; iWF < 2*halfwindowlength; ++iWF) { 954 // // cerr << "start index " << startIndex << endl;
864 // now find the max values on both sides of iWF 955 // maxChordCount++;
865 // left side: 956 // maxChordIndex = chordSequence[i];
866 float maxL = 0; 957 // maxChord = m_chordnames[maxChordIndex];
867 unsigned maxindL = nChord-1; 958 // }
868 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) { 959 // }
869 unsigned iChord = chordCandidates[kChord]; 960 // // chordSequence[count] = maxChordIndex;
870 float currsum = 0; 961 // // cerr << maxChordIndex << endl;
871 for (unsigned iFrame = 0; iFrame < iWF-1; ++iFrame) { 962 // f8.values.push_back(chordchange[count]/(halfwindowlength*2));
872 currsum += chordogram[count+iFrame][iChord]; 963 // // cerr << chordchange[count] << endl;
873 } 964 // fsOut[9].push_back(f8);
874 if (iChord == nChord-1) currsum *= 0.8; 965 // if (oldChord != maxChord) {
875 if (currsum > maxL) { 966 // oldChord = maxChord;
876 maxL = currsum; 967 //
877 maxindL = iChord; 968 // // char buffer1 [50];
878 } 969 // // if (maxChordIndex < nChord - 1) {
879 } 970 // // sprintf(buffer1, "%s%s", notenames[maxChordIndex % 12 + 12], chordtypes[maxChordIndex]);
880 // right side: 971 // // } else {
881 float maxR = 0; 972 // // sprintf(buffer1, "N");
882 unsigned maxindR = nChord-1; 973 // // }
883 for (unsigned kChord = 0; kChord < chordCandidates.size(); kChord++) { 974 // // f7.label = buffer1;
884 unsigned iChord = chordCandidates[kChord]; 975 // f7.label = m_chordnames[maxChordIndex];
885 float currsum = 0; 976 // fsOut[7].push_back(f7);
886 for (unsigned iFrame = iWF-1; iFrame < 2*halfwindowlength; ++iFrame) { 977 // }
887 currsum += chordogram[count+iFrame][iChord]; 978 // count++;
888 } 979 // }
889 if (iChord == nChord-1) currsum *= 0.8; 980 // Feature f7; // last chord estimate
890 if (currsum > maxR) { 981 // f7.hasTimestamp = true;
891 maxR = currsum; 982 // f7.timestamp = fsOut[6][fsOut[6].size()-1].timestamp;
892 maxindR = iChord; 983 // f7.label = "N";
893 } 984 // fsOut[7].push_back(f7);
894 } 985 // cerr << "done." << endl;
895 if (maxL+maxR > maxval) { 986 // // // musicity
896 maxval = maxL+maxR; 987 // // count = 0;
897 maxindex = iWF; 988 // // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
898 bestchordL = maxindL; 989 // // vector<float> musicityValue;
899 bestchordR = maxindR; 990 // // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
900 } 991 // // Feature f4 = *it;
901 992 // //
902 } 993 // // int startIndex = max(count - musicitykernelwidth/2,0);
903 // cerr << "maxindex: " << maxindex << ", bestchordR is " << bestchordR << ", of frame " << count << endl; 994 // // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
904 // add a score to every chord-frame-point that was part of a maximum 995 // // float chromasum = 0;
905 for (unsigned iFrame = 0; iFrame < maxindex-1; ++iFrame) { 996 // // float diffsum = 0;
906 scoreChordogram[iFrame+count][bestchordL]++; 997 // // for (int k = 0; k < 12; k++) {
907 } 998 // // for (int i = startIndex + 1; i < endIndex; i++) {
908 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) { 999 // // chromasum += pow(fsOut[4][i].values[k],2);
909 scoreChordogram[iFrame+count][bestchordR]++; 1000 // // diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
910 } 1001 // // }
911 if (bestchordL != bestchordR) chordchange[maxindex+count] += (halfwindowlength - abs(maxindex-halfwindowlength)) * 2.0 / halfwindowlength; 1002 // // }
912 count++; 1003 // // diffsum /= chromasum;
913 } 1004 // // musicityValue.push_back(diffsum);
914 // cerr << "******* agent finished *******" << endl; 1005 // // count++;
915 count = 0; 1006 // // }
916 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { 1007 // //
917 float maxval = 0; // will be the value of the most salient chord in this frame 1008 // // float musicityThreshold = 0.44;
918 float maxindex = 0; //... and the index thereof 1009 // // if (m_stepSize == 4096) {
919 for (unsigned iChord = 0; iChord < nChord; iChord++) { 1010 // // musicityThreshold = 0.74;
920 if (scoreChordogram[count][iChord] > maxval) { 1011 // // }
921 maxval = scoreChordogram[count][iChord]; 1012 // // if (m_stepSize == 4410) {
922 maxindex = iChord; 1013 // // musicityThreshold = 0.77;
923 // cerr << iChord << endl; 1014 // // }
924 } 1015 // //
925 } 1016 // // count = 0;
926 chordSequence.push_back(maxindex); 1017 // // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
927 // cerr << "before modefilter, maxindex: " << maxindex << endl; 1018 // // Feature f4 = *it;
928 count++; 1019 // // Feature f8; // musicity
929 } 1020 // // Feature f9; // musicity segmenter
930 // cerr << "******* mode filter done *******" << endl; 1021 // //
931 1022 // // f8.hasTimestamp = true;
932 1023 // // f8.timestamp = f4.timestamp;
933 // mode filter on chordSequence 1024 // // f9.hasTimestamp = true;
934 count = 0; 1025 // // f9.timestamp = f4.timestamp;
935 string oldChord = ""; 1026 // //
936 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { 1027 // // int startIndex = max(count - musicitykernelwidth/2,0);
937 Feature f6 = *it; 1028 // // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
938 Feature f7; // chord estimate 1029 // // int musicityCount = 0;
939 f7.hasTimestamp = true; 1030 // // for (int i = startIndex; i <= endIndex; i++) {
940 f7.timestamp = f6.timestamp; 1031 // // if (musicityValue[i] > musicityThreshold) musicityCount++;
941 Feature f8; // chord estimate 1032 // // }
942 f8.hasTimestamp = true; 1033 // // bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
943 f8.timestamp = f6.timestamp; 1034 // //
944 1035 // // if (isSpeech) {
945 vector<int> chordCount = vector<int>(nChord,0); 1036 // // if (oldlabeltype != 2) {
946 int maxChordCount = 0; 1037 // // f9.label = "Speech";
947 int maxChordIndex = nChord-1; 1038 // // fsOut[9].push_back(f9);
948 string maxChord; 1039 // // oldlabeltype = 2;
949 int startIndex = max(count - halfwindowlength/2,0); 1040 // // }
950 int endIndex = min(int(chordogram.size()), count + halfwindowlength/2); 1041 // // } else {
951 for (int i = startIndex; i < endIndex; i++) { 1042 // // if (oldlabeltype != 1) {
952 chordCount[chordSequence[i]]++; 1043 // // f9.label = "Music";
953 if (chordCount[chordSequence[i]] > maxChordCount) { 1044 // // fsOut[9].push_back(f9);
954 // cerr << "start index " << startIndex << endl; 1045 // // oldlabeltype = 1;
955 maxChordCount++; 1046 // // }
956 maxChordIndex = chordSequence[i]; 1047 // // }
957 maxChord = m_chordnames[maxChordIndex]; 1048 // // f8.values.push_back(musicityValue[count]);
958 } 1049 // // fsOut[8].push_back(f8);
959 } 1050 // // count++;
960 // chordSequence[count] = maxChordIndex; 1051 // // }
961 // cerr << maxChordIndex << endl;
962 f8.values.push_back(chordchange[count]/(halfwindowlength*2));
963 // cerr << chordchange[count] << endl;
964 fsOut[9].push_back(f8);
965 if (oldChord != maxChord) {
966 oldChord = maxChord;
967
968 // char buffer1 [50];
969 // if (maxChordIndex < nChord - 1) {
970 // sprintf(buffer1, "%s%s", notenames[maxChordIndex % 12 + 12], chordtypes[maxChordIndex]);
971 // } else {
972 // sprintf(buffer1, "N");
973 // }
974 // f7.label = buffer1;
975 f7.label = m_chordnames[maxChordIndex];
976 fsOut[7].push_back(f7);
977 }
978 count++;
979 }
980 Feature f7; // last chord estimate
981 f7.hasTimestamp = true;
982 f7.timestamp = fsOut[6][fsOut[6].size()-1].timestamp;
983 f7.label = "N";
984 fsOut[7].push_back(f7);
985 cerr << "done." << endl;
986 // // musicity
987 // count = 0;
988 // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2
989 // vector<float> musicityValue;
990 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
991 // Feature f4 = *it;
992 //
993 // int startIndex = max(count - musicitykernelwidth/2,0);
994 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
995 // float chromasum = 0;
996 // float diffsum = 0;
997 // for (int k = 0; k < 12; k++) {
998 // for (int i = startIndex + 1; i < endIndex; i++) {
999 // chromasum += pow(fsOut[4][i].values[k],2);
1000 // diffsum += abs(fsOut[4][i-1].values[k] - fsOut[4][i].values[k]);
1001 // }
1002 // }
1003 // diffsum /= chromasum;
1004 // musicityValue.push_back(diffsum);
1005 // count++;
1006 // }
1007 //
1008 // float musicityThreshold = 0.44;
1009 // if (m_stepSize == 4096) {
1010 // musicityThreshold = 0.74;
1011 // }
1012 // if (m_stepSize == 4410) {
1013 // musicityThreshold = 0.77;
1014 // }
1015 //
1016 // count = 0;
1017 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) {
1018 // Feature f4 = *it;
1019 // Feature f8; // musicity
1020 // Feature f9; // musicity segmenter
1021 //
1022 // f8.hasTimestamp = true;
1023 // f8.timestamp = f4.timestamp;
1024 // f9.hasTimestamp = true;
1025 // f9.timestamp = f4.timestamp;
1026 //
1027 // int startIndex = max(count - musicitykernelwidth/2,0);
1028 // int endIndex = min(int(chordogram.size()), startIndex + musicitykernelwidth - 1);
1029 // int musicityCount = 0;
1030 // for (int i = startIndex; i <= endIndex; i++) {
1031 // if (musicityValue[i] > musicityThreshold) musicityCount++;
1032 // }
1033 // bool isSpeech = (2 * musicityCount > endIndex - startIndex + 1);
1034 //
1035 // if (isSpeech) {
1036 // if (oldlabeltype != 2) {
1037 // f9.label = "Speech";
1038 // fsOut[9].push_back(f9);
1039 // oldlabeltype = 2;
1040 // }
1041 // } else {
1042 // if (oldlabeltype != 1) {
1043 // f9.label = "Music";
1044 // fsOut[9].push_back(f9);
1045 // oldlabeltype = 1;
1046 // }
1047 // }
1048 // f8.values.push_back(musicityValue[count]);
1049 // fsOut[8].push_back(f8);
1050 // count++;
1051 // }
1052 return fsOut; 1052 return fsOut;
1053 1053
1054 } 1054 }
1055 1055
1056 #endif 1056 #endif