Mercurial > hg > nnls-chroma
comparison NNLSChroma.cpp @ 13:9ae90fa5fa74 matthiasm-plugin
NNLS is now taken from a file without gpl. more chroma normalisation options.
author | matthiasm |
---|---|
date | Wed, 16 Jun 2010 10:16:13 +0000 |
parents | 54f28d8ac098 |
children | 75fb80542cd2 |
comparison
equal
deleted
inserted
replaced
12:54f28d8ac098 | 13:9ae90fa5fa74 |
---|---|
431 string | 431 string |
432 NNLSChroma::getDescription() const | 432 NNLSChroma::getDescription() const |
433 { | 433 { |
434 // Return something helpful here! | 434 // Return something helpful here! |
435 if (debug_on) cerr << "--> getDescription" << endl; | 435 if (debug_on) cerr << "--> getDescription" << endl; |
436 return "This plugin provides a number of features derived from a log-frequency amplitude spectrum (LAS) of the DFT: the LAS itself, a standard-tuned version thereof (the local and global tuning estimates can are also be output), an approximate transcription to semitone activation using non-linear least squares (NNLS). Furthermore chroma features and a simple chord estimate derived from this NNLS semitone transcription."; | 436 return "This plugin provides a number of features derived from a log-frequency amplitude spectrum of the DFT: some variants of the log-frequency spectrum, including a semitone spectrum derived from approximate transcription using the NNLS algorithm; based on this semitone spectrum, chroma features and a simple chord estimate."; |
437 } | 437 } |
438 | 438 |
439 string | 439 string |
440 NNLSChroma::getMaker() const | 440 NNLSChroma::getMaker() const |
441 { | 441 { |
567 d4.identifier = "chromanormalize"; | 567 d4.identifier = "chromanormalize"; |
568 d4.name = "chroma normalization"; | 568 d4.name = "chroma normalization"; |
569 d4.description = "How shall the chroma vector be normalized?"; | 569 d4.description = "How shall the chroma vector be normalized?"; |
570 d4.unit = ""; | 570 d4.unit = ""; |
571 d4.minValue = 0; | 571 d4.minValue = 0; |
572 d4.maxValue = 1; | 572 d4.maxValue = 3; |
573 d4.defaultValue = 0; | 573 d4.defaultValue = 0; |
574 d4.isQuantized = true; | 574 d4.isQuantized = true; |
575 d4.valueNames.push_back("no normalization"); | 575 d4.valueNames.push_back("none"); |
576 d4.valueNames.push_back("maximum normalization"); | 576 d4.valueNames.push_back("maximum norm"); |
577 d4.valueNames.push_back("L1 norm"); | |
578 d4.valueNames.push_back("L2 norm"); | |
577 d4.quantizeStep = 1.0; | 579 d4.quantizeStep = 1.0; |
578 list.push_back(d4); | 580 list.push_back(d4); |
579 | 581 |
580 return list; | 582 return list; |
581 } | 583 } |
1043 fsOut[0].push_back(f0); | 1045 fsOut[0].push_back(f0); |
1044 | 1046 |
1045 /** Tune Log-Frequency Spectrogram | 1047 /** Tune Log-Frequency Spectrogram |
1046 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to | 1048 calculate a tuned log-frequency spectrogram (f2): use the tuning estimated above (kinda f0) to |
1047 perform linear interpolation on the existing log-frequency spectrogram (kinda f1). | 1049 perform linear interpolation on the existing log-frequency spectrogram (kinda f1). |
1048 **/ | 1050 **/ |
1049 | 1051 cerr << "[NNLS Chroma Plugin] Tuning Log-Frequency Spectrogram ... "; |
1052 | |
1050 float tempValue = 0; | 1053 float tempValue = 0; |
1051 float dbThreshold = 0; // relative to the background spectrum | 1054 float dbThreshold = 0; // relative to the background spectrum |
1052 float thresh = pow(10,dbThreshold/20); | 1055 float thresh = pow(10,dbThreshold/20); |
1053 // cerr << "tune local ? " << m_tuneLocal << endl; | 1056 // cerr << "tune local ? " << m_tuneLocal << endl; |
1054 int count = 0; | 1057 int count = 0; |
1092 } | 1095 } |
1093 } | 1096 } |
1094 fsOut[2].push_back(f2); | 1097 fsOut[2].push_back(f2); |
1095 count++; | 1098 count++; |
1096 } | 1099 } |
1100 cerr << "done." << endl; | |
1097 | 1101 |
1098 /** Semitone spectrum and chromagrams | 1102 /** Semitone spectrum and chromagrams |
1099 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum | 1103 Semitone-spaced log-frequency spectrum derived from the tuned log-freq spectrum above. the spectrum |
1100 is inferred using a non-negative least squares algorithm. | 1104 is inferred using a non-negative least squares algorithm. |
1101 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means | 1105 Three different kinds of chromagram are calculated, "treble", "bass", and "both" (which means |
1102 bass and treble stacked onto each other). | 1106 bass and treble stacked onto each other). |
1103 **/ | 1107 **/ |
1104 // taucs_ccs_matrix* A_original_ordering = taucs_construct_sorted_ccs_matrix(nnlsdict06, nnls_m, nnls_n); | 1108 if (m_dictID == 1) { |
1109 cerr << "[NNLS Chroma Plugin] Mapping to semitone spectrum and chroma ... "; | |
1110 } else { | |
1111 cerr << "[NNLS Chroma Plugin] Performing NNLS and mapping to chroma ... "; | |
1112 } | |
1113 | |
1105 | 1114 |
1106 vector<vector<float> > chordogram; | 1115 vector<vector<float> > chordogram; |
1107 vector<vector<int> > scoreChordogram; | 1116 vector<vector<int> > scoreChordogram; |
1108 vector<float> oldchroma = vector<float>(12,0); | 1117 vector<float> oldchroma = vector<float>(12,0); |
1109 vector<float> oldbasschroma = vector<float>(12,0); | 1118 vector<float> oldbasschroma = vector<float>(12,0); |
1197 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]]; | 1206 chroma[signifIndex[iNote] % 12] += x[iNote] * treblewindow[signifIndex[iNote]]; |
1198 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]]; | 1207 basschroma[signifIndex[iNote] % 12] += x[iNote] * basswindow[signifIndex[iNote]]; |
1199 } | 1208 } |
1200 } | 1209 } |
1201 } | 1210 } |
1211 | |
1202 | 1212 |
1203 | 1213 |
1204 if (m_doNormalizeChroma > 0) { | 1214 |
1205 float chromamax = *max_element(chroma.begin(), chroma.end()); | |
1206 for (int i = 0; i < chroma.size(); i++) { | |
1207 chroma[i] /= chromamax; | |
1208 } | |
1209 } | |
1210 f4.values = chroma; | 1215 f4.values = chroma; |
1211 f5.values = basschroma; | 1216 f5.values = basschroma; |
1212 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas | 1217 chroma.insert(chroma.begin(), basschroma.begin(), basschroma.end()); // just stack the both chromas |
1213 f6.values = chroma; | 1218 f6.values = chroma; |
1214 | 1219 |
1220 if (m_doNormalizeChroma > 0) { | |
1221 vector<float> chromanorm = vector<float>(3,0); | |
1222 switch (int(m_doNormalizeChroma)) { | |
1223 case 0: // should never end up here | |
1224 break; | |
1225 case 1: | |
1226 chromanorm[0] = *max_element(f4.values.begin(), f4.values.end()); | |
1227 chromanorm[1] = *max_element(f5.values.begin(), f5.values.end()); | |
1228 chromanorm[2] = max(chromanorm[0], chromanorm[1]); | |
1229 break; | |
1230 case 2: | |
1231 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { | |
1232 chromanorm[0] += *it; | |
1233 } | |
1234 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { | |
1235 chromanorm[1] += *it; | |
1236 } | |
1237 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { | |
1238 chromanorm[2] += *it; | |
1239 } | |
1240 break; | |
1241 case 3: | |
1242 for (vector<float>::iterator it = f4.values.begin(); it != f4.values.end(); ++it) { | |
1243 chromanorm[0] += pow(*it,2); | |
1244 } | |
1245 chromanorm[0] = sqrt(chromanorm[0]); | |
1246 for (vector<float>::iterator it = f5.values.begin(); it != f5.values.end(); ++it) { | |
1247 chromanorm[1] += pow(*it,2); | |
1248 } | |
1249 chromanorm[1] = sqrt(chromanorm[1]); | |
1250 for (vector<float>::iterator it = f6.values.begin(); it != f6.values.end(); ++it) { | |
1251 chromanorm[2] += pow(*it,2); | |
1252 } | |
1253 chromanorm[2] = sqrt(chromanorm[2]); | |
1254 break; | |
1255 } | |
1256 if (chromanorm[0] > 0) { | |
1257 for (int i = 0; i < f4.values.size(); i++) { | |
1258 f4.values[i] /= chromanorm[0]; | |
1259 } | |
1260 } | |
1261 if (chromanorm[1] > 0) { | |
1262 for (int i = 0; i < f5.values.size(); i++) { | |
1263 f5.values[i] /= chromanorm[1]; | |
1264 } | |
1265 } | |
1266 if (chromanorm[2] > 0) { | |
1267 for (int i = 0; i < f6.values.size(); i++) { | |
1268 f6.values[i] /= chromanorm[2]; | |
1269 } | |
1270 } | |
1271 | |
1272 } | |
1273 | |
1215 // local chord estimation | 1274 // local chord estimation |
1216 vector<float> currentChordSalience; | 1275 vector<float> currentChordSalience; |
1217 float tempchordvalue = 0; | 1276 float tempchordvalue = 0; |
1218 float sumchordvalue = 0; | 1277 float sumchordvalue = 0; |
1219 | 1278 |
1237 fsOut[4].push_back(f4); | 1296 fsOut[4].push_back(f4); |
1238 fsOut[5].push_back(f5); | 1297 fsOut[5].push_back(f5); |
1239 fsOut[6].push_back(f6); | 1298 fsOut[6].push_back(f6); |
1240 count++; | 1299 count++; |
1241 } | 1300 } |
1242 cerr << "******* NNLS done *******" << endl; | 1301 cerr << "done." << endl; |
1302 | |
1243 | 1303 |
1244 /* Simple chord estimation | 1304 /* Simple chord estimation |
1245 I just take the local chord estimates ("currentChordSalience") and average them over time, then | 1305 I just take the local chord estimates ("currentChordSalience") and average them over time, then |
1246 take the maximum. Very simple, don't do this at home... | 1306 take the maximum. Very simple, don't do this at home... |
1247 */ | 1307 */ |
1308 cerr << "[NNLS Chroma Plugin] Chord Estimation ... "; | |
1248 count = 0; | 1309 count = 0; |
1249 int halfwindowlength = m_inputSampleRate / m_stepSize; | 1310 int halfwindowlength = m_inputSampleRate / m_stepSize; |
1250 vector<int> chordSequence; | 1311 vector<int> chordSequence; |
1251 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram | 1312 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { // initialise the score chordogram |
1252 vector<int> temp = vector<int>(nChord,0); | 1313 vector<int> temp = vector<int>(nChord,0); |
1328 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) { | 1389 for (unsigned iFrame = maxindex-1; iFrame < 2*halfwindowlength; ++iFrame) { |
1329 scoreChordogram[iFrame+count][bestchordR]++; | 1390 scoreChordogram[iFrame+count][bestchordR]++; |
1330 } | 1391 } |
1331 count++; | 1392 count++; |
1332 } | 1393 } |
1333 cerr << "******* agent finished *******" << endl; | 1394 // cerr << "******* agent finished *******" << endl; |
1334 count = 0; | 1395 count = 0; |
1335 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { | 1396 for (FeatureList::iterator it = fsOut[6].begin(); it != fsOut[6].end(); ++it) { |
1336 float maxval = 0; // will be the value of the most salient chord in this frame | 1397 float maxval = 0; // will be the value of the most salient chord in this frame |
1337 float maxindex = 0; //... and the index thereof | 1398 float maxindex = 0; //... and the index thereof |
1338 for (unsigned iChord = 0; iChord < nChord; iChord++) { | 1399 for (unsigned iChord = 0; iChord < nChord; iChord++) { |
1344 } | 1405 } |
1345 chordSequence.push_back(maxindex); | 1406 chordSequence.push_back(maxindex); |
1346 // cerr << "before modefilter, maxindex: " << maxindex << endl; | 1407 // cerr << "before modefilter, maxindex: " << maxindex << endl; |
1347 count++; | 1408 count++; |
1348 } | 1409 } |
1349 cerr << "******* mode filter done *******" << endl; | 1410 // cerr << "******* mode filter done *******" << endl; |
1350 | 1411 |
1351 | 1412 |
1352 // mode filter on chordSequence | 1413 // mode filter on chordSequence |
1353 count = 0; | 1414 count = 0; |
1354 string oldChord = ""; | 1415 string oldChord = ""; |
1387 f7.label = m_chordnames[maxChordIndex]; | 1448 f7.label = m_chordnames[maxChordIndex]; |
1388 fsOut[7].push_back(f7); | 1449 fsOut[7].push_back(f7); |
1389 } | 1450 } |
1390 count++; | 1451 count++; |
1391 } | 1452 } |
1453 cerr << "done." << endl; | |
1392 // // musicity | 1454 // // musicity |
1393 // count = 0; | 1455 // count = 0; |
1394 // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2 | 1456 // int oldlabeltype = 0; // start value is 0, music is 1, speech is 2 |
1395 // vector<float> musicityValue; | 1457 // vector<float> musicityValue; |
1396 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) { | 1458 // for (FeatureList::iterator it = fsOut[4].begin(); it != fsOut[4].end(); ++it) { |