Mercurial > hg > classical
changeset 36:48d8fec75afb
* More on track matching
author | Chris Cannam |
---|---|
date | Tue, 30 Mar 2010 07:29:08 +0100 |
parents | 732fb6b754fb |
children | a8ab8c08a668 |
files | common/Matcher.cpp common/Objects.cpp utilities/track/track.cpp |
diffstat | 3 files changed, 68 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/common/Matcher.cpp Sat Mar 27 12:30:25 2010 +0000 +++ b/common/Matcher.cpp Tue Mar 30 07:29:08 2010 +0100 @@ -252,6 +252,18 @@ std::cerr << "text = " << text.toStdString() << ", quoted = " << quoted.toStdString() << std::endl; + QStringList components = + text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); + QStringList reduced; + foreach (QString c, components) { + QString r = Composer::reduceName(c.toLower()); + if (r != "") { + if (r.length() > 3 || r[0].isDigit()) { + reduced.push_back(r); + } + } + } + EditDistance ed; foreach (Work *w, m_works) { @@ -259,20 +271,47 @@ break; } - float score = 0.f; - QString name = w->name(); + float highScore = 0.f; - if (quoted != "") { - if (quoteRe.indexIn(name) >= 0) { - QString q = quoteRe.cap(1); - int dist = ed.calculate(quoted, q, qthresh); - if (dist < qthresh) { - std::cerr << "quoted name match: " << q.toStdString() << std::endl; - score += 0.7f - 0.1f * dist; + QSet<QString> names = w->aliases(); + names.insert(w->name()); + + foreach (QString name, names) { + + float score = 0.f; + + if (quoted != "") { + if (quoteRe.indexIn(name) >= 0) { + QString q = quoteRe.cap(1); + int dist = ed.calculate(quoted, q, qthresh); + if (dist < qthresh) { + std::cerr << "quoted name match: " << q.toStdString() << std::endl; + score += 0.7f - 0.1f * dist; + if (score > highScore) { + highScore = score; + continue; + } + } } } - } + QStringList wcomp = + name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); + QSet<QString> wr; + foreach (QString wc, wcomp) { + wr.insert(Composer::reduceName(wc.toLower())); + } + foreach (QString rc, reduced) { + if (wr.contains(rc)) { + std::cerr << "component match: " << rc.toStdString() << std::endl; + score += 0.1; + } else { + score -= 0.101; + } + } + + if (score > highScore) highScore = score; +/* if (score == 0.f) { int ml = std::min(name.length(), text.length()); int thresh = ml / 4; @@ -283,13 +322,17 @@ } } - //!!! how to avoid high scores for things that we should be - //!!! able to recognise as different? e.g. "Chamber Symphony - //!!! No. 2" scoring very highly as a match for "Chamber - //!!! Symphony No. 1" - if (score > 0.f) { - matches.insert(Guess(score, w)); + + // need to avoid high scores for things with differing + // numbers, e.g. "Chamber Symphony No. 2" should not score + // highly as a match for "Chamber Symphony No. 1" + */ + } + + if (highScore > 0.f) { + std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl; + matches.insert(Guess(highScore, w)); } }
--- a/common/Objects.cpp Sat Mar 27 12:30:25 2010 +0000 +++ b/common/Objects.cpp Tue Mar 30 07:29:08 2010 +0100 @@ -206,14 +206,20 @@ .replace("'", "") .replace("x", "ks") .replace("y", "i") + .replace("ie", "i") + .replace("ei", "i") + .replace("ii", "i") .replace("k", "c") - .replace("ch", "c") - .replace("cc", "c") .replace("aa", "a") + .replace("a", "e") + .replace("ee", "e") .replace("v", "f") + .replace("ph", "f") .replace("ff", "f") .replace("th", "t") .replace("tch", "ch") + .replace("ch", "c") + .replace("cc", "c") .replace("er", "r"); return key; }
--- a/utilities/track/track.cpp Sat Mar 27 12:30:25 2010 +0000 +++ b/utilities/track/track.cpp Tue Mar 30 07:29:08 2010 +0100 @@ -413,7 +413,7 @@ guessWorkFromFilename(QString filename, float scale, GuessSet &guesses) { cerr << "guessWorkFromFilename: " << filename << endl; - QString filepart = QFileInfo(filename).fileName().replace(QRegExp("\\.[^\\.]*"), ""); + QString filepart = QFileInfo(filename).fileName().replace(QRegExp("\\.[^\\.]*"), "").replace(QRegExp("^\\d+[^\\w]+"), ""); guessWorkFromTitle(filepart, scale, guesses); }