Mercurial > hg > classical
changeset 15:701702f8959a classical-rdf
* Better scoring for composer name matches
author | Chris Cannam |
---|---|
date | Fri, 19 Feb 2010 14:53:22 +0000 |
parents | 06b2ea2ca577 |
children | cb315ba61e03 |
files | common/Objects.cpp testapp/Loader.cpp |
diffstat | 2 files changed, 128 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- a/common/Objects.cpp Fri Feb 19 12:41:23 2010 +0000 +++ b/common/Objects.cpp Fri Feb 19 14:53:22 2010 +0000 @@ -284,6 +284,20 @@ return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts)); } +static int +calculateThresholdedDistance(EditDistance &ed, const QString &user, + const QString &machine) +{ + int threshold = machine.length()/3; + int dist; + if (threshold == 0) dist = (user == machine ? 0 : -1); + else { + dist = ed.calculate(user, machine, threshold); + if (dist > threshold) dist = -1; + } + return dist; +} + float Composer::matchFuzzyName(QStringList elements) const { @@ -295,105 +309,168 @@ EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true); int score = 0; + bool haveSurname = false; + + // We aim to scale the eventual result such that a score of 1.0 or + // more indicates near-certainty that this is a correct match + // (i.e. that it is properly matched -- not that it is the only + // possible match). To achieve this score, we need to have + // matched with reasonable confidence every element in the passed + // elements list, and to have matched at least one of them to a + // part of our surname. + + int matched = 0; + int unmatched = 0; foreach (QString elt, elements) { bool accept = false; - int initial = 0; if (elt.length() == 1) { - // an initial: search forenames only ignoring connectives + // An initial: search forenames only, ignoring + // connectives. The score contribution here is low, but + // they do not count to matched which means the score can + // only enhance whatever happens elsewhere. They can + // however seriously damage our score if unmatched, which + // is as it should be. foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) { - score += 3; -// std::cerr << "[initial: " << s.toStdString() << "]" << std::endl; + score += 2; accept = true; break; } } if (!accept) { - score -= 10; + foreach (QString s, m_connectiveElements) { + if (s[0] == elt[0]) { + score += 1; + accept = true; + break; + } + } } + if (!accept) { + foreach (QString s, m_surnameElements) { + if (s[0] == elt[0]) { + // no score, but don't call it unmatched + accept = true; + break; + } + } + } + if (!accept) ++unmatched; continue; } foreach (QString s, m_surnameElements) { - if (s[0] == elt[0]) initial = 1; - accept = true; - int dist = ed.calculate(elt, s, s.length()/3); - if (dist == 0) score += 20; - else if (dist*3 <= s.length()) { score += 14 - dist + initial; } - else if (elt.length() > 3 && s.startsWith(elt)) score += 5; - else accept = false; - if (accept) { + int dist = calculateThresholdedDistance(ed, elt, s); + if (dist >= 0) { + score += 22 - dist*2; + if (elt[0] != s[0]) score -= 10; + accept = true; // std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; break; } } - if (accept) continue; + if (accept) { + haveSurname = true; + ++matched; + continue; + } foreach (QString s, m_forenameElements) { - if (s[0] == elt[0]) initial = 1; - accept = true; - int dist = ed.calculate(elt, s, s.length()/3); - if (dist == 0) score += 15; - else if (dist*3 <= s.length()) score += 9 - dist + initial; - else accept = false; - if (accept) { + int dist = calculateThresholdedDistance(ed, elt, s); + if (dist >= 0) { + score += 22 - dist*2; + if (elt[0] != s[0]) score -= 10; + accept = true; // std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; break; } } - if (accept) continue; + if (accept) { + ++matched; + continue; + } foreach (QString s, m_connectiveElements) { - accept = true; - int dist = ed.calculate(elt, s, 1); - if (dist == 0) score += 4; - else if (dist == 1) score += 2; - else accept = false; + // treated much like initials + int dist = calculateThresholdedDistance(ed, elt, s); + if (dist == 0) { + score += 2; + accept = true; + } else if (dist == 1) { + score += 1; + accept = true; + } if (accept) { // std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; break; } } - if (accept) continue; - - if (m_reducedSurnameElements.contains(elt)) { - score += 11; -// std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; + if (accept) { continue; } - if (m_reducedForenameElements.contains(elt)) { + QString reduced = reduceName(elt); + + if (m_reducedSurnameElements.contains(reduced)) { + score += 10; + haveSurname = true; + ++matched; + std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; + continue; + } + + if (m_reducedForenameElements.contains(reduced)) { score += 7; -// std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; + ++matched; + std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; continue; } foreach (QString s, m_otherElements) { - accept = true; - int dist = ed.calculate(elt, s, s.length()/3); - if (dist == 0) score += 12; - else if (dist*3 <= s.length()) score += 8 - dist; - else accept = false; - if (accept) { + int dist = calculateThresholdedDistance(ed, elt, s); + if (dist >= 0) { + score += 22 - dist*2; + if (elt[0] != s[0]) score -= 10; + accept = true; // std::cerr << "[other: " << s.toStdString() << "]" << std::endl; break; } } - if (accept) continue; + if (accept) { + ++matched; + continue; + } - if (initial) score -= 5; - else score -= 10; + ++unmatched; } + +// if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; + score += fameBonus; - if (score > 0) { -// if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; - score += fameBonus; + if (matched == 0) { + if (unmatched == 0) { + return float(score) / 20.f; + } else { + return 0; + } } -// if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl; - return float(score) / (elements.size() * 10); + + float fscore = score; + float divisor = (matched + unmatched) * 20; + + if (!haveSurname) fscore /= 2; + if (unmatched > 0) fscore /= 1.5; + + fscore /= divisor; + + if (matched > 0) { +// std::cerr << "[score " << score << " with divisor " << divisor << " for " << name().toStdString() << " adjusted to " << fscore << "]" << std::endl; + } + + return fscore; } static int
--- a/testapp/Loader.cpp Fri Feb 19 12:41:23 2010 +0000 +++ b/testapp/Loader.cpp Fri Feb 19 14:53:22 2010 +0000 @@ -97,6 +97,7 @@ std::cerr << std::endl << "Enter composer name: "; std::string s; getline(std::cin, s); + std::cerr << "[" << s << "]" << std::endl; QMultiMap<float, QString> matches; QRegExp sre("[\\., -]+"); QStringList elements = QString::fromStdString(s) @@ -111,7 +112,7 @@ for (QMultiMap<float, QString>::const_iterator i = matches.end(); i != matches.begin(); ) { --i; - if (i.key() < 0) continue; + if (i.key() <= 0) continue; if (n == 0) { std::cerr << "Best match:" << std::endl << " * "; } else if (n == 1) { @@ -122,7 +123,7 @@ std::cerr << i.value().toStdString(); for (int c = i.value().length(); c < 40; ++c) std::cerr << " "; std::cerr << "[" << i.key() << "]" << std::endl; - if (++n > 5 || i.key() < 1) break; + if (++n > 5) break; } if (n == 0) std::cerr << "No matches" << std::endl; }