changeset 15:701702f8959a classical-rdf

* Better scoring for composer name matches
author Chris Cannam
date Fri, 19 Feb 2010 14:53:22 +0000
parents 06b2ea2ca577
children cb315ba61e03
files common/Objects.cpp testapp/Loader.cpp
diffstat 2 files changed, 128 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/common/Objects.cpp	Fri Feb 19 12:41:23 2010 +0000
+++ b/common/Objects.cpp	Fri Feb 19 14:53:22 2010 +0000
@@ -284,6 +284,20 @@
     return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts));
 }
 
+static int
+calculateThresholdedDistance(EditDistance &ed, const QString &user,
+                             const QString &machine)
+{
+    int threshold = machine.length()/3;
+    int dist;
+    if (threshold == 0) dist = (user == machine ? 0 : -1);
+    else {
+        dist = ed.calculate(user, machine, threshold);
+        if (dist > threshold) dist = -1;
+    }
+    return dist;
+}
+
 float
 Composer::matchFuzzyName(QStringList elements) const
 {
@@ -295,105 +309,168 @@
     EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true);
     
     int score = 0;
+    bool haveSurname = false;
+
+    // We aim to scale the eventual result such that a score of 1.0 or
+    // more indicates near-certainty that this is a correct match
+    // (i.e. that it is properly matched -- not that it is the only
+    // possible match).  To achieve this score, we need to have
+    // matched with reasonable confidence every element in the passed
+    // elements list, and to have matched at least one of them to a
+    // part of our surname.
+
+    int matched = 0;
+    int unmatched = 0;
 
     foreach (QString elt, elements) {
 
         bool accept = false;
-        int initial = 0;
 
         if (elt.length() == 1) {
-            // an initial: search forenames only ignoring connectives
+            // An initial: search forenames only, ignoring
+            // connectives.  The score contribution here is low, but
+            // they do not count to matched which means the score can
+            // only enhance whatever happens elsewhere.  They can
+            // however seriously damage our score if unmatched, which
+            // is as it should be.
             foreach (QString s, m_forenameElements) {
                 if (s[0] == elt[0]) {
-                    score += 3;
-//                    std::cerr << "[initial: " << s.toStdString() << "]" << std::endl;
+                    score += 2;
                     accept = true;
                     break;
                 }
             }
             if (!accept) {
-                score -= 10;
+                foreach (QString s, m_connectiveElements) {
+                    if (s[0] == elt[0]) {
+                        score += 1;
+                        accept = true;
+                        break;
+                    }
+                }
             }
+            if (!accept) {
+                foreach (QString s, m_surnameElements) {
+                    if (s[0] == elt[0]) {
+                        // no score, but don't call it unmatched
+                        accept = true;
+                        break;
+                    }
+                }
+            }
+            if (!accept) ++unmatched;
             continue;
         }
         
         foreach (QString s, m_surnameElements) {
-            if (s[0] == elt[0]) initial = 1;
-            accept = true;
-            int dist = ed.calculate(elt, s, s.length()/3);
-            if (dist == 0) score += 20;
-            else if (dist*3 <= s.length()) { score += 14 - dist + initial; }
-            else if (elt.length() > 3 && s.startsWith(elt)) score += 5;
-            else accept = false;
-            if (accept) {
+            int dist = calculateThresholdedDistance(ed, elt, s);
+            if (dist >= 0) {
+                score += 22 - dist*2;
+                if (elt[0] != s[0]) score -= 10;
+                accept = true;
 //                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (accept) continue;
+        if (accept) {
+            haveSurname = true;
+            ++matched;
+            continue;
+        }
 
         foreach (QString s, m_forenameElements) {
-            if (s[0] == elt[0]) initial = 1;
-            accept = true;
-            int dist = ed.calculate(elt, s, s.length()/3);
-            if (dist == 0) score += 15;
-            else if (dist*3 <= s.length()) score += 9 - dist + initial;
-            else accept = false;
-            if (accept) {
+            int dist = calculateThresholdedDistance(ed, elt, s);
+            if (dist >= 0) {
+                score += 22 - dist*2;
+                if (elt[0] != s[0]) score -= 10;
+                accept = true;
 //                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (accept) continue;
+        if (accept) {
+            ++matched;
+            continue;
+        }
 
         foreach (QString s, m_connectiveElements) {
-            accept = true;
-            int dist = ed.calculate(elt, s, 1);
-            if (dist == 0) score += 4;
-            else if (dist == 1) score += 2;
-            else accept = false; 
+            // treated much like initials
+            int dist = calculateThresholdedDistance(ed, elt, s);
+            if (dist == 0) {
+                score += 2;
+                accept = true;
+            } else if (dist == 1) {
+                score += 1;
+                accept = true;
+            }
             if (accept) {
 //                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (accept) continue;
-
-        if (m_reducedSurnameElements.contains(elt)) {
-            score += 11;
-//            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
+        if (accept) {
             continue;
         }
 
-        if (m_reducedForenameElements.contains(elt)) {
+        QString reduced = reduceName(elt);
+
+        if (m_reducedSurnameElements.contains(reduced)) {
+            score += 10;
+            haveSurname = true;
+            ++matched;
+            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
+            continue;
+        }
+
+        if (m_reducedForenameElements.contains(reduced)) {
             score += 7;
-//            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
+            ++matched;
+            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
             continue;
         }
 
         foreach (QString s, m_otherElements) {
-            accept = true;
-            int dist = ed.calculate(elt, s, s.length()/3);
-            if (dist == 0) score += 12;
-            else if (dist*3 <= s.length()) score += 8 - dist;
-            else accept = false;
-            if (accept) {
+            int dist = calculateThresholdedDistance(ed, elt, s);
+            if (dist >= 0) {
+                score += 22 - dist*2;
+                if (elt[0] != s[0]) score -= 10;
+                accept = true;
 //                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (accept) continue;
+        if (accept) {
+            ++matched;
+            continue;
+        }
 
-        if (initial) score -= 5;
-        else score -= 10;
+        ++unmatched;
     }
+
+//    if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
+    score += fameBonus;
         
-    if (score > 0) {
-//        if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
-        score += fameBonus;
+    if (matched == 0) {
+        if (unmatched == 0) {
+            return float(score) / 20.f;
+        } else {
+            return 0;
+        }
     }
-//    if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl;
-    return float(score) / (elements.size() * 10);
+
+    float fscore = score;
+    float divisor = (matched + unmatched) * 20;
+
+    if (!haveSurname) fscore /= 2;
+    if (unmatched > 0) fscore /= 1.5;
+
+    fscore /= divisor;
+
+    if (matched > 0) {
+//        std::cerr << "[score " << score << " with divisor " << divisor << " for " << name().toStdString() << " adjusted to " << fscore << "]" << std::endl;
+    }
+
+    return fscore;
 }
 
 static int
--- a/testapp/Loader.cpp	Fri Feb 19 12:41:23 2010 +0000
+++ b/testapp/Loader.cpp	Fri Feb 19 14:53:22 2010 +0000
@@ -97,6 +97,7 @@
         std::cerr << std::endl << "Enter composer name: ";
         std::string s;
         getline(std::cin, s);
+        std::cerr << "[" << s << "]" << std::endl;
         QMultiMap<float, QString> matches;
         QRegExp sre("[\\., -]+");
         QStringList elements = QString::fromStdString(s)
@@ -111,7 +112,7 @@
         for (QMultiMap<float, QString>::const_iterator i = matches.end();
              i != matches.begin(); ) {
             --i;
-            if (i.key() < 0) continue;
+            if (i.key() <= 0) continue;
             if (n == 0) {
                 std::cerr << "Best match:" << std::endl << " * ";
             } else if (n == 1) {
@@ -122,7 +123,7 @@
             std::cerr << i.value().toStdString();
             for (int c = i.value().length(); c < 40; ++c) std::cerr << " ";
             std::cerr << "[" << i.key() << "]" << std::endl;
-            if (++n > 5 || i.key() < 1) break;
+            if (++n > 5) break;
         }
         if (n == 0) std::cerr << "No matches" << std::endl;
     }