changeset 36:48d8fec75afb

* More on track matching
author Chris Cannam
date Tue, 30 Mar 2010 07:29:08 +0100
parents 732fb6b754fb
children a8ab8c08a668
files common/Matcher.cpp common/Objects.cpp utilities/track/track.cpp
diffstat 3 files changed, 68 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/common/Matcher.cpp	Sat Mar 27 12:30:25 2010 +0000
+++ b/common/Matcher.cpp	Tue Mar 30 07:29:08 2010 +0100
@@ -252,6 +252,18 @@
     std::cerr << "text = " << text.toStdString() << ", quoted = "
               << quoted.toStdString() << std::endl;
 
+    QStringList components =
+        text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
+    QStringList reduced;
+    foreach (QString c, components) {
+        QString r = Composer::reduceName(c.toLower());
+        if (r != "") {
+            if (r.length() > 3 || r[0].isDigit()) {
+                reduced.push_back(r);
+            }
+        }
+    }
+
     EditDistance ed;
 
     foreach (Work *w, m_works) {
@@ -259,20 +271,47 @@
             break;
         }
 
-        float score = 0.f;
-        QString name = w->name();
+        float highScore = 0.f;
 
-        if (quoted != "") {
-            if (quoteRe.indexIn(name) >= 0) {
-                QString q = quoteRe.cap(1);
-                int dist = ed.calculate(quoted, q, qthresh);
-                if (dist < qthresh) {
-                    std::cerr << "quoted name match: " << q.toStdString() << std::endl;
-                    score += 0.7f - 0.1f * dist;
+        QSet<QString> names = w->aliases();
+        names.insert(w->name());
+
+        foreach (QString name, names) {
+
+            float score = 0.f;
+
+            if (quoted != "") {
+                if (quoteRe.indexIn(name) >= 0) {
+                    QString q = quoteRe.cap(1);
+                    int dist = ed.calculate(quoted, q, qthresh);
+                    if (dist < qthresh) {
+                        std::cerr << "quoted name match: " << q.toStdString() << std::endl;
+                        score += 0.7f - 0.1f * dist;
+                        if (score > highScore) {
+                            highScore = score;
+                            continue;
+                        }
+                    }
                 }
             }
-        }
 
+            QStringList wcomp =
+                name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
+            QSet<QString> wr;
+            foreach (QString wc, wcomp) {
+                wr.insert(Composer::reduceName(wc.toLower()));
+            }
+            foreach (QString rc, reduced) {
+                if (wr.contains(rc)) {
+                    std::cerr << "component match: " << rc.toStdString() << std::endl;
+                    score += 0.1;
+                } else {
+                    score -= 0.101;
+                }
+            }
+
+            if (score > highScore) highScore = score;
+/*
         if (score == 0.f) {
             int ml = std::min(name.length(), text.length());
             int thresh = ml / 4;
@@ -283,13 +322,17 @@
             }
         }
 
-        //!!! how to avoid high scores for things that we should be
-        //!!! able to recognise as different? e.g. "Chamber Symphony
-        //!!! No. 2" scoring very highly as a match for "Chamber
-        //!!! Symphony No. 1"
-        
         if (score > 0.f) {
-            matches.insert(Guess(score, w));
+
+            // need to avoid high scores for things with differing
+            // numbers, e.g. "Chamber Symphony No. 2" should not score
+            // highly as a match for "Chamber Symphony No. 1"
+            */
+        }
+
+        if (highScore > 0.f) {
+            std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl;
+            matches.insert(Guess(highScore, w));
         }
     }
     
--- a/common/Objects.cpp	Sat Mar 27 12:30:25 2010 +0000
+++ b/common/Objects.cpp	Tue Mar 30 07:29:08 2010 +0100
@@ -206,14 +206,20 @@
         .replace("'", "")
         .replace("x", "ks")
         .replace("y", "i")
+        .replace("ie", "i")
+        .replace("ei", "i")
+        .replace("ii", "i")
         .replace("k", "c")
-        .replace("ch", "c")
-        .replace("cc", "c")
         .replace("aa", "a")
+        .replace("a", "e")
+        .replace("ee", "e")
         .replace("v", "f")
+        .replace("ph", "f")
         .replace("ff", "f")
         .replace("th", "t")
         .replace("tch", "ch")
+        .replace("ch", "c")
+        .replace("cc", "c")
         .replace("er", "r");
     return key;
 }
--- a/utilities/track/track.cpp	Sat Mar 27 12:30:25 2010 +0000
+++ b/utilities/track/track.cpp	Tue Mar 30 07:29:08 2010 +0100
@@ -413,7 +413,7 @@
 guessWorkFromFilename(QString filename, float scale, GuessSet &guesses)
 {
     cerr << "guessWorkFromFilename: " << filename << endl;
-    QString filepart = QFileInfo(filename).fileName().replace(QRegExp("\\.[^\\.]*"), "");
+    QString filepart = QFileInfo(filename).fileName().replace(QRegExp("\\.[^\\.]*"), "").replace(QRegExp("^\\d+[^\\w]+"), "");
     guessWorkFromTitle(filepart, scale, guesses);
 }