changeset 13:a1d67e136c30 classical-rdf

* Test data, speedups
author Chris Cannam
date Fri, 19 Feb 2010 11:41:57 +0000
parents dc55b0940f15
children 06b2ea2ca577
files common/EditDistance.cpp common/EditDistance.h common/Objects.cpp common/Objects.h import/import.pro testapp/Loader.cpp
diffstat 6 files changed, 62 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/common/EditDistance.cpp	Thu Feb 18 18:22:07 2010 +0000
+++ b/common/EditDistance.cpp	Fri Feb 19 11:41:57 2010 +0000
@@ -1,11 +1,13 @@
 
 #include "EditDistance.h"
 
+#include <cstdlib> // abs
+
 namespace ClassicalData
 {
 
 int
-EditDistance::calculate(QString a, QString b)
+EditDistance::calculate(QString a, QString b, int threshold)
 {
     // Note: restricted Damerau-Levenshtein edit distance, or
     // Levenshtein if NoTransposition
@@ -13,6 +15,8 @@
     int al = a.length() + 1;
     int bl = b.length() + 1;
 
+    if (threshold > 0 && abs(al - bl) > threshold) return threshold + 1;
+
     int *currRow = (int *)alloca(sizeof(int) * bl);
     int *prevRow = (int *)alloca(sizeof(int) * bl);
     int *ppreRow = (int *)alloca(sizeof(int) * bl);
@@ -24,6 +28,7 @@
 
     for (int j = 1; j < al; ++j) {
 	currRow[0] = j * m_editPenalty;
+	bool stillAcceptable = (threshold == 0);
 	for (int i = 1; i < bl; ++i) {
 	    if (a[j-1] == b[i-1]) {
 		currRow[i] = prevRow[i-1];
@@ -42,7 +47,11 @@
 					  ppreRow[i-2] + m_editPenalty);
 		}
 	    }
+	    if (threshold > 0 && currRow[i] <= threshold) {
+		stillAcceptable = true;
+	    }
 	}
+	if (!stillAcceptable) return threshold + 1;
 	tmp = ppreRow;
 	ppreRow = prevRow;
 	prevRow = currRow;
@@ -50,7 +59,7 @@
     }
 
     int result = prevRow[bl-1];
-    if (m_normalise) {
+    if (m_normalise && m_editPenalty > 1) {
 	result = result / m_editPenalty;
 	if (result % m_editPenalty) ++result;
     }
--- a/common/EditDistance.h	Thu Feb 18 18:22:07 2010 +0000
+++ b/common/EditDistance.h	Fri Feb 19 11:41:57 2010 +0000
@@ -25,7 +25,7 @@
 	m_suffixPenalty(suffixPenalty),
 	m_normalise(normalise) { }
 
-    int calculate(QString a, QString b);
+    int calculate(QString a, QString b, int threshold = 0);
 
 private:
     TranspositionMode m_transpositionMode;
--- a/common/Objects.cpp	Thu Feb 18 18:22:07 2010 +0000
+++ b/common/Objects.cpp	Fri Feb 19 11:41:57 2010 +0000
@@ -92,7 +92,9 @@
     m_reducedSurnameElements.clear();
     m_reducedForenameElements.clear();
     
-    foreach (QString s, m_surname.split(' ')) {
+    static QRegExp sre("[\\., -]+");
+
+    foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) {
         if (s[0].isUpper()) {
             m_surnameElements.push_back(s.toLower());
             m_reducedSurnameElements.push_back(reduceName(s));
@@ -101,7 +103,7 @@
         }
     }
 
-    foreach (QString s, m_forenames.split(' ')) {
+    foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) {
         if (s[0].isUpper()) {
             m_forenameElements.push_back(s.toLower());
             m_reducedForenameElements.push_back(reduceName(s));
@@ -111,10 +113,12 @@
     }
 
     foreach (QString a, m_aliases) {
-        foreach (QString ae, a.split(' ')) {
+        foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) {
             m_otherElements.push_back(ae.toLower());
         }
     }
+
+    m_namesCached = true;
 }
 
 QString
@@ -274,15 +278,19 @@
 int
 Composer::matchFuzzyName(QString n) const
 {
+    int fameBonus = m_pages.size();
+    if (n == name()) return 100 + fameBonus;
+    static QRegExp sre("[\\., -]+");
+    return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts));
+}
+
+int
+Composer::matchFuzzyName(QStringList elements) const
+{
     cacheNames();
     int fameBonus = m_pages.size();
-
-    if (n == name()) return 100 + fameBonus;
     
     EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true);
-
-    static QRegExp sre("[\\., ]+");
-    QStringList elements = n.toLower().split(sre);
     
     int score = 0;
 
@@ -296,7 +304,7 @@
             foreach (QString s, m_forenameElements) {
                 if (s[0] == elt[0]) {
                     score += 3;
-                    std::cerr << "[initial: " << s.toStdString() << "]" << std::endl;
+//                    std::cerr << "[initial: " << s.toStdString() << "]" << std::endl;
                     accept = true;
                     break;
                 }
@@ -310,13 +318,13 @@
         foreach (QString s, m_surnameElements) {
             if (s[0] == elt[0]) initial = 1;
             accept = true;
-            int dist = ed.calculate(elt, s);
+            int dist = ed.calculate(elt, s, s.length()/3);
             if (dist == 0) score += 20;
             else if (dist*3 <= s.length()) { score += 14 - dist + initial; }
             else if (elt.length() > 3 && s.startsWith(elt)) score += 5;
             else accept = false;
             if (accept) {
-                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
+//                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
@@ -325,12 +333,12 @@
         foreach (QString s, m_forenameElements) {
             if (s[0] == elt[0]) initial = 1;
             accept = true;
-            int dist = ed.calculate(elt, s);
+            int dist = ed.calculate(elt, s, s.length()/3);
             if (dist == 0) score += 15;
             else if (dist*3 <= s.length()) score += 9 - dist + initial;
             else accept = false;
             if (accept) {
-                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
+//                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
@@ -338,12 +346,12 @@
 
         foreach (QString s, m_connectiveElements) {
             accept = true;
-            int dist = ed.calculate(elt, s);
+            int dist = ed.calculate(elt, s, 1);
             if (dist == 0) score += 4;
             else if (dist == 1) score += 2;
             else accept = false; 
             if (accept) {
-                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
+//                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
@@ -351,24 +359,24 @@
 
         if (m_reducedSurnameElements.contains(elt)) {
             score += 11;
-            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
+//            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
             continue;
         }
 
         if (m_reducedForenameElements.contains(elt)) {
             score += 7;
-            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
+//            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
             continue;
         }
 
         foreach (QString s, m_otherElements) {
             accept = true;
-            int dist = ed.calculate(elt, s);
+            int dist = ed.calculate(elt, s, s.length()/3);
             if (dist == 0) score += 12;
             else if (dist*3 <= s.length()) score += 8 - dist;
             else accept = false;
             if (accept) {
-                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
+//                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
@@ -379,10 +387,10 @@
     }
         
     if (score > 0) {
-        if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
+//        if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
         score += fameBonus;
     }
-    if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl;
+//    if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl;
     return score;
 }
 
--- a/common/Objects.h	Thu Feb 18 18:22:07 2010 +0000
+++ b/common/Objects.h	Fri Feb 19 11:41:57 2010 +0000
@@ -360,6 +360,16 @@
     int matchFuzzyName(QString name) const;
 
     /**
+     * Given another name which is believed to be a user-entered
+     * composer name with unpredictable formatting and spelling (and
+     * probably incomplete), return an estimate for the likelihood
+     * that the intended composer was this one.  Higher return values
+     * indicate greater confidence.  The supplied name should have
+     * been lower-cased and split on non-alphabetical characters.
+     */
+    int matchFuzzyName(QStringList name) const;
+
+    /**
      * Return the supplied name reduced into a "simplified" form,
      * eliminating many of the differences often found particularly in
      * European language names that have been anglicised.  Used in
--- a/import/import.pro	Thu Feb 18 18:22:07 2010 +0000
+++ b/import/import.pro	Fri Feb 19 11:41:57 2010 +0000
@@ -23,3 +23,10 @@
 	}
 
 }
+
+linux* {
+DEFINES += TURBOT_PROFILER WANT_TIMING
+INCLUDEPATH += ../../turbot/lib/
+LIBPATH += ../../turbot/lib/
+LIBS += -pg -lturbot
+}
--- a/testapp/Loader.cpp	Thu Feb 18 18:22:07 2010 +0000
+++ b/testapp/Loader.cpp	Fri Feb 19 11:41:57 2010 +0000
@@ -98,10 +98,13 @@
         std::string s;
         getline(std::cin, s);
         QMultiMap<int, QString> matches;
+        QRegExp sre("[\\., -]+");
+        QStringList elements = QString::fromStdString(s)
+            .toLower().split(sre, QString::SkipEmptyParts);
 	foreach (QObject *o, composers) {
 	    Composer *c = qobject_cast<Composer *>(o);
 	    if (!c) continue;
-            int value = c->matchFuzzyName(QString::fromStdString(s));
+            int value = c->matchFuzzyName(elements);
             matches.insert(value, c->getSortName(false));
 	}
         int n = 0;