changeset 11:98047b91b09d classical-rdf

* Add "proper" composer name matching from user input. Needs to be faster though
author Chris Cannam
date Thu, 18 Feb 2010 18:16:43 +0000
parents d35e5d769c87
children dc55b0940f15
files common/EditDistance.cpp common/EditDistance.h common/Objects.cpp common/Objects.h common/common.pro testapp/Loader.cpp testapp/testapp.pro
diffstat 7 files changed, 327 insertions(+), 188 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/common/EditDistance.cpp	Thu Feb 18 18:16:43 2010 +0000
@@ -0,0 +1,61 @@
+
+#include "EditDistance.h"
+
+namespace ClassicalData
+{
+
+int
+EditDistance::calculate(QString a, QString b)
+{
+    // Note: restricted Damerau-Levenshtein edit distance, or
+    // Levenshtein if NoTransposition
+
+    int al = a.length() + 1;
+    int bl = b.length() + 1;
+
+    int *currRow = (int *)alloca(sizeof(int) * bl);
+    int *prevRow = (int *)alloca(sizeof(int) * bl);
+    int *ppreRow = (int *)alloca(sizeof(int) * bl);
+    int *tmp = 0;
+    
+    for (int i = 0; i < bl; ++i) {
+	prevRow[i] = i * m_editPenalty;
+    }
+
+    for (int j = 1; j < al; ++j) {
+	currRow[0] = j * m_editPenalty;
+	for (int i = 1; i < bl; ++i) {
+	    if (a[j-1] == b[i-1]) {
+		currRow[i] = prevRow[i-1];
+	    } else {
+		int horizontal, vertical, diagonal;
+		horizontal = currRow[i-1] + m_editPenalty;
+		diagonal   = prevRow[i-1] + m_editPenalty;
+		if (i+1 == bl) vertical = prevRow[i] + m_suffixPenalty;
+		else vertical = prevRow[i] + m_editPenalty;
+		currRow[i] =
+		    std::min(horizontal, std::min(vertical, diagonal));
+	    }
+	    if (m_transpositionMode == RestrictedTransposition) {
+		if (i > 1 && j > 1 && a[i-1]==b[j-2] && a[i-2]==b[j-1]) {
+		    currRow[i] = std::min(currRow[i],
+					  ppreRow[i-2] + m_editPenalty);
+		}
+	    }
+	}
+	tmp = ppreRow;
+	ppreRow = prevRow;
+	prevRow = currRow;
+	currRow = tmp;
+    }
+
+    int result = prevRow[bl-1];
+    if (m_normalise) {
+	result = result / m_editPenalty;
+	if (result % m_editPenalty) ++result;
+    }
+    return result;
+}
+
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/common/EditDistance.h	Thu Feb 18 18:16:43 2010 +0000
@@ -0,0 +1,41 @@
+
+#ifndef _EDIT_DISTANCE_H_
+#define _EDIT_DISTANCE_H_
+
+#include <alloca.h>
+
+#include <QString>
+
+namespace ClassicalData {
+
+class EditDistance
+{
+public:
+    enum TranspositionMode {
+	NoTransposition,
+	RestrictedTransposition
+    };
+
+    EditDistance(TranspositionMode tm = RestrictedTransposition,
+		 int editPenalty = 1,
+		 int suffixPenalty = 1,
+		 bool normalise = true) :
+	m_transpositionMode(tm),
+	m_editPenalty(editPenalty),
+	m_suffixPenalty(suffixPenalty),
+	m_normalise(normalise) { }
+
+    int calculate(QString a, QString b);
+
+private:
+    TranspositionMode m_transpositionMode;
+    int m_editPenalty;
+    int m_suffixPenalty;
+    bool m_normalise;
+};
+
+}
+
+#endif
+
+    
--- a/common/Objects.cpp	Wed Feb 17 19:26:48 2010 +0000
+++ b/common/Objects.cpp	Thu Feb 18 18:16:43 2010 +0000
@@ -7,6 +7,8 @@
 #include <cstdlib>
 #include <iostream>
 
+#include "EditDistance.h"
+
 #include <QHash> // to ensure correct qHash(const QString &) is found
 
 unsigned int
@@ -53,6 +55,68 @@
     return true;
 }
 
+void
+Composer::cacheNames() const
+{
+    if (m_namesCached) return;
+
+    QString n = name();
+    QStringList pl = n.split(", ");
+
+    if (pl.size() == 1) {
+        QStringList pl2;
+        pl = n.split(' ');
+        pl2.push_back(pl[pl.size()-1]);
+        pl2.push_back("");
+        for (int i = 0; i+1 < pl.size(); ++i) {
+            if (i > 0) pl2[1] += " ";
+            pl2[1] += pl[i];
+        }
+        pl = pl2;
+    }
+
+    m_surname = pl[0];
+
+    n = "";
+    for (int i = 1; i < pl.size(); ++i) {
+        if (i > 1) n += ", ";
+        n += pl[i];
+    }
+
+    m_forenames = n;
+
+    m_surnameElements.clear();
+    m_connectiveElements.clear();
+    m_forenameElements.clear();
+    m_otherElements.clear();
+    m_reducedSurnameElements.clear();
+    m_reducedForenameElements.clear();
+    
+    foreach (QString s, m_surname.split(' ')) {
+        if (s[0].isUpper()) {
+            m_surnameElements.push_back(s.toLower());
+            m_reducedSurnameElements.push_back(reduceName(s));
+        } else {
+            m_connectiveElements.push_back(s.toLower());
+        }
+    }
+
+    foreach (QString s, m_forenames.split(' ')) {
+        if (s[0].isUpper()) {
+            m_forenameElements.push_back(s.toLower());
+            m_reducedForenameElements.push_back(reduceName(s));
+        } else {
+            m_connectiveElements.push_back(s.toLower());
+        }
+    }
+
+    foreach (QString a, m_aliases) {
+        foreach (QString ae, a.split(' ')) {
+            m_otherElements.push_back(ae.toLower());
+        }
+    }
+}
+
 QString
 Composer::getSortName(bool caps) const
 {
@@ -66,46 +130,15 @@
 QString
 Composer::getSurname() const
 {
-    //!!! slow (dup with getForenames)
-    QString n = name();
-    QStringList pl = n.split(", ");
-    if (pl.size() == 1) {
-        QStringList pl2;
-        pl = n.split(' ');
-        pl2.push_back(pl[pl.size()-1]);
-        pl2.push_back("");
-        for (int i = 0; i+1 < pl.size(); ++i) {
-            if (i > 0) pl2[1] += " ";
-            pl2[1] += pl[i];
-        }
-        pl = pl2;
-    }
-    return pl[0];
+    cacheNames();
+    return m_surname;
 }
 
 QString
 Composer::getForenames() const
 {
-    //!!! slow (dup with getSurname)
-    QString n = name();
-    QStringList pl = n.split(", ");
-    if (pl.size() == 1) {
-        QStringList pl2;
-        pl = n.split(' ');
-        pl2.push_back(pl[pl.size()-1]);
-        pl2.push_back("");
-        for (int i = 0; i+1 < pl.size(); ++i) {
-            if (i > 0) pl2[1] += " ";
-            pl2[1] += pl[i];
-        }
-        pl = pl2;
-    }
-    n = "";
-    for (int i = 1; i < pl.size(); ++i) {
-        if (i > 1) n += ", ";
-        n += pl[i];
-    }
-    return n;
+    cacheNames();
+    return m_forenames;
 }
 
 QString
@@ -241,101 +274,115 @@
 int
 Composer::matchFuzzyName(QString n) const
 {
-    if (n == name()) return 100;
+    cacheNames();
+    int fameBonus = m_pages.size();
+
+    if (n == name()) return 100 + fameBonus;
     
-    QString surname = getSurname();
-    QString forenames = getForenames();
+    EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true);
+
+    static QRegExp sre("[\\., ]+");
+    QStringList elements = n.toLower().split(sre);
     
-    QStringList sl = surname.split(' ');
-    QStringList fl = forenames.split(' ');
-    QStringList nl = n.split(' ');
- 
     int score = 0;
 
-    foreach (QString element, nl) {
-        
-        bool matchedSomething = false;
+    foreach (QString elt, elements) {
 
-        if (element.length() == 1) {
+        bool accept = false;
+        int initial = 0;
+
+        if (elt.length() == 1) {
             // an initial: search forenames only ignoring connectives
-            QChar c = element[0].toUpper();
-            foreach (QString f, fl) {
-                if (f[0] == c) {
+            foreach (QString s, m_forenameElements) {
+                if (s[0] == elt[0]) {
                     score += 3;
-                    matchedSomething = true;
+                    std::cerr << "[initial: " << s.toStdString() << "]" << std::endl;
+                    accept = true;
                     break;
                 }
             }
-            if (!matchedSomething) {
+            if (!accept) {
                 score -= 10;
             }
             continue;
         }
-
-        foreach (QString s, sl) {
-            if (s.toLower() == element.toLower()) {
-                if (s[0].isUpper()) {
-                    score += 20;
-                } else {
-                    score += 6;
-                }
-                matchedSomething = true;
+        
+        foreach (QString s, m_surnameElements) {
+            if (s[0] == elt[0]) initial = 1;
+            accept = true;
+            int dist = ed.calculate(elt, s);
+            if (dist == 0) score += 20;
+            else if (dist*3 <= s.length()) { score += 14 - dist + initial; }
+            else if (elt.length() > 3 && s.startsWith(elt)) score += 5;
+            else accept = false;
+            if (accept) {
+                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (matchedSomething) continue;
+        if (accept) continue;
 
-        foreach (QString f, fl) {
-            if (f.toLower() == element.toLower()) {
-                if (f[0].isUpper()) {
-                    score += 15;
-                } else {
-                    score += 4;
-                }
-                matchedSomething = true;
+        foreach (QString s, m_forenameElements) {
+            if (s[0] == elt[0]) initial = 1;
+            accept = true;
+            int dist = ed.calculate(elt, s);
+            if (dist == 0) score += 15;
+            else if (dist*3 <= s.length()) score += 9 - dist + initial;
+            else accept = false;
+            if (accept) {
+                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (matchedSomething) continue;
+        if (accept) continue;
 
-        QString reduced = reduceName(element);
-
-        foreach (QString s, sl) {
-            if (!s[0].isUpper()) continue;
-            if (reduceName(s) == reduced) {
-                score += 12;
-                matchedSomething = true;
+        foreach (QString s, m_connectiveElements) {
+            accept = true;
+            int dist = ed.calculate(elt, s);
+            if (dist == 0) score += 4;
+            else if (dist == 1) score += 2;
+            else accept = false; 
+            if (accept) {
+                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (matchedSomething) continue;
+        if (accept) continue;
 
-        foreach (QString f, fl) {
-            if (!f[0].isUpper()) continue;
-            if (reduceName(f) == reduced) {
-                score += 10;
-                matchedSomething = true;
+        if (m_reducedSurnameElements.contains(elt)) {
+            score += 11;
+            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
+            continue;
+        }
+
+        if (m_reducedForenameElements.contains(elt)) {
+            score += 7;
+            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
+            continue;
+        }
+
+        foreach (QString s, m_otherElements) {
+            accept = true;
+            int dist = ed.calculate(elt, s);
+            if (dist == 0) score += 12;
+            else if (dist*3 <= s.length()) score += 8 - dist;
+            else accept = false;
+            if (accept) {
+                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
                 break;
             }
         }
-        if (matchedSomething) continue;
+        if (accept) continue;
 
-        foreach (QString f, fl) {
-            // smaller penalty if we at least have the right first letter
-            if (!f[0].isUpper()) continue;
-            if (f[0] == element[0].toUpper()) {
-                score -= 4;
-                matchedSomething = true;
-                break;
-            }
-        }
-        if (matchedSomething) continue;
+        if (initial) score -= 5;
+        else score -= 10;
+    }
         
-        score -= 7;
-    }        
-
-    //!!! need to adjust for "fame" (more famous composers get a 1pt bonus)
-
+    if (score > 0) {
+        if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
+        score += fameBonus;
+    }
+    if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl;
     return score;
 }
 
--- a/common/Objects.h	Wed Feb 17 19:26:48 2010 +0000
+++ b/common/Objects.h	Thu Feb 18 18:16:43 2010 +0000
@@ -137,20 +137,20 @@
     NamedEntity(QObject *parent = 0) : QObject(parent) { }
 
     QString name() const { return m_name; }
-    void setName(QString n) { m_name = n; }
+    virtual void setName(QString n) { m_name = n; }
 
     QString remarks() const { return m_remarks; }
     void setRemarks(QString n) { m_remarks = n; }
     
     QSet<QString> aliases() const { return m_aliases; }
-    void setAliases(QSet<QString> l) { m_aliases = l; }
-    void addAlias(QString a) { m_aliases.insert(a); }
+    virtual void setAliases(QSet<QString> l) { m_aliases = l; }
+    virtual void addAlias(QString a) { m_aliases.insert(a); }
 
     QSet<Document *> pages() const { return m_pages; }
     void addPage(Document *p) { m_pages.insert(p); }
     void setPages(QSet<Document *> p) { m_pages = p; } //!!! destroy old ones? do we own?
 
-private:
+protected:
     QString m_name;
     QString m_remarks;
     QSet<QString> m_aliases;
@@ -287,7 +287,8 @@
     Q_PROPERTY(QString forenames READ getForenames STORED false)
 
 public:
-    Composer(QObject *parent = 0) : NamedEntity(parent), m_birth(0), m_death(0) { }
+    Composer(QObject *parent = 0) :
+        NamedEntity(parent), m_birth(0), m_death(0), m_namesCached(false) { }
 
     QString gender() const { return m_gender; }
     void setGender(QString n) { m_gender = n; }
@@ -311,6 +312,19 @@
     const Death *death() const { return m_death; }
     void setDeath(Death *d) { m_death = d; }
 
+    virtual void setName(QString n) {
+        NamedEntity::setName(n);
+        m_namesCached = false;
+    }
+    virtual void setAliases(QSet<QString> l) {
+        NamedEntity::setAliases(l);
+        m_namesCached = false;
+    }
+    virtual void addAlias(QString a) { 
+        NamedEntity::addAlias(a);
+        m_namesCached = false;
+    }
+
     QString getSurname() const;
     QString getForenames() const;
     QString getSortName(bool caps) const;
@@ -360,6 +374,17 @@
     QString m_period;
     Birth *m_birth;
     Death *m_death;
+
+    void cacheNames() const;
+    mutable bool m_namesCached;
+    mutable QString m_surname;
+    mutable QString m_forenames;
+    mutable QStringList m_surnameElements;
+    mutable QStringList m_connectiveElements;
+    mutable QStringList m_forenameElements;
+    mutable QStringList m_otherElements;
+    mutable QStringList m_reducedSurnameElements;
+    mutable QStringList m_reducedForenameElements;
 };
 
 class Form : public NamedEntity
--- a/common/common.pro	Wed Feb 17 19:26:48 2010 +0000
+++ b/common/common.pro	Thu Feb 18 18:16:43 2010 +0000
@@ -1,6 +1,6 @@
 TEMPLATE = lib
 TARGET = 
-CONFIG += debug staticlib
+CONFIG += release staticlib
 DEPENDPATH += .
 INCLUDEPATH += . ../../turbot
 
@@ -8,8 +8,8 @@
 MOC_DIR = m
 
 # Input
-HEADERS += Objects.h TypeRegistrar.h
-SOURCES += Objects.cpp TypeRegistrar.cpp
+HEADERS += EditDistance.h Objects.h TypeRegistrar.h
+SOURCES += EditDistance.cpp Objects.cpp TypeRegistrar.cpp
 
 solaris* {
 	 
--- a/testapp/Loader.cpp	Wed Feb 17 19:26:48 2010 +0000
+++ b/testapp/Loader.cpp	Thu Feb 18 18:16:43 2010 +0000
@@ -1,6 +1,7 @@
 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
 
 #include "Objects.h"
+#include "EditDistance.h"
 #include "TypeRegistrar.h"
 
 #include <dataquay/BasicStore.h>
@@ -12,6 +13,10 @@
 
 #include <iostream>
 
+#ifdef TURBOT_PROFILER
+#include <base/Profiler.h>
+#endif
+
 using namespace Dataquay;
 using namespace ClassicalData;
 
@@ -36,82 +41,6 @@
     return true;
 }
 
-//!!! both nasty and duplicated from Import.cpp
-
-QString makeNameKey(QString name)
-{
-    QString key = name.toLower()
-        .replace("'", "")
-        .replace("x", "ks")
-        .replace("y", "i")
-        .replace("k", "c")
-        .replace("ch", "c")
-        .replace("cc", "c")
-        .replace("v", "f")
-        .replace("ff", "f")
-        .replace("th", "t")
-        .replace("tch", "ch")
-        .replace("er", "r");
-//    DEBUG << "makeNameKey(" << name << "): " << key << endl;
-    return key;
-}
-
-bool namesFuzzyMatch(QString an, Composer *b)
-{
-    // ew!
-
-    QString bn = b->name();
-    if (bn == an) return true;
-    if (b->aliases().contains(an)) return true;
-    int aSurnameIndex = 0, bSurnameIndex = 0;
-    if (an.contains(",")) {
-        an.replace(",", "");
-    } else {
-        aSurnameIndex = -1;
-    }
-    if (bn.contains(",")) {
-        bn.replace(",", "");
-    } else {
-        bSurnameIndex = -1;
-    }
-    QStringList nl = an.split(QRegExp("[ -]"));
-    QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]"));
-    int matchCount = 0;
-    QString surnameMatch = "";
-    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
-    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
-    if (nl[aSurnameIndex][0].isUpper() &&
-        nl[aSurnameIndex] != "Della" &&
-        makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
-        surnameMatch = nl[aSurnameIndex];
-    }
-//    DEBUG << "bnl: " << endl;
-//    for (int i = 0; i < bnl.size(); ++i) DEBUG << bnl[i] << endl;
-    int tested = 0;
-    foreach (QString elt, nl) {
-        int score = 2;
-        if (!elt[0].isUpper() || elt == "Della") score = 1;
-        QString k = makeNameKey(elt);
-//	DEBUG << "Testing " << k << endl;
-        if (bnl.contains(k)) {
-            matchCount += score;
-        }
-        if (++tested == 2 && matchCount == 0) {
-            return false;
-        }
-    }
-    if (surnameMatch != "") {
-//        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
-        if (matchCount > 1) {
-            return true;
-        } else {
-//            DEBUG << "(but not enough else matched)" << endl;
-            return false;
-        }
-    }
-    return false;
-}
-
 int main(int argc, char **argv)
 {
     BasicStore *store = new BasicStore();
@@ -131,9 +60,9 @@
 
     delete mapper;
     delete store;
-    
+
     QObjectList composers;
-    std::cerr << "Known composers:" << std::endl;
+//    std::cerr << "Known composers:" << std::endl;
     foreach (QObject *o, root->children()) {
         Composer *c = qobject_cast<Composer *>(o);
         if (c) {
@@ -141,12 +70,12 @@
             if (sn == "") {
                 std::cerr << "WARNING: Composer " << c->name().toStdString() << " (URI " << c->property("uri").toString().toStdString() << ") has no sort-name" << std::endl;
             } else {
-                std::cerr << sn.toStdString() << std::endl;
+//                std::cerr << sn.toStdString() << std::endl;
             }
             composers.push_back(c);
         }
     }
-
+/*
     for (int i = 1; i < argc; ++i) {
 	QString name = argv[i];
 	std::cerr << "Name: " << name.toStdString() << std::endl;
@@ -163,6 +92,39 @@
             std::cerr << "Score: " << i.key() << " for name: " << i.value().toStdString() << std::endl;
         }
     }
+*/
+    while (!std::cin.eof()) {
+        std::cerr << "Enter composer name: ";
+        std::string s;
+        getline(std::cin, s);
+        QMultiMap<int, QString> matches;
+	foreach (QObject *o, composers) {
+	    Composer *c = qobject_cast<Composer *>(o);
+	    if (!c) continue;
+            int value = c->matchFuzzyName(QString::fromStdString(s));
+            matches.insert(value, c->getSortName(false));
+	}
+        int n = 0;
+        for (QMultiMap<int, QString>::const_iterator i = matches.end();
+             i != matches.begin(); ) {
+            --i;
+            if (i.key() < 0) continue;
+            if (n == 0) {
+                std::cerr << "Best match:" << std::endl << " * ";
+            } else if (n == 1) {
+                std::cerr << "Other candidate(s):" << std::endl << " - ";
+            } else {
+                std::cerr << " - ";
+            }
+            std::cerr << i.value().toStdString() << std::endl;
+            if (++n > 5 || i.key() < 1) break;
+        }
+        if (n == 0) std::cerr << "No matches" << std::endl;
+    }        
+
+#ifdef TURBOT_PROFILER
+    Turbot::Profiler::dump();
+#endif
 
 /*
     std::cerr << "mapped, storing again..." << std::endl;
--- a/testapp/testapp.pro	Wed Feb 17 19:26:48 2010 +0000
+++ b/testapp/testapp.pro	Thu Feb 18 18:16:43 2010 +0000
@@ -28,5 +28,8 @@
 linux* {
 QMAKE_CXXFLAGS_DEBUG += -Wall -Woverloaded-virtual -Wextra -Wformat-nonliteral -Wformat-security -Winit-self -O1 -pg
 
-LIBS += -pg
+DEFINES += TURBOT_PROFILER WANT_TIMING
+INCLUDEPATH += ../../turbot/lib/
+LIBPATH += ../../turbot/lib/
+LIBS += -pg -lturbot
 }