changeset 10:d35e5d769c87 classical-rdf

* some experiments with composer name matching
author Chris Cannam
date Wed, 17 Feb 2010 19:26:48 +0000
parents 9e2b203254ab
children 98047b91b09d
files common/Objects.cpp common/Objects.h common/common.pro import/Import.cpp testapp/Loader.cpp testapp/testapp.pro
diffstat 6 files changed, 330 insertions(+), 103 deletions(-) [+]
line wrap: on
line diff
--- a/common/Objects.cpp	Fri Feb 12 16:56:29 2010 +0000
+++ b/common/Objects.cpp	Wed Feb 17 19:26:48 2010 +0000
@@ -21,7 +21,7 @@
 QMutex Form::m_mutex;
 
 bool
-Composer::datesMatch(const Composer *b) const
+Composer::matchDates(const Composer *b) const
 {
     const Composer *a = this;
     
@@ -56,8 +56,19 @@
 QString
 Composer::getSortName(bool caps) const
 {
+    QString surname = getSurname();
+    QString forenames = getForenames();
+    if (caps) surname = surname.toUpper();
+    if (forenames != "") return surname + ", " + forenames;
+    else return surname;
+}
+
+QString
+Composer::getSurname() const
+{
+    //!!! slow (dup with getForenames)
     QString n = name();
-    QStringList pl = n.split(QRegExp(", *"));
+    QStringList pl = n.split(", ");
     if (pl.size() == 1) {
         QStringList pl2;
         pl = n.split(' ');
@@ -69,13 +80,29 @@
         }
         pl = pl2;
     }
-    if (caps) {
-        n = pl[0].toUpper();
-    } else {
-        n = pl[0];
+    return pl[0];
+}
+
+QString
+Composer::getForenames() const
+{
+    //!!! slow (dup with getSurname)
+    QString n = name();
+    QStringList pl = n.split(", ");
+    if (pl.size() == 1) {
+        QStringList pl2;
+        pl = n.split(' ');
+        pl2.push_back(pl[pl.size()-1]);
+        pl2.push_back("");
+        for (int i = 0; i+1 < pl.size(); ++i) {
+            if (i > 0) pl2[1] += " ";
+            pl2[1] += pl[i];
+        }
+        pl = pl2;
     }
+    n = "";
     for (int i = 1; i < pl.size(); ++i) {
-        n += ", ";
+        if (i > 1) n += ", ";
         n += pl[i];
     }
     return n;
@@ -116,6 +143,201 @@
 
     return s;
 }
+   
+static QString
+asciify(QString field)
+{
+    QString ascii;
+    for (int i = 0; i < field.length(); ++i) {
+        QString dc = field[i].decomposition();
+        if (dc != "") ascii += dc[0];
+        else if (field[i] == QChar(0x00DF)) {
+            ascii += "ss";
+        } else {
+            ascii += field[i];
+        }
+    }
+    ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
+    ascii.replace(QString::fromUtf8("\342\200\222"), "-");
+    ascii.replace(QString::fromUtf8("\342\200\223"), "-");
+    ascii.replace(QString::fromUtf8("\342\200\224"), "-");
+    ascii.replace(QString::fromUtf8("\342\200\225"), "-");
+    return ascii;
+}
+
+QString
+Composer::reduceName(QString name)
+{
+    QString key = asciify(name).toLower()
+        .replace("'", "")
+        .replace("x", "ks")
+        .replace("y", "i")
+        .replace("k", "c")
+        .replace("ch", "c")
+        .replace("cc", "c")
+        .replace("aa", "a")
+        .replace("v", "f")
+        .replace("ff", "f")
+        .replace("th", "t")
+        .replace("tch", "ch")
+        .replace("er", "r");
+    return key;
+}
+
+bool
+Composer::matchCatalogueName(QString an) const
+{
+    // ew!
+
+    QString bn = name();
+    if (bn == an) return true;
+    if (aliases().contains(an)) return true;
+
+    int aSurnameIndex = 0, bSurnameIndex = 0;
+    if (an.contains(",")) {
+        an.replace(",", "");
+    } else {
+        aSurnameIndex = -1;
+    }
+    if (bn.contains(",")) {
+        bn.replace(",", "");
+    } else {
+        bSurnameIndex = -1;
+    }
+    QStringList nl = an.split(QRegExp("[ -]"));
+    QStringList bnl = reduceName(bn).split(QRegExp("[ -]"));
+    int matchCount = 0;
+    QString surnameMatch = "";
+    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
+    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
+    if (nl[aSurnameIndex][0].isUpper() &&
+        nl[aSurnameIndex] != "Della" &&
+        reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
+        surnameMatch = nl[aSurnameIndex];
+    }
+    int tested = 0;
+    foreach (QString elt, nl) {
+        if (!elt[0].isUpper() || elt == "Della") continue;
+        QString k = reduceName(elt);
+        if (bnl.contains(k)) {
+            ++matchCount;
+        }
+        if (++tested == 2 && matchCount == 0) {
+            return false;
+        }
+    }
+    if (surnameMatch != "") {
+        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
+        if (matchCount > 1) {
+            return true;
+        } else {
+            DEBUG << "(but not enough else matched)" << endl;
+            return false;
+        }
+    }
+    return false;
+}    
+
+int
+Composer::matchFuzzyName(QString n) const
+{
+    if (n == name()) return 100;
+    
+    QString surname = getSurname();
+    QString forenames = getForenames();
+    
+    QStringList sl = surname.split(' ');
+    QStringList fl = forenames.split(' ');
+    QStringList nl = n.split(' ');
+ 
+    int score = 0;
+
+    foreach (QString element, nl) {
+        
+        bool matchedSomething = false;
+
+        if (element.length() == 1) {
+            // an initial: search forenames only ignoring connectives
+            QChar c = element[0].toUpper();
+            foreach (QString f, fl) {
+                if (f[0] == c) {
+                    score += 3;
+                    matchedSomething = true;
+                    break;
+                }
+            }
+            if (!matchedSomething) {
+                score -= 10;
+            }
+            continue;
+        }
+
+        foreach (QString s, sl) {
+            if (s.toLower() == element.toLower()) {
+                if (s[0].isUpper()) {
+                    score += 20;
+                } else {
+                    score += 6;
+                }
+                matchedSomething = true;
+                break;
+            }
+        }
+        if (matchedSomething) continue;
+
+        foreach (QString f, fl) {
+            if (f.toLower() == element.toLower()) {
+                if (f[0].isUpper()) {
+                    score += 15;
+                } else {
+                    score += 4;
+                }
+                matchedSomething = true;
+                break;
+            }
+        }
+        if (matchedSomething) continue;
+
+        QString reduced = reduceName(element);
+
+        foreach (QString s, sl) {
+            if (!s[0].isUpper()) continue;
+            if (reduceName(s) == reduced) {
+                score += 12;
+                matchedSomething = true;
+                break;
+            }
+        }
+        if (matchedSomething) continue;
+
+        foreach (QString f, fl) {
+            if (!f[0].isUpper()) continue;
+            if (reduceName(f) == reduced) {
+                score += 10;
+                matchedSomething = true;
+                break;
+            }
+        }
+        if (matchedSomething) continue;
+
+        foreach (QString f, fl) {
+            // smaller penalty if we at least have the right first letter
+            if (!f[0].isUpper()) continue;
+            if (f[0] == element[0].toUpper()) {
+                score -= 4;
+                matchedSomething = true;
+                break;
+            }
+        }
+        if (matchedSomething) continue;
+        
+        score -= 7;
+    }        
+
+    //!!! need to adjust for "fame" (more famous composers get a 1pt bonus)
+
+    return score;
+}
 
 static int
 compare(QString a, QString b)
@@ -129,8 +351,8 @@
     }
 }
 
-static int
-compareNumericTexts(QString a, QString b)
+int
+Work::compareCatalogueNumberTexts(QString a, QString b)
 {
 //    std::cout << "compare " << a.toStdString()
 //              << " " << b.toStdString() << std::endl;
@@ -151,7 +373,7 @@
                 if (al[i] != bl[i]) {
 //                    std::cout << "subcompare " << al[i].toStdString()
 //                              << " " << bl[i].toStdString() << std::endl;
-                    return compareNumericTexts(al[i], bl[i]);
+                    return compareCatalogueNumberTexts(al[i], bl[i]);
                 }
             }
         } else {
@@ -196,14 +418,14 @@
 */
     int c = 0;
     if (a->catalogue() != "" && b->catalogue() != "") {
-        c = compareNumericTexts(a->catalogue(), b->catalogue());
+        c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue());
     }
     if (c == 0 && a->opus() != "" && b->opus() != "") {
-        c = compareNumericTexts(a->opus(), b->opus());
+        c = compareCatalogueNumberTexts(a->opus(), b->opus());
     }
     if (c == 0 && a->partOf() == b->partOf() &&
         a->number() != "" && b->number() != "") {
-        c = compareNumericTexts(a->number(), b->number());
+        c = compareCatalogueNumberTexts(a->number(), b->number());
     }
 
     bool rv = false;
--- a/common/Objects.h	Fri Feb 12 16:56:29 2010 +0000
+++ b/common/Objects.h	Wed Feb 17 19:26:48 2010 +0000
@@ -213,6 +213,15 @@
         bool operator()(Work *, Work *);
     };
 
+    /**
+     * Compare the ordering of two strings that are known to contain
+     * catalogue number texts, such as "Op. 1 no 4" and "Op. 3 no 2"
+     * (which should compare in that order).  Return value is as for
+     * strcmp.
+     */
+    //!!! todo: unit tests
+    static int compareCatalogueNumberTexts(QString a, QString b);
+
 private:
     QString m_key;
     QString m_opus;
@@ -274,6 +283,9 @@
     Q_PROPERTY(ClassicalData::Birth *birth READ birth WRITE setBirth STORED true)
     Q_PROPERTY(ClassicalData::Death *death READ death WRITE setDeath STORED true)
 
+    Q_PROPERTY(QString surname READ getSurname STORED false)
+    Q_PROPERTY(QString forenames READ getForenames STORED false)
+
 public:
     Composer(QObject *parent = 0) : NamedEntity(parent), m_birth(0), m_death(0) { }
 
@@ -299,10 +311,48 @@
     const Death *death() const { return m_death; }
     void setDeath(Death *d) { m_death = d; }
 
-    bool datesMatch(const Composer *other) const; // "well enough"
+    QString getSurname() const;
+    QString getForenames() const;
     QString getSortName(bool caps) const;
     QString getDisplayDates() const;
 
+    /**
+     * Given another composer, return true if the other composer's
+     * dates match outs.  This is mostly intended (like
+     * matchCatalogueName) for use in merging distinct catalogues.
+     * Matching is somewhat fuzzy; more slack is cut when the dates
+     * are very long ago or are marked as approximate.
+     */
+    bool matchDates(const Composer *other) const; // "well enough"
+
+    /**
+     * Given another name which is intended to be a well-formatted
+     * catalogue name for a composer (but which may differ in
+     * ordering, number of forenames, and perhaps in spelling), test
+     * whether the name is a plausible match for our own.  This is
+     * mostly intended (like matchDates) for use in merging distinct
+     * catalogues.  Return true if the given name is highly likely to
+     * match our own.
+     */
+    bool matchCatalogueName(QString otherName) const;
+
+    /**
+     * Given another name which is believed to be a user-entered
+     * composer name with unpredictable formatting and spelling (and
+     * probably incomplete), return an estimate for the likelihood
+     * that the intended composer was this one.  Higher return values
+     * indicate greater confidence.
+     */
+    int matchFuzzyName(QString name) const;
+
+    /**
+     * Return the supplied name reduced into a "simplified" form,
+     * eliminating many of the differences often found particularly in
+     * European language names that have been anglicised.  Used in
+     * catalogue and fuzzy name matching.
+     */
+    static QString reduceName(QString name);
+
 private:
     QString m_gender;
     QSet<QString> m_nationality;
--- a/common/common.pro	Fri Feb 12 16:56:29 2010 +0000
+++ b/common/common.pro	Wed Feb 17 19:26:48 2010 +0000
@@ -20,4 +20,6 @@
 
 }
 
+linux* {
 	QMAKE_CXXFLAGS_DEBUG += -Wall -Woverloaded-virtual -Wextra -Wformat-nonliteral -Wformat-security -Winit-self -O1 -pg
+}
--- a/import/Import.cpp	Fri Feb 12 16:56:29 2010 +0000
+++ b/import/Import.cpp	Wed Feb 17 19:26:48 2010 +0000
@@ -143,76 +143,6 @@
     c->addAlias(nr);
 }
 
-QString makeNameKey(QString name)
-{
-    QString key = name.toLower()
-        .replace("'", "")
-        .replace("x", "ks")
-        .replace("y", "i")
-        .replace("k", "c")
-        .replace("ch", "c")
-        .replace("cc", "c")
-        .replace("v", "f")
-        .replace("ff", "f")
-        .replace("th", "t")
-        .replace("tch", "ch")
-        .replace("er", "r");
-//    DEBUG << "makeNameKey(" << name << "): " << key << endl;
-    return key;
-}
-
-bool namesFuzzyMatch(QString an, Composer *b)
-{
-    // ew!
-
-    QString bn = b->name();
-    if (bn == an) return true;
-    if (b->aliases().contains(an)) return true;
-    int aSurnameIndex = 0, bSurnameIndex = 0;
-    if (an.contains(",")) {
-        an.replace(",", "");
-    } else {
-        aSurnameIndex = -1;
-    }
-    if (bn.contains(",")) {
-        bn.replace(",", "");
-    } else {
-        bSurnameIndex = -1;
-    }
-    QStringList nl = an.split(QRegExp("[ -]"));
-    QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]"));
-    int matchCount = 0;
-    QString surnameMatch = "";
-    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
-    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
-    if (nl[aSurnameIndex][0].isUpper() &&
-        nl[aSurnameIndex] != "Della" &&
-        makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
-        surnameMatch = nl[aSurnameIndex];
-    }
-    int tested = 0;
-    foreach (QString elt, nl) {
-        if (!elt[0].isUpper() || elt == "Della") continue;
-        QString k = makeNameKey(elt);
-        if (bnl.contains(k)) {
-            ++matchCount;
-        }
-        if (++tested == 2 && matchCount == 0) {
-            return false;
-        }
-    }
-    if (surnameMatch != "") {
-        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
-        if (matchCount > 1) {
-            return true;
-        } else {
-            DEBUG << "(but not enough else matched)" << endl;
-            return false;
-        }
-    }
-    return false;
-}
-
 bool
 hasBetterName(Composer *c, Composer *other)
 {
@@ -264,21 +194,21 @@
     QSet<Composer *> matches;
 
     foreach (QString candidateName, allNames) {
-        QString key = makeNameKey(candidateName);
+        QString key = Composer::reduceName(candidateName);
         if (composers.contains(key)) {
             foreach (Composer *candidate, composers[key]) {
                 if (candidateName == dates) {
                     if (c->name() == candidate->name()) {
                         DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl;
-                    } else if (!namesFuzzyMatch(c->name(), candidate) &&
-                               !namesFuzzyMatch(candidate->name(), c)) {
+                    } else if (!candidate->matchCatalogueName(c->name()) &&
+                               !c->matchCatalogueName(candidate->name())) {
                         DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl;
                         continue;
                     } else {
                         DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl;
                     }
                 } else {
-                    if (!c->datesMatch(candidate)) {
+                    if (!c->matchDates(candidate)) {
                         DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl;
                         continue;
                     }
@@ -297,7 +227,7 @@
             for (ComposerMap::iterator i = composers.begin();
                  i != composers.end(); ++i) {
                 foreach (Composer *candidate, *i) {
-                    if (namesFuzzyMatch(c->name(), candidate)) {
+                    if (candidate->matchCatalogueName(c->name())) {
                         DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl;
                         matches.insert(candidate);
                         break;
@@ -309,7 +239,7 @@
 
         if (matches.empty()) {
             foreach (QString candidateName, allNames) {
-                QString key = makeNameKey(candidateName);
+                QString key = Composer::reduceName(candidateName);
                 composers[key].insert(c);
                 DEBUG << "added for alias or date " << candidateName << endl;
             }
@@ -331,14 +261,14 @@
     } else {
         other->addAlias(c->name());
     }
-    composers[makeNameKey(c->name())].insert(other);
+    composers[Composer::reduceName(c->name())].insert(other);
     DEBUG << "linking from alias " << c->name() << endl;
 
     foreach (QString alias, c->aliases()) {
         if (alias != other->name() && 
             !other->aliases().contains(alias)) {
             other->addAlias(alias);
-            composers[makeNameKey(alias)].insert(other);
+            composers[Composer::reduceName(alias)].insert(other);
             DEBUG << "linking from alias " << alias << endl;
         }
     }
@@ -642,7 +572,7 @@
         if (!cn) continue;
         if (!cn->composer()) {
             QString cname = cn->composerName();
-            QString key = makeNameKey(cname);
+            QString key = Composer::reduceName(cname);
             if (cname != "") {
                 if (!composers.contains(key)) {
                     DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
--- a/testapp/Loader.cpp	Fri Feb 12 16:56:29 2010 +0000
+++ b/testapp/Loader.cpp	Wed Feb 17 19:26:48 2010 +0000
@@ -1,11 +1,14 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
 
 #include "Objects.h"
 #include "TypeRegistrar.h"
 
 #include <dataquay/BasicStore.h>
 #include <dataquay/objectmapper/ObjectMapper.h>
+#include <dataquay/Debug.h>
 
 #include <QTemporaryFile>
+#include <QMultiMap>
 
 #include <iostream>
 
@@ -82,12 +85,16 @@
         makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
         surnameMatch = nl[aSurnameIndex];
     }
+//    DEBUG << "bnl: " << endl;
+//    for (int i = 0; i < bnl.size(); ++i) DEBUG << bnl[i] << endl;
     int tested = 0;
     foreach (QString elt, nl) {
-        if (!elt[0].isUpper() || elt == "Della") continue;
+        int score = 2;
+        if (!elt[0].isUpper() || elt == "Della") score = 1;
         QString k = makeNameKey(elt);
+//	DEBUG << "Testing " << k << endl;
         if (bnl.contains(k)) {
-            ++matchCount;
+            matchCount += score;
         }
         if (++tested == 2 && matchCount == 0) {
             return false;
@@ -126,20 +133,35 @@
     delete store;
     
     QObjectList composers;
+    std::cerr << "Known composers:" << std::endl;
     foreach (QObject *o, root->children()) {
-	if (qobject_cast<Composer *>(o)) composers.push_back(o);
+        Composer *c = qobject_cast<Composer *>(o);
+        if (c) {
+            QString sn = c->getSortName(true);
+            if (sn == "") {
+                std::cerr << "WARNING: Composer " << c->name().toStdString() << " (URI " << c->property("uri").toString().toStdString() << ") has no sort-name" << std::endl;
+            } else {
+                std::cerr << sn.toStdString() << std::endl;
+            }
+            composers.push_back(c);
+        }
     }
-    
-    if (argc > 1) {
-	QString name = argv[1];
+
+    for (int i = 1; i < argc; ++i) {
+	QString name = argv[i];
 	std::cerr << "Name: " << name.toStdString() << std::endl;
+        QMultiMap<int, QString> matches;
 	foreach (QObject *o, composers) {
 	    Composer *c = qobject_cast<Composer *>(o);
 	    if (!c) continue;
-	    if (namesFuzzyMatch(name, c)) {
-		std::cerr << "Matches: " << c->name().toStdString() << std::endl;
-	    }
+            int value = c->matchFuzzyName(name);
+            matches.insert(value, c->getSortName(false));
 	}
+        for (QMultiMap<int, QString>::const_iterator i = matches.begin();
+             i != matches.end(); ++i) {
+            if (i.key() < 0) continue;
+            std::cerr << "Score: " << i.key() << " for name: " << i.value().toStdString() << std::endl;
+        }
     }
 
 /*
--- a/testapp/testapp.pro	Fri Feb 12 16:56:29 2010 +0000
+++ b/testapp/testapp.pro	Wed Feb 17 19:26:48 2010 +0000
@@ -25,7 +25,8 @@
 
 }
 
-
+linux* {
 QMAKE_CXXFLAGS_DEBUG += -Wall -Woverloaded-virtual -Wextra -Wformat-nonliteral -Wformat-security -Winit-self -O1 -pg
 
 LIBS += -pg
+}