Mercurial > hg > classical
changeset 10:d35e5d769c87 classical-rdf
* some experiments with composer name matching
author | Chris Cannam |
---|---|
date | Wed, 17 Feb 2010 19:26:48 +0000 |
parents | 9e2b203254ab |
children | 98047b91b09d |
files | common/Objects.cpp common/Objects.h common/common.pro import/Import.cpp testapp/Loader.cpp testapp/testapp.pro |
diffstat | 6 files changed, 330 insertions(+), 103 deletions(-) [+] |
line wrap: on
line diff
--- a/common/Objects.cpp Fri Feb 12 16:56:29 2010 +0000 +++ b/common/Objects.cpp Wed Feb 17 19:26:48 2010 +0000 @@ -21,7 +21,7 @@ QMutex Form::m_mutex; bool -Composer::datesMatch(const Composer *b) const +Composer::matchDates(const Composer *b) const { const Composer *a = this; @@ -56,8 +56,19 @@ QString Composer::getSortName(bool caps) const { + QString surname = getSurname(); + QString forenames = getForenames(); + if (caps) surname = surname.toUpper(); + if (forenames != "") return surname + ", " + forenames; + else return surname; +} + +QString +Composer::getSurname() const +{ + //!!! slow (dup with getForenames) QString n = name(); - QStringList pl = n.split(QRegExp(", *")); + QStringList pl = n.split(", "); if (pl.size() == 1) { QStringList pl2; pl = n.split(' '); @@ -69,13 +80,29 @@ } pl = pl2; } - if (caps) { - n = pl[0].toUpper(); - } else { - n = pl[0]; + return pl[0]; +} + +QString +Composer::getForenames() const +{ + //!!! slow (dup with getSurname) + QString n = name(); + QStringList pl = n.split(", "); + if (pl.size() == 1) { + QStringList pl2; + pl = n.split(' '); + pl2.push_back(pl[pl.size()-1]); + pl2.push_back(""); + for (int i = 0; i+1 < pl.size(); ++i) { + if (i > 0) pl2[1] += " "; + pl2[1] += pl[i]; + } + pl = pl2; } + n = ""; for (int i = 1; i < pl.size(); ++i) { - n += ", "; + if (i > 1) n += ", "; n += pl[i]; } return n; @@ -116,6 +143,201 @@ return s; } + +static QString +asciify(QString field) +{ + QString ascii; + for (int i = 0; i < field.length(); ++i) { + QString dc = field[i].decomposition(); + if (dc != "") ascii += dc[0]; + else if (field[i] == QChar(0x00DF)) { + ascii += "ss"; + } else { + ascii += field[i]; + } + } + ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe + ascii.replace(QString::fromUtf8("\342\200\222"), "-"); + ascii.replace(QString::fromUtf8("\342\200\223"), "-"); + ascii.replace(QString::fromUtf8("\342\200\224"), "-"); + ascii.replace(QString::fromUtf8("\342\200\225"), "-"); + return ascii; +} + +QString +Composer::reduceName(QString name) +{ + QString key = asciify(name).toLower() + .replace("'", "") + .replace("x", "ks") + .replace("y", "i") + .replace("k", "c") + .replace("ch", "c") + .replace("cc", "c") + .replace("aa", "a") + .replace("v", "f") + .replace("ff", "f") + .replace("th", "t") + .replace("tch", "ch") + .replace("er", "r"); + return key; +} + +bool +Composer::matchCatalogueName(QString an) const +{ + // ew! + + QString bn = name(); + if (bn == an) return true; + if (aliases().contains(an)) return true; + + int aSurnameIndex = 0, bSurnameIndex = 0; + if (an.contains(",")) { + an.replace(",", ""); + } else { + aSurnameIndex = -1; + } + if (bn.contains(",")) { + bn.replace(",", ""); + } else { + bSurnameIndex = -1; + } + QStringList nl = an.split(QRegExp("[ -]")); + QStringList bnl = reduceName(bn).split(QRegExp("[ -]")); + int matchCount = 0; + QString surnameMatch = ""; + if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1; + if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1; + if (nl[aSurnameIndex][0].isUpper() && + nl[aSurnameIndex] != "Della" && + reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { + surnameMatch = nl[aSurnameIndex]; + } + int tested = 0; + foreach (QString elt, nl) { + if (!elt[0].isUpper() || elt == "Della") continue; + QString k = reduceName(elt); + if (bnl.contains(k)) { + ++matchCount; + } + if (++tested == 2 && matchCount == 0) { + return false; + } + } + if (surnameMatch != "") { + DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl; + if (matchCount > 1) { + return true; + } else { + DEBUG << "(but not enough else matched)" << endl; + return false; + } + } + return false; +} + +int +Composer::matchFuzzyName(QString n) const +{ + if (n == name()) return 100; + + QString surname = getSurname(); + QString forenames = getForenames(); + + QStringList sl = surname.split(' '); + QStringList fl = forenames.split(' '); + QStringList nl = n.split(' '); + + int score = 0; + + foreach (QString element, nl) { + + bool matchedSomething = false; + + if (element.length() == 1) { + // an initial: search forenames only ignoring connectives + QChar c = element[0].toUpper(); + foreach (QString f, fl) { + if (f[0] == c) { + score += 3; + matchedSomething = true; + break; + } + } + if (!matchedSomething) { + score -= 10; + } + continue; + } + + foreach (QString s, sl) { + if (s.toLower() == element.toLower()) { + if (s[0].isUpper()) { + score += 20; + } else { + score += 6; + } + matchedSomething = true; + break; + } + } + if (matchedSomething) continue; + + foreach (QString f, fl) { + if (f.toLower() == element.toLower()) { + if (f[0].isUpper()) { + score += 15; + } else { + score += 4; + } + matchedSomething = true; + break; + } + } + if (matchedSomething) continue; + + QString reduced = reduceName(element); + + foreach (QString s, sl) { + if (!s[0].isUpper()) continue; + if (reduceName(s) == reduced) { + score += 12; + matchedSomething = true; + break; + } + } + if (matchedSomething) continue; + + foreach (QString f, fl) { + if (!f[0].isUpper()) continue; + if (reduceName(f) == reduced) { + score += 10; + matchedSomething = true; + break; + } + } + if (matchedSomething) continue; + + foreach (QString f, fl) { + // smaller penalty if we at least have the right first letter + if (!f[0].isUpper()) continue; + if (f[0] == element[0].toUpper()) { + score -= 4; + matchedSomething = true; + break; + } + } + if (matchedSomething) continue; + + score -= 7; + } + + //!!! need to adjust for "fame" (more famous composers get a 1pt bonus) + + return score; +} static int compare(QString a, QString b) @@ -129,8 +351,8 @@ } } -static int -compareNumericTexts(QString a, QString b) +int +Work::compareCatalogueNumberTexts(QString a, QString b) { // std::cout << "compare " << a.toStdString() // << " " << b.toStdString() << std::endl; @@ -151,7 +373,7 @@ if (al[i] != bl[i]) { // std::cout << "subcompare " << al[i].toStdString() // << " " << bl[i].toStdString() << std::endl; - return compareNumericTexts(al[i], bl[i]); + return compareCatalogueNumberTexts(al[i], bl[i]); } } } else { @@ -196,14 +418,14 @@ */ int c = 0; if (a->catalogue() != "" && b->catalogue() != "") { - c = compareNumericTexts(a->catalogue(), b->catalogue()); + c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue()); } if (c == 0 && a->opus() != "" && b->opus() != "") { - c = compareNumericTexts(a->opus(), b->opus()); + c = compareCatalogueNumberTexts(a->opus(), b->opus()); } if (c == 0 && a->partOf() == b->partOf() && a->number() != "" && b->number() != "") { - c = compareNumericTexts(a->number(), b->number()); + c = compareCatalogueNumberTexts(a->number(), b->number()); } bool rv = false;
--- a/common/Objects.h Fri Feb 12 16:56:29 2010 +0000 +++ b/common/Objects.h Wed Feb 17 19:26:48 2010 +0000 @@ -213,6 +213,15 @@ bool operator()(Work *, Work *); }; + /** + * Compare the ordering of two strings that are known to contain + * catalogue number texts, such as "Op. 1 no 4" and "Op. 3 no 2" + * (which should compare in that order). Return value is as for + * strcmp. + */ + //!!! todo: unit tests + static int compareCatalogueNumberTexts(QString a, QString b); + private: QString m_key; QString m_opus; @@ -274,6 +283,9 @@ Q_PROPERTY(ClassicalData::Birth *birth READ birth WRITE setBirth STORED true) Q_PROPERTY(ClassicalData::Death *death READ death WRITE setDeath STORED true) + Q_PROPERTY(QString surname READ getSurname STORED false) + Q_PROPERTY(QString forenames READ getForenames STORED false) + public: Composer(QObject *parent = 0) : NamedEntity(parent), m_birth(0), m_death(0) { } @@ -299,10 +311,48 @@ const Death *death() const { return m_death; } void setDeath(Death *d) { m_death = d; } - bool datesMatch(const Composer *other) const; // "well enough" + QString getSurname() const; + QString getForenames() const; QString getSortName(bool caps) const; QString getDisplayDates() const; + /** + * Given another composer, return true if the other composer's + * dates match outs. This is mostly intended (like + * matchCatalogueName) for use in merging distinct catalogues. + * Matching is somewhat fuzzy; more slack is cut when the dates + * are very long ago or are marked as approximate. + */ + bool matchDates(const Composer *other) const; // "well enough" + + /** + * Given another name which is intended to be a well-formatted + * catalogue name for a composer (but which may differ in + * ordering, number of forenames, and perhaps in spelling), test + * whether the name is a plausible match for our own. This is + * mostly intended (like matchDates) for use in merging distinct + * catalogues. Return true if the given name is highly likely to + * match our own. + */ + bool matchCatalogueName(QString otherName) const; + + /** + * Given another name which is believed to be a user-entered + * composer name with unpredictable formatting and spelling (and + * probably incomplete), return an estimate for the likelihood + * that the intended composer was this one. Higher return values + * indicate greater confidence. + */ + int matchFuzzyName(QString name) const; + + /** + * Return the supplied name reduced into a "simplified" form, + * eliminating many of the differences often found particularly in + * European language names that have been anglicised. Used in + * catalogue and fuzzy name matching. + */ + static QString reduceName(QString name); + private: QString m_gender; QSet<QString> m_nationality;
--- a/common/common.pro Fri Feb 12 16:56:29 2010 +0000 +++ b/common/common.pro Wed Feb 17 19:26:48 2010 +0000 @@ -20,4 +20,6 @@ } +linux* { QMAKE_CXXFLAGS_DEBUG += -Wall -Woverloaded-virtual -Wextra -Wformat-nonliteral -Wformat-security -Winit-self -O1 -pg +}
--- a/import/Import.cpp Fri Feb 12 16:56:29 2010 +0000 +++ b/import/Import.cpp Wed Feb 17 19:26:48 2010 +0000 @@ -143,76 +143,6 @@ c->addAlias(nr); } -QString makeNameKey(QString name) -{ - QString key = name.toLower() - .replace("'", "") - .replace("x", "ks") - .replace("y", "i") - .replace("k", "c") - .replace("ch", "c") - .replace("cc", "c") - .replace("v", "f") - .replace("ff", "f") - .replace("th", "t") - .replace("tch", "ch") - .replace("er", "r"); -// DEBUG << "makeNameKey(" << name << "): " << key << endl; - return key; -} - -bool namesFuzzyMatch(QString an, Composer *b) -{ - // ew! - - QString bn = b->name(); - if (bn == an) return true; - if (b->aliases().contains(an)) return true; - int aSurnameIndex = 0, bSurnameIndex = 0; - if (an.contains(",")) { - an.replace(",", ""); - } else { - aSurnameIndex = -1; - } - if (bn.contains(",")) { - bn.replace(",", ""); - } else { - bSurnameIndex = -1; - } - QStringList nl = an.split(QRegExp("[ -]")); - QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]")); - int matchCount = 0; - QString surnameMatch = ""; - if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1; - if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1; - if (nl[aSurnameIndex][0].isUpper() && - nl[aSurnameIndex] != "Della" && - makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { - surnameMatch = nl[aSurnameIndex]; - } - int tested = 0; - foreach (QString elt, nl) { - if (!elt[0].isUpper() || elt == "Della") continue; - QString k = makeNameKey(elt); - if (bnl.contains(k)) { - ++matchCount; - } - if (++tested == 2 && matchCount == 0) { - return false; - } - } - if (surnameMatch != "") { - DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl; - if (matchCount > 1) { - return true; - } else { - DEBUG << "(but not enough else matched)" << endl; - return false; - } - } - return false; -} - bool hasBetterName(Composer *c, Composer *other) { @@ -264,21 +194,21 @@ QSet<Composer *> matches; foreach (QString candidateName, allNames) { - QString key = makeNameKey(candidateName); + QString key = Composer::reduceName(candidateName); if (composers.contains(key)) { foreach (Composer *candidate, composers[key]) { if (candidateName == dates) { if (c->name() == candidate->name()) { DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl; - } else if (!namesFuzzyMatch(c->name(), candidate) && - !namesFuzzyMatch(candidate->name(), c)) { + } else if (!candidate->matchCatalogueName(c->name()) && + !c->matchCatalogueName(candidate->name())) { DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl; continue; } else { DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl; } } else { - if (!c->datesMatch(candidate)) { + if (!c->matchDates(candidate)) { DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl; continue; } @@ -297,7 +227,7 @@ for (ComposerMap::iterator i = composers.begin(); i != composers.end(); ++i) { foreach (Composer *candidate, *i) { - if (namesFuzzyMatch(c->name(), candidate)) { + if (candidate->matchCatalogueName(c->name())) { DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl; matches.insert(candidate); break; @@ -309,7 +239,7 @@ if (matches.empty()) { foreach (QString candidateName, allNames) { - QString key = makeNameKey(candidateName); + QString key = Composer::reduceName(candidateName); composers[key].insert(c); DEBUG << "added for alias or date " << candidateName << endl; } @@ -331,14 +261,14 @@ } else { other->addAlias(c->name()); } - composers[makeNameKey(c->name())].insert(other); + composers[Composer::reduceName(c->name())].insert(other); DEBUG << "linking from alias " << c->name() << endl; foreach (QString alias, c->aliases()) { if (alias != other->name() && !other->aliases().contains(alias)) { other->addAlias(alias); - composers[makeNameKey(alias)].insert(other); + composers[Composer::reduceName(alias)].insert(other); DEBUG << "linking from alias " << alias << endl; } } @@ -642,7 +572,7 @@ if (!cn) continue; if (!cn->composer()) { QString cname = cn->composerName(); - QString key = makeNameKey(cname); + QString key = Composer::reduceName(cname); if (cname != "") { if (!composers.contains(key)) { DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
--- a/testapp/Loader.cpp Fri Feb 12 16:56:29 2010 +0000 +++ b/testapp/Loader.cpp Wed Feb 17 19:26:48 2010 +0000 @@ -1,11 +1,14 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Objects.h" #include "TypeRegistrar.h" #include <dataquay/BasicStore.h> #include <dataquay/objectmapper/ObjectMapper.h> +#include <dataquay/Debug.h> #include <QTemporaryFile> +#include <QMultiMap> #include <iostream> @@ -82,12 +85,16 @@ makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { surnameMatch = nl[aSurnameIndex]; } +// DEBUG << "bnl: " << endl; +// for (int i = 0; i < bnl.size(); ++i) DEBUG << bnl[i] << endl; int tested = 0; foreach (QString elt, nl) { - if (!elt[0].isUpper() || elt == "Della") continue; + int score = 2; + if (!elt[0].isUpper() || elt == "Della") score = 1; QString k = makeNameKey(elt); +// DEBUG << "Testing " << k << endl; if (bnl.contains(k)) { - ++matchCount; + matchCount += score; } if (++tested == 2 && matchCount == 0) { return false; @@ -126,20 +133,35 @@ delete store; QObjectList composers; + std::cerr << "Known composers:" << std::endl; foreach (QObject *o, root->children()) { - if (qobject_cast<Composer *>(o)) composers.push_back(o); + Composer *c = qobject_cast<Composer *>(o); + if (c) { + QString sn = c->getSortName(true); + if (sn == "") { + std::cerr << "WARNING: Composer " << c->name().toStdString() << " (URI " << c->property("uri").toString().toStdString() << ") has no sort-name" << std::endl; + } else { + std::cerr << sn.toStdString() << std::endl; + } + composers.push_back(c); + } } - - if (argc > 1) { - QString name = argv[1]; + + for (int i = 1; i < argc; ++i) { + QString name = argv[i]; std::cerr << "Name: " << name.toStdString() << std::endl; + QMultiMap<int, QString> matches; foreach (QObject *o, composers) { Composer *c = qobject_cast<Composer *>(o); if (!c) continue; - if (namesFuzzyMatch(name, c)) { - std::cerr << "Matches: " << c->name().toStdString() << std::endl; - } + int value = c->matchFuzzyName(name); + matches.insert(value, c->getSortName(false)); } + for (QMultiMap<int, QString>::const_iterator i = matches.begin(); + i != matches.end(); ++i) { + if (i.key() < 0) continue; + std::cerr << "Score: " << i.key() << " for name: " << i.value().toStdString() << std::endl; + } } /*