Mercurial > hg > classical
changeset 11:98047b91b09d classical-rdf
* Add "proper" composer name matching from user input. Needs to be faster
though
author | Chris Cannam |
---|---|
date | Thu, 18 Feb 2010 18:16:43 +0000 |
parents | d35e5d769c87 |
children | dc55b0940f15 |
files | common/EditDistance.cpp common/EditDistance.h common/Objects.cpp common/Objects.h common/common.pro testapp/Loader.cpp testapp/testapp.pro |
diffstat | 7 files changed, 327 insertions(+), 188 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/common/EditDistance.cpp Thu Feb 18 18:16:43 2010 +0000 @@ -0,0 +1,61 @@ + +#include "EditDistance.h" + +namespace ClassicalData +{ + +int +EditDistance::calculate(QString a, QString b) +{ + // Note: restricted Damerau-Levenshtein edit distance, or + // Levenshtein if NoTransposition + + int al = a.length() + 1; + int bl = b.length() + 1; + + int *currRow = (int *)alloca(sizeof(int) * bl); + int *prevRow = (int *)alloca(sizeof(int) * bl); + int *ppreRow = (int *)alloca(sizeof(int) * bl); + int *tmp = 0; + + for (int i = 0; i < bl; ++i) { + prevRow[i] = i * m_editPenalty; + } + + for (int j = 1; j < al; ++j) { + currRow[0] = j * m_editPenalty; + for (int i = 1; i < bl; ++i) { + if (a[j-1] == b[i-1]) { + currRow[i] = prevRow[i-1]; + } else { + int horizontal, vertical, diagonal; + horizontal = currRow[i-1] + m_editPenalty; + diagonal = prevRow[i-1] + m_editPenalty; + if (i+1 == bl) vertical = prevRow[i] + m_suffixPenalty; + else vertical = prevRow[i] + m_editPenalty; + currRow[i] = + std::min(horizontal, std::min(vertical, diagonal)); + } + if (m_transpositionMode == RestrictedTransposition) { + if (i > 1 && j > 1 && a[i-1]==b[j-2] && a[i-2]==b[j-1]) { + currRow[i] = std::min(currRow[i], + ppreRow[i-2] + m_editPenalty); + } + } + } + tmp = ppreRow; + ppreRow = prevRow; + prevRow = currRow; + currRow = tmp; + } + + int result = prevRow[bl-1]; + if (m_normalise) { + result = result / m_editPenalty; + if (result % m_editPenalty) ++result; + } + return result; +} + +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/common/EditDistance.h Thu Feb 18 18:16:43 2010 +0000 @@ -0,0 +1,41 @@ + +#ifndef _EDIT_DISTANCE_H_ +#define _EDIT_DISTANCE_H_ + +#include <alloca.h> + +#include <QString> + +namespace ClassicalData { + +class EditDistance +{ +public: + enum TranspositionMode { + NoTransposition, + RestrictedTransposition + }; + + EditDistance(TranspositionMode tm = RestrictedTransposition, + int editPenalty = 1, + int suffixPenalty = 1, + bool normalise = true) : + m_transpositionMode(tm), + m_editPenalty(editPenalty), + m_suffixPenalty(suffixPenalty), + m_normalise(normalise) { } + + int calculate(QString a, QString b); + +private: + TranspositionMode m_transpositionMode; + int m_editPenalty; + int m_suffixPenalty; + bool m_normalise; +}; + +} + +#endif + +
--- a/common/Objects.cpp Wed Feb 17 19:26:48 2010 +0000 +++ b/common/Objects.cpp Thu Feb 18 18:16:43 2010 +0000 @@ -7,6 +7,8 @@ #include <cstdlib> #include <iostream> +#include "EditDistance.h" + #include <QHash> // to ensure correct qHash(const QString &) is found unsigned int @@ -53,6 +55,68 @@ return true; } +void +Composer::cacheNames() const +{ + if (m_namesCached) return; + + QString n = name(); + QStringList pl = n.split(", "); + + if (pl.size() == 1) { + QStringList pl2; + pl = n.split(' '); + pl2.push_back(pl[pl.size()-1]); + pl2.push_back(""); + for (int i = 0; i+1 < pl.size(); ++i) { + if (i > 0) pl2[1] += " "; + pl2[1] += pl[i]; + } + pl = pl2; + } + + m_surname = pl[0]; + + n = ""; + for (int i = 1; i < pl.size(); ++i) { + if (i > 1) n += ", "; + n += pl[i]; + } + + m_forenames = n; + + m_surnameElements.clear(); + m_connectiveElements.clear(); + m_forenameElements.clear(); + m_otherElements.clear(); + m_reducedSurnameElements.clear(); + m_reducedForenameElements.clear(); + + foreach (QString s, m_surname.split(' ')) { + if (s[0].isUpper()) { + m_surnameElements.push_back(s.toLower()); + m_reducedSurnameElements.push_back(reduceName(s)); + } else { + m_connectiveElements.push_back(s.toLower()); + } + } + + foreach (QString s, m_forenames.split(' ')) { + if (s[0].isUpper()) { + m_forenameElements.push_back(s.toLower()); + m_reducedForenameElements.push_back(reduceName(s)); + } else { + m_connectiveElements.push_back(s.toLower()); + } + } + + foreach (QString a, m_aliases) { + foreach (QString ae, a.split(' ')) { + m_otherElements.push_back(ae.toLower()); + } + } +} + QString Composer::getSortName(bool caps) const { @@ -66,46 +130,15 @@ QString Composer::getSurname() const { - //!!! slow (dup with getForenames) - QString n = name(); - QStringList pl = n.split(", "); - if (pl.size() == 1) { - QStringList pl2; - pl = n.split(' '); - pl2.push_back(pl[pl.size()-1]); - pl2.push_back(""); - for (int i = 0; i+1 < pl.size(); ++i) { - if (i > 0) pl2[1] += " "; - pl2[1] += pl[i]; - } - pl = pl2; - } - return pl[0]; + cacheNames(); + return m_surname; } QString Composer::getForenames() const { - //!!! slow (dup with getSurname) - QString n = name(); - QStringList pl = n.split(", "); - if (pl.size() == 1) { - QStringList pl2; - pl = n.split(' '); - pl2.push_back(pl[pl.size()-1]); - pl2.push_back(""); - for (int i = 0; i+1 < pl.size(); ++i) { - if (i > 0) pl2[1] += " "; - pl2[1] += pl[i]; - } - pl = pl2; - } - n = ""; - for (int i = 1; i < pl.size(); ++i) { - if (i > 1) n += ", "; - n += pl[i]; - } - return n; + cacheNames(); + return m_forenames; } QString @@ -241,101 +274,115 @@ int Composer::matchFuzzyName(QString n) const { - if (n == name()) return 100; + cacheNames(); + int fameBonus = m_pages.size(); + + if (n == name()) return 100 + fameBonus; - QString surname = getSurname(); - QString forenames = getForenames(); + EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true); + + static QRegExp sre("[\\., ]+"); + QStringList elements = n.toLower().split(sre); - QStringList sl = surname.split(' '); - QStringList fl = forenames.split(' '); - QStringList nl = n.split(' '); - int score = 0; - foreach (QString element, nl) { - - bool matchedSomething = false; + foreach (QString elt, elements) { - if (element.length() == 1) { + bool accept = false; + int initial = 0; + + if (elt.length() == 1) { // an initial: search forenames only ignoring connectives - QChar c = element[0].toUpper(); - foreach (QString f, fl) { - if (f[0] == c) { + foreach (QString s, m_forenameElements) { + if (s[0] == elt[0]) { score += 3; - matchedSomething = true; + std::cerr << "[initial: " << s.toStdString() << "]" << std::endl; + accept = true; break; } } - if (!matchedSomething) { + if (!accept) { score -= 10; } continue; } - - foreach (QString s, sl) { - if (s.toLower() == element.toLower()) { - if (s[0].isUpper()) { - score += 20; - } else { - score += 6; - } - matchedSomething = true; + + foreach (QString s, m_surnameElements) { + if (s[0] == elt[0]) initial = 1; + accept = true; + int dist = ed.calculate(elt, s); + if (dist == 0) score += 20; + else if (dist*3 <= s.length()) { score += 14 - dist + initial; } + else if (elt.length() > 3 && s.startsWith(elt)) score += 5; + else accept = false; + if (accept) { + std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; break; } } - if (matchedSomething) continue; + if (accept) continue; - foreach (QString f, fl) { - if (f.toLower() == element.toLower()) { - if (f[0].isUpper()) { - score += 15; - } else { - score += 4; - } - matchedSomething = true; + foreach (QString s, m_forenameElements) { + if (s[0] == elt[0]) initial = 1; + accept = true; + int dist = ed.calculate(elt, s); + if (dist == 0) score += 15; + else if (dist*3 <= s.length()) score += 9 - dist + initial; + else accept = false; + if (accept) { + std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; break; } } - if (matchedSomething) continue; + if (accept) continue; - QString reduced = reduceName(element); - - foreach (QString s, sl) { - if (!s[0].isUpper()) continue; - if (reduceName(s) == reduced) { - score += 12; - matchedSomething = true; + foreach (QString s, m_connectiveElements) { + accept = true; + int dist = ed.calculate(elt, s); + if (dist == 0) score += 4; + else if (dist == 1) score += 2; + else accept = false; + if (accept) { + std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; break; } } - if (matchedSomething) continue; + if (accept) continue; - foreach (QString f, fl) { - if (!f[0].isUpper()) continue; - if (reduceName(f) == reduced) { - score += 10; - matchedSomething = true; + if (m_reducedSurnameElements.contains(elt)) { + score += 11; + std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; + continue; + } + + if (m_reducedForenameElements.contains(elt)) { + score += 7; + std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; + continue; + } + + foreach (QString s, m_otherElements) { + accept = true; + int dist = ed.calculate(elt, s); + if (dist == 0) score += 12; + else if (dist*3 <= s.length()) score += 8 - dist; + else accept = false; + if (accept) { + std::cerr << "[other: " << s.toStdString() << "]" << std::endl; break; } } - if (matchedSomething) continue; + if (accept) continue; - foreach (QString f, fl) { - // smaller penalty if we at least have the right first letter - if (!f[0].isUpper()) continue; - if (f[0] == element[0].toUpper()) { - score -= 4; - matchedSomething = true; - break; - } - } - if (matchedSomething) continue; + if (initial) score -= 5; + else score -= 10; + } - score -= 7; - } - - //!!! need to adjust for "fame" (more famous composers get a 1pt bonus) - + if (score > 0) { + if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; + score += fameBonus; + } + if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl; return score; }
--- a/common/Objects.h Wed Feb 17 19:26:48 2010 +0000 +++ b/common/Objects.h Thu Feb 18 18:16:43 2010 +0000 @@ -137,20 +137,20 @@ NamedEntity(QObject *parent = 0) : QObject(parent) { } QString name() const { return m_name; } - void setName(QString n) { m_name = n; } + virtual void setName(QString n) { m_name = n; } QString remarks() const { return m_remarks; } void setRemarks(QString n) { m_remarks = n; } QSet<QString> aliases() const { return m_aliases; } - void setAliases(QSet<QString> l) { m_aliases = l; } - void addAlias(QString a) { m_aliases.insert(a); } + virtual void setAliases(QSet<QString> l) { m_aliases = l; } + virtual void addAlias(QString a) { m_aliases.insert(a); } QSet<Document *> pages() const { return m_pages; } void addPage(Document *p) { m_pages.insert(p); } void setPages(QSet<Document *> p) { m_pages = p; } //!!! destroy old ones? do we own? -private: +protected: QString m_name; QString m_remarks; QSet<QString> m_aliases; @@ -287,7 +287,8 @@ Q_PROPERTY(QString forenames READ getForenames STORED false) public: - Composer(QObject *parent = 0) : NamedEntity(parent), m_birth(0), m_death(0) { } + Composer(QObject *parent = 0) : + NamedEntity(parent), m_birth(0), m_death(0), m_namesCached(false) { } QString gender() const { return m_gender; } void setGender(QString n) { m_gender = n; } @@ -311,6 +312,19 @@ const Death *death() const { return m_death; } void setDeath(Death *d) { m_death = d; } + virtual void setName(QString n) { + NamedEntity::setName(n); + m_namesCached = false; + } + virtual void setAliases(QSet<QString> l) { + NamedEntity::setAliases(l); + m_namesCached = false; + } + virtual void addAlias(QString a) { + NamedEntity::addAlias(a); + m_namesCached = false; + } + QString getSurname() const; QString getForenames() const; QString getSortName(bool caps) const; @@ -360,6 +374,17 @@ QString m_period; Birth *m_birth; Death *m_death; + + void cacheNames() const; + mutable bool m_namesCached; + mutable QString m_surname; + mutable QString m_forenames; + mutable QStringList m_surnameElements; + mutable QStringList m_connectiveElements; + mutable QStringList m_forenameElements; + mutable QStringList m_otherElements; + mutable QStringList m_reducedSurnameElements; + mutable QStringList m_reducedForenameElements; }; class Form : public NamedEntity
--- a/common/common.pro Wed Feb 17 19:26:48 2010 +0000 +++ b/common/common.pro Thu Feb 18 18:16:43 2010 +0000 @@ -1,6 +1,6 @@ TEMPLATE = lib TARGET = -CONFIG += debug staticlib +CONFIG += release staticlib DEPENDPATH += . INCLUDEPATH += . ../../turbot @@ -8,8 +8,8 @@ MOC_DIR = m # Input -HEADERS += Objects.h TypeRegistrar.h -SOURCES += Objects.cpp TypeRegistrar.cpp +HEADERS += EditDistance.h Objects.h TypeRegistrar.h +SOURCES += EditDistance.cpp Objects.cpp TypeRegistrar.cpp solaris* {
--- a/testapp/Loader.cpp Wed Feb 17 19:26:48 2010 +0000 +++ b/testapp/Loader.cpp Thu Feb 18 18:16:43 2010 +0000 @@ -1,6 +1,7 @@ /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Objects.h" +#include "EditDistance.h" #include "TypeRegistrar.h" #include <dataquay/BasicStore.h> @@ -12,6 +13,10 @@ #include <iostream> +#ifdef TURBOT_PROFILER +#include <base/Profiler.h> +#endif + using namespace Dataquay; using namespace ClassicalData; @@ -36,82 +41,6 @@ return true; } -//!!! both nasty and duplicated from Import.cpp - -QString makeNameKey(QString name) -{ - QString key = name.toLower() - .replace("'", "") - .replace("x", "ks") - .replace("y", "i") - .replace("k", "c") - .replace("ch", "c") - .replace("cc", "c") - .replace("v", "f") - .replace("ff", "f") - .replace("th", "t") - .replace("tch", "ch") - .replace("er", "r"); -// DEBUG << "makeNameKey(" << name << "): " << key << endl; - return key; -} - -bool namesFuzzyMatch(QString an, Composer *b) -{ - // ew! - - QString bn = b->name(); - if (bn == an) return true; - if (b->aliases().contains(an)) return true; - int aSurnameIndex = 0, bSurnameIndex = 0; - if (an.contains(",")) { - an.replace(",", ""); - } else { - aSurnameIndex = -1; - } - if (bn.contains(",")) { - bn.replace(",", ""); - } else { - bSurnameIndex = -1; - } - QStringList nl = an.split(QRegExp("[ -]")); - QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]")); - int matchCount = 0; - QString surnameMatch = ""; - if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1; - if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1; - if (nl[aSurnameIndex][0].isUpper() && - nl[aSurnameIndex] != "Della" && - makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { - surnameMatch = nl[aSurnameIndex]; - } -// DEBUG << "bnl: " << endl; -// for (int i = 0; i < bnl.size(); ++i) DEBUG << bnl[i] << endl; - int tested = 0; - foreach (QString elt, nl) { - int score = 2; - if (!elt[0].isUpper() || elt == "Della") score = 1; - QString k = makeNameKey(elt); -// DEBUG << "Testing " << k << endl; - if (bnl.contains(k)) { - matchCount += score; - } - if (++tested == 2 && matchCount == 0) { - return false; - } - } - if (surnameMatch != "") { -// DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl; - if (matchCount > 1) { - return true; - } else { -// DEBUG << "(but not enough else matched)" << endl; - return false; - } - } - return false; -} - int main(int argc, char **argv) { BasicStore *store = new BasicStore(); @@ -131,9 +60,9 @@ delete mapper; delete store; - + QObjectList composers; - std::cerr << "Known composers:" << std::endl; +// std::cerr << "Known composers:" << std::endl; foreach (QObject *o, root->children()) { Composer *c = qobject_cast<Composer *>(o); if (c) { @@ -141,12 +70,12 @@ if (sn == "") { std::cerr << "WARNING: Composer " << c->name().toStdString() << " (URI " << c->property("uri").toString().toStdString() << ") has no sort-name" << std::endl; } else { - std::cerr << sn.toStdString() << std::endl; +// std::cerr << sn.toStdString() << std::endl; } composers.push_back(c); } } - +/* for (int i = 1; i < argc; ++i) { QString name = argv[i]; std::cerr << "Name: " << name.toStdString() << std::endl; @@ -163,6 +92,39 @@ std::cerr << "Score: " << i.key() << " for name: " << i.value().toStdString() << std::endl; } } +*/ + while (!std::cin.eof()) { + std::cerr << "Enter composer name: "; + std::string s; + getline(std::cin, s); + QMultiMap<int, QString> matches; + foreach (QObject *o, composers) { + Composer *c = qobject_cast<Composer *>(o); + if (!c) continue; + int value = c->matchFuzzyName(QString::fromStdString(s)); + matches.insert(value, c->getSortName(false)); + } + int n = 0; + for (QMultiMap<int, QString>::const_iterator i = matches.end(); + i != matches.begin(); ) { + --i; + if (i.key() < 0) continue; + if (n == 0) { + std::cerr << "Best match:" << std::endl << " * "; + } else if (n == 1) { + std::cerr << "Other candidate(s):" << std::endl << " - "; + } else { + std::cerr << " - "; + } + std::cerr << i.value().toStdString() << std::endl; + if (++n > 5 || i.key() < 1) break; + } + if (n == 0) std::cerr << "No matches" << std::endl; + } + +#ifdef TURBOT_PROFILER + Turbot::Profiler::dump(); +#endif /* std::cerr << "mapped, storing again..." << std::endl;
--- a/testapp/testapp.pro Wed Feb 17 19:26:48 2010 +0000 +++ b/testapp/testapp.pro Thu Feb 18 18:16:43 2010 +0000 @@ -28,5 +28,8 @@ linux* { QMAKE_CXXFLAGS_DEBUG += -Wall -Woverloaded-virtual -Wextra -Wformat-nonliteral -Wformat-security -Winit-self -O1 -pg -LIBS += -pg +DEFINES += TURBOT_PROFILER WANT_TIMING +INCLUDEPATH += ../../turbot/lib/ +LIBPATH += ../../turbot/lib/ +LIBS += -pg -lturbot }