Mercurial > hg > classical
changeset 13:a1d67e136c30 classical-rdf
* Test data, speedups
author | Chris Cannam |
---|---|
date | Fri, 19 Feb 2010 11:41:57 +0000 |
parents | dc55b0940f15 |
children | 06b2ea2ca577 |
files | common/EditDistance.cpp common/EditDistance.h common/Objects.cpp common/Objects.h import/import.pro testapp/Loader.cpp |
diffstat | 6 files changed, 62 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/common/EditDistance.cpp Thu Feb 18 18:22:07 2010 +0000 +++ b/common/EditDistance.cpp Fri Feb 19 11:41:57 2010 +0000 @@ -1,11 +1,13 @@ #include "EditDistance.h" +#include <cstdlib> // abs + namespace ClassicalData { int -EditDistance::calculate(QString a, QString b) +EditDistance::calculate(QString a, QString b, int threshold) { // Note: restricted Damerau-Levenshtein edit distance, or // Levenshtein if NoTransposition @@ -13,6 +15,8 @@ int al = a.length() + 1; int bl = b.length() + 1; + if (threshold > 0 && abs(al - bl) > threshold) return threshold + 1; + int *currRow = (int *)alloca(sizeof(int) * bl); int *prevRow = (int *)alloca(sizeof(int) * bl); int *ppreRow = (int *)alloca(sizeof(int) * bl); @@ -24,6 +28,7 @@ for (int j = 1; j < al; ++j) { currRow[0] = j * m_editPenalty; + bool stillAcceptable = (threshold == 0); for (int i = 1; i < bl; ++i) { if (a[j-1] == b[i-1]) { currRow[i] = prevRow[i-1]; @@ -42,7 +47,11 @@ ppreRow[i-2] + m_editPenalty); } } + if (threshold > 0 && currRow[i] <= threshold) { + stillAcceptable = true; + } } + if (!stillAcceptable) return threshold + 1; tmp = ppreRow; ppreRow = prevRow; prevRow = currRow; @@ -50,7 +59,7 @@ } int result = prevRow[bl-1]; - if (m_normalise) { + if (m_normalise && m_editPenalty > 1) { result = result / m_editPenalty; if (result % m_editPenalty) ++result; }
--- a/common/EditDistance.h Thu Feb 18 18:22:07 2010 +0000 +++ b/common/EditDistance.h Fri Feb 19 11:41:57 2010 +0000 @@ -25,7 +25,7 @@ m_suffixPenalty(suffixPenalty), m_normalise(normalise) { } - int calculate(QString a, QString b); + int calculate(QString a, QString b, int threshold = 0); private: TranspositionMode m_transpositionMode;
--- a/common/Objects.cpp Thu Feb 18 18:22:07 2010 +0000 +++ b/common/Objects.cpp Fri Feb 19 11:41:57 2010 +0000 @@ -92,7 +92,9 @@ m_reducedSurnameElements.clear(); m_reducedForenameElements.clear(); - foreach (QString s, m_surname.split(' ')) { + static QRegExp sre("[\\., -]+"); + + foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_surnameElements.push_back(s.toLower()); m_reducedSurnameElements.push_back(reduceName(s)); @@ -101,7 +103,7 @@ } } - foreach (QString s, m_forenames.split(' ')) { + foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_forenameElements.push_back(s.toLower()); m_reducedForenameElements.push_back(reduceName(s)); @@ -111,10 +113,12 @@ } foreach (QString a, m_aliases) { - foreach (QString ae, a.split(' ')) { + foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) { m_otherElements.push_back(ae.toLower()); } } + + m_namesCached = true; } QString @@ -274,15 +278,19 @@ int Composer::matchFuzzyName(QString n) const { + int fameBonus = m_pages.size(); + if (n == name()) return 100 + fameBonus; + static QRegExp sre("[\\., -]+"); + return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts)); +} + +int +Composer::matchFuzzyName(QStringList elements) const +{ cacheNames(); int fameBonus = m_pages.size(); - - if (n == name()) return 100 + fameBonus; EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true); - - static QRegExp sre("[\\., ]+"); - QStringList elements = n.toLower().split(sre); int score = 0; @@ -296,7 +304,7 @@ foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) { score += 3; - std::cerr << "[initial: " << s.toStdString() << "]" << std::endl; +// std::cerr << "[initial: " << s.toStdString() << "]" << std::endl; accept = true; break; } @@ -310,13 +318,13 @@ foreach (QString s, m_surnameElements) { if (s[0] == elt[0]) initial = 1; accept = true; - int dist = ed.calculate(elt, s); + int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 20; else if (dist*3 <= s.length()) { score += 14 - dist + initial; } else if (elt.length() > 3 && s.startsWith(elt)) score += 5; else accept = false; if (accept) { - std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; +// std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; break; } } @@ -325,12 +333,12 @@ foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) initial = 1; accept = true; - int dist = ed.calculate(elt, s); + int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 15; else if (dist*3 <= s.length()) score += 9 - dist + initial; else accept = false; if (accept) { - std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; +// std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; break; } } @@ -338,12 +346,12 @@ foreach (QString s, m_connectiveElements) { accept = true; - int dist = ed.calculate(elt, s); + int dist = ed.calculate(elt, s, 1); if (dist == 0) score += 4; else if (dist == 1) score += 2; else accept = false; if (accept) { - std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; +// std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; break; } } @@ -351,24 +359,24 @@ if (m_reducedSurnameElements.contains(elt)) { score += 11; - std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; +// std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; continue; } if (m_reducedForenameElements.contains(elt)) { score += 7; - std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; +// std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; continue; } foreach (QString s, m_otherElements) { accept = true; - int dist = ed.calculate(elt, s); + int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 12; else if (dist*3 <= s.length()) score += 8 - dist; else accept = false; if (accept) { - std::cerr << "[other: " << s.toStdString() << "]" << std::endl; +// std::cerr << "[other: " << s.toStdString() << "]" << std::endl; break; } } @@ -379,10 +387,10 @@ } if (score > 0) { - if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; +// if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; score += fameBonus; } - if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl; +// if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl; return score; }
--- a/common/Objects.h Thu Feb 18 18:22:07 2010 +0000 +++ b/common/Objects.h Fri Feb 19 11:41:57 2010 +0000 @@ -360,6 +360,16 @@ int matchFuzzyName(QString name) const; /** + * Given another name which is believed to be a user-entered + * composer name with unpredictable formatting and spelling (and + * probably incomplete), return an estimate for the likelihood + * that the intended composer was this one. Higher return values + * indicate greater confidence. The supplied name should have + * been lower-cased and split on non-alphabetical characters. + */ + int matchFuzzyName(QStringList name) const; + + /** * Return the supplied name reduced into a "simplified" form, * eliminating many of the differences often found particularly in * European language names that have been anglicised. Used in
--- a/import/import.pro Thu Feb 18 18:22:07 2010 +0000 +++ b/import/import.pro Fri Feb 19 11:41:57 2010 +0000 @@ -23,3 +23,10 @@ } } + +linux* { +DEFINES += TURBOT_PROFILER WANT_TIMING +INCLUDEPATH += ../../turbot/lib/ +LIBPATH += ../../turbot/lib/ +LIBS += -pg -lturbot +}
--- a/testapp/Loader.cpp Thu Feb 18 18:22:07 2010 +0000 +++ b/testapp/Loader.cpp Fri Feb 19 11:41:57 2010 +0000 @@ -98,10 +98,13 @@ std::string s; getline(std::cin, s); QMultiMap<int, QString> matches; + QRegExp sre("[\\., -]+"); + QStringList elements = QString::fromStdString(s) + .toLower().split(sre, QString::SkipEmptyParts); foreach (QObject *o, composers) { Composer *c = qobject_cast<Composer *>(o); if (!c) continue; - int value = c->matchFuzzyName(QString::fromStdString(s)); + int value = c->matchFuzzyName(elements); matches.insert(value, c->getSortName(false)); } int n = 0;