Mercurial > hg > classical
view common/Objects.cpp @ 14:06b2ea2ca577 classical-rdf
* use floating-point scores for name matching
author | Chris Cannam |
---|---|
date | Fri, 19 Feb 2010 12:41:23 +0000 |
parents | a1d67e136c30 |
children | 701702f8959a |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Objects.h" #include <dataquay/Debug.h> #include <cstdlib> #include <iostream> #include "EditDistance.h" #include <QHash> // to ensure correct qHash(const QString &) is found unsigned int qHash(const QUrl &u) { return qHash(u.toString()); } namespace ClassicalData { QMap<QString, Form *> Form::m_map; QMutex Form::m_mutex; bool Composer::matchDates(const Composer *b) const { const Composer *a = this; if (a->birth() && b->birth()) { int ay = a->birth()->year(), by = b->birth()->year(); if (ay < 1800 || // birth dates before 1700 tend to be vague! a->birth()->approximate() || b->birth()->approximate()) { if (abs(ay - by) > 25) return false; } else { if (abs(ay - by) > 1) { return false; } } } if (a->death() && b->death()) { int ay = a->death()->year(), by = b->death()->year(); if (a->death()->approximate() || b->death()->approximate()) { if (abs(ay - by) > 10) return false; } else if (ay < 1700) { if (abs(ay - by) > 25) return false; } else if (ay < 1800) { // cut a bit of slack, but not as much as for birth date if (abs(ay - by) > 10) return false; } else { if (abs(ay - by) > 1) return false; } } return true; } void Composer::cacheNames() const { if (m_namesCached) return; QString n = name(); QStringList pl = n.split(", "); if (pl.size() == 1) { QStringList pl2; pl = n.split(' '); pl2.push_back(pl[pl.size()-1]); pl2.push_back(""); for (int i = 0; i+1 < pl.size(); ++i) { if (i > 0) pl2[1] += " "; pl2[1] += pl[i]; } pl = pl2; } m_surname = pl[0]; n = ""; for (int i = 1; i < pl.size(); ++i) { if (i > 1) n += ", "; n += pl[i]; } m_forenames = n; m_surnameElements.clear(); m_connectiveElements.clear(); m_forenameElements.clear(); m_otherElements.clear(); m_reducedSurnameElements.clear(); m_reducedForenameElements.clear(); static QRegExp sre("[\\., -]+"); foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_surnameElements.push_back(s.toLower()); m_reducedSurnameElements.push_back(reduceName(s)); } else if (s.length() > 1) { m_connectiveElements.push_back(s.toLower()); } } foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_forenameElements.push_back(s.toLower()); m_reducedForenameElements.push_back(reduceName(s)); } else if (s.length() > 1) { m_connectiveElements.push_back(s.toLower()); } } foreach (QString a, m_aliases) { foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) { m_otherElements.push_back(ae.toLower()); } } m_namesCached = true; } QString Composer::getSortName(bool caps) const { QString surname = getSurname(); QString forenames = getForenames(); if (caps) surname = surname.toUpper(); if (forenames != "") return surname + ", " + forenames; else return surname; } QString Composer::getSurname() const { cacheNames(); return m_surname; } QString Composer::getForenames() const { cacheNames(); return m_forenames; } QString Composer::getDisplayDates() const { QString s; if (birth() || death()) { bool showApprox = false; if ((birth() && birth()->approximate()) || (death() && death()->approximate())) { showApprox = true; } if (birth()) { if (birth()->place() != "") { s += birth()->place() + ", "; } if (showApprox) { s += "c. "; showApprox = false; } s += QString("%1").arg(birth()->year()); } s += "-"; if (death()) { if (death()->place() != "") { s += death()->place() + ", "; } if (showApprox) { s += "c. "; showApprox = false; } s += QString("%1").arg(death()->year()); } } return s; } static QString asciify(QString field) { QString ascii; for (int i = 0; i < field.length(); ++i) { QString dc = field[i].decomposition(); if (dc != "") ascii += dc[0]; else if (field[i] == QChar(0x00DF)) { ascii += "ss"; } else { ascii += field[i]; } } ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe ascii.replace(QString::fromUtf8("\342\200\222"), "-"); ascii.replace(QString::fromUtf8("\342\200\223"), "-"); ascii.replace(QString::fromUtf8("\342\200\224"), "-"); ascii.replace(QString::fromUtf8("\342\200\225"), "-"); return ascii; } QString Composer::reduceName(QString name) { QString key = asciify(name).toLower() .replace("'", "") .replace("x", "ks") .replace("y", "i") .replace("k", "c") .replace("ch", "c") .replace("cc", "c") .replace("aa", "a") .replace("v", "f") .replace("ff", "f") .replace("th", "t") .replace("tch", "ch") .replace("er", "r"); return key; } bool Composer::matchCatalogueName(QString an) const { // ew! QString bn = name(); if (bn == an) return true; if (aliases().contains(an)) return true; int aSurnameIndex = 0, bSurnameIndex = 0; if (an.contains(",")) { an.replace(",", ""); } else { aSurnameIndex = -1; } if (bn.contains(",")) { bn.replace(",", ""); } else { bSurnameIndex = -1; } QStringList nl = an.split(QRegExp("[ -]")); QStringList bnl = reduceName(bn).split(QRegExp("[ -]")); int matchCount = 0; QString surnameMatch = ""; if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1; if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1; if (nl[aSurnameIndex][0].isUpper() && nl[aSurnameIndex] != "Della" && reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { surnameMatch = nl[aSurnameIndex]; } int tested = 0; foreach (QString elt, nl) { if (!elt[0].isUpper() || elt == "Della") continue; QString k = reduceName(elt); if (bnl.contains(k)) { ++matchCount; } if (++tested == 2 && matchCount == 0) { return false; } } if (surnameMatch != "") { DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl; if (matchCount > 1) { return true; } else { DEBUG << "(but not enough else matched)" << endl; return false; } } return false; } float Composer::matchFuzzyName(QString n) const { int fameBonus = m_pages.size(); if (n == name()) return 100 + fameBonus; static QRegExp sre("[\\., -]+"); return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts)); } float Composer::matchFuzzyName(QStringList elements) const { if (elements.empty()) return 0; cacheNames(); int fameBonus = m_pages.size(); EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true); int score = 0; foreach (QString elt, elements) { bool accept = false; int initial = 0; if (elt.length() == 1) { // an initial: search forenames only ignoring connectives foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) { score += 3; // std::cerr << "[initial: " << s.toStdString() << "]" << std::endl; accept = true; break; } } if (!accept) { score -= 10; } continue; } foreach (QString s, m_surnameElements) { if (s[0] == elt[0]) initial = 1; accept = true; int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 20; else if (dist*3 <= s.length()) { score += 14 - dist + initial; } else if (elt.length() > 3 && s.startsWith(elt)) score += 5; else accept = false; if (accept) { // std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; break; } } if (accept) continue; foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) initial = 1; accept = true; int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 15; else if (dist*3 <= s.length()) score += 9 - dist + initial; else accept = false; if (accept) { // std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; break; } } if (accept) continue; foreach (QString s, m_connectiveElements) { accept = true; int dist = ed.calculate(elt, s, 1); if (dist == 0) score += 4; else if (dist == 1) score += 2; else accept = false; if (accept) { // std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; break; } } if (accept) continue; if (m_reducedSurnameElements.contains(elt)) { score += 11; // std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; continue; } if (m_reducedForenameElements.contains(elt)) { score += 7; // std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; continue; } foreach (QString s, m_otherElements) { accept = true; int dist = ed.calculate(elt, s, s.length()/3); if (dist == 0) score += 12; else if (dist*3 <= s.length()) score += 8 - dist; else accept = false; if (accept) { // std::cerr << "[other: " << s.toStdString() << "]" << std::endl; break; } } if (accept) continue; if (initial) score -= 5; else score -= 10; } if (score > 0) { // if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; score += fameBonus; } // if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl; return float(score) / (elements.size() * 10); } static int compare(QString a, QString b) { if (a < b) { return -1; } else if (a > b) { return 1; } else { return 0; } } int Work::compareCatalogueNumberTexts(QString a, QString b) { // std::cout << "compare " << a.toStdString() // << " " << b.toStdString() << std::endl; if (a == b) return 0; if (!a[0].isDigit()) { if (!b[0].isDigit()) { QStringList al = a.split(QRegExp("[ :-]")); QStringList bl = b.split(QRegExp("[ :-]")); if (al.size() < 2 || bl.size() < 2 || al.size() != bl.size()) { if (a < b) return -1; else if (a > b) return 1; else return 0; } for (int i = 0; i < al.size(); ++i) { if (al[i] != bl[i]) { // std::cout << "subcompare " << al[i].toStdString() // << " " << bl[i].toStdString() << std::endl; return compareCatalogueNumberTexts(al[i], bl[i]); } } } else { return compare(a, b); } } else { if (!b[0].isDigit()) { return compare(a, b); } } // use atoi instead of toInt() because we want it to succeed even // if the text is not only an integer (e.g. 35a) int aoi = atoi(a.toLocal8Bit().data()); int boi = atoi(b.toLocal8Bit().data()); // std::cout << "aoi = " << aoi << ", boi = " << boi << std::endl; if (aoi == boi) return compare(a, b); else return aoi - boi; } bool Work::Ordering::operator()(Work *a, Work *b) { if (!a) { if (!b) return false; else return true; } else { if (!b) { return false; } } /* QString ao = a->catalogue(); if (ao == "") ao = a->opus(); QString bo = b->catalogue(); if (bo == "") bo = b->opus(); std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl; */ int c = 0; if (a->catalogue() != "" && b->catalogue() != "") { c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue()); } if (c == 0 && a->opus() != "" && b->opus() != "") { c = compareCatalogueNumberTexts(a->opus(), b->opus()); } if (c == 0 && a->partOf() == b->partOf() && a->number() != "" && b->number() != "") { c = compareCatalogueNumberTexts(a->number(), b->number()); } bool rv = false; if (c == 0) { if (a->name() == b->name()) rv = (a < b); else rv = (a->name() < b->name()); } else { rv = (c < 0); } // std::cout << "result = " << rv << std::endl; return rv; } }