Mercurial > hg > classical
view common/Objects.cpp @ 31:07efb25d24d6
* Merge revs 7200-7222 from SVN (update use of loader API, switch to using
mapper for merge operation)
author | Chris Cannam |
---|---|
date | Thu, 18 Mar 2010 16:59:24 +0000 |
parents | 9729919e589c |
children | 84d6acb6b3ba |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Objects.h" #include <dataquay/Debug.h> #include <cstdlib> #include <iostream> #include "EditDistance.h" #include <QHash> // to ensure correct qHash(const QString &) is found namespace ClassicalData { QMap<QString, Form *> Form::m_map; QMutex Form::m_mutex; bool Composer::matchDates(const Composer *b) const { const Composer *a = this; if (a->birth() && b->birth()) { int ay = a->birth()->year(), by = b->birth()->year(); if (ay < 1800 || // birth dates before 1700 tend to be vague! a->birth()->approximate() || b->birth()->approximate()) { if (abs(ay - by) > 25) return false; } else { if (abs(ay - by) > 1) { return false; } } } if (a->death() && b->death()) { int ay = a->death()->year(), by = b->death()->year(); if (a->death()->approximate() || b->death()->approximate()) { if (abs(ay - by) > 10) return false; } else if (ay < 1700) { if (abs(ay - by) > 25) return false; } else if (ay < 1800) { // cut a bit of slack, but not as much as for birth date if (abs(ay - by) > 10) return false; } else { if (abs(ay - by) > 1) return false; } } return true; } void Composer::cacheNames() const { if (m_namesCached) return; QString n = name(); QStringList pl = n.split(", "); if (pl.size() == 1) { QStringList pl2; pl = n.split(' '); pl2.push_back(pl[pl.size()-1]); pl2.push_back(""); for (int i = 0; i+1 < pl.size(); ++i) { if (i > 0) pl2[1] += " "; pl2[1] += pl[i]; } pl = pl2; } m_surname = pl[0]; n = ""; for (int i = 1; i < pl.size(); ++i) { if (i > 1) n += ", "; n += pl[i]; } m_forenames = n; m_surnameElements.clear(); m_connectiveElements.clear(); m_forenameElements.clear(); m_otherElements.clear(); m_reducedSurnameElements.clear(); m_reducedForenameElements.clear(); static QRegExp sre("[\\., -]+"); foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_surnameElements.push_back(s.toLower()); m_reducedSurnameElements.push_back(reduceName(s)); } else if (s.length() > 1) { m_connectiveElements.push_back(s.toLower()); } } foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) { if (s[0].isUpper()) { m_forenameElements.push_back(s.toLower()); m_reducedForenameElements.push_back(reduceName(s)); } else if (s.length() > 1) { m_connectiveElements.push_back(s.toLower()); } } foreach (QString a, m_aliases) { foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) { m_otherElements.push_back(ae.toLower()); } } m_namesCached = true; } QString Composer::getSortName(bool caps) const { QString surname = getSurname(); QString forenames = getForenames(); if (caps) surname = surname.toUpper(); if (forenames != "") return surname + ", " + forenames; else return surname; } QString Composer::getSurname() const { cacheNames(); return m_surname; } QString Composer::getForenames() const { cacheNames(); return m_forenames; } QString Composer::getDisplayDates() const { QString s; if (birth() || death()) { bool showApprox = false; if ((birth() && birth()->approximate()) || (death() && death()->approximate())) { showApprox = true; } if (birth()) { if (birth()->place() != "") { s += birth()->place() + ", "; } if (showApprox) { s += "c. "; showApprox = false; } s += QString("%1").arg(birth()->year().toInt()); } s += "-"; if (death()) { if (death()->place() != "") { s += death()->place() + ", "; } if (showApprox) { s += "c. "; showApprox = false; } s += QString("%1").arg(death()->year().toInt()); } } return s; } static QString asciify(QString field) { QString ascii; for (int i = 0; i < field.length(); ++i) { QString dc = field[i].decomposition(); if (dc != "") ascii += dc[0]; else if (field[i] == QChar(0x00DF)) { ascii += "ss"; } else { ascii += field[i]; } } ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe ascii.replace(QString::fromUtf8("\342\200\222"), "-"); ascii.replace(QString::fromUtf8("\342\200\223"), "-"); ascii.replace(QString::fromUtf8("\342\200\224"), "-"); ascii.replace(QString::fromUtf8("\342\200\225"), "-"); return ascii; } QString Composer::reduceName(QString name) { QString key = asciify(name).toLower() .replace("'", "") .replace("x", "ks") .replace("y", "i") .replace("k", "c") .replace("ch", "c") .replace("cc", "c") .replace("aa", "a") .replace("v", "f") .replace("ff", "f") .replace("th", "t") .replace("tch", "ch") .replace("er", "r"); return key; } bool Composer::matchCatalogueName(QString an) const { // ew! QString bn = name(); if (bn == an) return true; if (aliases().contains(an)) return true; int aSurnameIndex = 0, bSurnameIndex = 0; if (an.contains(",")) { an.replace(",", ""); } else { aSurnameIndex = -1; } if (bn.contains(",")) { bn.replace(",", ""); } else { bSurnameIndex = -1; } QStringList nl = an.split(QRegExp("[ -]")); QStringList bnl = reduceName(bn).split(QRegExp("[ -]")); int matchCount = 0; QString surnameMatch = ""; if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1; if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1; if (nl[aSurnameIndex][0].isUpper() && nl[aSurnameIndex] != "Della" && reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) { surnameMatch = nl[aSurnameIndex]; } int tested = 0; foreach (QString elt, nl) { if (!elt[0].isUpper() || elt == "Della") continue; QString k = reduceName(elt); if (bnl.contains(k)) { ++matchCount; } if (++tested == 2 && matchCount == 0) { return false; } } if (surnameMatch != "") { DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl; if (matchCount > 1) { return true; } else { DEBUG << "(but not enough else matched)" << endl; return false; } } return false; } float Composer::matchFuzzyName(QString n) const { int fameBonus = m_pages.size(); if (n == name()) return 100 + fameBonus; static QRegExp sre("[\\., -]+"); return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts)); } static int calculateThresholdedDistance(EditDistance &ed, const QString &user, const QString &machine) { int threshold = machine.length()/3; int dist; if (threshold == 0) dist = (user == machine ? 0 : -1); else { dist = ed.calculate(user, machine, threshold); if (dist > threshold) dist = -1; } return dist; } float Composer::matchFuzzyName(QStringList elements) const { if (elements.empty()) return 0; cacheNames(); int fameBonus = m_pages.size(); EditDistance ed(EditDistance::RestrictedTransposition); int score = 0; bool haveSurname = false; // We aim to scale the eventual result such that a score of 1.0 or // more indicates near-certainty that this is a correct match // (i.e. that it is properly matched -- not that it is the only // possible match). To achieve this score, we need to have // matched with reasonable confidence every element in the passed // elements list, and to have matched at least one of them to a // part of our surname. int matched = 0; int unmatched = 0; foreach (QString elt, elements) { bool accept = false; if (elt.length() == 1) { // An initial: search forenames only, ignoring // connectives. The score contribution here is low, but // they do not count to matched which means the score can // only enhance whatever happens elsewhere. They can // however seriously damage our score if unmatched, which // is as it should be. foreach (QString s, m_forenameElements) { if (s[0] == elt[0]) { score += 2; accept = true; break; } } if (!accept) { foreach (QString s, m_connectiveElements) { if (s[0] == elt[0]) { score += 1; accept = true; break; } } } if (!accept) { foreach (QString s, m_surnameElements) { if (s[0] == elt[0]) { // no score, but don't call it unmatched accept = true; break; } } } if (!accept) ++unmatched; continue; } foreach (QString s, m_surnameElements) { int dist = calculateThresholdedDistance(ed, elt, s); if (dist >= 0) { score += 22 - dist*2; if (elt[0] != s[0]) score -= 10; accept = true; // std::cerr << "[surname: " << s.toStdString() << "]" << std::endl; break; } } if (accept) { haveSurname = true; ++matched; continue; } foreach (QString s, m_forenameElements) { int dist = calculateThresholdedDistance(ed, elt, s); if (dist >= 0) { score += 22 - dist*2; if (elt[0] != s[0]) score -= 10; accept = true; // std::cerr << "[forename: " << s.toStdString() << "]" << std::endl; break; } } if (accept) { ++matched; continue; } foreach (QString s, m_connectiveElements) { // treated much like initials int dist = calculateThresholdedDistance(ed, elt, s); if (dist == 0) { score += 2; accept = true; } else if (dist == 1) { score += 1; accept = true; } if (accept) { // std::cerr << "[connective: " << s.toStdString() << "]" << std::endl; break; } } if (accept) { continue; } QString reduced = reduceName(elt); //!!! these don't seem to match often... if (m_reducedSurnameElements.contains(reduced)) { score += 10; haveSurname = true; ++matched; std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl; continue; } if (m_reducedForenameElements.contains(reduced)) { score += 7; ++matched; std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl; continue; } foreach (QString s, m_otherElements) { int dist = calculateThresholdedDistance(ed, elt, s); if (dist >= 0) { score += 22 - dist*2; if (elt[0] != s[0]) score -= 10; accept = true; // std::cerr << "[other: " << s.toStdString() << "]" << std::endl; break; } } if (accept) { ++matched; continue; } ++unmatched; } // if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl; score += fameBonus; if (matched == 0) { if (unmatched == 0) { return float(score) / 20.f; } else { return 0; } } float fscore = score; float divisor = (matched + unmatched) * 20; if (!haveSurname) fscore /= 2; if (unmatched > 0) fscore /= 1.5; fscore /= divisor; if (matched > 0) { // std::cerr << "[score " << score << " with divisor " << divisor << " for " << name().toStdString() << " adjusted to " << fscore << "]" << std::endl; } return fscore; } float Composer::matchTyping(QString t) const { return doMatchTyping(t, false); } float Composer::matchTypingQuick(QString t) const { return doMatchTyping(t, true); } float Composer::doMatchTyping(QString t, bool quick) const { if (t == "") return 0; cacheNames(); float fameBonus = m_pages.size() / 400.f; QString n = name().toLower(); t = t.toLower(); if (n == t) return 1.f + fameBonus; if (n.startsWith(t)) return 0.8f + fameBonus; QSet<QString> sl; QSet<QString> nl; foreach (QString s, m_surnameElements) { sl.insert(s.toLower()); nl.insert(s.toLower()); } foreach (QString s, m_forenameElements) { nl.insert(s.toLower()); } if (!quick) { foreach (QString s, m_otherElements) { nl.insert(s.toLower()); } foreach (QString s, m_connectiveElements) { nl.insert(s.toLower()); } } static QRegExp sre("[\\., -]+"); QStringList tl = t.split(sre, QString::SkipEmptyParts); float score = 0.f; if (nl.empty() || tl.empty()) return 0.f; int unmatched = 0; for (int i = 0; i < tl.size(); ++i) { QString tel = tl[i]; float component = 0.f; float max = 0.f; for (QSet<QString>::const_iterator ni = nl.begin(); ni != nl.end(); ++ni) { QString nel = ni->toLower(); if (tel == nel) { if (tel.length() > 1) { component = 0.2; } else { component = 0.1; } if (sl.contains(nel)) component *= 1.5; goto calculated; } if (nel.startsWith(tel)) { component = 0.1; if (sl.contains(nel)) component *= 1.5; goto calculated; } if (!quick) { if (tel.length() > 3) { EditDistance ed(EditDistance::RestrictedTransposition); int dist = calculateThresholdedDistance (ed, nel.left(tel.length()), tel); if (dist >= 0) { component = 0.08 - dist * 0.01; if (sl.contains(nel)) component *= 1.5; } } if (component > 0.f) goto calculated; } if (nel.startsWith(tel[0])) { component += 0.02; } calculated: if (component > max) max = component; } score += max; } if (!quick) { if (t.contains(" ")) { float fuzzyScore = matchFuzzyName(t); if (fuzzyScore >= 0.4f) { score += fuzzyScore / 3.f; } } } if (score > 0.f) score += fameBonus; return score; } void Composer::mergeFrom(Composer *c) { QSet<QString> allNames = c->aliases(); allNames.insert(c->name()); foreach (QString n, allNames) { if (n != m_name && !m_aliases.contains(n)) { m_aliases.insert(n); m_namesCached = false; } } if (!m_birth) { if (c->birth()) { m_birth = new Birth(*c->birth()); emit birthChanged(m_birth); } } if (!m_death) { if (c->death()) { m_death = new Death(*c->death()); emit deathChanged(m_death); } } if (c->gender() != "") { if (m_gender == "") { m_gender = c->gender(); emit genderChanged(m_gender); } else if (c->gender() != m_gender) { std::cerr << "WARNING: Composer::mergeFrom: Gender mismatch! Composer " << c->name().toStdString() << " has gender " << c->gender().toStdString() << ", but target composer " << m_name.toStdString() << " has gender " << m_gender.toStdString() << std::endl; } } m_nationality.unite(c->nationality()); m_geonameURIs.unite(c->geonameURIs()); m_otherURIs.unite(c->otherURIs()); foreach (Document *d, c->pages()) { Document *dd = new Document; dd->setUri(d->uri()); dd->setSiteName(dd->siteName()); dd->setTopic(this); m_pages.insert(dd); } if (m_period == "") m_period = c->period(); if (m_remarks == "") m_remarks = c->remarks(); emit nationalityChanged(m_nationality); emit geonameURIsChanged(m_geonameURIs); emit otherURIsChanged(m_otherURIs); emit pagesChanged(m_pages); emit periodChanged(m_period); emit remarksChanged(m_remarks); emit aliasesChanged(m_aliases); } static int compare(QString a, QString b) { if (a < b) { return -1; } else if (a > b) { return 1; } else { return 0; } } int Work::compareCatalogueNumberTexts(QString a, QString b) { // std::cout << "compare " << a.toStdString() // << " " << b.toStdString() << std::endl; if (a == b) return 0; if (!a[0].isDigit()) { if (!b[0].isDigit()) { QStringList al = a.split(QRegExp("[ :-]")); QStringList bl = b.split(QRegExp("[ :-]")); if (al.size() < 2 || bl.size() < 2 || al.size() != bl.size()) { if (a < b) return -1; else if (a > b) return 1; else return 0; } for (int i = 0; i < al.size(); ++i) { if (al[i] != bl[i]) { // std::cout << "subcompare " << al[i].toStdString() // << " " << bl[i].toStdString() << std::endl; return compareCatalogueNumberTexts(al[i], bl[i]); } } } else { return compare(a, b); } } else { if (!b[0].isDigit()) { return compare(a, b); } } // use atoi instead of toInt() because we want it to succeed even // if the text is not only an integer (e.g. 35a) int aoi = atoi(a.toLocal8Bit().data()); int boi = atoi(b.toLocal8Bit().data()); // std::cout << "aoi = " << aoi << ", boi = " << boi << std::endl; if (aoi == boi) return compare(a, b); else return aoi - boi; } bool Work::Ordering::operator()(Work *a, Work *b) { if (!a) { if (!b) return false; else return true; } else { if (!b) { return false; } } /* QString ao = a->catalogue(); if (ao == "") ao = a->opus(); QString bo = b->catalogue(); if (bo == "") bo = b->opus(); std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl; */ int c = 0; if (a->catalogue() != "" && b->catalogue() != "") { c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue()); } if (c == 0 && a->opus() != "" && b->opus() != "") { c = compareCatalogueNumberTexts(a->opus(), b->opus()); } if (c == 0 && a->partOf() == b->partOf() && a->number() != "" && b->number() != "") { c = compareCatalogueNumberTexts(a->number(), b->number()); } bool rv = false; if (c == 0) { if (a->name() == b->name()) rv = (a < b); else rv = (a->name() < b->name()); } else { rv = (c < 0); } // std::cout << "result = " << rv << std::endl; return rv; } }