view common/Objects.cpp @ 14:06b2ea2ca577 classical-rdf

* use floating-point scores for name matching
author Chris Cannam
date Fri, 19 Feb 2010 12:41:23 +0000
parents a1d67e136c30
children 701702f8959a
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Objects.h"

#include <dataquay/Debug.h>

#include <cstdlib>
#include <iostream>

#include "EditDistance.h"

#include <QHash> // to ensure correct qHash(const QString &) is found

unsigned int
qHash(const QUrl &u)
{
    return qHash(u.toString());
}

namespace ClassicalData {

QMap<QString, Form *> Form::m_map;
QMutex Form::m_mutex;

bool
Composer::matchDates(const Composer *b) const
{
    const Composer *a = this;
    
    if (a->birth() && b->birth()) {
        int ay = a->birth()->year(), by = b->birth()->year();
        if (ay < 1800 || // birth dates before 1700 tend to be vague!
            a->birth()->approximate() ||
            b->birth()->approximate()) {
            if (abs(ay - by) > 25) return false;
        } else {
            if (abs(ay - by) > 1) {
                return false;
            }
        }
    }
    if (a->death() && b->death()) {
        int ay = a->death()->year(), by = b->death()->year();
        if (a->death()->approximate() || b->death()->approximate()) {
            if (abs(ay - by) > 10) return false;
        } else if (ay < 1700) {
            if (abs(ay - by) > 25) return false;
        } else if (ay < 1800) {
            // cut a bit of slack, but not as much as for birth date
            if (abs(ay - by) > 10) return false;
        } else {
            if (abs(ay - by) > 1) return false;
        }
    }
    return true;
}

void
Composer::cacheNames() const
{
    if (m_namesCached) return;

    QString n = name();
    QStringList pl = n.split(", ");

    if (pl.size() == 1) {
        QStringList pl2;
        pl = n.split(' ');
        pl2.push_back(pl[pl.size()-1]);
        pl2.push_back("");
        for (int i = 0; i+1 < pl.size(); ++i) {
            if (i > 0) pl2[1] += " ";
            pl2[1] += pl[i];
        }
        pl = pl2;
    }

    m_surname = pl[0];

    n = "";
    for (int i = 1; i < pl.size(); ++i) {
        if (i > 1) n += ", ";
        n += pl[i];
    }

    m_forenames = n;

    m_surnameElements.clear();
    m_connectiveElements.clear();
    m_forenameElements.clear();
    m_otherElements.clear();
    m_reducedSurnameElements.clear();
    m_reducedForenameElements.clear();
    
    static QRegExp sre("[\\., -]+");

    foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) {
        if (s[0].isUpper()) {
            m_surnameElements.push_back(s.toLower());
            m_reducedSurnameElements.push_back(reduceName(s));
        } else if (s.length() > 1) {
            m_connectiveElements.push_back(s.toLower());
        }
    }

    foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) {
        if (s[0].isUpper()) {
            m_forenameElements.push_back(s.toLower());
            m_reducedForenameElements.push_back(reduceName(s));
        } else if (s.length() > 1) {
            m_connectiveElements.push_back(s.toLower());
        }
    }

    foreach (QString a, m_aliases) {
        foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) {
            m_otherElements.push_back(ae.toLower());
        }
    }

    m_namesCached = true;
}

QString
Composer::getSortName(bool caps) const
{
    QString surname = getSurname();
    QString forenames = getForenames();
    if (caps) surname = surname.toUpper();
    if (forenames != "") return surname + ", " + forenames;
    else return surname;
}

QString
Composer::getSurname() const
{
    cacheNames();
    return m_surname;
}

QString
Composer::getForenames() const
{
    cacheNames();
    return m_forenames;
}

QString
Composer::getDisplayDates() const
{
    QString s;
    if (birth() || death()) {
        bool showApprox = false;
        if ((birth() && birth()->approximate()) ||
            (death() && death()->approximate())) {
            showApprox = true;
        }
        if (birth()) {
            if (birth()->place() != "") {
                s += birth()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(birth()->year());
        }
        s += "-";
        if (death()) {
            if (death()->place() != "") {
                s += death()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(death()->year());
        }
    }

    return s;
}
   
static QString
asciify(QString field)
{
    QString ascii;
    for (int i = 0; i < field.length(); ++i) {
        QString dc = field[i].decomposition();
        if (dc != "") ascii += dc[0];
        else if (field[i] == QChar(0x00DF)) {
            ascii += "ss";
        } else {
            ascii += field[i];
        }
    }
    ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
    ascii.replace(QString::fromUtf8("\342\200\222"), "-");
    ascii.replace(QString::fromUtf8("\342\200\223"), "-");
    ascii.replace(QString::fromUtf8("\342\200\224"), "-");
    ascii.replace(QString::fromUtf8("\342\200\225"), "-");
    return ascii;
}

QString
Composer::reduceName(QString name)
{
    QString key = asciify(name).toLower()
        .replace("'", "")
        .replace("x", "ks")
        .replace("y", "i")
        .replace("k", "c")
        .replace("ch", "c")
        .replace("cc", "c")
        .replace("aa", "a")
        .replace("v", "f")
        .replace("ff", "f")
        .replace("th", "t")
        .replace("tch", "ch")
        .replace("er", "r");
    return key;
}

bool
Composer::matchCatalogueName(QString an) const
{
    // ew!

    QString bn = name();
    if (bn == an) return true;
    if (aliases().contains(an)) return true;

    int aSurnameIndex = 0, bSurnameIndex = 0;
    if (an.contains(",")) {
        an.replace(",", "");
    } else {
        aSurnameIndex = -1;
    }
    if (bn.contains(",")) {
        bn.replace(",", "");
    } else {
        bSurnameIndex = -1;
    }
    QStringList nl = an.split(QRegExp("[ -]"));
    QStringList bnl = reduceName(bn).split(QRegExp("[ -]"));
    int matchCount = 0;
    QString surnameMatch = "";
    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
    if (nl[aSurnameIndex][0].isUpper() &&
        nl[aSurnameIndex] != "Della" &&
        reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
        surnameMatch = nl[aSurnameIndex];
    }
    int tested = 0;
    foreach (QString elt, nl) {
        if (!elt[0].isUpper() || elt == "Della") continue;
        QString k = reduceName(elt);
        if (bnl.contains(k)) {
            ++matchCount;
        }
        if (++tested == 2 && matchCount == 0) {
            return false;
        }
    }
    if (surnameMatch != "") {
        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
        if (matchCount > 1) {
            return true;
        } else {
            DEBUG << "(but not enough else matched)" << endl;
            return false;
        }
    }
    return false;
}    

float
Composer::matchFuzzyName(QString n) const
{
    int fameBonus = m_pages.size();
    if (n == name()) return 100 + fameBonus;
    static QRegExp sre("[\\., -]+");
    return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts));
}

float
Composer::matchFuzzyName(QStringList elements) const
{
    if (elements.empty()) return 0;

    cacheNames();
    int fameBonus = m_pages.size();
    
    EditDistance ed(EditDistance::RestrictedTransposition, 1, 1, true);
    
    int score = 0;

    foreach (QString elt, elements) {

        bool accept = false;
        int initial = 0;

        if (elt.length() == 1) {
            // an initial: search forenames only ignoring connectives
            foreach (QString s, m_forenameElements) {
                if (s[0] == elt[0]) {
                    score += 3;
//                    std::cerr << "[initial: " << s.toStdString() << "]" << std::endl;
                    accept = true;
                    break;
                }
            }
            if (!accept) {
                score -= 10;
            }
            continue;
        }
        
        foreach (QString s, m_surnameElements) {
            if (s[0] == elt[0]) initial = 1;
            accept = true;
            int dist = ed.calculate(elt, s, s.length()/3);
            if (dist == 0) score += 20;
            else if (dist*3 <= s.length()) { score += 14 - dist + initial; }
            else if (elt.length() > 3 && s.startsWith(elt)) score += 5;
            else accept = false;
            if (accept) {
//                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) continue;

        foreach (QString s, m_forenameElements) {
            if (s[0] == elt[0]) initial = 1;
            accept = true;
            int dist = ed.calculate(elt, s, s.length()/3);
            if (dist == 0) score += 15;
            else if (dist*3 <= s.length()) score += 9 - dist + initial;
            else accept = false;
            if (accept) {
//                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) continue;

        foreach (QString s, m_connectiveElements) {
            accept = true;
            int dist = ed.calculate(elt, s, 1);
            if (dist == 0) score += 4;
            else if (dist == 1) score += 2;
            else accept = false; 
            if (accept) {
//                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) continue;

        if (m_reducedSurnameElements.contains(elt)) {
            score += 11;
//            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
            continue;
        }

        if (m_reducedForenameElements.contains(elt)) {
            score += 7;
//            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
            continue;
        }

        foreach (QString s, m_otherElements) {
            accept = true;
            int dist = ed.calculate(elt, s, s.length()/3);
            if (dist == 0) score += 12;
            else if (dist*3 <= s.length()) score += 8 - dist;
            else accept = false;
            if (accept) {
//                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) continue;

        if (initial) score -= 5;
        else score -= 10;
    }
        
    if (score > 0) {
//        if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
        score += fameBonus;
    }
//    if (score > 1) std::cerr << "[score " << score << " for " << name().toStdString() << "]" << std::endl;
    return float(score) / (elements.size() * 10);
}

static int
compare(QString a, QString b)
{
    if (a < b) {
        return -1;
    } else if (a > b) {
        return 1;
    } else {
        return 0;
    }
}

int
Work::compareCatalogueNumberTexts(QString a, QString b)
{
//    std::cout << "compare " << a.toStdString()
//              << " " << b.toStdString() << std::endl;

    if (a == b) return 0;

    if (!a[0].isDigit()) {
        if (!b[0].isDigit()) {
            QStringList al = a.split(QRegExp("[ :-]"));
            QStringList bl = b.split(QRegExp("[ :-]"));
            if (al.size() < 2 || bl.size() < 2 ||
                al.size() != bl.size()) {
                if (a < b) return -1;
                else if (a > b) return 1;
                else return 0;
            }
            for (int i = 0; i < al.size(); ++i) {
                if (al[i] != bl[i]) {
//                    std::cout << "subcompare " << al[i].toStdString()
//                              << " " << bl[i].toStdString() << std::endl;
                    return compareCatalogueNumberTexts(al[i], bl[i]);
                }
            }
        } else {
            return compare(a, b);
        }
    } else {
        if (!b[0].isDigit()) {
            return compare(a, b);
        }
    }
    
    // use atoi instead of toInt() because we want it to succeed even
    // if the text is not only an integer (e.g. 35a)
    int aoi = atoi(a.toLocal8Bit().data());
    int boi = atoi(b.toLocal8Bit().data());

//    std::cout << "aoi = " << aoi << ", boi = " << boi << std::endl;

    if (aoi == boi) return compare(a, b);
    else return aoi - boi;
}

bool
Work::Ordering::operator()(Work *a, Work *b)
{
    if (!a) {
        if (!b) return false;
        else return true;
    } else {
        if (!b) {
            return false;
        }
    }
/*
    QString ao = a->catalogue();
    if (ao == "") ao = a->opus();

    QString bo = b->catalogue();
    if (bo == "") bo = b->opus();

    std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl;
*/
    int c = 0;
    if (a->catalogue() != "" && b->catalogue() != "") {
        c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue());
    }
    if (c == 0 && a->opus() != "" && b->opus() != "") {
        c = compareCatalogueNumberTexts(a->opus(), b->opus());
    }
    if (c == 0 && a->partOf() == b->partOf() &&
        a->number() != "" && b->number() != "") {
        c = compareCatalogueNumberTexts(a->number(), b->number());
    }

    bool rv = false;

    if (c == 0) {
        if (a->name() == b->name()) rv = (a < b);
        else rv = (a->name() < b->name());
    } else {
        rv = (c < 0);
    }

//    std::cout << "result = " << rv << std::endl;
    return rv;
}


}