view common/Objects.cpp @ 10:d35e5d769c87 classical-rdf

* some experiments with composer name matching
author Chris Cannam
date Wed, 17 Feb 2010 19:26:48 +0000
parents 719a4f477098
children 98047b91b09d
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Objects.h"

#include <dataquay/Debug.h>

#include <cstdlib>
#include <iostream>

#include <QHash> // to ensure correct qHash(const QString &) is found

unsigned int
qHash(const QUrl &u)
{
    return qHash(u.toString());
}

namespace ClassicalData {

QMap<QString, Form *> Form::m_map;
QMutex Form::m_mutex;

bool
Composer::matchDates(const Composer *b) const
{
    const Composer *a = this;
    
    if (a->birth() && b->birth()) {
        int ay = a->birth()->year(), by = b->birth()->year();
        if (ay < 1800 || // birth dates before 1700 tend to be vague!
            a->birth()->approximate() ||
            b->birth()->approximate()) {
            if (abs(ay - by) > 25) return false;
        } else {
            if (abs(ay - by) > 1) {
                return false;
            }
        }
    }
    if (a->death() && b->death()) {
        int ay = a->death()->year(), by = b->death()->year();
        if (a->death()->approximate() || b->death()->approximate()) {
            if (abs(ay - by) > 10) return false;
        } else if (ay < 1700) {
            if (abs(ay - by) > 25) return false;
        } else if (ay < 1800) {
            // cut a bit of slack, but not as much as for birth date
            if (abs(ay - by) > 10) return false;
        } else {
            if (abs(ay - by) > 1) return false;
        }
    }
    return true;
}

QString
Composer::getSortName(bool caps) const
{
    QString surname = getSurname();
    QString forenames = getForenames();
    if (caps) surname = surname.toUpper();
    if (forenames != "") return surname + ", " + forenames;
    else return surname;
}

QString
Composer::getSurname() const
{
    //!!! slow (dup with getForenames)
    QString n = name();
    QStringList pl = n.split(", ");
    if (pl.size() == 1) {
        QStringList pl2;
        pl = n.split(' ');
        pl2.push_back(pl[pl.size()-1]);
        pl2.push_back("");
        for (int i = 0; i+1 < pl.size(); ++i) {
            if (i > 0) pl2[1] += " ";
            pl2[1] += pl[i];
        }
        pl = pl2;
    }
    return pl[0];
}

QString
Composer::getForenames() const
{
    //!!! slow (dup with getSurname)
    QString n = name();
    QStringList pl = n.split(", ");
    if (pl.size() == 1) {
        QStringList pl2;
        pl = n.split(' ');
        pl2.push_back(pl[pl.size()-1]);
        pl2.push_back("");
        for (int i = 0; i+1 < pl.size(); ++i) {
            if (i > 0) pl2[1] += " ";
            pl2[1] += pl[i];
        }
        pl = pl2;
    }
    n = "";
    for (int i = 1; i < pl.size(); ++i) {
        if (i > 1) n += ", ";
        n += pl[i];
    }
    return n;
}

QString
Composer::getDisplayDates() const
{
    QString s;
    if (birth() || death()) {
        bool showApprox = false;
        if ((birth() && birth()->approximate()) ||
            (death() && death()->approximate())) {
            showApprox = true;
        }
        if (birth()) {
            if (birth()->place() != "") {
                s += birth()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(birth()->year());
        }
        s += "-";
        if (death()) {
            if (death()->place() != "") {
                s += death()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(death()->year());
        }
    }

    return s;
}
   
static QString
asciify(QString field)
{
    QString ascii;
    for (int i = 0; i < field.length(); ++i) {
        QString dc = field[i].decomposition();
        if (dc != "") ascii += dc[0];
        else if (field[i] == QChar(0x00DF)) {
            ascii += "ss";
        } else {
            ascii += field[i];
        }
    }
    ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
    ascii.replace(QString::fromUtf8("\342\200\222"), "-");
    ascii.replace(QString::fromUtf8("\342\200\223"), "-");
    ascii.replace(QString::fromUtf8("\342\200\224"), "-");
    ascii.replace(QString::fromUtf8("\342\200\225"), "-");
    return ascii;
}

QString
Composer::reduceName(QString name)
{
    QString key = asciify(name).toLower()
        .replace("'", "")
        .replace("x", "ks")
        .replace("y", "i")
        .replace("k", "c")
        .replace("ch", "c")
        .replace("cc", "c")
        .replace("aa", "a")
        .replace("v", "f")
        .replace("ff", "f")
        .replace("th", "t")
        .replace("tch", "ch")
        .replace("er", "r");
    return key;
}

bool
Composer::matchCatalogueName(QString an) const
{
    // ew!

    QString bn = name();
    if (bn == an) return true;
    if (aliases().contains(an)) return true;

    int aSurnameIndex = 0, bSurnameIndex = 0;
    if (an.contains(",")) {
        an.replace(",", "");
    } else {
        aSurnameIndex = -1;
    }
    if (bn.contains(",")) {
        bn.replace(",", "");
    } else {
        bSurnameIndex = -1;
    }
    QStringList nl = an.split(QRegExp("[ -]"));
    QStringList bnl = reduceName(bn).split(QRegExp("[ -]"));
    int matchCount = 0;
    QString surnameMatch = "";
    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
    if (nl[aSurnameIndex][0].isUpper() &&
        nl[aSurnameIndex] != "Della" &&
        reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
        surnameMatch = nl[aSurnameIndex];
    }
    int tested = 0;
    foreach (QString elt, nl) {
        if (!elt[0].isUpper() || elt == "Della") continue;
        QString k = reduceName(elt);
        if (bnl.contains(k)) {
            ++matchCount;
        }
        if (++tested == 2 && matchCount == 0) {
            return false;
        }
    }
    if (surnameMatch != "") {
        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
        if (matchCount > 1) {
            return true;
        } else {
            DEBUG << "(but not enough else matched)" << endl;
            return false;
        }
    }
    return false;
}    

int
Composer::matchFuzzyName(QString n) const
{
    if (n == name()) return 100;
    
    QString surname = getSurname();
    QString forenames = getForenames();
    
    QStringList sl = surname.split(' ');
    QStringList fl = forenames.split(' ');
    QStringList nl = n.split(' ');
 
    int score = 0;

    foreach (QString element, nl) {
        
        bool matchedSomething = false;

        if (element.length() == 1) {
            // an initial: search forenames only ignoring connectives
            QChar c = element[0].toUpper();
            foreach (QString f, fl) {
                if (f[0] == c) {
                    score += 3;
                    matchedSomething = true;
                    break;
                }
            }
            if (!matchedSomething) {
                score -= 10;
            }
            continue;
        }

        foreach (QString s, sl) {
            if (s.toLower() == element.toLower()) {
                if (s[0].isUpper()) {
                    score += 20;
                } else {
                    score += 6;
                }
                matchedSomething = true;
                break;
            }
        }
        if (matchedSomething) continue;

        foreach (QString f, fl) {
            if (f.toLower() == element.toLower()) {
                if (f[0].isUpper()) {
                    score += 15;
                } else {
                    score += 4;
                }
                matchedSomething = true;
                break;
            }
        }
        if (matchedSomething) continue;

        QString reduced = reduceName(element);

        foreach (QString s, sl) {
            if (!s[0].isUpper()) continue;
            if (reduceName(s) == reduced) {
                score += 12;
                matchedSomething = true;
                break;
            }
        }
        if (matchedSomething) continue;

        foreach (QString f, fl) {
            if (!f[0].isUpper()) continue;
            if (reduceName(f) == reduced) {
                score += 10;
                matchedSomething = true;
                break;
            }
        }
        if (matchedSomething) continue;

        foreach (QString f, fl) {
            // smaller penalty if we at least have the right first letter
            if (!f[0].isUpper()) continue;
            if (f[0] == element[0].toUpper()) {
                score -= 4;
                matchedSomething = true;
                break;
            }
        }
        if (matchedSomething) continue;
        
        score -= 7;
    }        

    //!!! need to adjust for "fame" (more famous composers get a 1pt bonus)

    return score;
}

static int
compare(QString a, QString b)
{
    if (a < b) {
        return -1;
    } else if (a > b) {
        return 1;
    } else {
        return 0;
    }
}

int
Work::compareCatalogueNumberTexts(QString a, QString b)
{
//    std::cout << "compare " << a.toStdString()
//              << " " << b.toStdString() << std::endl;

    if (a == b) return 0;

    if (!a[0].isDigit()) {
        if (!b[0].isDigit()) {
            QStringList al = a.split(QRegExp("[ :-]"));
            QStringList bl = b.split(QRegExp("[ :-]"));
            if (al.size() < 2 || bl.size() < 2 ||
                al.size() != bl.size()) {
                if (a < b) return -1;
                else if (a > b) return 1;
                else return 0;
            }
            for (int i = 0; i < al.size(); ++i) {
                if (al[i] != bl[i]) {
//                    std::cout << "subcompare " << al[i].toStdString()
//                              << " " << bl[i].toStdString() << std::endl;
                    return compareCatalogueNumberTexts(al[i], bl[i]);
                }
            }
        } else {
            return compare(a, b);
        }
    } else {
        if (!b[0].isDigit()) {
            return compare(a, b);
        }
    }
    
    // use atoi instead of toInt() because we want it to succeed even
    // if the text is not only an integer (e.g. 35a)
    int aoi = atoi(a.toLocal8Bit().data());
    int boi = atoi(b.toLocal8Bit().data());

//    std::cout << "aoi = " << aoi << ", boi = " << boi << std::endl;

    if (aoi == boi) return compare(a, b);
    else return aoi - boi;
}

bool
Work::Ordering::operator()(Work *a, Work *b)
{
    if (!a) {
        if (!b) return false;
        else return true;
    } else {
        if (!b) {
            return false;
        }
    }
/*
    QString ao = a->catalogue();
    if (ao == "") ao = a->opus();

    QString bo = b->catalogue();
    if (bo == "") bo = b->opus();

    std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl;
*/
    int c = 0;
    if (a->catalogue() != "" && b->catalogue() != "") {
        c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue());
    }
    if (c == 0 && a->opus() != "" && b->opus() != "") {
        c = compareCatalogueNumberTexts(a->opus(), b->opus());
    }
    if (c == 0 && a->partOf() == b->partOf() &&
        a->number() != "" && b->number() != "") {
        c = compareCatalogueNumberTexts(a->number(), b->number());
    }

    bool rv = false;

    if (c == 0) {
        if (a->name() == b->name()) rv = (a < b);
        else rv = (a->name() < b->name());
    } else {
        rv = (c < 0);
    }

//    std::cout << "result = " << rv << std::endl;
    return rv;
}


}