view common/Objects.cpp @ 48:5f23d5b29aaf

* Add original tags to AudioFile
author Chris Cannam
date Wed, 02 Jun 2010 17:29:47 +0100
parents 0033259c6772
children e0e12bd2978d
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Objects.h"

#include <dataquay/Debug.h>

#include <cstdlib>
#include <iostream>

#include "EditDistance.h"

#include <QHash> // to ensure correct qHash(const QString &) is found
#include <QFile>
#include <QFileInfo>
#include <QCryptographicHash>

namespace ClassicalData {

QMap<QString, Form *> Form::m_map;
QMutex Form::m_mutex;

QString
Composition::getComposerName() const
{
    if (m_composer) return m_composer->name();
    return m_cname;
}

bool
Composer::matchDates(const Composer *b) const
{
    const Composer *a = this;
    
    if (a->birth() && b->birth()) {
        int ay = a->birth()->year(), by = b->birth()->year();
        if (ay < 1800 || // birth dates before 1700 tend to be vague!
            a->birth()->approximate() ||
            b->birth()->approximate()) {
            if (abs(ay - by) > 25) return false;
        } else {
            if (abs(ay - by) > 1) {
                return false;
            }
        }
    }
    if (a->death() && b->death()) {
        int ay = a->death()->year(), by = b->death()->year();
        if (a->death()->approximate() || b->death()->approximate()) {
            if (abs(ay - by) > 10) return false;
        } else if (ay < 1700) {
            if (abs(ay - by) > 25) return false;
        } else if (ay < 1800) {
            // cut a bit of slack, but not as much as for birth date
            if (abs(ay - by) > 10) return false;
        } else {
            if (abs(ay - by) > 1) return false;
        }
    }
    return true;
}

void
Composer::cacheNames() const
{
    if (m_namesCached) return;

    QString n = name();
    QStringList pl = n.split(", ");

    if (pl.size() == 1) {
        QStringList pl2;
        pl = n.split(' ');
        pl2.push_back(pl[pl.size()-1]);
        pl2.push_back("");
        for (int i = 0; i+1 < pl.size(); ++i) {
            if (i > 0) pl2[1] += " ";
            pl2[1] += pl[i];
        }
        pl = pl2;
    }

    m_surname = pl[0];

    n = "";
    for (int i = 1; i < pl.size(); ++i) {
        if (i > 1) n += ", ";
        n += pl[i];
    }

    m_forenames = n;

    m_surnameElements.clear();
    m_connectiveElements.clear();
    m_forenameElements.clear();
    m_otherElements.clear();
    m_reducedSurnameElements.clear();
    m_reducedForenameElements.clear();
    
    static QRegExp sre("[\\., -]+");

    foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) {
        if (s[0].isUpper()) {
            m_surnameElements.push_back(s.toLower());
            m_reducedSurnameElements.push_back(reduceName(s));
        } else if (s.length() > 1) {
            m_connectiveElements.push_back(s.toLower());
        }
    }

    foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) {
        if (s[0].isUpper()) {
            m_forenameElements.push_back(s.toLower());
            m_reducedForenameElements.push_back(reduceName(s));
        } else if (s.length() > 1) {
            m_connectiveElements.push_back(s.toLower());
        }
    }

    foreach (QString a, m_aliases) {
        foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) {
            m_otherElements.push_back(ae.toLower());
        }
    }

    m_namesCached = true;
}

QString
Composer::getSortName(bool caps) const
{
    QString surname = getSurname();
    QString forenames = getForenames();
    if (caps) surname = surname.toUpper();
    if (forenames != "") return surname + ", " + forenames;
    else return surname;
}

QString
Composer::getSurname() const
{
    cacheNames();
    return m_surname;
}

QString
Composer::getForenames() const
{
    cacheNames();
    return m_forenames;
}

QString
Composer::getDisplayDates() const
{
    QString s;
    if (birth() || death()) {
        bool showApprox = false;
        if ((birth() && birth()->approximate()) ||
            (death() && death()->approximate())) {
            showApprox = true;
        }
        if (birth()) {
            if (birth()->place() != "") {
                s += birth()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(birth()->year().toInt());
        }
        s += "-";
        if (death()) {
            if (death()->place() != "") {
                s += death()->place() + ", ";
            }
            if (showApprox) {
                s += "c. ";
                showApprox = false;
            }
            s += QString("%1").arg(death()->year().toInt());
        }
    }

    return s;
}
   
static QString
asciify(QString field)
{
    QString ascii;
    for (int i = 0; i < field.length(); ++i) {
        QString dc = field[i].decomposition();
        if (dc != "") ascii += dc[0];
        else if (field[i] == QChar(0x00DF)) {
            ascii += "ss";
        } else {
            ascii += field[i];
        }
    }
    ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
    ascii.replace(QString::fromUtf8("\342\200\222"), "-");
    ascii.replace(QString::fromUtf8("\342\200\223"), "-");
    ascii.replace(QString::fromUtf8("\342\200\224"), "-");
    ascii.replace(QString::fromUtf8("\342\200\225"), "-");
    return ascii;
}

QString
Composer::reduceName(QString name)
{
    QString key = asciify(name).toLower()
        .replace("'", "")
        .replace("x", "ks")
        .replace("y", "i")
        .replace("ie", "i")
        .replace("ei", "i")
        .replace("ii", "i")
        .replace("k", "c")
        .replace("aa", "a")
        .replace("a", "e")
        .replace("ee", "e")
        .replace("v", "f")
        .replace("ph", "f")
        .replace("ff", "f")
        .replace("th", "t")
        .replace("tch", "ch")
        .replace("ch", "c")
        .replace("cc", "c")
        .replace("er", "r");
    return key;
}

bool
Composer::matchCatalogueName(QString an) const
{
    // ew!

    QString bn = name();
    if (bn == an) return true;
    if (aliases().contains(an)) return true;

    int aSurnameIndex = 0, bSurnameIndex = 0;
    if (an.contains(",")) {
        an.replace(",", "");
    } else {
        aSurnameIndex = -1;
    }
    if (bn.contains(",")) {
        bn.replace(",", "");
    } else {
        bSurnameIndex = -1;
    }
    QStringList nl = an.split(QRegExp("[ -]"));
    QStringList bnl = reduceName(bn).split(QRegExp("[ -]"));
    int matchCount = 0;
    QString surnameMatch = "";
    if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
    if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
    if (nl[aSurnameIndex][0].isUpper() &&
        nl[aSurnameIndex] != "Della" &&
        reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
        surnameMatch = nl[aSurnameIndex];
    }
    int tested = 0;
    foreach (QString elt, nl) {
        if (!elt[0].isUpper() || elt == "Della") continue;
        QString k = reduceName(elt);
        if (bnl.contains(k)) {
            ++matchCount;
        }
        if (++tested == 2 && matchCount == 0) {
            return false;
        }
    }
    if (surnameMatch != "") {
        DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
        if (matchCount > 1) {
            return true;
        } else {
            DEBUG << "(but not enough else matched)" << endl;
            return false;
        }
    }
    return false;
}    

float
Composer::matchFuzzyName(QString n) const
{
    int fameBonus = m_pages.size();
    if (n == name()) return 100 + fameBonus;
    static QRegExp sre("[\\., -]+");
    return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts));
}

static int
calculateThresholdedDistance(EditDistance &ed, const QString &user,
                             const QString &machine)
{
    int threshold = machine.length()/3;
    int dist;
    if (threshold == 0) dist = (user == machine ? 0 : -1);
    else {
        dist = ed.calculate(user, machine, threshold);
        if (dist > threshold) dist = -1;
    }
    return dist;
}

float
Composer::matchFuzzyName(QStringList elements) const
{
    if (elements.empty()) return 0;

    cacheNames();
    int fameBonus = m_pages.size();
    
    EditDistance ed(EditDistance::RestrictedTransposition);
    
    int score = 0;
    bool haveSurname = false;

    // We aim to scale the eventual result such that a score of 1.0 or
    // more indicates near-certainty that this is a correct match
    // (i.e. that it is properly matched -- not that it is the only
    // possible match).  To achieve this score, we need to have
    // matched with reasonable confidence every element in the passed
    // elements list, and to have matched at least one of them to a
    // part of our surname.

    int matched = 0;
    int unmatched = 0;

    foreach (QString elt, elements) {

        bool accept = false;

        if (elt.length() == 1) {
            // An initial: search forenames only, ignoring
            // connectives.  The score contribution here is low, but
            // they do not count to matched which means the score can
            // only enhance whatever happens elsewhere.  They can
            // however seriously damage our score if unmatched, which
            // is as it should be.
            foreach (QString s, m_forenameElements) {
                if (s[0] == elt[0]) {
                    score += 2;
                    accept = true;
                    break;
                }
            }
            if (!accept) {
                foreach (QString s, m_connectiveElements) {
                    if (s[0] == elt[0]) {
                        score += 1;
                        accept = true;
                        break;
                    }
                }
            }
            if (!accept) {
                foreach (QString s, m_surnameElements) {
                    if (s[0] == elt[0]) {
                        // no score, but don't call it unmatched
                        accept = true;
                        break;
                    }
                }
            }
            if (!accept) ++unmatched;
            continue;
        }
        
        foreach (QString s, m_surnameElements) {
            int dist = calculateThresholdedDistance(ed, elt, s);
            if (dist >= 0) {
                score += 22 - dist*2;
                if (elt[0] != s[0]) score -= 10;
                accept = true;
//                std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) {
            haveSurname = true;
            ++matched;
            continue;
        }

        foreach (QString s, m_forenameElements) {
            int dist = calculateThresholdedDistance(ed, elt, s);
            if (dist >= 0) {
                score += 22 - dist*2;
                if (elt[0] != s[0]) score -= 10;
                accept = true;
//                std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) {
            ++matched;
            continue;
        }

        foreach (QString s, m_connectiveElements) {
            // treated much like initials
            int dist = calculateThresholdedDistance(ed, elt, s);
            if (dist == 0) {
                score += 2;
                accept = true;
            } else if (dist == 1) {
                score += 1;
                accept = true;
            }
            if (accept) {
//                std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) {
            continue;
        }

        QString reduced = reduceName(elt);

        //!!! these don't seem to match often...

        if (m_reducedSurnameElements.contains(reduced)) {
            score += 10;
            haveSurname = true;
            ++matched;
            std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
            continue;
        }

        if (m_reducedForenameElements.contains(reduced)) {
            score += 7;
            ++matched;
            std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
            continue;
        }

        foreach (QString s, m_otherElements) {
            int dist = calculateThresholdedDistance(ed, elt, s);
            if (dist >= 0) {
                score += 22 - dist*2;
                if (elt[0] != s[0]) score -= 10;
                accept = true;
//                std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
                break;
            }
        }
        if (accept) {
            ++matched;
            continue;
        }

        ++unmatched;
    }

//    if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
    score += fameBonus;
        
    if (matched == 0) {
        if (unmatched == 0) {
            return float(score) / 20.f;
        } else {
            return 0;
        }
    }

    float fscore = score;
    float divisor = (matched + unmatched) * 20;

    if (!haveSurname) fscore /= 2;
    if (unmatched > 0) fscore /= 1.5;

    fscore /= divisor;

    if (matched > 0) {
//        std::cerr << "[score " << score << " with divisor " << divisor << " for " << name().toStdString() << " adjusted to " << fscore << "]" << std::endl;
    }

    return fscore;
}

float
Composer::matchTyping(QString t) const
{
    return doMatchTyping(t, false);
}

float
Composer::matchTypingQuick(QString t) const
{
    return doMatchTyping(t, true);
}

float
Composer::doMatchTyping(QString t, bool quick) const
{
    if (t == "") return 0;

    cacheNames();
    float fameBonus = m_pages.size() / 400.f;
    
    QString n = name().toLower();
    t = t.toLower();

    if (n == t) return 1.f + fameBonus;
    if (n.startsWith(t)) return 0.8f + fameBonus;

    QSet<QString> sl;
    QSet<QString> nl;
    foreach (QString s, m_surnameElements) {
        sl.insert(s.toLower());
        nl.insert(s.toLower());
    }
    foreach (QString s, m_forenameElements) {
        nl.insert(s.toLower());
    }
    if (!quick) {
        foreach (QString s, m_otherElements) {
            nl.insert(s.toLower());
        }
        foreach (QString s, m_connectiveElements) {
            nl.insert(s.toLower());
        }
    }

    static QRegExp sre("[\\., -]+");
    QStringList tl = t.split(sre, QString::SkipEmptyParts);
    
    float score = 0.f;

    if (nl.empty() || tl.empty()) return 0.f;
    
    int unmatched = 0;

    for (int i = 0; i < tl.size(); ++i) {

        QString tel = tl[i];
        float component = 0.f;
        float max = 0.f;

        for (QSet<QString>::const_iterator ni = nl.begin();
             ni != nl.end(); ++ni) {

            QString nel = ni->toLower();

            if (tel == nel) {
                if (tel.length() > 1) {
                    component = 0.2;
                } else {
                    component = 0.1;
                }
                if (sl.contains(nel)) component *= 1.5;
                goto calculated;
            }

            if (nel.startsWith(tel)) {
                component = 0.1;
                if (sl.contains(nel)) component *= 1.5;
                goto calculated;
            }

            if (!quick) {
                if (tel.length() > 3) {
                    EditDistance ed(EditDistance::RestrictedTransposition);
                    int dist = calculateThresholdedDistance
                        (ed, nel.left(tel.length()), tel);
                    if (dist >= 0) {
                        component = 0.08 - dist * 0.01;
                        if (sl.contains(nel)) component *= 1.5;
                    }
                }
                if (component > 0.f) goto calculated;
            }

            if (nel.startsWith(tel[0])) {
                component += 0.02;
            }

        calculated:
            if (component > max) max = component;
        }

        score += max;
    }

    if (!quick) {
        if (t.contains(" ")) {
            float fuzzyScore = matchFuzzyName(t);
            if (fuzzyScore >= 0.4f) {
                score += fuzzyScore / 3.f;
            }
        }
    }

    if (score > 0.f) score += fameBonus;
    return score;
}

void
Composer::mergeFrom(Composer *c)
{
    QSet<QString> allNames = c->aliases();
    allNames.insert(c->name());
        
    foreach (QString n, allNames) {
        if (n != m_name && !m_aliases.contains(n)) {
            m_aliases.insert(n);
            m_namesCached = false;
        }
    }

    if (!m_birth) {
        if (c->birth()) {
            m_birth = new Birth(*c->birth());
            emit birthChanged(m_birth);
        }
    }

    if (!m_death) {
        if (c->death()) {
            m_death = new Death(*c->death());
            emit deathChanged(m_death);
        }
    }
        
    if (c->gender() != "") {
        if (m_gender == "") {
            m_gender = c->gender();
            emit genderChanged(m_gender);
        } else if (c->gender() != m_gender) {
            std::cerr << "WARNING: Composer::mergeFrom: Gender mismatch! Composer " << c->name().toStdString() << " has gender " << c->gender().toStdString() << ", but target composer " << m_name.toStdString() << " has gender " << m_gender.toStdString() << std::endl;
        }
    }

    m_nationality.unite(c->nationality());
    m_geonameURIs.unite(c->geonameURIs());
    m_otherURIs.unite(c->otherURIs());

    foreach (Document *d, c->pages()) {
/*
        Document *dd = new Document;
        dd->setUri(d->uri());
        dd->setSiteName(d->siteName());
        dd->setTopic(this);
        m_pages.insert(dd);
*/
        d->setTopic(this);
        m_pages.insert(d);
    }
    
    if (m_period == "") m_period = c->period();
    if (m_remarks == "") m_remarks = c->remarks();

    emit nationalityChanged(m_nationality);
    emit geonameURIsChanged(m_geonameURIs);
    emit otherURIsChanged(m_otherURIs);
    emit pagesChanged(m_pages);
    emit periodChanged(m_period);
    emit remarksChanged(m_remarks);
    emit aliasesChanged(m_aliases);
}

QString
Work::getComposerName() const
{
    Composer *c = getComposer();
    if (c) return c->name();
    else return "";
}

static int
compare(QString a, QString b)
{
    if (a < b) {
        return -1;
    } else if (a > b) {
        return 1;
    } else {
        return 0;
    }
}

int
Work::compareCatalogueNumberTexts(QString a, QString b)
{
//    std::cout << "compare " << a.toStdString()
//              << " :: " << b.toStdString() << std::endl;

    if (a == b) return 0;

    if (!a[0].isDigit()) {
        a.replace(QRegExp("^[^\\d]+"), "");
    }

    if (!b[0].isDigit()) {
        b.replace(QRegExp("^[^\\d]+"), "");
    }

    QStringList al = a.split(QRegExp("\\b[^\\d]*"), QString::SkipEmptyParts);
    QStringList bl = b.split(QRegExp("\\b[^\\d]*"), QString::SkipEmptyParts);
    if (al.size() != bl.size()) return int(al.size()) - int(bl.size());

/*    if (al.size() < 2 || bl.size() < 2 || al.size() != bl.size()) {
        if (a < b) return -1;
        else if (a > b) return 1;
        else return 0;
    }
*/
    for (int i = 0; i < al.size(); ++i) {
        if (al[i] != bl[i]) {
            // use atoi instead of toInt() because we want it to succeed even
            // if the text is not only an integer (e.g. 35a)
            int aoi = atoi(al[i].toLocal8Bit().data());
            int boi = atoi(bl[i].toLocal8Bit().data());
            if (aoi != boi) return aoi - boi;
            else return compare(al[i], bl[i]);
        }
    }
    return 0;
}

QStringList
Work::extractCatalogueNumberTexts(QString text)
{
    //!!! test this
    QStringList results;
    std::cerr << "Work::extractCatalogueNumberTexts(" << text.toStdString() << ")" << std::endl;

    // Note we explicitly exclude "catalogue identifiers" beginning
    // with N, because we don't want to treat e.g. "Symphony No. 8"
    // as catalogue number 8.  What a fine hack.

    QRegExp catre("\\b([Oo]pu?s?|[A-MP-Z]+)\\.?[\\s_]*(\\d+\\w*)(\\s+[Nn]([OoRrBb]?|umber)(\\.\\s*|\\s+)(\\d+\\w*))?\\b");
    int ix = 0;
    while ((ix = catre.indexIn(text, ix+1)) >= 0) {
        std::cerr << "extractCatalogueNumberTexts: found match \"" << catre.cap(0).toStdString() << "\"" << std::endl;
        QString cat = catre.cap(0);
        // ensure space before digit
        for (int i = 0; i+1 < cat.length(); ++i) {
            if (!cat[i].isDigit() && !cat[i].isSpace() && cat[i+1].isDigit()) {
                QString spaced = cat.left(i+1) + " " + cat.right(cat.length()-i-1);
                std::cerr << "spaced out from " << cat.toStdString() << " to "
                          << spaced.toStdString() << std::endl;
                cat = spaced;
                break;
            }
        }
        results.push_back(cat);
    }
    return results;
}

bool
Work::Ordering::operator()(Work *a, Work *b)
{
    if (!a) {
        if (!b) return false;
        else return true;
    } else {
        if (!b) {
            return false;
        }
    }
/*
    QString ao = a->catalogue();
    if (ao == "") ao = a->opus();

    QString bo = b->catalogue();
    if (bo == "") bo = b->opus();

    std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl;
*/
    int c = 0;
    if (a->catalogue() != "" && b->catalogue() != "") {
        c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue());
    }
    if (c == 0 && a->opus() != "" && b->opus() != "") {
        c = compareCatalogueNumberTexts(a->opus(), b->opus());
    }
    if (c == 0 && a->partOf() == b->partOf() &&
        a->number() != "" && b->number() != "") {
        c = compareCatalogueNumberTexts(a->number(), b->number());
    }

    bool rv = false;

    if (c == 0) {
        if (a->name() == b->name()) rv = (a < b);
        else rv = (a->name() < b->name());
    } else {
        rv = (c < 0);
    }

//    std::cout << "result = " << rv << std::endl;
    return rv;
}

QString
Work::getDisplayName() const
{
    QString suffix;

    if (catalogue() != "") {
        suffix = catalogue();
    } else if (opus() != "") {
        suffix = QString("Op. %1").arg(opus());
    }
    if (suffix != "" && number() != "") {
        suffix = QString("%1 no. %2").arg(suffix).arg(number());
    }
    if (suffix != "") {
        if (name() != "") {
            return QString("%1, %2").arg(name()).arg(suffix);
        } else {
            return suffix;
        }
    } else {
        return name();
    }
}

AudioFile::AudioFile(QObject *parent) :
    QObject(parent)
{
}

AudioFile::AudioFile(FileSource source, QObject *parent) :
    QObject(parent)
{
    if (source.isAvailable()) {
        QFile f(source.getLocalFilename());
        f.open(QIODevice::ReadOnly);
        //!!! stream this!
        QByteArray ba = f.readAll();
        m_hash = QString::fromAscii
            (QCryptographicHash::hash(ba, QCryptographicHash::Sha1).toHex());
    }
    QString location = source.getLocation();
    if (source.isRemote()) {
        m_uri = Dataquay::Uri(location);
    } else {
        if (location.contains("://")) {
            m_uri = Dataquay::Uri(location);
        } else if (location.startsWith('/')) {
            m_uri = Dataquay::Uri("file://" + location);
        } else {
            m_uri = Dataquay::Uri("file://" + QFileInfo(location).canonicalFilePath());
        }
    }

    std::cerr << "AudioFile::AudioFile: hash = " << m_hash.toStdString()
              << ", uri = " << m_uri.toString().toStdString() << std::endl;
}

AudioFile::~AudioFile()
{
    foreach (AudioFileTag *t, m_tags) delete t;
}

void
AudioFile::setTags(QSet<AudioFileTag *> tt)
{
    foreach (AudioFileTag *t, m_tags) {
        if (!tt.contains(t)) delete t;
    }
    m_tags = tt;
}

void
AudioFile::addTag(AudioFileTag *t)
{
    m_tags.insert(t);
}

}