view import/Import.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents 7d8a6167febb
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Objects.h"

#include <dataquay/BasicStore.h>
#include <dataquay/RDFException.h>
#include <dataquay/objectmapper/ObjectStorer.h>
#include <dataquay/objectmapper/ObjectLoader.h>
#include <dataquay/objectmapper/ObjectBuilder.h>
#include <dataquay/objectmapper/TypeMapping.h>
#include <dataquay/objectmapper/ContainerBuilder.h>

#include "ImportClassicalComposersOrg.h"
#include "ImportClassicalDotNet.h"
#include "ImportClassicalArchives.h"
#include "ImportWikipediaComposers.h"
#include "ImportWikipediaWorks.h"
#include "ImportWikipediaWorksK.h"
#include "ImportWikipediaWorksList.h"
#include "ImportHoboken.h"

#include "TypeRegistrar.h"

#include <dataquay/Debug.h>

using namespace ClassicalData;
using namespace Dataquay;

#include <iostream>
#include <set>

typedef QMap<QString, QSet<Composer *> > ComposerMap; // name -> composers

void
addMiscExpansions(Composer *c)
{
    QString n = c->name();

    DEBUG << "addMiscExpansions: n = " << n << endl;

    // lovely hard-coded special cases go here! some of these are
    // needed for works->composer assignments
    if (n == "Balakirev, Milii") {
        c->addAlias("Mily Balakirev");
    }
    if (n.startsWith("Cui, C")) {
        c->addAlias(QString::fromUtf8("C\303\251sar Cui"));
    }
    if (n == "Handel, George Frideric") {
        c->addAlias("Handel, Georg Friedrich");
        c->addAlias("Handel");
    }
    if (n == "Prokofiev, Sergey") {
        c->addAlias("Prokofieff, Sergei");
        c->addAlias("Sergei Prokofieff");
    }
    if (n == "Rossini, Gioacchino") {
        c->addAlias("Rossini, Gioachino");
        c->addAlias("Gioachino Rossini");
    }
    if (n == "Edwards, Richard") {
        c->addAlias("Edwardes, Richard");
        c->addAlias("Richard Edwardes");
        c->addAlias("Richard Edwards");
    }
    if (n == "Rimsky-Korsakov, Nikolay Andreyevich") {
        c->addAlias("Nikolai Rimsky-Korsakov");
    }
    if (n.startsWith("Piccinni, Nico")) {
        c->addAlias(n);
        c->setName(QString::fromUtf8("Piccinni, Niccol\303\262"));
    }
    if (n == "Tchaikovsky, Pyotr Ilyich") {
        c->addAlias("Tchaikovsky, Piotr Ilyitch");
    }
    if (n == "Wilhelm Stenhammar") {
        c->addAlias("Stenhammar, Vilhelm Eugene");
        c->setName("Stenhammar, Wilhelm");
        c->addAlias(n);
    }
    if (n == "Mercadante, Saverio Rafaele") {
        c->addAlias("Mercadante, Giuseppe");
    }
    if (n == "Johann Wenzel Anton Stamitz") {
        c->addAlias(n);
        c->setName("Stamitz, Johann Wenzel Anton");
        c->addAlias("Stamitz, Jan Vaclav");
    }
    if (n == "Mario Castelnuovo-Tedesco") {
        c->addAlias("Castelnuovo Tedesco, Mario");
    }
    if (n == "Mayr, Simon") {
        c->addAlias("Mayr");
    }

    n.replace(", Sr.", " Sr.");
    n.replace(", Jr.", " Jr.");

    int comma = n.indexOf(", ");
    if (comma > 0 && comma + 2 < n.length()) {

        QString left = n.left(comma);
        QString right = n.right(n.length() - comma - 2);

        QRegExp jrsr("( (Sr\\.|Jr\\.|I|II))$");
        if (jrsr.indexIn(right) >= 0) {
            left = left + jrsr.cap(1);
            right = right.left(right.length()-jrsr.matchedLength());
        }
        n = right + " " + left;
    }

    if (n != c->name()) c->addAlias(n);

    if (n.contains("Sergey")) {
        QString nn(n);
        nn.replace("Sergey", "Sergei");
        c->addAlias(nn);
    } else if (n.contains("Sergei")) {
        QString nn(n);
        nn.replace("Sergei", "Sergey");
        c->addAlias(nn);
    }

    QRegExp sr("((, )?Sr\\.|Senior|\\(?the elder\\)?)", Qt::CaseInsensitive);
    if (sr.indexIn(n) >= 0) {
        QString nr = n;
        nr.replace(sr.pos(0), sr.matchedLength(), " I");
        nr.replace("  ", " ");
        DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl;
        c->addAlias(nr);
    }
    QRegExp jr("((, )?Jr\\.|Junior|\\(?the younger\\)?)", Qt::CaseInsensitive);
    if (jr.indexIn(n) >= 0) {
        QString nr = n;
        nr.replace(jr.pos(0), jr.matchedLength(), " II");
        nr.replace("  ", " ");
        DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl;
        c->addAlias(nr);
    }
    QString nr = n;
    nr.replace("(I)", "I");
    nr.replace("(II)", "II");
    nr.replace("(III)", "III");
    c->addAlias(nr);
}

bool
hasBetterName(Composer *c, Composer *other)
{
    if (c->name() == other->name()) return false;

    // Try to guess which of c and other is more likely to have a good
    // "canonical form" of the composer's name

    if (c->name().startsWith("van ")) {
        return false; // wrong choice of sort for e.g. LvB; should be
                      // Beethoven, Ludwig van, not van Beethoven, Ludwig
    }
    if (other->name().startsWith("van ")) {
        return true;
    }

    if (c->aliases().size() != other->aliases().size()) {
        // a rather weak heuristic
        return c->aliases().size() > other->aliases().size();
    }

    if (c->name().contains(',') && !other->name().contains(',')) {
        // another rather weak heuristic
        return true;
    }

    return false;
}

void mergeComposer(Composer *c, ComposerMap &composers)
{
    QString name = c->name();

    QSet<QString> allNames = c->aliases();
    allNames.insert(name);
    
    QString dates;
    if (c->birth()) {
        if (c->death()) {
            dates = QString("%1-%2").arg(c->birth()->year()).arg(c->death()->year());
        } else {
            dates = QString("%1-").arg(c->birth()->year());
        }
    }
    if (dates != "") {
        allNames.insert(dates);
    }

    QSet<Composer *> matches;

    foreach (QString candidateName, allNames) {
        QString key = Composer::reduceName(candidateName);
        if (composers.contains(key)) {
            foreach (Composer *candidate, composers[key]) {
                if (candidateName == dates) {
                    if (c->name() == candidate->name()) {
                        DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl;
                    } else if (!candidate->matchCatalogueName(c->name()) &&
                               !c->matchCatalogueName(candidate->name())) {
                        DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl;
                        continue;
                    } else {
                        DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl;
                    }
                } else {
                    if (!c->matchDates(candidate)) {
                        DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl;
                        continue;
                    }
                }
                matches.insert(candidate);
            }
        }
    }

    if (matches.empty()) {
        DEBUG << "mergeComposer: No existing composer with alias matching any alias of " << c->name() << ", adding" << endl;

        if (!c->birth() && !c->death()) {
            DEBUG << "Composer has no dates, laboriously searching for all names" << endl;
            // laboriously look for fuzzy match across _all_ composers
            for (ComposerMap::iterator i = composers.begin();
                 i != composers.end(); ++i) {
                foreach (Composer *candidate, *i) {
                    if (candidate->matchCatalogueName(c->name())) {
                        DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl;
                        matches.insert(candidate);
                        break;
                    }
                }
                if (!matches.empty()) break;
            }
        }

        if (matches.empty()) {
            foreach (QString candidateName, allNames) {
                QString key = Composer::reduceName(candidateName);
                composers[key].insert(c);
                DEBUG << "added for alias or date " << candidateName << endl;
            }
            return;
        }
    }

    if (matches.size() > 1) {
        DEBUG << "mergeComposer: More than one composer matches name and date(s) for " << c->name() << " -- something fishy here" << endl;
    }

    Composer *other = *matches.begin();

    DEBUG << "mergeComposer: Merging " << c->name() << " with " << other->name() << endl;

    if (hasBetterName(c, other)) {
        other->addAlias(other->name());
        other->setName(c->name());
    } else {
        other->addAlias(c->name());
    }
    composers[Composer::reduceName(c->name())].insert(other);
    DEBUG << "linking from alias " << c->name() << endl;

    foreach (QString alias, c->aliases()) {
        if (alias != other->name() && 
            !other->aliases().contains(alias)) {
            other->addAlias(alias);
            composers[Composer::reduceName(alias)].insert(other);
            DEBUG << "linking from alias " << alias << endl;
        }
    }
    
    foreach (Document *d, c->pages()) {
        bool found = false;
        foreach (Document *dd, other->pages()) {
            if (d->uri() == dd->uri()) {
                found = true;
                break;
            }
        }
        if (!found) {
            d->setTopic(other);
            other->addPage(d);
        }
    }

    //!!! actually the "approximate" bits of the following are bogus;
    // a source reporting birth or death date as approx is probably
    // more accurate than one reporting an exact date

    if (c->birth()) {
        if (!other->birth() || other->birth()->approximate()) {
            other->setBirth(c->birth());
        }
    }

    if (c->death()) {
        if (!other->death() || other->death()->approximate()) {
            other->setDeath(c->death());
        }
    }

    if (c->gender() != "") other->setGender(c->gender());

    foreach (QString s, c->nationality()) {
        other->addNationality(s);
    }

    foreach (Uri s, c->geonameURIs()) {
        other->addGeonameURI(s);
    }

    if (c->remarks() != "") other->setRemarks(c->remarks());
    if (c->period() != "") other->setPeriod(c->period());

}
    
QString
asciify(QString field)
{
    // accented characters etc -- add "ascii version" for dumb search purposes
    QString ascii;
    for (int i = 0; i < field.length(); ++i) {
        QString dc = field[i].decomposition();
        if (dc != "") ascii += dc[0];
        else if (field[i] == QChar(0x00DF)) {
            ascii += "ss";
        } else {
            ascii += field[i];
        }
    }
    ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
    ascii.replace(QString::fromUtf8("\342\200\222"), "-");
    ascii.replace(QString::fromUtf8("\342\200\223"), "-");
    ascii.replace(QString::fromUtf8("\342\200\224"), "-");
    ascii.replace(QString::fromUtf8("\342\200\225"), "-");
    return ascii;
}

void
asciify(Composer *c)
{
    QString n = c->name();
    QString asc = asciify(n);
    if (asc != n && !c->aliases().contains(asc)) c->addAlias(asc);
    foreach (QString alias, c->aliases()) {
        asc = asciify(alias);
        if (asc != alias && !c->aliases().contains(asc)) c->addAlias(asc);
    }
}

void
asciify(Work *w)
{
    QString n = w->name();
    QString asc = asciify(n);
    if (asc != n && !w->aliases().contains(asc)) w->addAlias(asc);
    foreach (QString alias, w->aliases()) {
        asc = asciify(alias);
        if (asc != alias && !w->aliases().contains(asc)) w->addAlias(asc);
    }
}

void
assignUri(Store *s, Composer *c)
{
    static QSet<QString> convSet;
    QString conv = c->name();
    if (!conv.contains(",")) {
        QStringList sl = conv.split(" ");
        if (!sl.empty()) {
            sl.push_front(sl[sl.size()-1]);
            sl.removeLast();
            conv = sl.join(" ");
            DEBUG << "assignUri: " << c->name() << " -> " << conv << endl;
        }
    }
    conv = asciify(conv);
    conv.replace(" ", "_");
    conv.replace("-", "_");
    conv.replace(QRegExp("[^a-zA-Z0-9_-]"), "");
    conv = conv.toLower();
    QString initial = conv;
    int i = 2;
    while (convSet.contains(conv)) {
        conv = QString("%1__%2").arg(initial).arg(i);
        i++;
    }
    convSet.insert(conv);
    c->setProperty("uri", QVariant::fromValue(s->expand(":composer/" + conv)));
}

void
assignUri(Store *s, Work *w, Composer *c)
{
    QString pfx = c->property("uri").value<Uri>().toString();
    DEBUG << "pfx = " << pfx << endl;
    if (!pfx.contains("composer/")) pfx = ":work/";
    else {
        pfx.replace("composer/", "work/");
        pfx += "/";
    }

    static QSet<QString> convSet;

    QString conv = w->catalogue();
    if (conv == "") conv = w->opus();
    conv = conv.replace(".", "");
    bool hasOpus = (conv != "");
    if (conv == "") conv = w->name().toLower();
    if (w->number() != "") conv = conv + "_no" + w->number();
    conv = asciify(conv);
    conv.replace(" ", "_");
    conv.replace("-", "_");
    conv.replace(":", "_");
    conv.replace(QRegExp("[^a-zA-Z0-9_-]"), "");

    if (pfx != "") conv = pfx + conv;

    // I think actually for works we want to merge duplicates rather than
    // assign them separate URIs, _unless_ they lack a viable opus number
    if (!hasOpus) {
        QString initial = conv;
        int i = 2;
        while (convSet.contains(conv)) {
            conv = QString("%1__%2").arg(initial).arg(i);
            i++;
        }
    }
    convSet.insert(conv);

    w->setProperty("uri", conv);
}

void
addDbpediaResource(Store *store, QObject *o, QString s)
{
    Uri u = o->property("uri").value<Uri>();
    if (u == Uri()) return;
    if (s.startsWith("http://en.wikipedia.org/wiki/")) {
        store->add(Triple(u,
                          "mo:wikipedia",
                          Uri(s)));
        s.replace("http://en.wikipedia.org/wiki/",
                  "http://dbpedia.org/resource/");
        store->add(Triple(u,
                          "owl:sameAs",
                          Uri(s)));
    }
}

int main(int argc, char **argv)
{
    qRegisterMetaType<ClassicalComposersOrgImporter *>
	("ClassicalData::ClassicalComposersOrgImporter*");
    qRegisterMetaType<ClassicalDotNetImporter *>
	("ClassicalData::ClassicalDotNetImporter*");
    qRegisterMetaType<ClassicalArchivesImporter *>
	("ClassicalData::ClassicalArchivesImporter*");
    qRegisterMetaType<WikipediaComposersImporter *>
	("ClassicalData::WikipediaComposersImporter*");
    qRegisterMetaType<WikipediaWorksImporter *>
	("ClassicalData::WikipediaWorksImporter*");
    qRegisterMetaType<WikipediaWorksKImporter *>
	("ClassicalData::WikipediaWorksKImporter*");
    qRegisterMetaType<WikipediaWorksListImporter *>
	("ClassicalData::WikipediaWorksListImporter*");
    qRegisterMetaType<HobokenImporter *>
	("ClassicalData::HobokenImporter*");

    ObjectBuilder::getInstance()->registerClass
	<ClassicalComposersOrgImporter>("ClassicalData::ClassicalComposersOrgImporter*");
    ObjectBuilder::getInstance()->registerClass
	<ClassicalDotNetImporter>("ClassicalData::ClassicalDotNetImporter*");
    ObjectBuilder::getInstance()->registerClass
	<ClassicalArchivesImporter>("ClassicalData::ClassicalArchivesImporter*");
    ObjectBuilder::getInstance()->registerClass
	<WikipediaComposersImporter>("ClassicalData::WikipediaComposersImporter*");
    ObjectBuilder::getInstance()->registerClass
	<WikipediaWorksImporter>("ClassicalData::WikipediaWorksImporter*");
    ObjectBuilder::getInstance()->registerClass
	<WikipediaWorksKImporter>("ClassicalData::WikipediaWorksKImporter*");
    ObjectBuilder::getInstance()->registerClass
	<WikipediaWorksListImporter>("ClassicalData::WikipediaWorksListImporter*");
    ObjectBuilder::getInstance()->registerClass
	<HobokenImporter>("ClassicalData::HobokenImporter*");

    BasicStore *store = BasicStore::load(QUrl("file:importers.ttl"));
    ObjectLoader loader(store);
    QObject *parentObject = loader.loadAllObjects(new QObject());
    
    BasicStore *outstore = new BasicStore();
    outstore->setBaseUri(Uri("http://dbtune.org/classical/resource/"));
    ObjectStorer storer(outstore);
    TypeMapping tm;

    TypeRegistrar::registerTypes();
    TypeRegistrar::addMappings(outstore, &tm);

    storer.setTypeMapping(tm);
    storer.setPropertyStorePolicy(ObjectStorer::StoreIfChanged);
    storer.setObjectStorePolicy(ObjectStorer::StoreAllObjects);
    storer.setBlankNodePolicy(ObjectStorer::NoBlankNodes);

    QList<Importer *> importers = parentObject->findChildren<Importer *>();
    std::cerr << "have " << importers.size() << " importers" << std::endl;

    ComposerMap composers;

    QList<Composer *> dated;
    QList<Composer *> undated;

    QList<Work *> works;
    QList<Composition *> compositions;
    QList<QObject *> other;
    
    foreach (Importer *importer, importers) {
        QObjectList objects = importer->getImportedObjects();
        foreach (QObject *o, objects) {
            Composer *c;
            if ((c = qobject_cast<Composer *>(o))) {
                addMiscExpansions(c);
                asciify(c);
                if (c->birth() || c->death()) dated.push_back(c);
                else undated.push_back(c);
                continue;
            }
            Work *w;
            if ((w = qobject_cast<Work *>(o))) {
                asciify(w); 
                works.push_back(w);
                continue;
            }
            Composition *cn;
            if ((cn = qobject_cast<Composition *>(o))) {
                compositions.push_back(cn);
                continue;
            }
        }
    }

    // get all the dated composers merged before attempting to match
    // the undated ones
    foreach (Composer *c, dated) {
        mergeComposer(c, composers);
    }
    foreach (Composer *c, undated) {
        mergeComposer(c, composers);
    }

    QObjectList toStore;

    QSet<Composer *> cset;
    for (ComposerMap::iterator i = composers.begin(); i != composers.end(); ++i) {
        foreach (Composer *c, i.value()) {
            if (!cset.contains(c)) {
                assignUri(outstore, c);
                toStore.push_back(c);
                cset.insert(c);
            }
            foreach (Document *d, c->pages()) {
                QString s = d->uri().toString();
                addDbpediaResource(outstore, c, s);
            }                        
        }
    }

    QSet<QString> storedUris;

    foreach (Work *w, works) {
        Composition *cn = w->composition();
        if (!cn) continue;
        if (!cn->composer()) {
            QString cname = cn->composerName();
            QString key = Composer::reduceName(cname);
            if (cname != "") {
                if (!composers.contains(key)) {
                    DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
                } else {
                    QSet<Composer *> cs = composers[key];
                    if (cs.empty()) {
                        DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
                    } else if (cs.size() > 1) {
                        DEBUG << "Failed to assign Composition to composer: "
                              << cs.size() << " composers match name " << cname << endl;
                    } else {
                        cn->setComposer(*cs.begin());
                    }
                }
            } else {
                DEBUG << "Failed to assign Composition to composer: composer name is empty" << endl;
            }
        }

        if (cn->composer()) {
            assignUri(outstore, w, cn->composer());
        }

        foreach (Document *d, w->pages()) {
            QString s = d->uri().toString();
            addDbpediaResource(outstore, w, s);
            if (!storedUris.contains(s)) {
                toStore.push_back(d);
                storedUris.insert(s);
            }
        }                        

        QString u = w->property("uri").value<Uri>().toString();
        if (u == "" || !storedUris.contains(u)) {
            toStore.push_back(w);
            if (u != "") storedUris.insert(u);
        }
    }

    try {
        storer.storeAllObjects(toStore);
        
    } catch (RDFException e) {
        std::cerr << "Caught RDF exception: " << e.what() << std::endl;
    }

    DEBUG << "Stored, now saving" << endl;

    outstore->save("imported.ttl");

    DEBUG << "Saved" << endl;


    QMultiMap<QString, Composer *> cmap;
    foreach (Composer *c, cset) {
        QString n = c->getSortName(true);
        cmap.insert(n, c);
    }

    std::cout << "Composers: " << cmap.size() << std::endl;

    for (QMultiMap<QString, Composer *>::iterator i = cmap.begin();
         i != cmap.end(); ++i) {

        QString n = i.key();
        Composer *c = i.value();
        
        std::cout << n.toStdString();
        
        QString d = c->getDisplayDates();
        if (d != "") std::cout << " (" << d.toStdString() << ")";
        std::cout << std::endl;
    }

    std::cout << std::endl;

    std::cout << "Works by composer:" << std::endl;

    for (QMultiMap<QString, Composer *>::iterator i = cmap.begin();
         i != cmap.end(); ++i) {

        QString n = i.key();
        Composer *c = i.value();
    
        std::set<Work *, Work::Ordering> wmap;
        foreach (Work *w, works) {
            Composition *cn = w->composition();
            if (!cn) continue;
            if (cn->composer() != c) continue;
            if (w->partOf()) continue;
            wmap.insert(w);
        }

        if (wmap.empty()) continue;
        
        std::cout << n.toStdString() << std::endl;

        foreach (Work *w, wmap) {
            std::cout << " * ";
            std::cout << w->name().toStdString();
            if (w->catalogue() != "") {
                std::cout << " [" << w->catalogue().toStdString() << "]";
            }
            if (w->opus() != "") {
                std::cout << " [op. " << w->opus().toStdString() << "]";
            }
            std::cout << std::endl;
            std::set<Work *, Work::Ordering> orderedParts;
            foreach (Work *ww, w->parts()) {
                orderedParts.insert(ww);
            }
            foreach (Work *ww, orderedParts) {
                std::cout << "    ";
                if (ww->number() != "") {
                    std::cout << ww->number().toStdString() << ". ";
                }
                std::cout << ww->name().toStdString();
                if (ww->catalogue() != "" && ww->catalogue() != w->catalogue()) {
                    std::cout << " [" << ww->catalogue().toStdString() << "]";
                }
                if (ww->opus() != "" && ww->opus() != w->opus()) {
                    std::cout << " [op. " << ww->opus().toStdString() << "]";
                }
                std::cout << std::endl;
            }
        }

        std::cout << std::endl;
    }

    delete outstore;

    DEBUG << "Done" << endl;


}