view import/ImportWikipediaComposers.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportWikipediaComposers.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
WikipediaComposersImporter::setSource(QUrl source)
{
    DEBUG << "WikipediaComposersImporter::setSource: " << source << endl;
    import(source);
}

Composer *
addComposer(QString namefield, QString birthfield, QString deathfield,
            QString datesfield, QString nationalityfield, QString worksfield,
            QString summaryfield)
{
    namefield = namefield.trimmed();
    birthfield = birthfield.trimmed();
    deathfield = deathfield.trimmed();
    datesfield = datesfield.trimmed();
    nationalityfield = nationalityfield.trimmed();
    worksfield = worksfield.trimmed();
    summaryfield = summaryfield.trimmed();

    Composer *composer = new Composer();

    QString name = namefield;
    name.replace("[[", "");
    name.replace("]]", "");
    QString pagename = name;

    if (name.contains('|')) {
        QStringList bits = name.split('|');
        pagename = bits[0];
        name = bits[1];
    }

    composer->setName(name);

    pagename.replace(" ", "_");
    QUrl url;
    url.setScheme("http");
    url.setHost("en.wikipedia.org");

    url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename));
    Document *d = new Document;
    d->setUri(Uri(url));
    d->setSiteName("Wikipedia");
    d->setTopic(composer);
    composer->addPage(d);

    if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment

    bool approx = (datesfield.contains("c.") || datesfield.contains("?")
                   || datesfield.contains("before") || datesfield.contains("after"));

    if (datesfield != "") {
        DEBUG << "dates for " << name << ": " << datesfield << endl;
        datesfield.replace("(", "");
        datesfield.replace(")", "");
        datesfield.replace(" ", "");
        datesfield.replace(QString::fromUtf8("\342\200\222"), "-");
        datesfield.replace(QString::fromUtf8("\342\200\223"), "-");
        datesfield.replace(QString::fromUtf8("\342\200\224"), "-");
        datesfield.replace(QString::fromUtf8("\342\200\225"), "-");
        datesfield.replace("--", "-");
        DEBUG << "dates for " << name << ": " << datesfield << endl;

        QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-");
        QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?");

        if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1);
        else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2);

        QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
        QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");

        if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2);
        else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2);

//        datesfield.replace(QRegExp("[^0-9]+"), "-");
/*
        QStringList list = datesfield.split('-');
        if (!list.empty()) {
            birthfield = list[0];
            if (list.size() > 1) {
                deathfield = list[1];
            }
        }
*/
        DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl;
    }
    if (birthfield != "") {
        Birth *e = new Birth(birthfield.toInt());
        e->setApproximate(approx);
        composer->setBirth(e);
    }
    if (deathfield != "") {
        Death *e = new Death(deathfield.toInt());
        e->setApproximate(approx);
        composer->setDeath(e);
    }
    if (nationalityfield != "") {
        composer->addNationality(nationalityfield);
    }
    if (summaryfield != "") {
        summaryfield.replace(QRegExp("^[Cc]omposer, *"), "");
        summaryfield[0] = summaryfield[0].toUpper();
        summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[[");
        summaryfield.replace("[[", "");
        summaryfield.replace("]]", "");
        summaryfield.replace("''", "\"");
        summaryfield.replace("&quot;", "'");
        summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), "");
        summaryfield.replace("[", "");
        summaryfield.replace("]", "");
        composer->setRemarks(summaryfield);
    }

    return composer;
}    

void
WikipediaComposersImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    
    QString period;
    DEBUG << "source = " << source.toString() << endl;
    QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_");
    QRegExp pmatcher2("List_of_([^_-]+)[_-]era_");
    QRegExp pmatcher3("([^_-]+)_composers");
    if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1);
    else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1);
    else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1);
    DEBUG << "period = "<< period << endl;

    int count = 0;
    
    // table form A (used of e.g. Romantic transitional composers)
    // | Name || birth || death || nationality || summary || flags
    // note: 5x ||
    QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|");

    // table form B (used of e.g. 20th-century composers)
    // | Name || birth-[death] || nationality || notable works || remarks
    // Note name may contain a single | if in double-square brackets, hence 2a
    // note: 4x ||
    QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)");
    // just in case the final column has been omitted completely (as happens).
    // this must be matched after matcher2
    QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)");

    // list form
    // * [[Name]] [alias?] (stuff about dates)[,] notes
    QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)");

    while (!stream.atEnd()) {
        QString line = stream.readLine();

        Composer *o = 0;

        if (matcher1.indexIn(line) >= 0) {

            o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3),
                            "", matcher1.cap(4), "", matcher1.cap(5));

        } else if (matcher2.indexIn(line) >= 0) {

            o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "",
                            matcher2.cap(4), matcher2.cap(5), "");

        } else if (matcher2a.indexIn(line) >= 0) {

            o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "",
                            matcher2a.cap(4), "", "");

        } else if (matcher3.indexIn(line) >= 0) {
            
            o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3),
                            "", "", matcher3.cap(5));

        } else if (line.startsWith("* ") || line.startsWith("| ") ||
                   line.startsWith("*[") || line.startsWith("|[")) {
            DEBUG << "Failed to match promising line: " << line << endl;
        }

        if (o) {
            if (period != "") o->setPeriod(period);
            m_objects.push_back(o);
            ++count;
        }

    }

    DEBUG << "Found " << count << " things" << endl;
}


}