view import/ImportWikipediaWorks.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportWikipediaWorks.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
WikipediaWorksImporter::setSource(QUrl source)
{
    DEBUG << "WikipediaWorksImporter::setSource: " << source << endl;
    import(source);
}

QString
sanitise(QString field, QString &linkText)
{
    int mp;

    field.replace(QString::fromUtf8("\342\200\222"), "-");
    field.replace(QString::fromUtf8("\342\200\223"), "-");
    field.replace(QString::fromUtf8("\342\200\224"), "-");
    field.replace(QString::fromUtf8("\342\200\225"), "-");

    QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
    if ((mp = link2.indexIn(field)) >= 0) {
	if (linkText == "") linkText = link2.cap(2);
	field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
	return sanitise(field, linkText);
    }

    QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
    if ((mp = link1.indexIn(field)) >= 0) {
	if (linkText == "") linkText = link1.cap(2);
	field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
	return sanitise(field, linkText);
    }

    field = field.trimmed();

    field.replace("[", "");
    field.replace("]", "");
    field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
    field.replace("''", "\"");
    field.replace("&quot;", "\"");
    field.replace(QRegExp("&lt;[^&]*&gt;"), "");
    field.replace(QRegExp("^\\**"), "");

    while (field.endsWith(".") || field.endsWith(",")) {
        field = field.left(field.length()-1);
    }

    if (field.startsWith("(") && field.endsWith(")")) {
        DEBUG << "before: " << field;
        field = field.mid(1, field.length()-2);
        DEBUG << "after: " << field;
    }
    field.replace(QRegExp("^\\**"), "");
    if (field == ")" || field == "(") {
        field = "";
    }

    field.replace(" - ,", ",");

    return field;
}

QString
extractYear(QString datefield)
{
    QRegExp re("[0-9]{4}");
    if (re.indexIn(datefield) >= 0) {
	return re.cap(0);
    }
    return "";
}

QString
extractKey(QString titlefield)
{
    QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
    if (re.indexIn(titlefield) >= 0) {
	return re.cap(1);
    }
    return "";
}

Work *
makeWork(QString composerName, QString opfield, QString kfield, 
         QString numfield, QString titlefield, QString datefield,
         QString placefield, QString remarksfield, Work *main)
{
    QString linkText;

    Work *w = new Work;

    QString op = sanitise(opfield, linkText);
    if (op != "") {
	op.replace("Opus ", "");
	op.replace("Op. ", "");
	op.replace("Op ", "");
	w->setOpus(op);
    }

    QString k = sanitise(kfield, linkText);
    if (k != "") {
	w->setCatalogue(k);
    }

    QString num = sanitise(numfield, linkText);
    if (num != "") {
        num.replace("No. ", "");
        num.replace("No ", "");
        w->setNumber(num);
    }

    QString key = extractKey(titlefield);
    if (key != "") {
        w->setKey(key);
    }

    QString title = sanitise(titlefield, linkText);
    if (linkText != "") {
	linkText.replace(" ", "_");
	QUrl url;
	url.setScheme("http");
	url.setHost("en.wikipedia.org");
	url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
	Document *d = new Document;
	d->setUri(Uri(url));
	d->setSiteName("Wikipedia");
	d->setTopic(w);
	w->addPage(d);
    }

    QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
    int pos;
    if ((pos = explicationRE.indexIn(title)) >= 0) {
        w->addAlias(explicationRE.cap(2));
        title = explicationRE.cap(1);
    }

    if (remarksfield == "") {
        QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
        if ((pos = remarksRE.indexIn(title)) >= 0) {
            remarksfield = remarksRE.cap(2);
            title = remarksRE.cap(1);
        }
    }
    
    if (remarksfield == "") {
        QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
        if ((pos = remarksRE.indexIn(title)) >= 0) {
            remarksfield = remarksRE.cap(2);
            title = remarksRE.cap(1);
        }
    }

    w->setName(title);
    
    QString remarks = sanitise(remarksfield, linkText);
    if (remarks != "") {
        w->setRemarks(remarks);
    }

    QString year = extractYear(datefield);
    QString place = sanitise(placefield, linkText);

    DEBUG << "title = " << title << endl;

    if (main) {
        main->addPart(w);
        w->setPartOf(main);
        w->setComposition(main->composition());
        main->composition()->addWork(w);
    }

    if (!main || !main->composition() ||
        (year != "" && (main->composition()->year() != year.toInt()))) {
        Composition *c = new Composition;
        c->setComposerName(composerName);
        c->addWork(w);
        c->setYear(year.toInt());
        c->setPlace(place);
        w->setComposition(c);
    }

    return w;
}


void
WikipediaWorksImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    
    QString composerName;
    if (filename.contains("K%C3%B6chel")) {
	composerName = "Wolfgang Amadeus Mozart";
    } else if (filename.contains("/Schubert_")) {
        composerName = "Franz Schubert";
    } else {
	QRegExp byby("by_(.*)_by");
	if (byby.indexIn(filename) >= 0) {
	    composerName = byby.cap(1).replace('_', ' ');
	} else {
	    QRegExp by("by_(.*)");
	    if (by.indexIn(filename) >= 0) {
		composerName = by.cap(1).replace('_', ' ');
	    }
	}
    }
    composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());

    DEBUG << "composerName = " << composerName << endl;

    // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
    QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[K\\. *([0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");

    QString all = stream.readAll();

    DEBUG << "Read " << all.length() << " chars" << endl;

    all.replace(QRegExp("^.*<page>"), "");

    int pos = 0, count = 0;

    while ((pos = matcherK.indexIn(all, pos)) != -1) {

        all.replace(pos, matcherK.matchedLength(), "");
	++count;

	QString kfield = matcherK.cap(1);
	QString titlefield = matcherK.cap(2);
	QString datefield = matcherK.cap(3);
	QString placefield = matcherK.cap(4);

	m_objects.push_back
	    (makeWork(composerName, "K. " + kfield, kfield, "",
		      titlefield, datefield, placefield, "", 0));
    }

    // Opus in list form (as used for e.g. Beethoven's works)
    QRegExp matcherB("[\\*:] *'*((Opus|Op\\.|WoO|Anh|H|D) [0-9][^,:'{\n]*)'*[,:{] *([^\n]*)\n");

    // Part of an opus (e.g. op 18 no 1), intended to be anchored to
    // the point at which the last matcherB or matcherB2 match ended
    // (note caret)
    QRegExp matcherB2("^[\\*:]{2} *([A-Za-z ]*)((No\\.* +)?[0-9][^ :\n]*)[: ] *([^\n]*)\n");

    // Date and remarks within titlefield
    QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\)(.*)");

    pos = 0;

    while ((pos = matcherB.indexIn(all, pos)) != -1) {

        all.replace(pos, matcherB.matchedLength(), "");
	++count;

	QString opfield = matcherB.cap(1);
	QString titlefield = matcherB.cap(3);

        QString datefield, remarksfield;

        if (titlefield != "") {
            int dpos;
            if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
                datefield = matcherDate.cap(1);
                remarksfield = matcherDate.cap(2);
                titlefield = titlefield.left(dpos);
            }
        }

	Work *main = makeWork(composerName, opfield, "", "",
                              titlefield, datefield, "", remarksfield, 0);

        m_objects.push_back(main);

        int spos = pos;

        while ((spos = matcherB2.indexIn(all, spos, QRegExp::CaretAtOffset))
               != -1) {

            all.replace(spos, matcherB2.matchedLength(), "");
            ++count;

            QString numfield = matcherB2.cap(2);

            titlefield = matcherB2.cap(4);

            if (matcherB2.cap(1).trimmed() != "") {
                titlefield = matcherB2.cap(1) + matcherB2.cap(2) + " " 
                    + matcherB2.cap(4);
                DEBUG << "prefix to number = " << matcherB2.cap(1) << ", so extending title from " << matcherB2.cap(4) << " to " << titlefield << endl;
            }

            datefield = "";
            remarksfield = "";

            if (titlefield != "") {
                int dpos;
                if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
                    datefield = matcherDate.cap(1);
                    remarksfield = matcherDate.cap(2);
                    titlefield = titlefield.left(dpos);
                }
            }

            Work *sub = makeWork(composerName, opfield, "", numfield,
                                 titlefield, datefield, "", remarksfield, main);

            m_objects.push_back(sub);
        }
    }

    // Title with date but no opus in list form (as used for e.g. Copland)
    QRegExp matcherC("\\* *([^\n]*)\\([^\\)]*([0-9]{4})[^\\)]*\\) *\n");

    // Part of the above (e.g. song in cycle), intended to be anchored to
    // the point at which the last matcherC or matcherC2 match ended
    // (note caret)
    QRegExp matcherC2("^\\*\\* *([^\n]*)\n");

    pos = 0;

    while ((pos = matcherC.indexIn(all, pos)) != -1) {

        all.replace(pos, matcherC.matchedLength(), "");
	++count;

	QString titlefield = matcherC.cap(1);
        QString datefield = matcherC.cap(2);

	Work *main = makeWork(composerName, "", "", "",
                              titlefield, datefield, "", "", 0);

        m_objects.push_back(main);

        int spos = pos;

        while ((spos = matcherC2.indexIn(all, spos, QRegExp::CaretAtOffset))
               != -1) {

            all.replace(spos, matcherC2.matchedLength(), "");
            ++count;

            titlefield = matcherC2.cap(1);

            datefield = "";

            if (titlefield != "") {
                int dpos;
                if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
                    datefield = matcherDate.cap(1);
                    titlefield = titlefield.left(dpos);
                }
            }

            Work *sub = makeWork(composerName, "", "", "",
                                 titlefield, datefield, "", "", main);

            m_objects.push_back(sub);
        }
    }



    DEBUG << "Left over: " << all << endl;

    // Other forms:
    // *March No. 1 in F major for Military Band, WoO 18 (1808)


    DEBUG << "Found " << count << " things" << endl;
}


}