view import/ImportWikipediaWorksK.cpp @ 0:e8f4c2b55fd8 classical-rdf

* reorganise
author Chris Cannam
date Tue, 01 Dec 2009 17:50:41 +0000
parents
children c8ef23d3888c
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportWikipediaWorksK.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
WikipediaWorksKImporter::setSource(QUrl source)
{
    DEBUG << "WikipediaWorksKImporter::setSource: " << source << endl;
    import(source);
}

static QString
sanitise(QString field, QString &linkText)
{
    int mp;

    field.replace(QString::fromUtf8("\342\200\222"), "-");
    field.replace(QString::fromUtf8("\342\200\223"), "-");
    field.replace(QString::fromUtf8("\342\200\224"), "-");
    field.replace(QString::fromUtf8("\342\200\225"), "-");

    QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
    if ((mp = link2.indexIn(field)) >= 0) {
	if (linkText == "") linkText = link2.cap(2);
	field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
	return sanitise(field, linkText);
    }

    QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
    if ((mp = link1.indexIn(field)) >= 0) {
	if (linkText == "") linkText = link1.cap(2);
	field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
	return sanitise(field, linkText);
    }

    field = field.trimmed();

    field.replace("[", "");
    field.replace("]", "");
    field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
    field.replace("''", "\"");
    field.replace("&quot;", "\"");
    field.replace(QRegExp("&lt;[^&]*&gt;"), "");
    field.replace(QRegExp("^\\**"), "");

    while (field.endsWith(".") || field.endsWith(",")) {
        field = field.left(field.length()-1);
    }

    if (field.startsWith("(") && field.endsWith(")")) {
        DEBUG << "before: " << field;
        field = field.mid(1, field.length()-2);
        DEBUG << "after: " << field;
    }
    field.replace(QRegExp("^\\**"), "");
    if (field == ")" || field == "(") {
        field = "";
    }

    field.replace(" - ,", ",");

    return field;
}

static QString
extractYear(QString datefield)
{
    QRegExp re("[0-9]{4}");
    if (re.indexIn(datefield) >= 0) {
	return re.cap(0);
    }
    return "";
}

static QString
extractKey(QString titlefield)
{
    QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
    if (re.indexIn(titlefield) >= 0) {
	return re.cap(1);
    }
    return "";
}

static Work *
makeWork(QString composerName, QString opfield, QString kfield, 
         QString numfield, QString titlefield, QString datefield,
         QString placefield, QString remarksfield, Work *main)
{
    QString linkText;

    Work *w = new Work;

    QString op = sanitise(opfield, linkText);
    if (op != "") {
	op.replace("Opus ", "");
	op.replace("Op. ", "");
	op.replace("Op ", "");
	w->setOpus(op);
    }

    QString k = sanitise(kfield, linkText);
    if (k != "") {
	k.replace("K. ", "K ");
	w->setCatalogue(k);
    }

    QString num = sanitise(numfield, linkText);
    if (num != "") {
        num.replace("No. ", "");
        num.replace("No ", "");
        w->setNumber(num);
    }

    QString key = extractKey(titlefield);
    if (key != "") {
        w->setKey(key);
    }

    QString title = sanitise(titlefield, linkText);
    if (linkText != "") {
	linkText.replace(" ", "_");
	QUrl url;
	url.setScheme("http");
	url.setHost("en.wikipedia.org");
	url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
	Document *d = new Document;
	d->setUri(url);
	d->setSiteName("Wikipedia");
	d->setTopic(w);
	w->addPage(d);
    }

    QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
    int pos;
    if ((pos = explicationRE.indexIn(title)) >= 0) {
        w->addAlias(explicationRE.cap(2));
        title = explicationRE.cap(1);
    }

    if (remarksfield == "") {
        QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
        if ((pos = remarksRE.indexIn(title)) >= 0) {
            remarksfield = remarksRE.cap(2);
            title = remarksRE.cap(1);
        }
    }
    
    if (remarksfield == "") {
        QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
        if ((pos = remarksRE.indexIn(title)) >= 0) {
            remarksfield = remarksRE.cap(2);
            title = remarksRE.cap(1);
        }
    }

    w->setName(title);
    
    QString remarks = sanitise(remarksfield, linkText);
    if (remarks != "") {
        w->setRemarks(remarks);
    }

    QString year = extractYear(datefield);
    QString place = sanitise(placefield, linkText);

    DEBUG << "title = " << title << endl;

    if (main) {
        main->addPart(w);
        w->setPartOf(main);
        w->setComposition(main->composition());
        main->composition()->addWork(w);
    }

    if (!main || !main->composition() ||
        (year != "" && (main->composition()->year() != year.toInt()))) {
        Composition *c = new Composition;
        c->setComposerName(composerName);
        c->addWork(w);
        c->setYear(year.toInt());
        c->setPlace(place);
        w->setComposition(c);
    }

    return w;
}


void
WikipediaWorksKImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    
    QString composerName;
    if (filename.contains("K%C3%B6chel")) {
	composerName = "Wolfgang Amadeus Mozart";
    } else {
	QRegExp byby("by_(.*)_by");
	if (byby.indexIn(filename) >= 0) {
	    composerName = byby.cap(1).replace('_', ' ');
	} else {
	    QRegExp by("by_(.*)");
	    if (by.indexIn(filename) >= 0) {
		composerName = by.cap(1).replace('_', ' ');
	    }
	}
    }
    composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());

    DEBUG << "composerName = " << composerName << endl;

    // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
    QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[(K\\.? *[0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");

    QString all = stream.readAll();

    DEBUG << "Read " << all.length() << " chars" << endl;

    all.replace(QRegExp("^.*<page>"), "");

    int pos = 0, count = 0;

    while ((pos = matcherK.indexIn(all, pos)) != -1) {

        all.replace(pos, matcherK.matchedLength(), "");
	++count;

	QString kfield = matcherK.cap(1);
	QString titlefield = matcherK.cap(2);
	QString datefield = matcherK.cap(3);
	QString placefield = matcherK.cap(4);

	m_objects.push_back
	    (makeWork(composerName, "", kfield, "",
		      titlefield, datefield, placefield, "", 0));
    }

    DEBUG << "Left over: " << all << endl;

    DEBUG << "Found " << count << " things" << endl;
}


}