view import/ImportHoboken.cpp @ 25:e856df83c57f classical-rdf

* checkpoint disambiguation
author Chris Cannam
date Fri, 26 Feb 2010 16:39:16 +0000
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportHoboken.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
HobokenImporter::setSource(QUrl source)
{
    DEBUG << "HobokenImporter::setSource: " << source << endl;
    import(source);
}

QString
hobToForm(QString hob)
{
    QStringList bits = hob.split(':');
    QString group = bits[0];
    int num = bits[1].toInt();
    if (group == "I") return "symphony";
    if (group == "II" && (num <= 24 || !bits[1][0].isDigit())) return "divertimento";
    if (group == "III") return "string quartet";
    if (group == "IV") return "divertimento";
    if (group == "V") return "string trio;trio";
    if (group == "VI") return "string duo;duo;sonata";
    if (group == "VII") return "concerto";
    if (group == "VIII") return "march";
    if (group == "IX") return "dance";
    if (group == "X") return "divertimento";
    if (group == "XI") return "trio";
    if (group == "XII") return "duo";
    if (group == "XIII") return "concerto";
    if (group == "XIV") return "divertimento";
    if (group == "XV") return "piano trio;trio";
    if (group == "XVI") return "piano sonata;sonata";
    if (group == "XVII") return "work for piano";
    if (group == "XVIIa") return "work for piano";
    if (group == "XVIII") return "piano concerto;concerto";
    if (group == "XXII") return "mass";
    if (group == "XXIIa") return "requiem";
    //!!! choral works
    return "";
}

void
HobokenImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    
    QString composerName = "Joseph Haydn";

    DEBUG << "composerName = " << composerName << endl;

    QMap<QString, Work *> hobMap;
    QMap<int, Work *> opusMap;

    while (!stream.atEnd()) {

	QString line = stream.readLine();

	QString hob = "";

	QRegExp hobre("^([\\d][^ _]+_([A-Za-z]*)[^ ]+) ");

	if (hobre.indexIn(line) >= 0) {

	    hob = hobre.cap(1);
	    Work *w = 0;
	    Composition *cn = 0;

	    if (!hobMap.contains(hob)) {
		w = new Work();
		QString key = hobre.cap(2);
		if (key != "") {
		    if (key.length() > 1 && key[1] == 's') {
			key = key[0] + "-flat";
		    }
		    if (key[0].isLower()) {
			key[0] = key[0].toUpper();
			key += " minor";
		    } else {
			key += " major";
		    }
		    w->setKey(key);
		}
		cn = new Composition();
		cn->setComposerName(composerName);
		cn->addWork(w);
		w->setComposition(cn);
		hobMap[hob] = w;
	    } else {
		w = hobMap[hob];
		cn = w->composition();
	    }

	    QRegExp hobre2("^[^ ]+ # (Hob [^ ]*)");
	    if (hobre2.indexIn(line) >= 0) {
                QString hobtext = hobre2.cap(1);
		w->setCatalogue(hobtext);
                QStringList forms = hobToForm(hobtext).split(";");
                foreach (QString f, forms) {
                    if (f != "") {
                        w->addForm(Form::getFormByName(f));
                    }
                }
		continue;
	    }

	    QRegExp titlere("^[^ ]+ @([^ ]+) (.*)");
	    if (titlere.indexIn(line) >= 0) {
		QString title = titlere.cap(2).trimmed();
		if (titlere.cap(1) == "en") {
		    if (w->name() != "") {
			w->addAlias(w->name());
		    }
		    w->setName(title);
		} else {
		    if (w->name() == "") {
			w->setName(title);
		    } else {
			w->addAlias(title);
		    }
		}
		continue;
	    }

	    QRegExp httpre("^[^ ]+ (http:[^ ]*) *$");
	    if (httpre.indexIn(line) >= 0) {
		QString url = httpre.cap(1).trimmed();
		Document *d = new Document;
		d->setUri(Uri(url));
		d->setTopic(w);
		if (url.contains("wikipedia")) d->setSiteName("Wikipedia");
		else if (url.contains("klassika.info")) {
		    d->setSiteName("Klassika - Die deutschsprachigen Klassikseiten");
		}
		w->addPage(d);
		continue;
	    }
	    
	    QRegExp datere("^[^ ]+ \\[[^]]*(\\d{4})[^]]*\\]");
	    if (datere.indexIn(line) >= 0) {
		cn->setYear(datere.cap(1).toInt());
		continue;
	    }
	    
	    QRegExp opre("^[^ ]+ -> ([^ ]+)");
	    if (opre.indexIn(line) >= 0) {
		QString optext = opre.cap(1);
		w->setOpus(optext);
		if (optext.contains('/')) {
		    QStringList ops = optext.split('/');
		    int opno = ops[0].toInt();
		    if (opno == 0) {
			DEBUG << "Failed to convert " << optext << " to op no" << endl;
		    } else {
			if (!opusMap.contains(opno)) {
			    opusMap[opno] = new Work();
			    opusMap[opno]->setOpus(ops[0]);
			    opusMap[opno]->setComposition(new Composition());
			    opusMap[opno]->composition()->setComposerName(composerName);
			}
			opusMap[opno]->addPart(w);
			w->setPartOf(opusMap[opno]);
			w->setOpus(ops[0]);
			w->setNumber(ops[1]);
		    }
		}
		continue;
	    }

	    continue;
	}

	QRegExp opre("^Opus ([\\d][^ ]*): (.*)");
	if (opre.indexIn(line) >= 0) {
	    QString optext = opre.cap(1);
	    int opno = optext.toInt();
	    if (!opusMap.contains(opno)) {
		opusMap[opno] = new Work();
		opusMap[opno]->setOpus(optext);
		opusMap[opno]->setComposition(new Composition());
		opusMap[opno]->composition()->setComposerName(composerName);
	    }
	    QString title = opre.cap(2);
	    title.replace("<br>", " - ");
	    opusMap[opno]->setName(title);
	    continue;
	}

	DEBUG << "Failed to match line: " << line << endl;
    }

    foreach (Work *w, hobMap) m_objects.push_back(w);
    foreach (Work *w, opusMap) m_objects.push_back(w);


    DEBUG << "Found " << m_objects.size() << " things" << endl;
}


}