view import/ImportWikipediaWorksList.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportWikipediaWorksList.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
WikipediaWorksListImporter::setSource(QUrl source)
{
    DEBUG << "WikipediaWorksListImporter::setSource: " << source << endl;
    import(source);
}

static QString
sanitise(QString field, QString &linkText)
{
    int mp;

    field.replace(QString::fromUtf8("\342\200\222"), "-");
    field.replace(QString::fromUtf8("\342\200\223"), "-");
    field.replace(QString::fromUtf8("\342\200\224"), "-");
    field.replace(QString::fromUtf8("\342\200\225"), "-");

    field.replace(QString::fromUtf8("\342\231\255"), "-flat");
    field.replace(QString::fromUtf8("\342\231\257"), "-sharp");

    QRegExp link2("([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
    if ((mp = link2.indexIn(field)) >= 0) {
	if (linkText == "" && mp < 4) linkText = link2.cap(2);
	field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
	return sanitise(field, linkText);
    }

    QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
    if ((mp = link1.indexIn(field)) >= 0) {
	if (linkText == "") linkText = link1.cap(2);
	field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
	return sanitise(field, linkText);
    }

    field = field.trimmed();

    field.replace("[", "");
    field.replace("]", "");
    field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), " ");
    field.replace("'''", "\"");
    field.replace("''", "\"");
    field.replace("&quot;", "\"");
    field.replace("\"\"", "\"");
    field.replace(QRegExp("^[\'\"] (\")?"), "\"");
    field.replace(QRegExp("&lt;[^&]*&gt;"), "");
    field.replace(QRegExp("^\\**"), "");
    
    if (field.endsWith("c.")) {
        // historical artifact from removal of Bruckner year indication (c. 1856)
        field = field.left(field.length()-2);
    }

    while (field.endsWith(".") || field.endsWith(",")) {
        field = field.left(field.length()-1);
    }

    if (field.startsWith(";") || field.startsWith(":") || field.startsWith(",")
        || field.startsWith("-")) {
        field = field.right(field.length()-1);
    }

    if (field.startsWith("(") && field.endsWith(")")) {
        DEBUG << "before: " << field;
        field = field.mid(1, field.length()-2);
        DEBUG << "after: " << field;
    }

    field.replace(QRegExp("^\\**"), "");
    if (field == ")" || field == "(") {
        field = "";
    }

    field.replace(" - ,", ",");
    field.replace("  ", " ");

    return field.trimmed();
}

static QString
extractYear(QString datefield)
{
    QRegExp re("[0-9]{4}");
    if (re.indexIn(datefield) >= 0) {
	return re.cap(0);
    }
    return "";
}

static QString
extractKey(QString titlefield)
{
    QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
    if (re.indexIn(titlefield) >= 0) {
	return re.cap(1);
    }
    return "";
}

static Work *
makeWork(QString composerName, QString opfield, QString numfield,
         int partNumber, QString titlefield, QString datefield,
         QString placefield, QString remarksfield, Work *main)
{
    if (titlefield.contains("List of ") || titlefield.contains("http:")) return 0;

    QString linkText;

    Work *w = new Work;

    QRegExp embeddedOpMatcher("([Oo]pus|[Oo]p.|WAB) (posth[a-z\\.]* *)?([0-9][^ ;:,]*)(,? *([Nn]umber|[Nn]o.|[Nn]r.) ([0-9][^ ;:,]*))?,?");
    if (embeddedOpMatcher.indexIn(titlefield) >= 0) {
        QString opf = embeddedOpMatcher.cap(0);
        if (opfield == "") opfield = opf;
        titlefield.replace(opf, "");
    } else if (embeddedOpMatcher.indexIn(remarksfield) >= 0) {
        opfield = embeddedOpMatcher.cap(0);
    }
    if (main && numfield == "") {
        QRegExp embeddedNumMatcher("(Number|No.|Nr.) ([0-9][^ ;:,]*)");
        if (embeddedNumMatcher.indexIn(titlefield) >= 0) {
            numfield = embeddedNumMatcher.cap(2);
        } else if (embeddedNumMatcher.indexIn(remarksfield) >= 0) {
            numfield = embeddedNumMatcher.cap(2);
        }
    }

    QString op = sanitise(opfield, linkText);
    if (op != "") {
        if (op.toLower().contains("op")) {
            op.replace("Opus ", "");
            op.replace("Op. ", "");
            op.replace("Op.", "");
            op.replace("Op ", "");
            op.replace("opus ", "");
            op.replace("op. ", "");
            op.replace("op.", "");
            op.replace("op ", "");
            w->setOpus(op);
        } else if (QRegExp("^[0-9]*$").indexIn(op) >= 0) {
            w->setOpus(op);
        } else {
            w->setCatalogue(op);
        }
    }

    QString num = sanitise(numfield, linkText);
    if (num != "") {
        num.replace("No. ", "");
        num.replace("No ", "");
        w->setNumber(num);
    } else if (partNumber > 0) {
        w->setNumber(QString("%1").arg(partNumber));
    }

    QString key = extractKey(titlefield);
    if (key != "") {
        w->setKey(key);
    }

    DEBUG << "title before sanitise: " << titlefield << endl;

    remarksfield = remarksfield.trimmed();

    QString title = sanitise(titlefield, linkText);
    title.replace(QRegExp(", which.*$"), "");
    if (linkText != "") {
        if (remarksfield == "" && title.startsWith(linkText)) {
            remarksfield = title.right(title.length() - linkText.length());
            title = linkText;
        }
	linkText.replace(" ", "_");
	QUrl url;
	url.setScheme("http");
	url.setHost("en.wikipedia.org");
	url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
	Document *d = new Document;
	d->setUri(Uri(url));
	d->setSiteName("Wikipedia");
	d->setTopic(w);
	w->addPage(d);
    }

    DEBUG << "title after sanitise: " << title << ", link text " << linkText << ", remarks " << remarksfield << endl;

    QRegExp explicationRE("^(\"[^-]+\") - (.+)$");
    int pos;
    if ((pos = explicationRE.indexIn(title)) >= 0) {
        QString part = explicationRE.cap(2);
        if (part[0].isUpper()) w->addAlias(explicationRE.cap(2));
        else if (remarksfield == "") remarksfield = explicationRE.cap(2);
        title = explicationRE.cap(1);
    }

    QRegExp remarksRE1("^(\"[^-]+\") (for .*)$");
    if ((pos = remarksRE1.indexIn(title)) >= 0) {
        if (remarksfield != "") {
            remarksfield = QString("%1 - %2")
                .arg(remarksRE1.cap(2)).arg(remarksfield);
        } else {
            remarksfield = remarksRE1.cap(2);
        }
        title = remarksRE1.cap(1);
    }
    
    QRegExp remarksRE2("^(\"[^\"]+\"), (.*)$");
    if ((pos = remarksRE2.indexIn(title)) >= 0) {
        if (remarksfield != "") {
            remarksfield = QString("%1 - %2")
                .arg(remarksRE2.cap(2)).arg(remarksfield);
        } else {
            remarksfield = remarksRE2.cap(2);
        }
        title = remarksRE2.cap(1);
    }

    QRegExp explicationRE2("^([^\\(]*\") \\(([^\\)]*)\\)(.*)$");
    if ((pos = explicationRE2.indexIn(title)) >= 0) {
        w->addAlias(explicationRE2.cap(2));
        if (remarksfield == "") remarksfield = explicationRE2.cap(3);
        title = explicationRE2.cap(1);
    }

    if (title.startsWith("Song \"")) {
        title = title.right(title.length() - 5);
        w->addForm(Form::getFormByName("song"));
    }
    if (!main && title.startsWith("Song cycle \"")) {
        title = title.right(title.length() - 11);
        w->addForm(Form::getFormByName("song cycle"));
    }
    if (main && main->forms().contains(Form::getFormByName("song cycle"))) {
        w->addForm(Form::getFormByName("song"));
    }

    if (title == "" && !main) {
        delete w;
        return 0;
    }

    w->setName(title);
    
    QString remarks = sanitise(remarksfield, linkText);
    if (remarks != "") {
        w->setRemarks(remarks);
    }

    QString year = extractYear(datefield);
    QString place = sanitise(placefield, linkText);

    DEBUG << "title = " << title << endl;

    if (main) {
        main->addPart(w);
        w->setPartOf(main);
        w->setComposition(main->composition());
        main->composition()->addWork(w);
    }

    if (!main || !main->composition() ||
        (year != "" && (main->composition()->year() != year.toInt()))) {
        Composition *c = new Composition;
        c->setComposerName(composerName);
        c->addWork(w);
        c->setYear(year.toInt());
        c->setPlace(place);
        w->setComposition(c);
    }

    return w;
}


void
WikipediaWorksListImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    
    QString composerName;
    if (filename.contains("K%C3%B6chel")) {
	composerName = "Wolfgang Amadeus Mozart";
    } else if (filename.contains("/Schubert_")) {
        composerName = "Franz Schubert";
    } else {
	QRegExp byby("by_(.*)_by");
	if (byby.indexIn(filename) >= 0) {
	    composerName = byby.cap(1).replace('_', ' ');
	} else {
            QRegExp bybr("by_(.*)_\\(");
            if (bybr.indexIn(filename) >= 0) {
                composerName = bybr.cap(1).replace('_', ' ');
            } else {
                QRegExp by("by_(.*)");
                if (by.indexIn(filename) >= 0) {
                    composerName = by.cap(1).replace('_', ' ');
                } else {
                    QRegExp of("of_([A-Z].*)");
                    if (of.indexIn(filename) >= 0) {
                        composerName = of.cap(1).replace('_', ' ');
                    }
                }
            }
	}
    }
    composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());

    DEBUG << "composerName = " << composerName << endl;


    // We try to keep these matchers specific enough that we can be
    // sure the title field will come out containing _at least_ the
    // title.  i.e. the title field should never end up with just the
    // opus number or date or whatever, even if the line is formatted
    // in a way we hadn't anticipated.  Thus it helps if the title is
    // bookended by '' or [[]], etc

    // e.g. Beethoven
    // *Opus 84: ''[[Egmont (Beethoven)|Egmont]]'', overture and incidental music (1810)
    // opus field - n/a - title - date - n/a - remarks
    QRegExp workMatcher1("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:{]*)[:,] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");

    // e.g. Tchaikovsky
    // *'''Op. 19''' 6 Pieces, for piano (1873)
    // or Ravel
    // * '''1''', Piano Sonata movement (1888), lost

/*
    // opus field - n/a - title - date - n/a - remarks
    QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G)? *[0-9][^ ,:'{]*)'''[:, ] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");
*/
    // opus field - n/a - title
    QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|[A-Z]{1,2})?\\.? *[0-9][^ ,:'{]*),?'''[:, ] *(.*)$");

    // e.g. Copland
    // * ''Four Motets'' for mixed voices (1921)
    // title - date field
    // (no opus)
    QRegExp workMatcher2("^\\* *(''.*''\\)?) *(.*)$");
    workMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings

    // e.g. Copland
    // * Arrangement of ''Lincoln Portrait'' for concert band (1942)
    // or Mendelssohn
    // * [[Christe du Lamm Gottes]] (1827), SATB, strings
    // title - date field - remarks
    // (no opus)
    QRegExp workMatcher3("^\\* *([^\\*].*) *\\(([^\\)]*[0-9]{4}[^\\)]*)\\) *(.*)$");
    
    // e.g. Scriabin
    // *[[Sonata No. 2 (Scriabin)|Sonata No. 2 in G sharp minor]], Op. 19  (also known as ''Sonata-Fantasy'')"
    // title - opus field - n/a - remarks
    QRegExp workMatcher4("^\\* *(\\[\\[.*\\]\\]),* (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*) *(.*)$");

    // e.g. Scriabin
    // *Opus 35: [[Opus 35 (Scriabin)|Three Preludes]]
    // opus field - n/a - title - remarks
    QRegExp workMatcher5("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)[:,]* *([\\[']+.*[\\]']+) *(.*)$");

    // e.g. Boccherini
    // *G 1: Cello Sonata in F major
    // or weird Schubert layout
    // * D 505{{nbsp|4}}Adagio in D-flat for Piano
    // or Glazunov
    // :Op. 67: ''[[The Seasons (ballet)|The Seasons]]'', ballet in one act (1900)
    // or even
    // ::Op. 77: ''[[Symphony No. 7 (Glazunov)|Symphony No. 7]]'' &quot;Pastorale&quot; in F major (1902-1903)
    // This one is a real mess, for really messy pages.  Needs to go near
    // the end of the matchers in case it catches something it shouldn't
    // n/a - opus field - n/a - n/a - n/a - title
    QRegExp workMatcher6("^([\\*:]|::) *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)(([:,]| *\\{+[^\\}]+\\}+) *(.*))?$");

    // e.g. Bruch
    // * Adagio appassionato for violin and orchestra in C sharp minor, Op. 57
    // title - opus field - date field
    QRegExp workMatcher7("^\\* *(.*),? (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*|[Oo]p. posth[a-z.]*) *(\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\))? *$");

    // e.g. Bruckner
    // * Symphony No. 0 in D minor 1869 WAB 100
    // title - date field - opus field
    QRegExp workMatcher8("^\\* *(.*) ([0-9]{4}[0-9/-]*) *(WAB [0-9][^ ]*)$");

    // e.g. Bach
    // * BWV 506 ? Was bist du doch, o Seele, so betruebet
    // opus field - title
    QRegExp workMatcher9("^\\* *(BWV [^ ]+)(.*)$");

    // Catch-all for things that look at all promising (anything that
    // starts with ' or [ after bullet: take the whole as title)
    QRegExp workMatcher10("^[\\*:] *((['\\[]|&quot;).*)$");



    // e.g. Beethoven
    // **No. 1: [[Piano Trio No. 1 (Beethoven)|Piano Trio No. 1]] in E-flat major
    // number field - n/a - title, remarks etc
    QRegExp partMatcher1("^[\\*:]{2} *((No\\.? *)?[0-9][^ ,:'{]*)[:, ] *(.*)$");

    // e.g. Copland
    // ** ''Help us, O Lord''
    // title - remarks
    QRegExp partMatcher2("^\\*\\* *(''.*'') *(.*)$");
    partMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings

    // e.g. Scriabin
    // **[[Mazurka Op. 40 No. 1 (Scriabin)|Mazurka in D flat major]]
    // title - remarks
    QRegExp partMatcher3("^\\*\\* *(\\[\\[.*\\]\\])(.*)$");

    // e.g. Berlioz
    // ** 1: ''Méditation religieuse''
    // number - title - remarks
    QRegExp partMatcher4("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *([\\[]*''.*''[\\]]*) *(.*)$");

    // e.g. Tchaikovsky
    // **4. Nocturne [???????] (C? minor)
    // number - title - remarks
    QRegExp partMatcher5("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *(.*\\[[^\\]]+\\])(.*)$");

    // e.g. Schubert
    // **2. &quot;Wohin?&quot;
    // n/a - number - title
    QRegExp partMatcher6("^\\*\\* *(([0-9][0-9a-z]*)[\\.:])? *((&quot;|'').*)$");
   
    // e.g. Mendelssohn
    // ** Notturno
    // title only
    QRegExp partMatcher7("^\\*\\* *(.*)$");


    // Date and remarks within titlefield or remarksfield
    QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\),?(.*)");

    
    Work *main = 0;
    int partNumber = 0;

    QString line;
    QString opfield, numfield, titlefield, remarksfield, datefield;

    while (!stream.atEnd()) {

        if (line == "") {
            line = stream.readLine();
            DEBUG << "line: " << line << endl;
        }

        opfield = "";
        numfield = "";
        titlefield = "";
        datefield = "";
        remarksfield = "";
        partNumber = 0;

        if (workMatcher1.indexIn(line) >= 0) {

            DEBUG << "matcher 1" << endl;
            opfield = workMatcher1.cap(1);
            titlefield = workMatcher1.cap(3);
            datefield = workMatcher1.cap(4);
            remarksfield = workMatcher1.cap(6);

        } else if (workMatcher1a.indexIn(line) >= 0) {

            DEBUG << "matcher 1a" << endl;
            opfield = workMatcher1a.cap(1);
            titlefield = workMatcher1a.cap(3);
/*
            datefield = workMatcher1a.cap(4);
            remarksfield = workMatcher1a.cap(6);
*/

        } else if (workMatcher2.indexIn(line) >= 0) {

            DEBUG << "matcher 2" << endl;
            titlefield = workMatcher2.cap(1);
            remarksfield = workMatcher2.cap(2);
            
        } else if (workMatcher3.indexIn(line) >= 0) {

            DEBUG << "matcher 3" << endl;
            titlefield = workMatcher3.cap(1);
            datefield = workMatcher3.cap(2);
            remarksfield = workMatcher3.cap(3);

        } else if (workMatcher4.indexIn(line) >= 0) {

            DEBUG << "matcher 4" << endl;
            titlefield = workMatcher4.cap(1);
            opfield = workMatcher4.cap(2);
            remarksfield = workMatcher4.cap(4);

        } else if (workMatcher5.indexIn(line) >= 0) {

            DEBUG << "matcher 5" << endl;
            opfield = workMatcher5.cap(1);
            titlefield = workMatcher5.cap(3);
            remarksfield = workMatcher5.cap(4);

        } else if (workMatcher6.indexIn(line) >= 0) {

            DEBUG << "matcher 6" << endl;
            opfield = workMatcher6.cap(2);
            titlefield = workMatcher6.cap(6);

        } else if (workMatcher7.indexIn(line) >= 0) {

            DEBUG << "matcher 7" << endl;
            titlefield = workMatcher7.cap(1);
            opfield = workMatcher7.cap(2);
            datefield = workMatcher7.cap(3);

        } else if (workMatcher8.indexIn(line) >= 0) {

            DEBUG << "matcher 8" << endl;
            titlefield = workMatcher8.cap(1);
            datefield = workMatcher8.cap(2);
            opfield = workMatcher8.cap(3);

        } else if (workMatcher9.indexIn(line) >= 0) {

            DEBUG << "matcher 9" << endl;
            opfield = workMatcher9.cap(1);
            titlefield = workMatcher9.cap(2);

        } else if (workMatcher10.indexIn(line) >= 0) {

            DEBUG << "matcher 10" << endl;
            titlefield = workMatcher10.cap(1);

        } else {
            if (line.startsWith("*") || line.startsWith(":")) {
                DEBUG << "Failed to match promising works list line: " << line << endl;
            }
            line = "";
            continue;
        }

        if (titlefield != "" && datefield == "") {
            int dpos;
            if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
                datefield = matcherDate.cap(1);
                remarksfield = matcherDate.cap(2);
                titlefield = titlefield.left(dpos);
            }
        }

        if (remarksfield != "" && datefield == "") {
            int dpos;
            if ((dpos = matcherDate.indexIn(remarksfield)) != -1) {
                datefield = matcherDate.cap(1);
                remarksfield = remarksfield.left(dpos);
            }
        }

        main = makeWork(composerName, opfield, "", 0,
                        titlefield, datefield, "", remarksfield, 0);

        if (main) m_objects.push_back(main);

        line = "";

        while (!stream.atEnd()) {
            
            ++partNumber;
            line = stream.readLine();
            DEBUG << "line: " << line << endl;
            
            if (partMatcher1.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 1" << endl;
                numfield = partMatcher1.cap(1);
                titlefield = partMatcher1.cap(3);
                remarksfield = "";

            } else if (partMatcher2.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 2" << endl;
                titlefield = partMatcher2.cap(1);
                remarksfield = partMatcher2.cap(2);

            } else if (partMatcher3.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 3" << endl;
                titlefield = partMatcher3.cap(1);
                remarksfield = partMatcher3.cap(2);

            } else if (partMatcher4.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 4" << endl;
                numfield = partMatcher4.cap(1);
                titlefield = partMatcher4.cap(2);
                remarksfield = partMatcher4.cap(3);

            } else if (partMatcher5.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 5" << endl;
                numfield = partMatcher5.cap(1);
                titlefield = partMatcher5.cap(2);
                remarksfield = partMatcher5.cap(3);

            } else if (partMatcher6.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 6" << endl;
                numfield = partMatcher6.cap(2);
                titlefield = partMatcher6.cap(3);

            } else if (partMatcher7.indexIn(line) >= 0) {
                
                DEBUG << "part matcher 7" << endl;
                titlefield = partMatcher7.cap(1);

            } else {
                if (line.startsWith("**") || line.startsWith("::")) {
                    DEBUG << "Failed to match promising part line: " << line << endl;
                }
                break;
            }

            if (titlefield != "" && datefield == "") {
                int dpos;
                if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
                    datefield = matcherDate.cap(1);
                    remarksfield = matcherDate.cap(2);
                    titlefield = titlefield.left(dpos);
                }
            }

            Work *part = makeWork(composerName, opfield, numfield, partNumber,
                                  titlefield, datefield, "", remarksfield,
                                  main);

            if (part) m_objects.push_back(part);
        }
    }        

    DEBUG << "Found " << m_objects.size() << " things" << endl;
}


}