Mercurial > hg > classical
view import/ImportWikipediaWorks.cpp @ 53:bcea875d8d2f tip
More build fixes
author | Chris Cannam |
---|---|
date | Thu, 16 Oct 2014 19:03:51 +0100 |
parents | c8ef23d3888c |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportWikipediaWorks.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void WikipediaWorksImporter::setSource(QUrl source) { DEBUG << "WikipediaWorksImporter::setSource: " << source << endl; import(source); } QString sanitise(QString field, QString &linkText) { int mp; field.replace(QString::fromUtf8("\342\200\222"), "-"); field.replace(QString::fromUtf8("\342\200\223"), "-"); field.replace(QString::fromUtf8("\342\200\224"), "-"); field.replace(QString::fromUtf8("\342\200\225"), "-"); QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]"); if ((mp = link2.indexIn(field)) >= 0) { if (linkText == "") linkText = link2.cap(2); field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3)); return sanitise(field, linkText); } QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]"); if ((mp = link1.indexIn(field)) >= 0) { if (linkText == "") linkText = link1.cap(2); field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2)); return sanitise(field, linkText); } field = field.trimmed(); field.replace("[", ""); field.replace("]", ""); field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), ""); field.replace("''", "\""); field.replace(""", "\""); field.replace(QRegExp("<[^&]*>"), ""); field.replace(QRegExp("^\\**"), ""); while (field.endsWith(".") || field.endsWith(",")) { field = field.left(field.length()-1); } if (field.startsWith("(") && field.endsWith(")")) { DEBUG << "before: " << field; field = field.mid(1, field.length()-2); DEBUG << "after: " << field; } field.replace(QRegExp("^\\**"), ""); if (field == ")" || field == "(") { field = ""; } field.replace(" - ,", ","); return field; } QString extractYear(QString datefield) { QRegExp re("[0-9]{4}"); if (re.indexIn(datefield) >= 0) { return re.cap(0); } return ""; } QString extractKey(QString titlefield) { QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))"); if (re.indexIn(titlefield) >= 0) { return re.cap(1); } return ""; } Work * makeWork(QString composerName, QString opfield, QString kfield, QString numfield, QString titlefield, QString datefield, QString placefield, QString remarksfield, Work *main) { QString linkText; Work *w = new Work; QString op = sanitise(opfield, linkText); if (op != "") { op.replace("Opus ", ""); op.replace("Op. ", ""); op.replace("Op ", ""); w->setOpus(op); } QString k = sanitise(kfield, linkText); if (k != "") { w->setCatalogue(k); } QString num = sanitise(numfield, linkText); if (num != "") { num.replace("No. ", ""); num.replace("No ", ""); w->setNumber(num); } QString key = extractKey(titlefield); if (key != "") { w->setKey(key); } QString title = sanitise(titlefield, linkText); if (linkText != "") { linkText.replace(" ", "_"); QUrl url; url.setScheme("http"); url.setHost("en.wikipedia.org"); url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText)); Document *d = new Document; d->setUri(Uri(url)); d->setSiteName("Wikipedia"); d->setTopic(w); w->addPage(d); } QRegExp explicationRE("^(\"[^-]+\") - (.*)$"); int pos; if ((pos = explicationRE.indexIn(title)) >= 0) { w->addAlias(explicationRE.cap(2)); title = explicationRE.cap(1); } if (remarksfield == "") { QRegExp remarksRE("^(\"[^-]+\") (for .*)$"); if ((pos = remarksRE.indexIn(title)) >= 0) { remarksfield = remarksRE.cap(2); title = remarksRE.cap(1); } } if (remarksfield == "") { QRegExp remarksRE("^(\"[^-]+\"), (.*)$"); if ((pos = remarksRE.indexIn(title)) >= 0) { remarksfield = remarksRE.cap(2); title = remarksRE.cap(1); } } w->setName(title); QString remarks = sanitise(remarksfield, linkText); if (remarks != "") { w->setRemarks(remarks); } QString year = extractYear(datefield); QString place = sanitise(placefield, linkText); DEBUG << "title = " << title << endl; if (main) { main->addPart(w); w->setPartOf(main); w->setComposition(main->composition()); main->composition()->addWork(w); } if (!main || !main->composition() || (year != "" && (main->composition()->year() != year.toInt()))) { Composition *c = new Composition; c->setComposerName(composerName); c->addWork(w); c->setYear(year.toInt()); c->setPlace(place); w->setComposition(c); } return w; } void WikipediaWorksImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString composerName; if (filename.contains("K%C3%B6chel")) { composerName = "Wolfgang Amadeus Mozart"; } else if (filename.contains("/Schubert_")) { composerName = "Franz Schubert"; } else { QRegExp byby("by_(.*)_by"); if (byby.indexIn(filename) >= 0) { composerName = byby.cap(1).replace('_', ' '); } else { QRegExp by("by_(.*)"); if (by.indexIn(filename) >= 0) { composerName = by.cap(1).replace('_', ' '); } } } composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit()); DEBUG << "composerName = " << composerName << endl; // K numbers in tabular form (as found in "Köchel Catalogue" WP page) QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[K\\. *([0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n"); QString all = stream.readAll(); DEBUG << "Read " << all.length() << " chars" << endl; all.replace(QRegExp("^.*<page>"), ""); int pos = 0, count = 0; while ((pos = matcherK.indexIn(all, pos)) != -1) { all.replace(pos, matcherK.matchedLength(), ""); ++count; QString kfield = matcherK.cap(1); QString titlefield = matcherK.cap(2); QString datefield = matcherK.cap(3); QString placefield = matcherK.cap(4); m_objects.push_back (makeWork(composerName, "K. " + kfield, kfield, "", titlefield, datefield, placefield, "", 0)); } // Opus in list form (as used for e.g. Beethoven's works) QRegExp matcherB("[\\*:] *'*((Opus|Op\\.|WoO|Anh|H|D) [0-9][^,:'{\n]*)'*[,:{] *([^\n]*)\n"); // Part of an opus (e.g. op 18 no 1), intended to be anchored to // the point at which the last matcherB or matcherB2 match ended // (note caret) QRegExp matcherB2("^[\\*:]{2} *([A-Za-z ]*)((No\\.* +)?[0-9][^ :\n]*)[: ] *([^\n]*)\n"); // Date and remarks within titlefield QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\)(.*)"); pos = 0; while ((pos = matcherB.indexIn(all, pos)) != -1) { all.replace(pos, matcherB.matchedLength(), ""); ++count; QString opfield = matcherB.cap(1); QString titlefield = matcherB.cap(3); QString datefield, remarksfield; if (titlefield != "") { int dpos; if ((dpos = matcherDate.indexIn(titlefield)) != -1) { datefield = matcherDate.cap(1); remarksfield = matcherDate.cap(2); titlefield = titlefield.left(dpos); } } Work *main = makeWork(composerName, opfield, "", "", titlefield, datefield, "", remarksfield, 0); m_objects.push_back(main); int spos = pos; while ((spos = matcherB2.indexIn(all, spos, QRegExp::CaretAtOffset)) != -1) { all.replace(spos, matcherB2.matchedLength(), ""); ++count; QString numfield = matcherB2.cap(2); titlefield = matcherB2.cap(4); if (matcherB2.cap(1).trimmed() != "") { titlefield = matcherB2.cap(1) + matcherB2.cap(2) + " " + matcherB2.cap(4); DEBUG << "prefix to number = " << matcherB2.cap(1) << ", so extending title from " << matcherB2.cap(4) << " to " << titlefield << endl; } datefield = ""; remarksfield = ""; if (titlefield != "") { int dpos; if ((dpos = matcherDate.indexIn(titlefield)) != -1) { datefield = matcherDate.cap(1); remarksfield = matcherDate.cap(2); titlefield = titlefield.left(dpos); } } Work *sub = makeWork(composerName, opfield, "", numfield, titlefield, datefield, "", remarksfield, main); m_objects.push_back(sub); } } // Title with date but no opus in list form (as used for e.g. Copland) QRegExp matcherC("\\* *([^\n]*)\\([^\\)]*([0-9]{4})[^\\)]*\\) *\n"); // Part of the above (e.g. song in cycle), intended to be anchored to // the point at which the last matcherC or matcherC2 match ended // (note caret) QRegExp matcherC2("^\\*\\* *([^\n]*)\n"); pos = 0; while ((pos = matcherC.indexIn(all, pos)) != -1) { all.replace(pos, matcherC.matchedLength(), ""); ++count; QString titlefield = matcherC.cap(1); QString datefield = matcherC.cap(2); Work *main = makeWork(composerName, "", "", "", titlefield, datefield, "", "", 0); m_objects.push_back(main); int spos = pos; while ((spos = matcherC2.indexIn(all, spos, QRegExp::CaretAtOffset)) != -1) { all.replace(spos, matcherC2.matchedLength(), ""); ++count; titlefield = matcherC2.cap(1); datefield = ""; if (titlefield != "") { int dpos; if ((dpos = matcherDate.indexIn(titlefield)) != -1) { datefield = matcherDate.cap(1); titlefield = titlefield.left(dpos); } } Work *sub = makeWork(composerName, "", "", "", titlefield, datefield, "", "", main); m_objects.push_back(sub); } } DEBUG << "Left over: " << all << endl; // Other forms: // *March No. 1 in F major for Military Band, WoO 18 (1808) DEBUG << "Found " << count << " things" << endl; } }