Mercurial > hg > classical
view import/ImportWikipediaWorksList.cpp @ 53:bcea875d8d2f tip
More build fixes
author | Chris Cannam |
---|---|
date | Thu, 16 Oct 2014 19:03:51 +0100 |
parents | c8ef23d3888c |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportWikipediaWorksList.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void WikipediaWorksListImporter::setSource(QUrl source) { DEBUG << "WikipediaWorksListImporter::setSource: " << source << endl; import(source); } static QString sanitise(QString field, QString &linkText) { int mp; field.replace(QString::fromUtf8("\342\200\222"), "-"); field.replace(QString::fromUtf8("\342\200\223"), "-"); field.replace(QString::fromUtf8("\342\200\224"), "-"); field.replace(QString::fromUtf8("\342\200\225"), "-"); field.replace(QString::fromUtf8("\342\231\255"), "-flat"); field.replace(QString::fromUtf8("\342\231\257"), "-sharp"); QRegExp link2("([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]"); if ((mp = link2.indexIn(field)) >= 0) { if (linkText == "" && mp < 4) linkText = link2.cap(2); field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3)); return sanitise(field, linkText); } QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]"); if ((mp = link1.indexIn(field)) >= 0) { if (linkText == "") linkText = link1.cap(2); field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2)); return sanitise(field, linkText); } field = field.trimmed(); field.replace("[", ""); field.replace("]", ""); field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), " "); field.replace("'''", "\""); field.replace("''", "\""); field.replace(""", "\""); field.replace("\"\"", "\""); field.replace(QRegExp("^[\'\"] (\")?"), "\""); field.replace(QRegExp("<[^&]*>"), ""); field.replace(QRegExp("^\\**"), ""); if (field.endsWith("c.")) { // historical artifact from removal of Bruckner year indication (c. 1856) field = field.left(field.length()-2); } while (field.endsWith(".") || field.endsWith(",")) { field = field.left(field.length()-1); } if (field.startsWith(";") || field.startsWith(":") || field.startsWith(",") || field.startsWith("-")) { field = field.right(field.length()-1); } if (field.startsWith("(") && field.endsWith(")")) { DEBUG << "before: " << field; field = field.mid(1, field.length()-2); DEBUG << "after: " << field; } field.replace(QRegExp("^\\**"), ""); if (field == ")" || field == "(") { field = ""; } field.replace(" - ,", ","); field.replace(" ", " "); return field.trimmed(); } static QString extractYear(QString datefield) { QRegExp re("[0-9]{4}"); if (re.indexIn(datefield) >= 0) { return re.cap(0); } return ""; } static QString extractKey(QString titlefield) { QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))"); if (re.indexIn(titlefield) >= 0) { return re.cap(1); } return ""; } static Work * makeWork(QString composerName, QString opfield, QString numfield, int partNumber, QString titlefield, QString datefield, QString placefield, QString remarksfield, Work *main) { if (titlefield.contains("List of ") || titlefield.contains("http:")) return 0; QString linkText; Work *w = new Work; QRegExp embeddedOpMatcher("([Oo]pus|[Oo]p.|WAB) (posth[a-z\\.]* *)?([0-9][^ ;:,]*)(,? *([Nn]umber|[Nn]o.|[Nn]r.) ([0-9][^ ;:,]*))?,?"); if (embeddedOpMatcher.indexIn(titlefield) >= 0) { QString opf = embeddedOpMatcher.cap(0); if (opfield == "") opfield = opf; titlefield.replace(opf, ""); } else if (embeddedOpMatcher.indexIn(remarksfield) >= 0) { opfield = embeddedOpMatcher.cap(0); } if (main && numfield == "") { QRegExp embeddedNumMatcher("(Number|No.|Nr.) ([0-9][^ ;:,]*)"); if (embeddedNumMatcher.indexIn(titlefield) >= 0) { numfield = embeddedNumMatcher.cap(2); } else if (embeddedNumMatcher.indexIn(remarksfield) >= 0) { numfield = embeddedNumMatcher.cap(2); } } QString op = sanitise(opfield, linkText); if (op != "") { if (op.toLower().contains("op")) { op.replace("Opus ", ""); op.replace("Op. ", ""); op.replace("Op.", ""); op.replace("Op ", ""); op.replace("opus ", ""); op.replace("op. ", ""); op.replace("op.", ""); op.replace("op ", ""); w->setOpus(op); } else if (QRegExp("^[0-9]*$").indexIn(op) >= 0) { w->setOpus(op); } else { w->setCatalogue(op); } } QString num = sanitise(numfield, linkText); if (num != "") { num.replace("No. ", ""); num.replace("No ", ""); w->setNumber(num); } else if (partNumber > 0) { w->setNumber(QString("%1").arg(partNumber)); } QString key = extractKey(titlefield); if (key != "") { w->setKey(key); } DEBUG << "title before sanitise: " << titlefield << endl; remarksfield = remarksfield.trimmed(); QString title = sanitise(titlefield, linkText); title.replace(QRegExp(", which.*$"), ""); if (linkText != "") { if (remarksfield == "" && title.startsWith(linkText)) { remarksfield = title.right(title.length() - linkText.length()); title = linkText; } linkText.replace(" ", "_"); QUrl url; url.setScheme("http"); url.setHost("en.wikipedia.org"); url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText)); Document *d = new Document; d->setUri(Uri(url)); d->setSiteName("Wikipedia"); d->setTopic(w); w->addPage(d); } DEBUG << "title after sanitise: " << title << ", link text " << linkText << ", remarks " << remarksfield << endl; QRegExp explicationRE("^(\"[^-]+\") - (.+)$"); int pos; if ((pos = explicationRE.indexIn(title)) >= 0) { QString part = explicationRE.cap(2); if (part[0].isUpper()) w->addAlias(explicationRE.cap(2)); else if (remarksfield == "") remarksfield = explicationRE.cap(2); title = explicationRE.cap(1); } QRegExp remarksRE1("^(\"[^-]+\") (for .*)$"); if ((pos = remarksRE1.indexIn(title)) >= 0) { if (remarksfield != "") { remarksfield = QString("%1 - %2") .arg(remarksRE1.cap(2)).arg(remarksfield); } else { remarksfield = remarksRE1.cap(2); } title = remarksRE1.cap(1); } QRegExp remarksRE2("^(\"[^\"]+\"), (.*)$"); if ((pos = remarksRE2.indexIn(title)) >= 0) { if (remarksfield != "") { remarksfield = QString("%1 - %2") .arg(remarksRE2.cap(2)).arg(remarksfield); } else { remarksfield = remarksRE2.cap(2); } title = remarksRE2.cap(1); } QRegExp explicationRE2("^([^\\(]*\") \\(([^\\)]*)\\)(.*)$"); if ((pos = explicationRE2.indexIn(title)) >= 0) { w->addAlias(explicationRE2.cap(2)); if (remarksfield == "") remarksfield = explicationRE2.cap(3); title = explicationRE2.cap(1); } if (title.startsWith("Song \"")) { title = title.right(title.length() - 5); w->addForm(Form::getFormByName("song")); } if (!main && title.startsWith("Song cycle \"")) { title = title.right(title.length() - 11); w->addForm(Form::getFormByName("song cycle")); } if (main && main->forms().contains(Form::getFormByName("song cycle"))) { w->addForm(Form::getFormByName("song")); } if (title == "" && !main) { delete w; return 0; } w->setName(title); QString remarks = sanitise(remarksfield, linkText); if (remarks != "") { w->setRemarks(remarks); } QString year = extractYear(datefield); QString place = sanitise(placefield, linkText); DEBUG << "title = " << title << endl; if (main) { main->addPart(w); w->setPartOf(main); w->setComposition(main->composition()); main->composition()->addWork(w); } if (!main || !main->composition() || (year != "" && (main->composition()->year() != year.toInt()))) { Composition *c = new Composition; c->setComposerName(composerName); c->addWork(w); c->setYear(year.toInt()); c->setPlace(place); w->setComposition(c); } return w; } void WikipediaWorksListImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString composerName; if (filename.contains("K%C3%B6chel")) { composerName = "Wolfgang Amadeus Mozart"; } else if (filename.contains("/Schubert_")) { composerName = "Franz Schubert"; } else { QRegExp byby("by_(.*)_by"); if (byby.indexIn(filename) >= 0) { composerName = byby.cap(1).replace('_', ' '); } else { QRegExp bybr("by_(.*)_\\("); if (bybr.indexIn(filename) >= 0) { composerName = bybr.cap(1).replace('_', ' '); } else { QRegExp by("by_(.*)"); if (by.indexIn(filename) >= 0) { composerName = by.cap(1).replace('_', ' '); } else { QRegExp of("of_([A-Z].*)"); if (of.indexIn(filename) >= 0) { composerName = of.cap(1).replace('_', ' '); } } } } } composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit()); DEBUG << "composerName = " << composerName << endl; // We try to keep these matchers specific enough that we can be // sure the title field will come out containing _at least_ the // title. i.e. the title field should never end up with just the // opus number or date or whatever, even if the line is formatted // in a way we hadn't anticipated. Thus it helps if the title is // bookended by '' or [[]], etc // e.g. Beethoven // *Opus 84: ''[[Egmont (Beethoven)|Egmont]]'', overture and incidental music (1810) // opus field - n/a - title - date - n/a - remarks QRegExp workMatcher1("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:{]*)[:,] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$"); // e.g. Tchaikovsky // *'''Op. 19''' 6 Pieces, for piano (1873) // or Ravel // * '''1''', Piano Sonata movement (1888), lost /* // opus field - n/a - title - date - n/a - remarks QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G)? *[0-9][^ ,:'{]*)'''[:, ] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$"); */ // opus field - n/a - title QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|[A-Z]{1,2})?\\.? *[0-9][^ ,:'{]*),?'''[:, ] *(.*)$"); // e.g. Copland // * ''Four Motets'' for mixed voices (1921) // title - date field // (no opus) QRegExp workMatcher2("^\\* *(''.*''\\)?) *(.*)$"); workMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings // e.g. Copland // * Arrangement of ''Lincoln Portrait'' for concert band (1942) // or Mendelssohn // * [[Christe du Lamm Gottes]] (1827), SATB, strings // title - date field - remarks // (no opus) QRegExp workMatcher3("^\\* *([^\\*].*) *\\(([^\\)]*[0-9]{4}[^\\)]*)\\) *(.*)$"); // e.g. Scriabin // *[[Sonata No. 2 (Scriabin)|Sonata No. 2 in G sharp minor]], Op. 19 (also known as ''Sonata-Fantasy'')" // title - opus field - n/a - remarks QRegExp workMatcher4("^\\* *(\\[\\[.*\\]\\]),* (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*) *(.*)$"); // e.g. Scriabin // *Opus 35: [[Opus 35 (Scriabin)|Three Preludes]] // opus field - n/a - title - remarks QRegExp workMatcher5("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)[:,]* *([\\[']+.*[\\]']+) *(.*)$"); // e.g. Boccherini // *G 1: Cello Sonata in F major // or weird Schubert layout // * D 505{{nbsp|4}}Adagio in D-flat for Piano // or Glazunov // :Op. 67: ''[[The Seasons (ballet)|The Seasons]]'', ballet in one act (1900) // or even // ::Op. 77: ''[[Symphony No. 7 (Glazunov)|Symphony No. 7]]'' "Pastorale" in F major (1902-1903) // This one is a real mess, for really messy pages. Needs to go near // the end of the matchers in case it catches something it shouldn't // n/a - opus field - n/a - n/a - n/a - title QRegExp workMatcher6("^([\\*:]|::) *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)(([:,]| *\\{+[^\\}]+\\}+) *(.*))?$"); // e.g. Bruch // * Adagio appassionato for violin and orchestra in C sharp minor, Op. 57 // title - opus field - date field QRegExp workMatcher7("^\\* *(.*),? (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*|[Oo]p. posth[a-z.]*) *(\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\))? *$"); // e.g. Bruckner // * Symphony No. 0 in D minor 1869 WAB 100 // title - date field - opus field QRegExp workMatcher8("^\\* *(.*) ([0-9]{4}[0-9/-]*) *(WAB [0-9][^ ]*)$"); // e.g. Bach // * BWV 506 ? Was bist du doch, o Seele, so betruebet // opus field - title QRegExp workMatcher9("^\\* *(BWV [^ ]+)(.*)$"); // Catch-all for things that look at all promising (anything that // starts with ' or [ after bullet: take the whole as title) QRegExp workMatcher10("^[\\*:] *((['\\[]|").*)$"); // e.g. Beethoven // **No. 1: [[Piano Trio No. 1 (Beethoven)|Piano Trio No. 1]] in E-flat major // number field - n/a - title, remarks etc QRegExp partMatcher1("^[\\*:]{2} *((No\\.? *)?[0-9][^ ,:'{]*)[:, ] *(.*)$"); // e.g. Copland // ** ''Help us, O Lord'' // title - remarks QRegExp partMatcher2("^\\*\\* *(''.*'') *(.*)$"); partMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings // e.g. Scriabin // **[[Mazurka Op. 40 No. 1 (Scriabin)|Mazurka in D flat major]] // title - remarks QRegExp partMatcher3("^\\*\\* *(\\[\\[.*\\]\\])(.*)$"); // e.g. Berlioz // ** 1: ''Méditation religieuse'' // number - title - remarks QRegExp partMatcher4("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *([\\[]*''.*''[\\]]*) *(.*)$"); // e.g. Tchaikovsky // **4. Nocturne [???????] (C? minor) // number - title - remarks QRegExp partMatcher5("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *(.*\\[[^\\]]+\\])(.*)$"); // e.g. Schubert // **2. "Wohin?" // n/a - number - title QRegExp partMatcher6("^\\*\\* *(([0-9][0-9a-z]*)[\\.:])? *(("|'').*)$"); // e.g. Mendelssohn // ** Notturno // title only QRegExp partMatcher7("^\\*\\* *(.*)$"); // Date and remarks within titlefield or remarksfield QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\),?(.*)"); Work *main = 0; int partNumber = 0; QString line; QString opfield, numfield, titlefield, remarksfield, datefield; while (!stream.atEnd()) { if (line == "") { line = stream.readLine(); DEBUG << "line: " << line << endl; } opfield = ""; numfield = ""; titlefield = ""; datefield = ""; remarksfield = ""; partNumber = 0; if (workMatcher1.indexIn(line) >= 0) { DEBUG << "matcher 1" << endl; opfield = workMatcher1.cap(1); titlefield = workMatcher1.cap(3); datefield = workMatcher1.cap(4); remarksfield = workMatcher1.cap(6); } else if (workMatcher1a.indexIn(line) >= 0) { DEBUG << "matcher 1a" << endl; opfield = workMatcher1a.cap(1); titlefield = workMatcher1a.cap(3); /* datefield = workMatcher1a.cap(4); remarksfield = workMatcher1a.cap(6); */ } else if (workMatcher2.indexIn(line) >= 0) { DEBUG << "matcher 2" << endl; titlefield = workMatcher2.cap(1); remarksfield = workMatcher2.cap(2); } else if (workMatcher3.indexIn(line) >= 0) { DEBUG << "matcher 3" << endl; titlefield = workMatcher3.cap(1); datefield = workMatcher3.cap(2); remarksfield = workMatcher3.cap(3); } else if (workMatcher4.indexIn(line) >= 0) { DEBUG << "matcher 4" << endl; titlefield = workMatcher4.cap(1); opfield = workMatcher4.cap(2); remarksfield = workMatcher4.cap(4); } else if (workMatcher5.indexIn(line) >= 0) { DEBUG << "matcher 5" << endl; opfield = workMatcher5.cap(1); titlefield = workMatcher5.cap(3); remarksfield = workMatcher5.cap(4); } else if (workMatcher6.indexIn(line) >= 0) { DEBUG << "matcher 6" << endl; opfield = workMatcher6.cap(2); titlefield = workMatcher6.cap(6); } else if (workMatcher7.indexIn(line) >= 0) { DEBUG << "matcher 7" << endl; titlefield = workMatcher7.cap(1); opfield = workMatcher7.cap(2); datefield = workMatcher7.cap(3); } else if (workMatcher8.indexIn(line) >= 0) { DEBUG << "matcher 8" << endl; titlefield = workMatcher8.cap(1); datefield = workMatcher8.cap(2); opfield = workMatcher8.cap(3); } else if (workMatcher9.indexIn(line) >= 0) { DEBUG << "matcher 9" << endl; opfield = workMatcher9.cap(1); titlefield = workMatcher9.cap(2); } else if (workMatcher10.indexIn(line) >= 0) { DEBUG << "matcher 10" << endl; titlefield = workMatcher10.cap(1); } else { if (line.startsWith("*") || line.startsWith(":")) { DEBUG << "Failed to match promising works list line: " << line << endl; } line = ""; continue; } if (titlefield != "" && datefield == "") { int dpos; if ((dpos = matcherDate.indexIn(titlefield)) != -1) { datefield = matcherDate.cap(1); remarksfield = matcherDate.cap(2); titlefield = titlefield.left(dpos); } } if (remarksfield != "" && datefield == "") { int dpos; if ((dpos = matcherDate.indexIn(remarksfield)) != -1) { datefield = matcherDate.cap(1); remarksfield = remarksfield.left(dpos); } } main = makeWork(composerName, opfield, "", 0, titlefield, datefield, "", remarksfield, 0); if (main) m_objects.push_back(main); line = ""; while (!stream.atEnd()) { ++partNumber; line = stream.readLine(); DEBUG << "line: " << line << endl; if (partMatcher1.indexIn(line) >= 0) { DEBUG << "part matcher 1" << endl; numfield = partMatcher1.cap(1); titlefield = partMatcher1.cap(3); remarksfield = ""; } else if (partMatcher2.indexIn(line) >= 0) { DEBUG << "part matcher 2" << endl; titlefield = partMatcher2.cap(1); remarksfield = partMatcher2.cap(2); } else if (partMatcher3.indexIn(line) >= 0) { DEBUG << "part matcher 3" << endl; titlefield = partMatcher3.cap(1); remarksfield = partMatcher3.cap(2); } else if (partMatcher4.indexIn(line) >= 0) { DEBUG << "part matcher 4" << endl; numfield = partMatcher4.cap(1); titlefield = partMatcher4.cap(2); remarksfield = partMatcher4.cap(3); } else if (partMatcher5.indexIn(line) >= 0) { DEBUG << "part matcher 5" << endl; numfield = partMatcher5.cap(1); titlefield = partMatcher5.cap(2); remarksfield = partMatcher5.cap(3); } else if (partMatcher6.indexIn(line) >= 0) { DEBUG << "part matcher 6" << endl; numfield = partMatcher6.cap(2); titlefield = partMatcher6.cap(3); } else if (partMatcher7.indexIn(line) >= 0) { DEBUG << "part matcher 7" << endl; titlefield = partMatcher7.cap(1); } else { if (line.startsWith("**") || line.startsWith("::")) { DEBUG << "Failed to match promising part line: " << line << endl; } break; } if (titlefield != "" && datefield == "") { int dpos; if ((dpos = matcherDate.indexIn(titlefield)) != -1) { datefield = matcherDate.cap(1); remarksfield = matcherDate.cap(2); titlefield = titlefield.left(dpos); } } Work *part = makeWork(composerName, opfield, numfield, partNumber, titlefield, datefield, "", remarksfield, main); if (part) m_objects.push_back(part); } } DEBUG << "Found " << m_objects.size() << " things" << endl; } }