Mercurial > hg > classical
view import/ImportWikipediaComposers.cpp @ 4:719a4f477098 classical-rdf
* Add Classical Archives composer list importer; run it
author | Chris Cannam |
---|---|
date | Thu, 10 Dec 2009 15:15:40 +0000 |
parents | e8f4c2b55fd8 |
children | c8ef23d3888c |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportWikipediaComposers.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void WikipediaComposersImporter::setSource(QUrl source) { DEBUG << "WikipediaComposersImporter::setSource: " << source << endl; import(source); } Composer * addComposer(QString namefield, QString birthfield, QString deathfield, QString datesfield, QString nationalityfield, QString worksfield, QString summaryfield) { namefield = namefield.trimmed(); birthfield = birthfield.trimmed(); deathfield = deathfield.trimmed(); datesfield = datesfield.trimmed(); nationalityfield = nationalityfield.trimmed(); worksfield = worksfield.trimmed(); summaryfield = summaryfield.trimmed(); Composer *composer = new Composer(); QString name = namefield; name.replace("[[", ""); name.replace("]]", ""); QString pagename = name; if (name.contains('|')) { QStringList bits = name.split('|'); pagename = bits[0]; name = bits[1]; } composer->setName(name); pagename.replace(" ", "_"); QUrl url; url.setScheme("http"); url.setHost("en.wikipedia.org"); url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename)); Document *d = new Document; d->setUri(url); d->setSiteName("Wikipedia"); d->setTopic(composer); composer->addPage(d); if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment bool approx = (datesfield.contains("c.") || datesfield.contains("?") || datesfield.contains("before") || datesfield.contains("after")); if (datesfield != "") { DEBUG << "dates for " << name << ": " << datesfield << endl; datesfield.replace("(", ""); datesfield.replace(")", ""); datesfield.replace(" ", ""); datesfield.replace(QString::fromUtf8("\342\200\222"), "-"); datesfield.replace(QString::fromUtf8("\342\200\223"), "-"); datesfield.replace(QString::fromUtf8("\342\200\224"), "-"); datesfield.replace(QString::fromUtf8("\342\200\225"), "-"); datesfield.replace("--", "-"); DEBUG << "dates for " << name << ": " << datesfield << endl; QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-"); QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?"); if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1); else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2); QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])"); QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])"); if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2); else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2); // datesfield.replace(QRegExp("[^0-9]+"), "-"); /* QStringList list = datesfield.split('-'); if (!list.empty()) { birthfield = list[0]; if (list.size() > 1) { deathfield = list[1]; } } */ DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl; } if (birthfield != "") { Birth *e = new Birth(birthfield.toInt()); e->setApproximate(approx); composer->setBirth(e); } if (deathfield != "") { Death *e = new Death(deathfield.toInt()); e->setApproximate(approx); composer->setDeath(e); } if (nationalityfield != "") { composer->addNationality(nationalityfield); } if (summaryfield != "") { summaryfield.replace(QRegExp("^[Cc]omposer, *"), ""); summaryfield[0] = summaryfield[0].toUpper(); summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[["); summaryfield.replace("[[", ""); summaryfield.replace("]]", ""); summaryfield.replace("''", "\""); summaryfield.replace(""", "'"); summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), ""); summaryfield.replace("[", ""); summaryfield.replace("]", ""); composer->setRemarks(summaryfield); } return composer; } void WikipediaComposersImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString period; DEBUG << "source = " << source.toString() << endl; QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_"); QRegExp pmatcher2("List_of_([^_-]+)[_-]era_"); QRegExp pmatcher3("([^_-]+)_composers"); if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1); else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1); else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1); DEBUG << "period = "<< period << endl; int count = 0; // table form A (used of e.g. Romantic transitional composers) // | Name || birth || death || nationality || summary || flags // note: 5x || QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|"); // table form B (used of e.g. 20th-century composers) // | Name || birth-[death] || nationality || notable works || remarks // Note name may contain a single | if in double-square brackets, hence 2a // note: 4x || QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)"); // just in case the final column has been omitted completely (as happens). // this must be matched after matcher2 QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)"); // list form // * [[Name]] [alias?] (stuff about dates)[,] notes QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)"); while (!stream.atEnd()) { QString line = stream.readLine(); Composer *o = 0; if (matcher1.indexIn(line) >= 0) { o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3), "", matcher1.cap(4), "", matcher1.cap(5)); } else if (matcher2.indexIn(line) >= 0) { o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "", matcher2.cap(4), matcher2.cap(5), ""); } else if (matcher2a.indexIn(line) >= 0) { o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "", matcher2a.cap(4), "", ""); } else if (matcher3.indexIn(line) >= 0) { o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3), "", "", matcher3.cap(5)); } else if (line.startsWith("* ") || line.startsWith("| ") || line.startsWith("*[") || line.startsWith("|[")) { DEBUG << "Failed to match promising line: " << line << endl; } if (o) { if (period != "") o->setPeriod(period); m_objects.push_back(o); ++count; } } DEBUG << "Found " << count << " things" << endl; } }