Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@0: #include "ImportWikipediaComposers.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@0: WikipediaComposersImporter::setSource(QUrl source) Chris@0: { Chris@0: DEBUG << "WikipediaComposersImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@0: Composer * Chris@0: addComposer(QString namefield, QString birthfield, QString deathfield, Chris@0: QString datesfield, QString nationalityfield, QString worksfield, Chris@0: QString summaryfield) Chris@0: { Chris@0: namefield = namefield.trimmed(); Chris@0: birthfield = birthfield.trimmed(); Chris@0: deathfield = deathfield.trimmed(); Chris@0: datesfield = datesfield.trimmed(); Chris@0: nationalityfield = nationalityfield.trimmed(); Chris@0: worksfield = worksfield.trimmed(); Chris@0: summaryfield = summaryfield.trimmed(); Chris@0: Chris@0: Composer *composer = new Composer(); Chris@0: Chris@0: QString name = namefield; Chris@0: name.replace("[[", ""); Chris@0: name.replace("]]", ""); Chris@0: QString pagename = name; Chris@0: Chris@0: if (name.contains('|')) { Chris@0: QStringList bits = name.split('|'); Chris@0: pagename = bits[0]; Chris@0: name = bits[1]; Chris@0: } Chris@0: Chris@0: composer->setName(name); Chris@0: Chris@0: pagename.replace(" ", "_"); Chris@0: QUrl url; Chris@0: url.setScheme("http"); Chris@0: url.setHost("en.wikipedia.org"); Chris@0: Chris@0: url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename)); Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri(url)); Chris@0: d->setSiteName("Wikipedia"); Chris@0: d->setTopic(composer); Chris@0: composer->addPage(d); Chris@0: Chris@0: if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment Chris@0: Chris@0: bool approx = (datesfield.contains("c.") || datesfield.contains("?") Chris@0: || datesfield.contains("before") || datesfield.contains("after")); Chris@0: Chris@0: if (datesfield != "") { Chris@0: DEBUG << "dates for " << name << ": " << datesfield << endl; Chris@0: datesfield.replace("(", ""); Chris@0: datesfield.replace(")", ""); Chris@0: datesfield.replace(" ", ""); Chris@0: datesfield.replace(QString::fromUtf8("\342\200\222"), "-"); Chris@0: datesfield.replace(QString::fromUtf8("\342\200\223"), "-"); Chris@0: datesfield.replace(QString::fromUtf8("\342\200\224"), "-"); Chris@0: datesfield.replace(QString::fromUtf8("\342\200\225"), "-"); Chris@0: datesfield.replace("--", "-"); Chris@0: DEBUG << "dates for " << name << ": " << datesfield << endl; Chris@0: Chris@0: QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-"); Chris@0: QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?"); Chris@0: Chris@0: if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1); Chris@0: else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2); Chris@0: Chris@0: QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])"); Chris@0: QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])"); Chris@0: Chris@0: if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2); Chris@0: else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2); Chris@0: Chris@0: // datesfield.replace(QRegExp("[^0-9]+"), "-"); Chris@0: /* Chris@0: QStringList list = datesfield.split('-'); Chris@0: if (!list.empty()) { Chris@0: birthfield = list[0]; Chris@0: if (list.size() > 1) { Chris@0: deathfield = list[1]; Chris@0: } Chris@0: } Chris@0: */ Chris@0: DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl; Chris@0: } Chris@0: if (birthfield != "") { Chris@0: Birth *e = new Birth(birthfield.toInt()); Chris@0: e->setApproximate(approx); Chris@0: composer->setBirth(e); Chris@0: } Chris@0: if (deathfield != "") { Chris@0: Death *e = new Death(deathfield.toInt()); Chris@0: e->setApproximate(approx); Chris@0: composer->setDeath(e); Chris@0: } Chris@0: if (nationalityfield != "") { Chris@4: composer->addNationality(nationalityfield); Chris@0: } Chris@0: if (summaryfield != "") { Chris@0: summaryfield.replace(QRegExp("^[Cc]omposer, *"), ""); Chris@0: summaryfield[0] = summaryfield[0].toUpper(); Chris@0: summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[["); Chris@0: summaryfield.replace("[[", ""); Chris@0: summaryfield.replace("]]", ""); Chris@0: summaryfield.replace("''", "\""); Chris@0: summaryfield.replace(""", "'"); Chris@0: summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), ""); Chris@0: summaryfield.replace("[", ""); Chris@0: summaryfield.replace("]", ""); Chris@0: composer->setRemarks(summaryfield); Chris@0: } Chris@0: Chris@0: return composer; Chris@0: } Chris@0: Chris@0: void Chris@0: WikipediaComposersImporter::import(QUrl source) Chris@0: { Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: Chris@0: QString period; Chris@0: DEBUG << "source = " << source.toString() << endl; Chris@0: QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_"); Chris@0: QRegExp pmatcher2("List_of_([^_-]+)[_-]era_"); Chris@0: QRegExp pmatcher3("([^_-]+)_composers"); Chris@0: if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1); Chris@0: else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1); Chris@0: else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1); Chris@0: DEBUG << "period = "<< period << endl; Chris@0: Chris@0: int count = 0; Chris@0: Chris@0: // table form A (used of e.g. Romantic transitional composers) Chris@0: // | Name || birth || death || nationality || summary || flags Chris@0: // note: 5x || Chris@0: QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|"); Chris@0: Chris@0: // table form B (used of e.g. 20th-century composers) Chris@0: // | Name || birth-[death] || nationality || notable works || remarks Chris@0: // Note name may contain a single | if in double-square brackets, hence 2a Chris@0: // note: 4x || Chris@0: QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)"); Chris@0: // just in case the final column has been omitted completely (as happens). Chris@0: // this must be matched after matcher2 Chris@0: QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)"); Chris@0: Chris@0: // list form Chris@0: // * [[Name]] [alias?] (stuff about dates)[,] notes Chris@0: QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)"); Chris@0: Chris@0: while (!stream.atEnd()) { Chris@0: QString line = stream.readLine(); Chris@0: Chris@0: Composer *o = 0; Chris@0: Chris@0: if (matcher1.indexIn(line) >= 0) { Chris@0: Chris@0: o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3), Chris@0: "", matcher1.cap(4), "", matcher1.cap(5)); Chris@0: Chris@0: } else if (matcher2.indexIn(line) >= 0) { Chris@0: Chris@0: o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "", Chris@0: matcher2.cap(4), matcher2.cap(5), ""); Chris@0: Chris@0: } else if (matcher2a.indexIn(line) >= 0) { Chris@0: Chris@0: o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "", Chris@0: matcher2a.cap(4), "", ""); Chris@0: Chris@0: } else if (matcher3.indexIn(line) >= 0) { Chris@0: Chris@0: o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3), Chris@0: "", "", matcher3.cap(5)); Chris@0: Chris@0: } else if (line.startsWith("* ") || line.startsWith("| ") || Chris@0: line.startsWith("*[") || line.startsWith("|[")) { Chris@0: DEBUG << "Failed to match promising line: " << line << endl; Chris@0: } Chris@0: Chris@0: if (o) { Chris@0: if (period != "") o->setPeriod(period); Chris@0: m_objects.push_back(o); Chris@0: ++count; Chris@0: } Chris@0: Chris@0: } Chris@0: Chris@0: DEBUG << "Found " << count << " things" << endl; Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: Chris@0: