Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@0: #include "ImportWikipediaWorksK.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@0: WikipediaWorksKImporter::setSource(QUrl source) Chris@0: { Chris@0: DEBUG << "WikipediaWorksKImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@0: static QString Chris@0: sanitise(QString field, QString &linkText) Chris@0: { Chris@0: int mp; Chris@0: Chris@0: field.replace(QString::fromUtf8("\342\200\222"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\223"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\224"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\225"), "-"); Chris@0: Chris@0: QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]"); Chris@0: if ((mp = link2.indexIn(field)) >= 0) { Chris@0: if (linkText == "") linkText = link2.cap(2); Chris@0: field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3)); Chris@0: return sanitise(field, linkText); Chris@0: } Chris@0: Chris@0: QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]"); Chris@0: if ((mp = link1.indexIn(field)) >= 0) { Chris@0: if (linkText == "") linkText = link1.cap(2); Chris@0: field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2)); Chris@0: return sanitise(field, linkText); Chris@0: } Chris@0: Chris@0: field = field.trimmed(); Chris@0: Chris@0: field.replace("[", ""); Chris@0: field.replace("]", ""); Chris@0: field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), ""); Chris@0: field.replace("''", "\""); Chris@0: field.replace(""", "\""); Chris@0: field.replace(QRegExp("<[^&]*>"), ""); Chris@0: field.replace(QRegExp("^\\**"), ""); Chris@0: Chris@0: while (field.endsWith(".") || field.endsWith(",")) { Chris@0: field = field.left(field.length()-1); Chris@0: } Chris@0: Chris@0: if (field.startsWith("(") && field.endsWith(")")) { Chris@0: DEBUG << "before: " << field; Chris@0: field = field.mid(1, field.length()-2); Chris@0: DEBUG << "after: " << field; Chris@0: } Chris@0: field.replace(QRegExp("^\\**"), ""); Chris@0: if (field == ")" || field == "(") { Chris@0: field = ""; Chris@0: } Chris@0: Chris@0: field.replace(" - ,", ","); Chris@0: Chris@0: return field; Chris@0: } Chris@0: Chris@0: static QString Chris@0: extractYear(QString datefield) Chris@0: { Chris@0: QRegExp re("[0-9]{4}"); Chris@0: if (re.indexIn(datefield) >= 0) { Chris@0: return re.cap(0); Chris@0: } Chris@0: return ""; Chris@0: } Chris@0: Chris@0: static QString Chris@0: extractKey(QString titlefield) Chris@0: { Chris@0: QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))"); Chris@0: if (re.indexIn(titlefield) >= 0) { Chris@0: return re.cap(1); Chris@0: } Chris@0: return ""; Chris@0: } Chris@0: Chris@0: static Work * Chris@0: makeWork(QString composerName, QString opfield, QString kfield, Chris@0: QString numfield, QString titlefield, QString datefield, Chris@0: QString placefield, QString remarksfield, Work *main) Chris@0: { Chris@0: QString linkText; Chris@0: Chris@0: Work *w = new Work; Chris@0: Chris@0: QString op = sanitise(opfield, linkText); Chris@0: if (op != "") { Chris@0: op.replace("Opus ", ""); Chris@0: op.replace("Op. ", ""); Chris@0: op.replace("Op ", ""); Chris@0: w->setOpus(op); Chris@0: } Chris@0: Chris@0: QString k = sanitise(kfield, linkText); Chris@0: if (k != "") { Chris@0: k.replace("K. ", "K "); Chris@0: w->setCatalogue(k); Chris@0: } Chris@0: Chris@0: QString num = sanitise(numfield, linkText); Chris@0: if (num != "") { Chris@0: num.replace("No. ", ""); Chris@0: num.replace("No ", ""); Chris@0: w->setNumber(num); Chris@0: } Chris@0: Chris@0: QString key = extractKey(titlefield); Chris@0: if (key != "") { Chris@0: w->setKey(key); Chris@0: } Chris@0: Chris@0: QString title = sanitise(titlefield, linkText); Chris@0: if (linkText != "") { Chris@0: linkText.replace(" ", "_"); Chris@0: QUrl url; Chris@0: url.setScheme("http"); Chris@0: url.setHost("en.wikipedia.org"); Chris@0: url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText)); Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri(url)); Chris@0: d->setSiteName("Wikipedia"); Chris@0: d->setTopic(w); Chris@0: w->addPage(d); Chris@0: } Chris@0: Chris@0: QRegExp explicationRE("^(\"[^-]+\") - (.*)$"); Chris@0: int pos; Chris@0: if ((pos = explicationRE.indexIn(title)) >= 0) { Chris@0: w->addAlias(explicationRE.cap(2)); Chris@0: title = explicationRE.cap(1); Chris@0: } Chris@0: Chris@0: if (remarksfield == "") { Chris@0: QRegExp remarksRE("^(\"[^-]+\") (for .*)$"); Chris@0: if ((pos = remarksRE.indexIn(title)) >= 0) { Chris@0: remarksfield = remarksRE.cap(2); Chris@0: title = remarksRE.cap(1); Chris@0: } Chris@0: } Chris@0: Chris@0: if (remarksfield == "") { Chris@0: QRegExp remarksRE("^(\"[^-]+\"), (.*)$"); Chris@0: if ((pos = remarksRE.indexIn(title)) >= 0) { Chris@0: remarksfield = remarksRE.cap(2); Chris@0: title = remarksRE.cap(1); Chris@0: } Chris@0: } Chris@0: Chris@0: w->setName(title); Chris@0: Chris@0: QString remarks = sanitise(remarksfield, linkText); Chris@0: if (remarks != "") { Chris@0: w->setRemarks(remarks); Chris@0: } Chris@0: Chris@0: QString year = extractYear(datefield); Chris@0: QString place = sanitise(placefield, linkText); Chris@0: Chris@0: DEBUG << "title = " << title << endl; Chris@0: Chris@0: if (main) { Chris@0: main->addPart(w); Chris@0: w->setPartOf(main); Chris@0: w->setComposition(main->composition()); Chris@0: main->composition()->addWork(w); Chris@0: } Chris@0: Chris@0: if (!main || !main->composition() || Chris@0: (year != "" && (main->composition()->year() != year.toInt()))) { Chris@0: Composition *c = new Composition; Chris@0: c->setComposerName(composerName); Chris@0: c->addWork(w); Chris@0: c->setYear(year.toInt()); Chris@0: c->setPlace(place); Chris@0: w->setComposition(c); Chris@0: } Chris@0: Chris@0: return w; Chris@0: } Chris@0: Chris@0: Chris@0: void Chris@0: WikipediaWorksKImporter::import(QUrl source) Chris@0: { Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: Chris@0: QString composerName; Chris@0: if (filename.contains("K%C3%B6chel")) { Chris@0: composerName = "Wolfgang Amadeus Mozart"; Chris@0: } else { Chris@0: QRegExp byby("by_(.*)_by"); Chris@0: if (byby.indexIn(filename) >= 0) { Chris@0: composerName = byby.cap(1).replace('_', ' '); Chris@0: } else { Chris@0: QRegExp by("by_(.*)"); Chris@0: if (by.indexIn(filename) >= 0) { Chris@0: composerName = by.cap(1).replace('_', ' '); Chris@0: } Chris@0: } Chris@0: } Chris@0: composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit()); Chris@0: Chris@0: DEBUG << "composerName = " << composerName << endl; Chris@0: Chris@0: // K numbers in tabular form (as found in "Köchel Catalogue" WP page) Chris@0: QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[(K\\.? *[0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n"); Chris@0: Chris@0: QString all = stream.readAll(); Chris@0: Chris@0: DEBUG << "Read " << all.length() << " chars" << endl; Chris@0: Chris@0: all.replace(QRegExp("^.*"), ""); Chris@0: Chris@0: int pos = 0, count = 0; Chris@0: Chris@0: while ((pos = matcherK.indexIn(all, pos)) != -1) { Chris@0: Chris@0: all.replace(pos, matcherK.matchedLength(), ""); Chris@0: ++count; Chris@0: Chris@0: QString kfield = matcherK.cap(1); Chris@0: QString titlefield = matcherK.cap(2); Chris@0: QString datefield = matcherK.cap(3); Chris@0: QString placefield = matcherK.cap(4); Chris@0: Chris@0: m_objects.push_back Chris@0: (makeWork(composerName, "", kfield, "", Chris@0: titlefield, datefield, placefield, "", 0)); Chris@0: } Chris@0: Chris@0: DEBUG << "Left over: " << all << endl; Chris@0: Chris@0: DEBUG << "Found " << count << " things" << endl; Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: Chris@0: