annotate import/ImportWikipediaWorks.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportWikipediaWorks.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 WikipediaWorksImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "WikipediaWorksImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 QString
Chris@0 27 sanitise(QString field, QString &linkText)
Chris@0 28 {
Chris@0 29 int mp;
Chris@0 30
Chris@0 31 field.replace(QString::fromUtf8("\342\200\222"), "-");
Chris@0 32 field.replace(QString::fromUtf8("\342\200\223"), "-");
Chris@0 33 field.replace(QString::fromUtf8("\342\200\224"), "-");
Chris@0 34 field.replace(QString::fromUtf8("\342\200\225"), "-");
Chris@0 35
Chris@0 36 QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
Chris@0 37 if ((mp = link2.indexIn(field)) >= 0) {
Chris@0 38 if (linkText == "") linkText = link2.cap(2);
Chris@0 39 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
Chris@0 40 return sanitise(field, linkText);
Chris@0 41 }
Chris@0 42
Chris@0 43 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
Chris@0 44 if ((mp = link1.indexIn(field)) >= 0) {
Chris@0 45 if (linkText == "") linkText = link1.cap(2);
Chris@0 46 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
Chris@0 47 return sanitise(field, linkText);
Chris@0 48 }
Chris@0 49
Chris@0 50 field = field.trimmed();
Chris@0 51
Chris@0 52 field.replace("[", "");
Chris@0 53 field.replace("]", "");
Chris@0 54 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
Chris@0 55 field.replace("''", "\"");
Chris@0 56 field.replace("&quot;", "\"");
Chris@0 57 field.replace(QRegExp("&lt;[^&]*&gt;"), "");
Chris@0 58 field.replace(QRegExp("^\\**"), "");
Chris@0 59
Chris@0 60 while (field.endsWith(".") || field.endsWith(",")) {
Chris@0 61 field = field.left(field.length()-1);
Chris@0 62 }
Chris@0 63
Chris@0 64 if (field.startsWith("(") && field.endsWith(")")) {
Chris@0 65 DEBUG << "before: " << field;
Chris@0 66 field = field.mid(1, field.length()-2);
Chris@0 67 DEBUG << "after: " << field;
Chris@0 68 }
Chris@0 69 field.replace(QRegExp("^\\**"), "");
Chris@0 70 if (field == ")" || field == "(") {
Chris@0 71 field = "";
Chris@0 72 }
Chris@0 73
Chris@0 74 field.replace(" - ,", ",");
Chris@0 75
Chris@0 76 return field;
Chris@0 77 }
Chris@0 78
Chris@0 79 QString
Chris@0 80 extractYear(QString datefield)
Chris@0 81 {
Chris@0 82 QRegExp re("[0-9]{4}");
Chris@0 83 if (re.indexIn(datefield) >= 0) {
Chris@0 84 return re.cap(0);
Chris@0 85 }
Chris@0 86 return "";
Chris@0 87 }
Chris@0 88
Chris@0 89 QString
Chris@0 90 extractKey(QString titlefield)
Chris@0 91 {
Chris@0 92 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
Chris@0 93 if (re.indexIn(titlefield) >= 0) {
Chris@0 94 return re.cap(1);
Chris@0 95 }
Chris@0 96 return "";
Chris@0 97 }
Chris@0 98
Chris@0 99 Work *
Chris@0 100 makeWork(QString composerName, QString opfield, QString kfield,
Chris@0 101 QString numfield, QString titlefield, QString datefield,
Chris@0 102 QString placefield, QString remarksfield, Work *main)
Chris@0 103 {
Chris@0 104 QString linkText;
Chris@0 105
Chris@0 106 Work *w = new Work;
Chris@0 107
Chris@0 108 QString op = sanitise(opfield, linkText);
Chris@0 109 if (op != "") {
Chris@0 110 op.replace("Opus ", "");
Chris@0 111 op.replace("Op. ", "");
Chris@0 112 op.replace("Op ", "");
Chris@0 113 w->setOpus(op);
Chris@0 114 }
Chris@0 115
Chris@0 116 QString k = sanitise(kfield, linkText);
Chris@0 117 if (k != "") {
Chris@0 118 w->setCatalogue(k);
Chris@0 119 }
Chris@0 120
Chris@0 121 QString num = sanitise(numfield, linkText);
Chris@0 122 if (num != "") {
Chris@0 123 num.replace("No. ", "");
Chris@0 124 num.replace("No ", "");
Chris@0 125 w->setNumber(num);
Chris@0 126 }
Chris@0 127
Chris@0 128 QString key = extractKey(titlefield);
Chris@0 129 if (key != "") {
Chris@0 130 w->setKey(key);
Chris@0 131 }
Chris@0 132
Chris@0 133 QString title = sanitise(titlefield, linkText);
Chris@0 134 if (linkText != "") {
Chris@0 135 linkText.replace(" ", "_");
Chris@0 136 QUrl url;
Chris@0 137 url.setScheme("http");
Chris@0 138 url.setHost("en.wikipedia.org");
Chris@0 139 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
Chris@0 140 Document *d = new Document;
Chris@18 141 d->setUri(Uri(url));
Chris@0 142 d->setSiteName("Wikipedia");
Chris@0 143 d->setTopic(w);
Chris@0 144 w->addPage(d);
Chris@0 145 }
Chris@0 146
Chris@0 147 QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
Chris@0 148 int pos;
Chris@0 149 if ((pos = explicationRE.indexIn(title)) >= 0) {
Chris@0 150 w->addAlias(explicationRE.cap(2));
Chris@0 151 title = explicationRE.cap(1);
Chris@0 152 }
Chris@0 153
Chris@0 154 if (remarksfield == "") {
Chris@0 155 QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
Chris@0 156 if ((pos = remarksRE.indexIn(title)) >= 0) {
Chris@0 157 remarksfield = remarksRE.cap(2);
Chris@0 158 title = remarksRE.cap(1);
Chris@0 159 }
Chris@0 160 }
Chris@0 161
Chris@0 162 if (remarksfield == "") {
Chris@0 163 QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
Chris@0 164 if ((pos = remarksRE.indexIn(title)) >= 0) {
Chris@0 165 remarksfield = remarksRE.cap(2);
Chris@0 166 title = remarksRE.cap(1);
Chris@0 167 }
Chris@0 168 }
Chris@0 169
Chris@0 170 w->setName(title);
Chris@0 171
Chris@0 172 QString remarks = sanitise(remarksfield, linkText);
Chris@0 173 if (remarks != "") {
Chris@0 174 w->setRemarks(remarks);
Chris@0 175 }
Chris@0 176
Chris@0 177 QString year = extractYear(datefield);
Chris@0 178 QString place = sanitise(placefield, linkText);
Chris@0 179
Chris@0 180 DEBUG << "title = " << title << endl;
Chris@0 181
Chris@0 182 if (main) {
Chris@0 183 main->addPart(w);
Chris@0 184 w->setPartOf(main);
Chris@0 185 w->setComposition(main->composition());
Chris@0 186 main->composition()->addWork(w);
Chris@0 187 }
Chris@0 188
Chris@0 189 if (!main || !main->composition() ||
Chris@0 190 (year != "" && (main->composition()->year() != year.toInt()))) {
Chris@0 191 Composition *c = new Composition;
Chris@0 192 c->setComposerName(composerName);
Chris@0 193 c->addWork(w);
Chris@0 194 c->setYear(year.toInt());
Chris@0 195 c->setPlace(place);
Chris@0 196 w->setComposition(c);
Chris@0 197 }
Chris@0 198
Chris@0 199 return w;
Chris@0 200 }
Chris@0 201
Chris@0 202
Chris@0 203 void
Chris@0 204 WikipediaWorksImporter::import(QUrl source)
Chris@0 205 {
Chris@0 206 //!!! for now
Chris@0 207 QString filename = source.toLocalFile();
Chris@0 208
Chris@0 209 QFile file(filename);
Chris@0 210 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 211 throw std::exception();
Chris@0 212 }
Chris@0 213
Chris@0 214 QTextStream stream(&file);
Chris@0 215 stream.setCodec("UTF-8");
Chris@0 216
Chris@0 217 QString composerName;
Chris@0 218 if (filename.contains("K%C3%B6chel")) {
Chris@0 219 composerName = "Wolfgang Amadeus Mozart";
Chris@0 220 } else if (filename.contains("/Schubert_")) {
Chris@0 221 composerName = "Franz Schubert";
Chris@0 222 } else {
Chris@0 223 QRegExp byby("by_(.*)_by");
Chris@0 224 if (byby.indexIn(filename) >= 0) {
Chris@0 225 composerName = byby.cap(1).replace('_', ' ');
Chris@0 226 } else {
Chris@0 227 QRegExp by("by_(.*)");
Chris@0 228 if (by.indexIn(filename) >= 0) {
Chris@0 229 composerName = by.cap(1).replace('_', ' ');
Chris@0 230 }
Chris@0 231 }
Chris@0 232 }
Chris@0 233 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
Chris@0 234
Chris@0 235 DEBUG << "composerName = " << composerName << endl;
Chris@0 236
Chris@0 237 // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
Chris@0 238 QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[K\\. *([0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
Chris@0 239
Chris@0 240 QString all = stream.readAll();
Chris@0 241
Chris@0 242 DEBUG << "Read " << all.length() << " chars" << endl;
Chris@0 243
Chris@0 244 all.replace(QRegExp("^.*<page>"), "");
Chris@0 245
Chris@0 246 int pos = 0, count = 0;
Chris@0 247
Chris@0 248 while ((pos = matcherK.indexIn(all, pos)) != -1) {
Chris@0 249
Chris@0 250 all.replace(pos, matcherK.matchedLength(), "");
Chris@0 251 ++count;
Chris@0 252
Chris@0 253 QString kfield = matcherK.cap(1);
Chris@0 254 QString titlefield = matcherK.cap(2);
Chris@0 255 QString datefield = matcherK.cap(3);
Chris@0 256 QString placefield = matcherK.cap(4);
Chris@0 257
Chris@0 258 m_objects.push_back
Chris@0 259 (makeWork(composerName, "K. " + kfield, kfield, "",
Chris@0 260 titlefield, datefield, placefield, "", 0));
Chris@0 261 }
Chris@0 262
Chris@0 263 // Opus in list form (as used for e.g. Beethoven's works)
Chris@0 264 QRegExp matcherB("[\\*:] *'*((Opus|Op\\.|WoO|Anh|H|D) [0-9][^,:'{\n]*)'*[,:{] *([^\n]*)\n");
Chris@0 265
Chris@0 266 // Part of an opus (e.g. op 18 no 1), intended to be anchored to
Chris@0 267 // the point at which the last matcherB or matcherB2 match ended
Chris@0 268 // (note caret)
Chris@0 269 QRegExp matcherB2("^[\\*:]{2} *([A-Za-z ]*)((No\\.* +)?[0-9][^ :\n]*)[: ] *([^\n]*)\n");
Chris@0 270
Chris@0 271 // Date and remarks within titlefield
Chris@0 272 QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\)(.*)");
Chris@0 273
Chris@0 274 pos = 0;
Chris@0 275
Chris@0 276 while ((pos = matcherB.indexIn(all, pos)) != -1) {
Chris@0 277
Chris@0 278 all.replace(pos, matcherB.matchedLength(), "");
Chris@0 279 ++count;
Chris@0 280
Chris@0 281 QString opfield = matcherB.cap(1);
Chris@0 282 QString titlefield = matcherB.cap(3);
Chris@0 283
Chris@0 284 QString datefield, remarksfield;
Chris@0 285
Chris@0 286 if (titlefield != "") {
Chris@0 287 int dpos;
Chris@0 288 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
Chris@0 289 datefield = matcherDate.cap(1);
Chris@0 290 remarksfield = matcherDate.cap(2);
Chris@0 291 titlefield = titlefield.left(dpos);
Chris@0 292 }
Chris@0 293 }
Chris@0 294
Chris@0 295 Work *main = makeWork(composerName, opfield, "", "",
Chris@0 296 titlefield, datefield, "", remarksfield, 0);
Chris@0 297
Chris@0 298 m_objects.push_back(main);
Chris@0 299
Chris@0 300 int spos = pos;
Chris@0 301
Chris@0 302 while ((spos = matcherB2.indexIn(all, spos, QRegExp::CaretAtOffset))
Chris@0 303 != -1) {
Chris@0 304
Chris@0 305 all.replace(spos, matcherB2.matchedLength(), "");
Chris@0 306 ++count;
Chris@0 307
Chris@0 308 QString numfield = matcherB2.cap(2);
Chris@0 309
Chris@0 310 titlefield = matcherB2.cap(4);
Chris@0 311
Chris@0 312 if (matcherB2.cap(1).trimmed() != "") {
Chris@0 313 titlefield = matcherB2.cap(1) + matcherB2.cap(2) + " "
Chris@0 314 + matcherB2.cap(4);
Chris@0 315 DEBUG << "prefix to number = " << matcherB2.cap(1) << ", so extending title from " << matcherB2.cap(4) << " to " << titlefield << endl;
Chris@0 316 }
Chris@0 317
Chris@0 318 datefield = "";
Chris@0 319 remarksfield = "";
Chris@0 320
Chris@0 321 if (titlefield != "") {
Chris@0 322 int dpos;
Chris@0 323 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
Chris@0 324 datefield = matcherDate.cap(1);
Chris@0 325 remarksfield = matcherDate.cap(2);
Chris@0 326 titlefield = titlefield.left(dpos);
Chris@0 327 }
Chris@0 328 }
Chris@0 329
Chris@0 330 Work *sub = makeWork(composerName, opfield, "", numfield,
Chris@0 331 titlefield, datefield, "", remarksfield, main);
Chris@0 332
Chris@0 333 m_objects.push_back(sub);
Chris@0 334 }
Chris@0 335 }
Chris@0 336
Chris@0 337 // Title with date but no opus in list form (as used for e.g. Copland)
Chris@0 338 QRegExp matcherC("\\* *([^\n]*)\\([^\\)]*([0-9]{4})[^\\)]*\\) *\n");
Chris@0 339
Chris@0 340 // Part of the above (e.g. song in cycle), intended to be anchored to
Chris@0 341 // the point at which the last matcherC or matcherC2 match ended
Chris@0 342 // (note caret)
Chris@0 343 QRegExp matcherC2("^\\*\\* *([^\n]*)\n");
Chris@0 344
Chris@0 345 pos = 0;
Chris@0 346
Chris@0 347 while ((pos = matcherC.indexIn(all, pos)) != -1) {
Chris@0 348
Chris@0 349 all.replace(pos, matcherC.matchedLength(), "");
Chris@0 350 ++count;
Chris@0 351
Chris@0 352 QString titlefield = matcherC.cap(1);
Chris@0 353 QString datefield = matcherC.cap(2);
Chris@0 354
Chris@0 355 Work *main = makeWork(composerName, "", "", "",
Chris@0 356 titlefield, datefield, "", "", 0);
Chris@0 357
Chris@0 358 m_objects.push_back(main);
Chris@0 359
Chris@0 360 int spos = pos;
Chris@0 361
Chris@0 362 while ((spos = matcherC2.indexIn(all, spos, QRegExp::CaretAtOffset))
Chris@0 363 != -1) {
Chris@0 364
Chris@0 365 all.replace(spos, matcherC2.matchedLength(), "");
Chris@0 366 ++count;
Chris@0 367
Chris@0 368 titlefield = matcherC2.cap(1);
Chris@0 369
Chris@0 370 datefield = "";
Chris@0 371
Chris@0 372 if (titlefield != "") {
Chris@0 373 int dpos;
Chris@0 374 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
Chris@0 375 datefield = matcherDate.cap(1);
Chris@0 376 titlefield = titlefield.left(dpos);
Chris@0 377 }
Chris@0 378 }
Chris@0 379
Chris@0 380 Work *sub = makeWork(composerName, "", "", "",
Chris@0 381 titlefield, datefield, "", "", main);
Chris@0 382
Chris@0 383 m_objects.push_back(sub);
Chris@0 384 }
Chris@0 385 }
Chris@0 386
Chris@0 387
Chris@0 388
Chris@0 389 DEBUG << "Left over: " << all << endl;
Chris@0 390
Chris@0 391 // Other forms:
Chris@0 392 // *March No. 1 in F major for Military Band, WoO 18 (1808)
Chris@0 393
Chris@0 394
Chris@0 395 DEBUG << "Found " << count << " things" << endl;
Chris@0 396 }
Chris@0 397
Chris@0 398
Chris@0 399 }
Chris@0 400
Chris@0 401