annotate import/ImportWikipediaWorksList.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportWikipediaWorksList.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 WikipediaWorksListImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "WikipediaWorksListImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 static QString
Chris@0 27 sanitise(QString field, QString &linkText)
Chris@0 28 {
Chris@0 29 int mp;
Chris@0 30
Chris@0 31 field.replace(QString::fromUtf8("\342\200\222"), "-");
Chris@0 32 field.replace(QString::fromUtf8("\342\200\223"), "-");
Chris@0 33 field.replace(QString::fromUtf8("\342\200\224"), "-");
Chris@0 34 field.replace(QString::fromUtf8("\342\200\225"), "-");
Chris@0 35
Chris@0 36 field.replace(QString::fromUtf8("\342\231\255"), "-flat");
Chris@0 37 field.replace(QString::fromUtf8("\342\231\257"), "-sharp");
Chris@0 38
Chris@0 39 QRegExp link2("([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
Chris@0 40 if ((mp = link2.indexIn(field)) >= 0) {
Chris@0 41 if (linkText == "" && mp < 4) linkText = link2.cap(2);
Chris@0 42 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
Chris@0 43 return sanitise(field, linkText);
Chris@0 44 }
Chris@0 45
Chris@0 46 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
Chris@0 47 if ((mp = link1.indexIn(field)) >= 0) {
Chris@0 48 if (linkText == "") linkText = link1.cap(2);
Chris@0 49 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
Chris@0 50 return sanitise(field, linkText);
Chris@0 51 }
Chris@0 52
Chris@0 53 field = field.trimmed();
Chris@0 54
Chris@0 55 field.replace("[", "");
Chris@0 56 field.replace("]", "");
Chris@0 57 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), " ");
Chris@0 58 field.replace("'''", "\"");
Chris@0 59 field.replace("''", "\"");
Chris@0 60 field.replace("&quot;", "\"");
Chris@0 61 field.replace("\"\"", "\"");
Chris@0 62 field.replace(QRegExp("^[\'\"] (\")?"), "\"");
Chris@0 63 field.replace(QRegExp("&lt;[^&]*&gt;"), "");
Chris@0 64 field.replace(QRegExp("^\\**"), "");
Chris@0 65
Chris@0 66 if (field.endsWith("c.")) {
Chris@0 67 // historical artifact from removal of Bruckner year indication (c. 1856)
Chris@0 68 field = field.left(field.length()-2);
Chris@0 69 }
Chris@0 70
Chris@0 71 while (field.endsWith(".") || field.endsWith(",")) {
Chris@0 72 field = field.left(field.length()-1);
Chris@0 73 }
Chris@0 74
Chris@0 75 if (field.startsWith(";") || field.startsWith(":") || field.startsWith(",")
Chris@0 76 || field.startsWith("-")) {
Chris@0 77 field = field.right(field.length()-1);
Chris@0 78 }
Chris@0 79
Chris@0 80 if (field.startsWith("(") && field.endsWith(")")) {
Chris@0 81 DEBUG << "before: " << field;
Chris@0 82 field = field.mid(1, field.length()-2);
Chris@0 83 DEBUG << "after: " << field;
Chris@0 84 }
Chris@0 85
Chris@0 86 field.replace(QRegExp("^\\**"), "");
Chris@0 87 if (field == ")" || field == "(") {
Chris@0 88 field = "";
Chris@0 89 }
Chris@0 90
Chris@0 91 field.replace(" - ,", ",");
Chris@0 92 field.replace(" ", " ");
Chris@0 93
Chris@0 94 return field.trimmed();
Chris@0 95 }
Chris@0 96
Chris@0 97 static QString
Chris@0 98 extractYear(QString datefield)
Chris@0 99 {
Chris@0 100 QRegExp re("[0-9]{4}");
Chris@0 101 if (re.indexIn(datefield) >= 0) {
Chris@0 102 return re.cap(0);
Chris@0 103 }
Chris@0 104 return "";
Chris@0 105 }
Chris@0 106
Chris@0 107 static QString
Chris@0 108 extractKey(QString titlefield)
Chris@0 109 {
Chris@0 110 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
Chris@0 111 if (re.indexIn(titlefield) >= 0) {
Chris@0 112 return re.cap(1);
Chris@0 113 }
Chris@0 114 return "";
Chris@0 115 }
Chris@0 116
Chris@0 117 static Work *
Chris@0 118 makeWork(QString composerName, QString opfield, QString numfield,
Chris@0 119 int partNumber, QString titlefield, QString datefield,
Chris@0 120 QString placefield, QString remarksfield, Work *main)
Chris@0 121 {
Chris@0 122 if (titlefield.contains("List of ") || titlefield.contains("http:")) return 0;
Chris@0 123
Chris@0 124 QString linkText;
Chris@0 125
Chris@0 126 Work *w = new Work;
Chris@0 127
Chris@0 128 QRegExp embeddedOpMatcher("([Oo]pus|[Oo]p.|WAB) (posth[a-z\\.]* *)?([0-9][^ ;:,]*)(,? *([Nn]umber|[Nn]o.|[Nn]r.) ([0-9][^ ;:,]*))?,?");
Chris@0 129 if (embeddedOpMatcher.indexIn(titlefield) >= 0) {
Chris@0 130 QString opf = embeddedOpMatcher.cap(0);
Chris@0 131 if (opfield == "") opfield = opf;
Chris@0 132 titlefield.replace(opf, "");
Chris@0 133 } else if (embeddedOpMatcher.indexIn(remarksfield) >= 0) {
Chris@0 134 opfield = embeddedOpMatcher.cap(0);
Chris@0 135 }
Chris@0 136 if (main && numfield == "") {
Chris@0 137 QRegExp embeddedNumMatcher("(Number|No.|Nr.) ([0-9][^ ;:,]*)");
Chris@0 138 if (embeddedNumMatcher.indexIn(titlefield) >= 0) {
Chris@0 139 numfield = embeddedNumMatcher.cap(2);
Chris@0 140 } else if (embeddedNumMatcher.indexIn(remarksfield) >= 0) {
Chris@0 141 numfield = embeddedNumMatcher.cap(2);
Chris@0 142 }
Chris@0 143 }
Chris@0 144
Chris@0 145 QString op = sanitise(opfield, linkText);
Chris@0 146 if (op != "") {
Chris@0 147 if (op.toLower().contains("op")) {
Chris@0 148 op.replace("Opus ", "");
Chris@0 149 op.replace("Op. ", "");
Chris@0 150 op.replace("Op.", "");
Chris@0 151 op.replace("Op ", "");
Chris@0 152 op.replace("opus ", "");
Chris@0 153 op.replace("op. ", "");
Chris@0 154 op.replace("op.", "");
Chris@0 155 op.replace("op ", "");
Chris@0 156 w->setOpus(op);
Chris@0 157 } else if (QRegExp("^[0-9]*$").indexIn(op) >= 0) {
Chris@0 158 w->setOpus(op);
Chris@0 159 } else {
Chris@0 160 w->setCatalogue(op);
Chris@0 161 }
Chris@0 162 }
Chris@0 163
Chris@0 164 QString num = sanitise(numfield, linkText);
Chris@0 165 if (num != "") {
Chris@0 166 num.replace("No. ", "");
Chris@0 167 num.replace("No ", "");
Chris@0 168 w->setNumber(num);
Chris@0 169 } else if (partNumber > 0) {
Chris@0 170 w->setNumber(QString("%1").arg(partNumber));
Chris@0 171 }
Chris@0 172
Chris@0 173 QString key = extractKey(titlefield);
Chris@0 174 if (key != "") {
Chris@0 175 w->setKey(key);
Chris@0 176 }
Chris@0 177
Chris@0 178 DEBUG << "title before sanitise: " << titlefield << endl;
Chris@0 179
Chris@0 180 remarksfield = remarksfield.trimmed();
Chris@0 181
Chris@0 182 QString title = sanitise(titlefield, linkText);
Chris@0 183 title.replace(QRegExp(", which.*$"), "");
Chris@0 184 if (linkText != "") {
Chris@0 185 if (remarksfield == "" && title.startsWith(linkText)) {
Chris@0 186 remarksfield = title.right(title.length() - linkText.length());
Chris@0 187 title = linkText;
Chris@0 188 }
Chris@0 189 linkText.replace(" ", "_");
Chris@0 190 QUrl url;
Chris@0 191 url.setScheme("http");
Chris@0 192 url.setHost("en.wikipedia.org");
Chris@0 193 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
Chris@0 194 Document *d = new Document;
Chris@18 195 d->setUri(Uri(url));
Chris@0 196 d->setSiteName("Wikipedia");
Chris@0 197 d->setTopic(w);
Chris@0 198 w->addPage(d);
Chris@0 199 }
Chris@0 200
Chris@0 201 DEBUG << "title after sanitise: " << title << ", link text " << linkText << ", remarks " << remarksfield << endl;
Chris@0 202
Chris@0 203 QRegExp explicationRE("^(\"[^-]+\") - (.+)$");
Chris@0 204 int pos;
Chris@0 205 if ((pos = explicationRE.indexIn(title)) >= 0) {
Chris@0 206 QString part = explicationRE.cap(2);
Chris@0 207 if (part[0].isUpper()) w->addAlias(explicationRE.cap(2));
Chris@0 208 else if (remarksfield == "") remarksfield = explicationRE.cap(2);
Chris@0 209 title = explicationRE.cap(1);
Chris@0 210 }
Chris@0 211
Chris@0 212 QRegExp remarksRE1("^(\"[^-]+\") (for .*)$");
Chris@0 213 if ((pos = remarksRE1.indexIn(title)) >= 0) {
Chris@0 214 if (remarksfield != "") {
Chris@0 215 remarksfield = QString("%1 - %2")
Chris@0 216 .arg(remarksRE1.cap(2)).arg(remarksfield);
Chris@0 217 } else {
Chris@0 218 remarksfield = remarksRE1.cap(2);
Chris@0 219 }
Chris@0 220 title = remarksRE1.cap(1);
Chris@0 221 }
Chris@0 222
Chris@0 223 QRegExp remarksRE2("^(\"[^\"]+\"), (.*)$");
Chris@0 224 if ((pos = remarksRE2.indexIn(title)) >= 0) {
Chris@0 225 if (remarksfield != "") {
Chris@0 226 remarksfield = QString("%1 - %2")
Chris@0 227 .arg(remarksRE2.cap(2)).arg(remarksfield);
Chris@0 228 } else {
Chris@0 229 remarksfield = remarksRE2.cap(2);
Chris@0 230 }
Chris@0 231 title = remarksRE2.cap(1);
Chris@0 232 }
Chris@0 233
Chris@0 234 QRegExp explicationRE2("^([^\\(]*\") \\(([^\\)]*)\\)(.*)$");
Chris@0 235 if ((pos = explicationRE2.indexIn(title)) >= 0) {
Chris@0 236 w->addAlias(explicationRE2.cap(2));
Chris@0 237 if (remarksfield == "") remarksfield = explicationRE2.cap(3);
Chris@0 238 title = explicationRE2.cap(1);
Chris@0 239 }
Chris@0 240
Chris@0 241 if (title.startsWith("Song \"")) {
Chris@0 242 title = title.right(title.length() - 5);
Chris@0 243 w->addForm(Form::getFormByName("song"));
Chris@0 244 }
Chris@0 245 if (!main && title.startsWith("Song cycle \"")) {
Chris@0 246 title = title.right(title.length() - 11);
Chris@0 247 w->addForm(Form::getFormByName("song cycle"));
Chris@0 248 }
Chris@0 249 if (main && main->forms().contains(Form::getFormByName("song cycle"))) {
Chris@0 250 w->addForm(Form::getFormByName("song"));
Chris@0 251 }
Chris@0 252
Chris@0 253 if (title == "" && !main) {
Chris@0 254 delete w;
Chris@0 255 return 0;
Chris@0 256 }
Chris@0 257
Chris@0 258 w->setName(title);
Chris@0 259
Chris@0 260 QString remarks = sanitise(remarksfield, linkText);
Chris@0 261 if (remarks != "") {
Chris@0 262 w->setRemarks(remarks);
Chris@0 263 }
Chris@0 264
Chris@0 265 QString year = extractYear(datefield);
Chris@0 266 QString place = sanitise(placefield, linkText);
Chris@0 267
Chris@0 268 DEBUG << "title = " << title << endl;
Chris@0 269
Chris@0 270 if (main) {
Chris@0 271 main->addPart(w);
Chris@0 272 w->setPartOf(main);
Chris@0 273 w->setComposition(main->composition());
Chris@0 274 main->composition()->addWork(w);
Chris@0 275 }
Chris@0 276
Chris@0 277 if (!main || !main->composition() ||
Chris@0 278 (year != "" && (main->composition()->year() != year.toInt()))) {
Chris@0 279 Composition *c = new Composition;
Chris@0 280 c->setComposerName(composerName);
Chris@0 281 c->addWork(w);
Chris@0 282 c->setYear(year.toInt());
Chris@0 283 c->setPlace(place);
Chris@0 284 w->setComposition(c);
Chris@0 285 }
Chris@0 286
Chris@0 287 return w;
Chris@0 288 }
Chris@0 289
Chris@0 290
Chris@0 291 void
Chris@0 292 WikipediaWorksListImporter::import(QUrl source)
Chris@0 293 {
Chris@0 294 //!!! for now
Chris@0 295 QString filename = source.toLocalFile();
Chris@0 296
Chris@0 297 QFile file(filename);
Chris@0 298 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 299 throw std::exception();
Chris@0 300 }
Chris@0 301
Chris@0 302 QTextStream stream(&file);
Chris@0 303 stream.setCodec("UTF-8");
Chris@0 304
Chris@0 305 QString composerName;
Chris@0 306 if (filename.contains("K%C3%B6chel")) {
Chris@0 307 composerName = "Wolfgang Amadeus Mozart";
Chris@0 308 } else if (filename.contains("/Schubert_")) {
Chris@0 309 composerName = "Franz Schubert";
Chris@0 310 } else {
Chris@0 311 QRegExp byby("by_(.*)_by");
Chris@0 312 if (byby.indexIn(filename) >= 0) {
Chris@0 313 composerName = byby.cap(1).replace('_', ' ');
Chris@0 314 } else {
Chris@0 315 QRegExp bybr("by_(.*)_\\(");
Chris@0 316 if (bybr.indexIn(filename) >= 0) {
Chris@0 317 composerName = bybr.cap(1).replace('_', ' ');
Chris@0 318 } else {
Chris@0 319 QRegExp by("by_(.*)");
Chris@0 320 if (by.indexIn(filename) >= 0) {
Chris@0 321 composerName = by.cap(1).replace('_', ' ');
Chris@0 322 } else {
Chris@0 323 QRegExp of("of_([A-Z].*)");
Chris@0 324 if (of.indexIn(filename) >= 0) {
Chris@0 325 composerName = of.cap(1).replace('_', ' ');
Chris@0 326 }
Chris@0 327 }
Chris@0 328 }
Chris@0 329 }
Chris@0 330 }
Chris@0 331 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
Chris@0 332
Chris@0 333 DEBUG << "composerName = " << composerName << endl;
Chris@0 334
Chris@0 335
Chris@0 336 // We try to keep these matchers specific enough that we can be
Chris@0 337 // sure the title field will come out containing _at least_ the
Chris@0 338 // title. i.e. the title field should never end up with just the
Chris@0 339 // opus number or date or whatever, even if the line is formatted
Chris@0 340 // in a way we hadn't anticipated. Thus it helps if the title is
Chris@0 341 // bookended by '' or [[]], etc
Chris@0 342
Chris@0 343 // e.g. Beethoven
Chris@0 344 // *Opus 84: ''[[Egmont (Beethoven)|Egmont]]'', overture and incidental music (1810)
Chris@0 345 // opus field - n/a - title - date - n/a - remarks
Chris@0 346 QRegExp workMatcher1("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:{]*)[:,] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");
Chris@0 347
Chris@0 348 // e.g. Tchaikovsky
Chris@0 349 // *'''Op. 19''' 6 Pieces, for piano (1873)
Chris@0 350 // or Ravel
Chris@0 351 // * '''1''', Piano Sonata movement (1888), lost
Chris@0 352
Chris@0 353 /*
Chris@0 354 // opus field - n/a - title - date - n/a - remarks
Chris@0 355 QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G)? *[0-9][^ ,:'{]*)'''[:, ] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");
Chris@0 356 */
Chris@0 357 // opus field - n/a - title
Chris@0 358 QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|[A-Z]{1,2})?\\.? *[0-9][^ ,:'{]*),?'''[:, ] *(.*)$");
Chris@0 359
Chris@0 360 // e.g. Copland
Chris@0 361 // * ''Four Motets'' for mixed voices (1921)
Chris@0 362 // title - date field
Chris@0 363 // (no opus)
Chris@0 364 QRegExp workMatcher2("^\\* *(''.*''\\)?) *(.*)$");
Chris@0 365 workMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings
Chris@0 366
Chris@0 367 // e.g. Copland
Chris@0 368 // * Arrangement of ''Lincoln Portrait'' for concert band (1942)
Chris@0 369 // or Mendelssohn
Chris@0 370 // * [[Christe du Lamm Gottes]] (1827), SATB, strings
Chris@0 371 // title - date field - remarks
Chris@0 372 // (no opus)
Chris@0 373 QRegExp workMatcher3("^\\* *([^\\*].*) *\\(([^\\)]*[0-9]{4}[^\\)]*)\\) *(.*)$");
Chris@0 374
Chris@0 375 // e.g. Scriabin
Chris@0 376 // *[[Sonata No. 2 (Scriabin)|Sonata No. 2 in G sharp minor]], Op. 19 (also known as ''Sonata-Fantasy'')"
Chris@0 377 // title - opus field - n/a - remarks
Chris@0 378 QRegExp workMatcher4("^\\* *(\\[\\[.*\\]\\]),* (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*) *(.*)$");
Chris@0 379
Chris@0 380 // e.g. Scriabin
Chris@0 381 // *Opus 35: [[Opus 35 (Scriabin)|Three Preludes]]
Chris@0 382 // opus field - n/a - title - remarks
Chris@0 383 QRegExp workMatcher5("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)[:,]* *([\\[']+.*[\\]']+) *(.*)$");
Chris@0 384
Chris@0 385 // e.g. Boccherini
Chris@0 386 // *G 1: Cello Sonata in F major
Chris@0 387 // or weird Schubert layout
Chris@0 388 // * D 505{{nbsp|4}}Adagio in D-flat for Piano
Chris@0 389 // or Glazunov
Chris@0 390 // :Op. 67: ''[[The Seasons (ballet)|The Seasons]]'', ballet in one act (1900)
Chris@0 391 // or even
Chris@0 392 // ::Op. 77: ''[[Symphony No. 7 (Glazunov)|Symphony No. 7]]'' &quot;Pastorale&quot; in F major (1902-1903)
Chris@0 393 // This one is a real mess, for really messy pages. Needs to go near
Chris@0 394 // the end of the matchers in case it catches something it shouldn't
Chris@0 395 // n/a - opus field - n/a - n/a - n/a - title
Chris@0 396 QRegExp workMatcher6("^([\\*:]|::) *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)(([:,]| *\\{+[^\\}]+\\}+) *(.*))?$");
Chris@0 397
Chris@0 398 // e.g. Bruch
Chris@0 399 // * Adagio appassionato for violin and orchestra in C sharp minor, Op. 57
Chris@0 400 // title - opus field - date field
Chris@0 401 QRegExp workMatcher7("^\\* *(.*),? (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*|[Oo]p. posth[a-z.]*) *(\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\))? *$");
Chris@0 402
Chris@0 403 // e.g. Bruckner
Chris@0 404 // * Symphony No. 0 in D minor 1869 WAB 100
Chris@0 405 // title - date field - opus field
Chris@0 406 QRegExp workMatcher8("^\\* *(.*) ([0-9]{4}[0-9/-]*) *(WAB [0-9][^ ]*)$");
Chris@0 407
Chris@0 408 // e.g. Bach
Chris@0 409 // * BWV 506 ? Was bist du doch, o Seele, so betruebet
Chris@0 410 // opus field - title
Chris@0 411 QRegExp workMatcher9("^\\* *(BWV [^ ]+)(.*)$");
Chris@0 412
Chris@0 413 // Catch-all for things that look at all promising (anything that
Chris@0 414 // starts with ' or [ after bullet: take the whole as title)
Chris@0 415 QRegExp workMatcher10("^[\\*:] *((['\\[]|&quot;).*)$");
Chris@0 416
Chris@0 417
Chris@0 418
Chris@0 419 // e.g. Beethoven
Chris@0 420 // **No. 1: [[Piano Trio No. 1 (Beethoven)|Piano Trio No. 1]] in E-flat major
Chris@0 421 // number field - n/a - title, remarks etc
Chris@0 422 QRegExp partMatcher1("^[\\*:]{2} *((No\\.? *)?[0-9][^ ,:'{]*)[:, ] *(.*)$");
Chris@0 423
Chris@0 424 // e.g. Copland
Chris@0 425 // ** ''Help us, O Lord''
Chris@0 426 // title - remarks
Chris@0 427 QRegExp partMatcher2("^\\*\\* *(''.*'') *(.*)$");
Chris@0 428 partMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings
Chris@0 429
Chris@0 430 // e.g. Scriabin
Chris@0 431 // **[[Mazurka Op. 40 No. 1 (Scriabin)|Mazurka in D flat major]]
Chris@0 432 // title - remarks
Chris@0 433 QRegExp partMatcher3("^\\*\\* *(\\[\\[.*\\]\\])(.*)$");
Chris@0 434
Chris@0 435 // e.g. Berlioz
Chris@0 436 // ** 1: ''Méditation religieuse''
Chris@0 437 // number - title - remarks
Chris@0 438 QRegExp partMatcher4("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *([\\[]*''.*''[\\]]*) *(.*)$");
Chris@0 439
Chris@0 440 // e.g. Tchaikovsky
Chris@0 441 // **4. Nocturne [???????] (C? minor)
Chris@0 442 // number - title - remarks
Chris@0 443 QRegExp partMatcher5("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *(.*\\[[^\\]]+\\])(.*)$");
Chris@0 444
Chris@0 445 // e.g. Schubert
Chris@0 446 // **2. &quot;Wohin?&quot;
Chris@0 447 // n/a - number - title
Chris@0 448 QRegExp partMatcher6("^\\*\\* *(([0-9][0-9a-z]*)[\\.:])? *((&quot;|'').*)$");
Chris@0 449
Chris@0 450 // e.g. Mendelssohn
Chris@0 451 // ** Notturno
Chris@0 452 // title only
Chris@0 453 QRegExp partMatcher7("^\\*\\* *(.*)$");
Chris@0 454
Chris@0 455
Chris@0 456 // Date and remarks within titlefield or remarksfield
Chris@0 457 QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\),?(.*)");
Chris@0 458
Chris@0 459
Chris@0 460 Work *main = 0;
Chris@0 461 int partNumber = 0;
Chris@0 462
Chris@0 463 QString line;
Chris@0 464 QString opfield, numfield, titlefield, remarksfield, datefield;
Chris@0 465
Chris@0 466 while (!stream.atEnd()) {
Chris@0 467
Chris@0 468 if (line == "") {
Chris@0 469 line = stream.readLine();
Chris@0 470 DEBUG << "line: " << line << endl;
Chris@0 471 }
Chris@0 472
Chris@0 473 opfield = "";
Chris@0 474 numfield = "";
Chris@0 475 titlefield = "";
Chris@0 476 datefield = "";
Chris@0 477 remarksfield = "";
Chris@0 478 partNumber = 0;
Chris@0 479
Chris@0 480 if (workMatcher1.indexIn(line) >= 0) {
Chris@0 481
Chris@0 482 DEBUG << "matcher 1" << endl;
Chris@0 483 opfield = workMatcher1.cap(1);
Chris@0 484 titlefield = workMatcher1.cap(3);
Chris@0 485 datefield = workMatcher1.cap(4);
Chris@0 486 remarksfield = workMatcher1.cap(6);
Chris@0 487
Chris@0 488 } else if (workMatcher1a.indexIn(line) >= 0) {
Chris@0 489
Chris@0 490 DEBUG << "matcher 1a" << endl;
Chris@0 491 opfield = workMatcher1a.cap(1);
Chris@0 492 titlefield = workMatcher1a.cap(3);
Chris@0 493 /*
Chris@0 494 datefield = workMatcher1a.cap(4);
Chris@0 495 remarksfield = workMatcher1a.cap(6);
Chris@0 496 */
Chris@0 497
Chris@0 498 } else if (workMatcher2.indexIn(line) >= 0) {
Chris@0 499
Chris@0 500 DEBUG << "matcher 2" << endl;
Chris@0 501 titlefield = workMatcher2.cap(1);
Chris@0 502 remarksfield = workMatcher2.cap(2);
Chris@0 503
Chris@0 504 } else if (workMatcher3.indexIn(line) >= 0) {
Chris@0 505
Chris@0 506 DEBUG << "matcher 3" << endl;
Chris@0 507 titlefield = workMatcher3.cap(1);
Chris@0 508 datefield = workMatcher3.cap(2);
Chris@0 509 remarksfield = workMatcher3.cap(3);
Chris@0 510
Chris@0 511 } else if (workMatcher4.indexIn(line) >= 0) {
Chris@0 512
Chris@0 513 DEBUG << "matcher 4" << endl;
Chris@0 514 titlefield = workMatcher4.cap(1);
Chris@0 515 opfield = workMatcher4.cap(2);
Chris@0 516 remarksfield = workMatcher4.cap(4);
Chris@0 517
Chris@0 518 } else if (workMatcher5.indexIn(line) >= 0) {
Chris@0 519
Chris@0 520 DEBUG << "matcher 5" << endl;
Chris@0 521 opfield = workMatcher5.cap(1);
Chris@0 522 titlefield = workMatcher5.cap(3);
Chris@0 523 remarksfield = workMatcher5.cap(4);
Chris@0 524
Chris@0 525 } else if (workMatcher6.indexIn(line) >= 0) {
Chris@0 526
Chris@0 527 DEBUG << "matcher 6" << endl;
Chris@0 528 opfield = workMatcher6.cap(2);
Chris@0 529 titlefield = workMatcher6.cap(6);
Chris@0 530
Chris@0 531 } else if (workMatcher7.indexIn(line) >= 0) {
Chris@0 532
Chris@0 533 DEBUG << "matcher 7" << endl;
Chris@0 534 titlefield = workMatcher7.cap(1);
Chris@0 535 opfield = workMatcher7.cap(2);
Chris@0 536 datefield = workMatcher7.cap(3);
Chris@0 537
Chris@0 538 } else if (workMatcher8.indexIn(line) >= 0) {
Chris@0 539
Chris@0 540 DEBUG << "matcher 8" << endl;
Chris@0 541 titlefield = workMatcher8.cap(1);
Chris@0 542 datefield = workMatcher8.cap(2);
Chris@0 543 opfield = workMatcher8.cap(3);
Chris@0 544
Chris@0 545 } else if (workMatcher9.indexIn(line) >= 0) {
Chris@0 546
Chris@0 547 DEBUG << "matcher 9" << endl;
Chris@0 548 opfield = workMatcher9.cap(1);
Chris@0 549 titlefield = workMatcher9.cap(2);
Chris@0 550
Chris@0 551 } else if (workMatcher10.indexIn(line) >= 0) {
Chris@0 552
Chris@0 553 DEBUG << "matcher 10" << endl;
Chris@0 554 titlefield = workMatcher10.cap(1);
Chris@0 555
Chris@0 556 } else {
Chris@0 557 if (line.startsWith("*") || line.startsWith(":")) {
Chris@0 558 DEBUG << "Failed to match promising works list line: " << line << endl;
Chris@0 559 }
Chris@0 560 line = "";
Chris@0 561 continue;
Chris@0 562 }
Chris@0 563
Chris@0 564 if (titlefield != "" && datefield == "") {
Chris@0 565 int dpos;
Chris@0 566 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
Chris@0 567 datefield = matcherDate.cap(1);
Chris@0 568 remarksfield = matcherDate.cap(2);
Chris@0 569 titlefield = titlefield.left(dpos);
Chris@0 570 }
Chris@0 571 }
Chris@0 572
Chris@0 573 if (remarksfield != "" && datefield == "") {
Chris@0 574 int dpos;
Chris@0 575 if ((dpos = matcherDate.indexIn(remarksfield)) != -1) {
Chris@0 576 datefield = matcherDate.cap(1);
Chris@0 577 remarksfield = remarksfield.left(dpos);
Chris@0 578 }
Chris@0 579 }
Chris@0 580
Chris@0 581 main = makeWork(composerName, opfield, "", 0,
Chris@0 582 titlefield, datefield, "", remarksfield, 0);
Chris@0 583
Chris@0 584 if (main) m_objects.push_back(main);
Chris@0 585
Chris@0 586 line = "";
Chris@0 587
Chris@0 588 while (!stream.atEnd()) {
Chris@0 589
Chris@0 590 ++partNumber;
Chris@0 591 line = stream.readLine();
Chris@0 592 DEBUG << "line: " << line << endl;
Chris@0 593
Chris@0 594 if (partMatcher1.indexIn(line) >= 0) {
Chris@0 595
Chris@0 596 DEBUG << "part matcher 1" << endl;
Chris@0 597 numfield = partMatcher1.cap(1);
Chris@0 598 titlefield = partMatcher1.cap(3);
Chris@0 599 remarksfield = "";
Chris@0 600
Chris@0 601 } else if (partMatcher2.indexIn(line) >= 0) {
Chris@0 602
Chris@0 603 DEBUG << "part matcher 2" << endl;
Chris@0 604 titlefield = partMatcher2.cap(1);
Chris@0 605 remarksfield = partMatcher2.cap(2);
Chris@0 606
Chris@0 607 } else if (partMatcher3.indexIn(line) >= 0) {
Chris@0 608
Chris@0 609 DEBUG << "part matcher 3" << endl;
Chris@0 610 titlefield = partMatcher3.cap(1);
Chris@0 611 remarksfield = partMatcher3.cap(2);
Chris@0 612
Chris@0 613 } else if (partMatcher4.indexIn(line) >= 0) {
Chris@0 614
Chris@0 615 DEBUG << "part matcher 4" << endl;
Chris@0 616 numfield = partMatcher4.cap(1);
Chris@0 617 titlefield = partMatcher4.cap(2);
Chris@0 618 remarksfield = partMatcher4.cap(3);
Chris@0 619
Chris@0 620 } else if (partMatcher5.indexIn(line) >= 0) {
Chris@0 621
Chris@0 622 DEBUG << "part matcher 5" << endl;
Chris@0 623 numfield = partMatcher5.cap(1);
Chris@0 624 titlefield = partMatcher5.cap(2);
Chris@0 625 remarksfield = partMatcher5.cap(3);
Chris@0 626
Chris@0 627 } else if (partMatcher6.indexIn(line) >= 0) {
Chris@0 628
Chris@0 629 DEBUG << "part matcher 6" << endl;
Chris@0 630 numfield = partMatcher6.cap(2);
Chris@0 631 titlefield = partMatcher6.cap(3);
Chris@0 632
Chris@0 633 } else if (partMatcher7.indexIn(line) >= 0) {
Chris@0 634
Chris@0 635 DEBUG << "part matcher 7" << endl;
Chris@0 636 titlefield = partMatcher7.cap(1);
Chris@0 637
Chris@0 638 } else {
Chris@0 639 if (line.startsWith("**") || line.startsWith("::")) {
Chris@0 640 DEBUG << "Failed to match promising part line: " << line << endl;
Chris@0 641 }
Chris@0 642 break;
Chris@0 643 }
Chris@0 644
Chris@0 645 if (titlefield != "" && datefield == "") {
Chris@0 646 int dpos;
Chris@0 647 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
Chris@0 648 datefield = matcherDate.cap(1);
Chris@0 649 remarksfield = matcherDate.cap(2);
Chris@0 650 titlefield = titlefield.left(dpos);
Chris@0 651 }
Chris@0 652 }
Chris@0 653
Chris@0 654 Work *part = makeWork(composerName, opfield, numfield, partNumber,
Chris@0 655 titlefield, datefield, "", remarksfield,
Chris@0 656 main);
Chris@0 657
Chris@0 658 if (part) m_objects.push_back(part);
Chris@0 659 }
Chris@0 660 }
Chris@0 661
Chris@0 662 DEBUG << "Found " << m_objects.size() << " things" << endl;
Chris@0 663 }
Chris@0 664
Chris@0 665
Chris@0 666 }
Chris@0 667
Chris@0 668