Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@0: #include "ImportWikipediaWorksList.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@0: WikipediaWorksListImporter::setSource(QUrl source) Chris@0: { Chris@0: DEBUG << "WikipediaWorksListImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@0: static QString Chris@0: sanitise(QString field, QString &linkText) Chris@0: { Chris@0: int mp; Chris@0: Chris@0: field.replace(QString::fromUtf8("\342\200\222"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\223"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\224"), "-"); Chris@0: field.replace(QString::fromUtf8("\342\200\225"), "-"); Chris@0: Chris@0: field.replace(QString::fromUtf8("\342\231\255"), "-flat"); Chris@0: field.replace(QString::fromUtf8("\342\231\257"), "-sharp"); Chris@0: Chris@0: QRegExp link2("([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]"); Chris@0: if ((mp = link2.indexIn(field)) >= 0) { Chris@0: if (linkText == "" && mp < 4) linkText = link2.cap(2); Chris@0: field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3)); Chris@0: return sanitise(field, linkText); Chris@0: } Chris@0: Chris@0: QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]"); Chris@0: if ((mp = link1.indexIn(field)) >= 0) { Chris@0: if (linkText == "") linkText = link1.cap(2); Chris@0: field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2)); Chris@0: return sanitise(field, linkText); Chris@0: } Chris@0: Chris@0: field = field.trimmed(); Chris@0: Chris@0: field.replace("[", ""); Chris@0: field.replace("]", ""); Chris@0: field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), " "); Chris@0: field.replace("'''", "\""); Chris@0: field.replace("''", "\""); Chris@0: field.replace(""", "\""); Chris@0: field.replace("\"\"", "\""); Chris@0: field.replace(QRegExp("^[\'\"] (\")?"), "\""); Chris@0: field.replace(QRegExp("<[^&]*>"), ""); Chris@0: field.replace(QRegExp("^\\**"), ""); Chris@0: Chris@0: if (field.endsWith("c.")) { Chris@0: // historical artifact from removal of Bruckner year indication (c. 1856) Chris@0: field = field.left(field.length()-2); Chris@0: } Chris@0: Chris@0: while (field.endsWith(".") || field.endsWith(",")) { Chris@0: field = field.left(field.length()-1); Chris@0: } Chris@0: Chris@0: if (field.startsWith(";") || field.startsWith(":") || field.startsWith(",") Chris@0: || field.startsWith("-")) { Chris@0: field = field.right(field.length()-1); Chris@0: } Chris@0: Chris@0: if (field.startsWith("(") && field.endsWith(")")) { Chris@0: DEBUG << "before: " << field; Chris@0: field = field.mid(1, field.length()-2); Chris@0: DEBUG << "after: " << field; Chris@0: } Chris@0: Chris@0: field.replace(QRegExp("^\\**"), ""); Chris@0: if (field == ")" || field == "(") { Chris@0: field = ""; Chris@0: } Chris@0: Chris@0: field.replace(" - ,", ","); Chris@0: field.replace(" ", " "); Chris@0: Chris@0: return field.trimmed(); Chris@0: } Chris@0: Chris@0: static QString Chris@0: extractYear(QString datefield) Chris@0: { Chris@0: QRegExp re("[0-9]{4}"); Chris@0: if (re.indexIn(datefield) >= 0) { Chris@0: return re.cap(0); Chris@0: } Chris@0: return ""; Chris@0: } Chris@0: Chris@0: static QString Chris@0: extractKey(QString titlefield) Chris@0: { Chris@0: QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))"); Chris@0: if (re.indexIn(titlefield) >= 0) { Chris@0: return re.cap(1); Chris@0: } Chris@0: return ""; Chris@0: } Chris@0: Chris@0: static Work * Chris@0: makeWork(QString composerName, QString opfield, QString numfield, Chris@0: int partNumber, QString titlefield, QString datefield, Chris@0: QString placefield, QString remarksfield, Work *main) Chris@0: { Chris@0: if (titlefield.contains("List of ") || titlefield.contains("http:")) return 0; Chris@0: Chris@0: QString linkText; Chris@0: Chris@0: Work *w = new Work; Chris@0: Chris@0: QRegExp embeddedOpMatcher("([Oo]pus|[Oo]p.|WAB) (posth[a-z\\.]* *)?([0-9][^ ;:,]*)(,? *([Nn]umber|[Nn]o.|[Nn]r.) ([0-9][^ ;:,]*))?,?"); Chris@0: if (embeddedOpMatcher.indexIn(titlefield) >= 0) { Chris@0: QString opf = embeddedOpMatcher.cap(0); Chris@0: if (opfield == "") opfield = opf; Chris@0: titlefield.replace(opf, ""); Chris@0: } else if (embeddedOpMatcher.indexIn(remarksfield) >= 0) { Chris@0: opfield = embeddedOpMatcher.cap(0); Chris@0: } Chris@0: if (main && numfield == "") { Chris@0: QRegExp embeddedNumMatcher("(Number|No.|Nr.) ([0-9][^ ;:,]*)"); Chris@0: if (embeddedNumMatcher.indexIn(titlefield) >= 0) { Chris@0: numfield = embeddedNumMatcher.cap(2); Chris@0: } else if (embeddedNumMatcher.indexIn(remarksfield) >= 0) { Chris@0: numfield = embeddedNumMatcher.cap(2); Chris@0: } Chris@0: } Chris@0: Chris@0: QString op = sanitise(opfield, linkText); Chris@0: if (op != "") { Chris@0: if (op.toLower().contains("op")) { Chris@0: op.replace("Opus ", ""); Chris@0: op.replace("Op. ", ""); Chris@0: op.replace("Op.", ""); Chris@0: op.replace("Op ", ""); Chris@0: op.replace("opus ", ""); Chris@0: op.replace("op. ", ""); Chris@0: op.replace("op.", ""); Chris@0: op.replace("op ", ""); Chris@0: w->setOpus(op); Chris@0: } else if (QRegExp("^[0-9]*$").indexIn(op) >= 0) { Chris@0: w->setOpus(op); Chris@0: } else { Chris@0: w->setCatalogue(op); Chris@0: } Chris@0: } Chris@0: Chris@0: QString num = sanitise(numfield, linkText); Chris@0: if (num != "") { Chris@0: num.replace("No. ", ""); Chris@0: num.replace("No ", ""); Chris@0: w->setNumber(num); Chris@0: } else if (partNumber > 0) { Chris@0: w->setNumber(QString("%1").arg(partNumber)); Chris@0: } Chris@0: Chris@0: QString key = extractKey(titlefield); Chris@0: if (key != "") { Chris@0: w->setKey(key); Chris@0: } Chris@0: Chris@0: DEBUG << "title before sanitise: " << titlefield << endl; Chris@0: Chris@0: remarksfield = remarksfield.trimmed(); Chris@0: Chris@0: QString title = sanitise(titlefield, linkText); Chris@0: title.replace(QRegExp(", which.*$"), ""); Chris@0: if (linkText != "") { Chris@0: if (remarksfield == "" && title.startsWith(linkText)) { Chris@0: remarksfield = title.right(title.length() - linkText.length()); Chris@0: title = linkText; Chris@0: } Chris@0: linkText.replace(" ", "_"); Chris@0: QUrl url; Chris@0: url.setScheme("http"); Chris@0: url.setHost("en.wikipedia.org"); Chris@0: url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText)); Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri(url)); Chris@0: d->setSiteName("Wikipedia"); Chris@0: d->setTopic(w); Chris@0: w->addPage(d); Chris@0: } Chris@0: Chris@0: DEBUG << "title after sanitise: " << title << ", link text " << linkText << ", remarks " << remarksfield << endl; Chris@0: Chris@0: QRegExp explicationRE("^(\"[^-]+\") - (.+)$"); Chris@0: int pos; Chris@0: if ((pos = explicationRE.indexIn(title)) >= 0) { Chris@0: QString part = explicationRE.cap(2); Chris@0: if (part[0].isUpper()) w->addAlias(explicationRE.cap(2)); Chris@0: else if (remarksfield == "") remarksfield = explicationRE.cap(2); Chris@0: title = explicationRE.cap(1); Chris@0: } Chris@0: Chris@0: QRegExp remarksRE1("^(\"[^-]+\") (for .*)$"); Chris@0: if ((pos = remarksRE1.indexIn(title)) >= 0) { Chris@0: if (remarksfield != "") { Chris@0: remarksfield = QString("%1 - %2") Chris@0: .arg(remarksRE1.cap(2)).arg(remarksfield); Chris@0: } else { Chris@0: remarksfield = remarksRE1.cap(2); Chris@0: } Chris@0: title = remarksRE1.cap(1); Chris@0: } Chris@0: Chris@0: QRegExp remarksRE2("^(\"[^\"]+\"), (.*)$"); Chris@0: if ((pos = remarksRE2.indexIn(title)) >= 0) { Chris@0: if (remarksfield != "") { Chris@0: remarksfield = QString("%1 - %2") Chris@0: .arg(remarksRE2.cap(2)).arg(remarksfield); Chris@0: } else { Chris@0: remarksfield = remarksRE2.cap(2); Chris@0: } Chris@0: title = remarksRE2.cap(1); Chris@0: } Chris@0: Chris@0: QRegExp explicationRE2("^([^\\(]*\") \\(([^\\)]*)\\)(.*)$"); Chris@0: if ((pos = explicationRE2.indexIn(title)) >= 0) { Chris@0: w->addAlias(explicationRE2.cap(2)); Chris@0: if (remarksfield == "") remarksfield = explicationRE2.cap(3); Chris@0: title = explicationRE2.cap(1); Chris@0: } Chris@0: Chris@0: if (title.startsWith("Song \"")) { Chris@0: title = title.right(title.length() - 5); Chris@0: w->addForm(Form::getFormByName("song")); Chris@0: } Chris@0: if (!main && title.startsWith("Song cycle \"")) { Chris@0: title = title.right(title.length() - 11); Chris@0: w->addForm(Form::getFormByName("song cycle")); Chris@0: } Chris@0: if (main && main->forms().contains(Form::getFormByName("song cycle"))) { Chris@0: w->addForm(Form::getFormByName("song")); Chris@0: } Chris@0: Chris@0: if (title == "" && !main) { Chris@0: delete w; Chris@0: return 0; Chris@0: } Chris@0: Chris@0: w->setName(title); Chris@0: Chris@0: QString remarks = sanitise(remarksfield, linkText); Chris@0: if (remarks != "") { Chris@0: w->setRemarks(remarks); Chris@0: } Chris@0: Chris@0: QString year = extractYear(datefield); Chris@0: QString place = sanitise(placefield, linkText); Chris@0: Chris@0: DEBUG << "title = " << title << endl; Chris@0: Chris@0: if (main) { Chris@0: main->addPart(w); Chris@0: w->setPartOf(main); Chris@0: w->setComposition(main->composition()); Chris@0: main->composition()->addWork(w); Chris@0: } Chris@0: Chris@0: if (!main || !main->composition() || Chris@0: (year != "" && (main->composition()->year() != year.toInt()))) { Chris@0: Composition *c = new Composition; Chris@0: c->setComposerName(composerName); Chris@0: c->addWork(w); Chris@0: c->setYear(year.toInt()); Chris@0: c->setPlace(place); Chris@0: w->setComposition(c); Chris@0: } Chris@0: Chris@0: return w; Chris@0: } Chris@0: Chris@0: Chris@0: void Chris@0: WikipediaWorksListImporter::import(QUrl source) Chris@0: { Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: Chris@0: QString composerName; Chris@0: if (filename.contains("K%C3%B6chel")) { Chris@0: composerName = "Wolfgang Amadeus Mozart"; Chris@0: } else if (filename.contains("/Schubert_")) { Chris@0: composerName = "Franz Schubert"; Chris@0: } else { Chris@0: QRegExp byby("by_(.*)_by"); Chris@0: if (byby.indexIn(filename) >= 0) { Chris@0: composerName = byby.cap(1).replace('_', ' '); Chris@0: } else { Chris@0: QRegExp bybr("by_(.*)_\\("); Chris@0: if (bybr.indexIn(filename) >= 0) { Chris@0: composerName = bybr.cap(1).replace('_', ' '); Chris@0: } else { Chris@0: QRegExp by("by_(.*)"); Chris@0: if (by.indexIn(filename) >= 0) { Chris@0: composerName = by.cap(1).replace('_', ' '); Chris@0: } else { Chris@0: QRegExp of("of_([A-Z].*)"); Chris@0: if (of.indexIn(filename) >= 0) { Chris@0: composerName = of.cap(1).replace('_', ' '); Chris@0: } Chris@0: } Chris@0: } Chris@0: } Chris@0: } Chris@0: composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit()); Chris@0: Chris@0: DEBUG << "composerName = " << composerName << endl; Chris@0: Chris@0: Chris@0: // We try to keep these matchers specific enough that we can be Chris@0: // sure the title field will come out containing _at least_ the Chris@0: // title. i.e. the title field should never end up with just the Chris@0: // opus number or date or whatever, even if the line is formatted Chris@0: // in a way we hadn't anticipated. Thus it helps if the title is Chris@0: // bookended by '' or [[]], etc Chris@0: Chris@0: // e.g. Beethoven Chris@0: // *Opus 84: ''[[Egmont (Beethoven)|Egmont]]'', overture and incidental music (1810) Chris@0: // opus field - n/a - title - date - n/a - remarks Chris@0: QRegExp workMatcher1("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:{]*)[:,] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$"); Chris@0: Chris@0: // e.g. Tchaikovsky Chris@0: // *'''Op. 19''' 6 Pieces, for piano (1873) Chris@0: // or Ravel Chris@0: // * '''1''', Piano Sonata movement (1888), lost Chris@0: Chris@0: /* Chris@0: // opus field - n/a - title - date - n/a - remarks Chris@0: QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G)? *[0-9][^ ,:'{]*)'''[:, ] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$"); Chris@0: */ Chris@0: // opus field - n/a - title Chris@0: QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|[A-Z]{1,2})?\\.? *[0-9][^ ,:'{]*),?'''[:, ] *(.*)$"); Chris@0: Chris@0: // e.g. Copland Chris@0: // * ''Four Motets'' for mixed voices (1921) Chris@0: // title - date field Chris@0: // (no opus) Chris@0: QRegExp workMatcher2("^\\* *(''.*''\\)?) *(.*)$"); Chris@0: workMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings Chris@0: Chris@0: // e.g. Copland Chris@0: // * Arrangement of ''Lincoln Portrait'' for concert band (1942) Chris@0: // or Mendelssohn Chris@0: // * [[Christe du Lamm Gottes]] (1827), SATB, strings Chris@0: // title - date field - remarks Chris@0: // (no opus) Chris@0: QRegExp workMatcher3("^\\* *([^\\*].*) *\\(([^\\)]*[0-9]{4}[^\\)]*)\\) *(.*)$"); Chris@0: Chris@0: // e.g. Scriabin Chris@0: // *[[Sonata No. 2 (Scriabin)|Sonata No. 2 in G sharp minor]], Op. 19 (also known as ''Sonata-Fantasy'')" Chris@0: // title - opus field - n/a - remarks Chris@0: QRegExp workMatcher4("^\\* *(\\[\\[.*\\]\\]),* (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*) *(.*)$"); Chris@0: Chris@0: // e.g. Scriabin Chris@0: // *Opus 35: [[Opus 35 (Scriabin)|Three Preludes]] Chris@0: // opus field - n/a - title - remarks Chris@0: QRegExp workMatcher5("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)[:,]* *([\\[']+.*[\\]']+) *(.*)$"); Chris@0: Chris@0: // e.g. Boccherini Chris@0: // *G 1: Cello Sonata in F major Chris@0: // or weird Schubert layout Chris@0: // * D 505{{nbsp|4}}Adagio in D-flat for Piano Chris@0: // or Glazunov Chris@0: // :Op. 67: ''[[The Seasons (ballet)|The Seasons]]'', ballet in one act (1900) Chris@0: // or even Chris@0: // ::Op. 77: ''[[Symphony No. 7 (Glazunov)|Symphony No. 7]]'' "Pastorale" in F major (1902-1903) Chris@0: // This one is a real mess, for really messy pages. Needs to go near Chris@0: // the end of the matchers in case it catches something it shouldn't Chris@0: // n/a - opus field - n/a - n/a - n/a - title Chris@0: QRegExp workMatcher6("^([\\*:]|::) *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)(([:,]| *\\{+[^\\}]+\\}+) *(.*))?$"); Chris@0: Chris@0: // e.g. Bruch Chris@0: // * Adagio appassionato for violin and orchestra in C sharp minor, Op. 57 Chris@0: // title - opus field - date field Chris@0: QRegExp workMatcher7("^\\* *(.*),? (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*|[Oo]p. posth[a-z.]*) *(\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\))? *$"); Chris@0: Chris@0: // e.g. Bruckner Chris@0: // * Symphony No. 0 in D minor 1869 WAB 100 Chris@0: // title - date field - opus field Chris@0: QRegExp workMatcher8("^\\* *(.*) ([0-9]{4}[0-9/-]*) *(WAB [0-9][^ ]*)$"); Chris@0: Chris@0: // e.g. Bach Chris@0: // * BWV 506 ? Was bist du doch, o Seele, so betruebet Chris@0: // opus field - title Chris@0: QRegExp workMatcher9("^\\* *(BWV [^ ]+)(.*)$"); Chris@0: Chris@0: // Catch-all for things that look at all promising (anything that Chris@0: // starts with ' or [ after bullet: take the whole as title) Chris@0: QRegExp workMatcher10("^[\\*:] *((['\\[]|").*)$"); Chris@0: Chris@0: Chris@0: Chris@0: // e.g. Beethoven Chris@0: // **No. 1: [[Piano Trio No. 1 (Beethoven)|Piano Trio No. 1]] in E-flat major Chris@0: // number field - n/a - title, remarks etc Chris@0: QRegExp partMatcher1("^[\\*:]{2} *((No\\.? *)?[0-9][^ ,:'{]*)[:, ] *(.*)$"); Chris@0: Chris@0: // e.g. Copland Chris@0: // ** ''Help us, O Lord'' Chris@0: // title - remarks Chris@0: QRegExp partMatcher2("^\\*\\* *(''.*'') *(.*)$"); Chris@0: partMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings Chris@0: Chris@0: // e.g. Scriabin Chris@0: // **[[Mazurka Op. 40 No. 1 (Scriabin)|Mazurka in D flat major]] Chris@0: // title - remarks Chris@0: QRegExp partMatcher3("^\\*\\* *(\\[\\[.*\\]\\])(.*)$"); Chris@0: Chris@0: // e.g. Berlioz Chris@0: // ** 1: ''Méditation religieuse'' Chris@0: // number - title - remarks Chris@0: QRegExp partMatcher4("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *([\\[]*''.*''[\\]]*) *(.*)$"); Chris@0: Chris@0: // e.g. Tchaikovsky Chris@0: // **4. Nocturne [???????] (C? minor) Chris@0: // number - title - remarks Chris@0: QRegExp partMatcher5("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *(.*\\[[^\\]]+\\])(.*)$"); Chris@0: Chris@0: // e.g. Schubert Chris@0: // **2. "Wohin?" Chris@0: // n/a - number - title Chris@0: QRegExp partMatcher6("^\\*\\* *(([0-9][0-9a-z]*)[\\.:])? *(("|'').*)$"); Chris@0: Chris@0: // e.g. Mendelssohn Chris@0: // ** Notturno Chris@0: // title only Chris@0: QRegExp partMatcher7("^\\*\\* *(.*)$"); Chris@0: Chris@0: Chris@0: // Date and remarks within titlefield or remarksfield Chris@0: QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\),?(.*)"); Chris@0: Chris@0: Chris@0: Work *main = 0; Chris@0: int partNumber = 0; Chris@0: Chris@0: QString line; Chris@0: QString opfield, numfield, titlefield, remarksfield, datefield; Chris@0: Chris@0: while (!stream.atEnd()) { Chris@0: Chris@0: if (line == "") { Chris@0: line = stream.readLine(); Chris@0: DEBUG << "line: " << line << endl; Chris@0: } Chris@0: Chris@0: opfield = ""; Chris@0: numfield = ""; Chris@0: titlefield = ""; Chris@0: datefield = ""; Chris@0: remarksfield = ""; Chris@0: partNumber = 0; Chris@0: Chris@0: if (workMatcher1.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 1" << endl; Chris@0: opfield = workMatcher1.cap(1); Chris@0: titlefield = workMatcher1.cap(3); Chris@0: datefield = workMatcher1.cap(4); Chris@0: remarksfield = workMatcher1.cap(6); Chris@0: Chris@0: } else if (workMatcher1a.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 1a" << endl; Chris@0: opfield = workMatcher1a.cap(1); Chris@0: titlefield = workMatcher1a.cap(3); Chris@0: /* Chris@0: datefield = workMatcher1a.cap(4); Chris@0: remarksfield = workMatcher1a.cap(6); Chris@0: */ Chris@0: Chris@0: } else if (workMatcher2.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 2" << endl; Chris@0: titlefield = workMatcher2.cap(1); Chris@0: remarksfield = workMatcher2.cap(2); Chris@0: Chris@0: } else if (workMatcher3.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 3" << endl; Chris@0: titlefield = workMatcher3.cap(1); Chris@0: datefield = workMatcher3.cap(2); Chris@0: remarksfield = workMatcher3.cap(3); Chris@0: Chris@0: } else if (workMatcher4.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 4" << endl; Chris@0: titlefield = workMatcher4.cap(1); Chris@0: opfield = workMatcher4.cap(2); Chris@0: remarksfield = workMatcher4.cap(4); Chris@0: Chris@0: } else if (workMatcher5.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 5" << endl; Chris@0: opfield = workMatcher5.cap(1); Chris@0: titlefield = workMatcher5.cap(3); Chris@0: remarksfield = workMatcher5.cap(4); Chris@0: Chris@0: } else if (workMatcher6.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 6" << endl; Chris@0: opfield = workMatcher6.cap(2); Chris@0: titlefield = workMatcher6.cap(6); Chris@0: Chris@0: } else if (workMatcher7.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 7" << endl; Chris@0: titlefield = workMatcher7.cap(1); Chris@0: opfield = workMatcher7.cap(2); Chris@0: datefield = workMatcher7.cap(3); Chris@0: Chris@0: } else if (workMatcher8.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 8" << endl; Chris@0: titlefield = workMatcher8.cap(1); Chris@0: datefield = workMatcher8.cap(2); Chris@0: opfield = workMatcher8.cap(3); Chris@0: Chris@0: } else if (workMatcher9.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 9" << endl; Chris@0: opfield = workMatcher9.cap(1); Chris@0: titlefield = workMatcher9.cap(2); Chris@0: Chris@0: } else if (workMatcher10.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "matcher 10" << endl; Chris@0: titlefield = workMatcher10.cap(1); Chris@0: Chris@0: } else { Chris@0: if (line.startsWith("*") || line.startsWith(":")) { Chris@0: DEBUG << "Failed to match promising works list line: " << line << endl; Chris@0: } Chris@0: line = ""; Chris@0: continue; Chris@0: } Chris@0: Chris@0: if (titlefield != "" && datefield == "") { Chris@0: int dpos; Chris@0: if ((dpos = matcherDate.indexIn(titlefield)) != -1) { Chris@0: datefield = matcherDate.cap(1); Chris@0: remarksfield = matcherDate.cap(2); Chris@0: titlefield = titlefield.left(dpos); Chris@0: } Chris@0: } Chris@0: Chris@0: if (remarksfield != "" && datefield == "") { Chris@0: int dpos; Chris@0: if ((dpos = matcherDate.indexIn(remarksfield)) != -1) { Chris@0: datefield = matcherDate.cap(1); Chris@0: remarksfield = remarksfield.left(dpos); Chris@0: } Chris@0: } Chris@0: Chris@0: main = makeWork(composerName, opfield, "", 0, Chris@0: titlefield, datefield, "", remarksfield, 0); Chris@0: Chris@0: if (main) m_objects.push_back(main); Chris@0: Chris@0: line = ""; Chris@0: Chris@0: while (!stream.atEnd()) { Chris@0: Chris@0: ++partNumber; Chris@0: line = stream.readLine(); Chris@0: DEBUG << "line: " << line << endl; Chris@0: Chris@0: if (partMatcher1.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 1" << endl; Chris@0: numfield = partMatcher1.cap(1); Chris@0: titlefield = partMatcher1.cap(3); Chris@0: remarksfield = ""; Chris@0: Chris@0: } else if (partMatcher2.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 2" << endl; Chris@0: titlefield = partMatcher2.cap(1); Chris@0: remarksfield = partMatcher2.cap(2); Chris@0: Chris@0: } else if (partMatcher3.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 3" << endl; Chris@0: titlefield = partMatcher3.cap(1); Chris@0: remarksfield = partMatcher3.cap(2); Chris@0: Chris@0: } else if (partMatcher4.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 4" << endl; Chris@0: numfield = partMatcher4.cap(1); Chris@0: titlefield = partMatcher4.cap(2); Chris@0: remarksfield = partMatcher4.cap(3); Chris@0: Chris@0: } else if (partMatcher5.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 5" << endl; Chris@0: numfield = partMatcher5.cap(1); Chris@0: titlefield = partMatcher5.cap(2); Chris@0: remarksfield = partMatcher5.cap(3); Chris@0: Chris@0: } else if (partMatcher6.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 6" << endl; Chris@0: numfield = partMatcher6.cap(2); Chris@0: titlefield = partMatcher6.cap(3); Chris@0: Chris@0: } else if (partMatcher7.indexIn(line) >= 0) { Chris@0: Chris@0: DEBUG << "part matcher 7" << endl; Chris@0: titlefield = partMatcher7.cap(1); Chris@0: Chris@0: } else { Chris@0: if (line.startsWith("**") || line.startsWith("::")) { Chris@0: DEBUG << "Failed to match promising part line: " << line << endl; Chris@0: } Chris@0: break; Chris@0: } Chris@0: Chris@0: if (titlefield != "" && datefield == "") { Chris@0: int dpos; Chris@0: if ((dpos = matcherDate.indexIn(titlefield)) != -1) { Chris@0: datefield = matcherDate.cap(1); Chris@0: remarksfield = matcherDate.cap(2); Chris@0: titlefield = titlefield.left(dpos); Chris@0: } Chris@0: } Chris@0: Chris@0: Work *part = makeWork(composerName, opfield, numfield, partNumber, Chris@0: titlefield, datefield, "", remarksfield, Chris@0: main); Chris@0: Chris@0: if (part) m_objects.push_back(part); Chris@0: } Chris@0: } Chris@0: Chris@0: DEBUG << "Found " << m_objects.size() << " things" << endl; Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: Chris@0: