Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportWikipediaWorksList.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 WikipediaWorksListImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "WikipediaWorksListImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 static QString
|
Chris@0
|
27 sanitise(QString field, QString &linkText)
|
Chris@0
|
28 {
|
Chris@0
|
29 int mp;
|
Chris@0
|
30
|
Chris@0
|
31 field.replace(QString::fromUtf8("\342\200\222"), "-");
|
Chris@0
|
32 field.replace(QString::fromUtf8("\342\200\223"), "-");
|
Chris@0
|
33 field.replace(QString::fromUtf8("\342\200\224"), "-");
|
Chris@0
|
34 field.replace(QString::fromUtf8("\342\200\225"), "-");
|
Chris@0
|
35
|
Chris@0
|
36 field.replace(QString::fromUtf8("\342\231\255"), "-flat");
|
Chris@0
|
37 field.replace(QString::fromUtf8("\342\231\257"), "-sharp");
|
Chris@0
|
38
|
Chris@0
|
39 QRegExp link2("([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
|
Chris@0
|
40 if ((mp = link2.indexIn(field)) >= 0) {
|
Chris@0
|
41 if (linkText == "" && mp < 4) linkText = link2.cap(2);
|
Chris@0
|
42 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
|
Chris@0
|
43 return sanitise(field, linkText);
|
Chris@0
|
44 }
|
Chris@0
|
45
|
Chris@0
|
46 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
|
Chris@0
|
47 if ((mp = link1.indexIn(field)) >= 0) {
|
Chris@0
|
48 if (linkText == "") linkText = link1.cap(2);
|
Chris@0
|
49 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
|
Chris@0
|
50 return sanitise(field, linkText);
|
Chris@0
|
51 }
|
Chris@0
|
52
|
Chris@0
|
53 field = field.trimmed();
|
Chris@0
|
54
|
Chris@0
|
55 field.replace("[", "");
|
Chris@0
|
56 field.replace("]", "");
|
Chris@0
|
57 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), " ");
|
Chris@0
|
58 field.replace("'''", "\"");
|
Chris@0
|
59 field.replace("''", "\"");
|
Chris@0
|
60 field.replace(""", "\"");
|
Chris@0
|
61 field.replace("\"\"", "\"");
|
Chris@0
|
62 field.replace(QRegExp("^[\'\"] (\")?"), "\"");
|
Chris@0
|
63 field.replace(QRegExp("<[^&]*>"), "");
|
Chris@0
|
64 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
65
|
Chris@0
|
66 if (field.endsWith("c.")) {
|
Chris@0
|
67 // historical artifact from removal of Bruckner year indication (c. 1856)
|
Chris@0
|
68 field = field.left(field.length()-2);
|
Chris@0
|
69 }
|
Chris@0
|
70
|
Chris@0
|
71 while (field.endsWith(".") || field.endsWith(",")) {
|
Chris@0
|
72 field = field.left(field.length()-1);
|
Chris@0
|
73 }
|
Chris@0
|
74
|
Chris@0
|
75 if (field.startsWith(";") || field.startsWith(":") || field.startsWith(",")
|
Chris@0
|
76 || field.startsWith("-")) {
|
Chris@0
|
77 field = field.right(field.length()-1);
|
Chris@0
|
78 }
|
Chris@0
|
79
|
Chris@0
|
80 if (field.startsWith("(") && field.endsWith(")")) {
|
Chris@0
|
81 DEBUG << "before: " << field;
|
Chris@0
|
82 field = field.mid(1, field.length()-2);
|
Chris@0
|
83 DEBUG << "after: " << field;
|
Chris@0
|
84 }
|
Chris@0
|
85
|
Chris@0
|
86 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
87 if (field == ")" || field == "(") {
|
Chris@0
|
88 field = "";
|
Chris@0
|
89 }
|
Chris@0
|
90
|
Chris@0
|
91 field.replace(" - ,", ",");
|
Chris@0
|
92 field.replace(" ", " ");
|
Chris@0
|
93
|
Chris@0
|
94 return field.trimmed();
|
Chris@0
|
95 }
|
Chris@0
|
96
|
Chris@0
|
97 static QString
|
Chris@0
|
98 extractYear(QString datefield)
|
Chris@0
|
99 {
|
Chris@0
|
100 QRegExp re("[0-9]{4}");
|
Chris@0
|
101 if (re.indexIn(datefield) >= 0) {
|
Chris@0
|
102 return re.cap(0);
|
Chris@0
|
103 }
|
Chris@0
|
104 return "";
|
Chris@0
|
105 }
|
Chris@0
|
106
|
Chris@0
|
107 static QString
|
Chris@0
|
108 extractKey(QString titlefield)
|
Chris@0
|
109 {
|
Chris@0
|
110 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
|
Chris@0
|
111 if (re.indexIn(titlefield) >= 0) {
|
Chris@0
|
112 return re.cap(1);
|
Chris@0
|
113 }
|
Chris@0
|
114 return "";
|
Chris@0
|
115 }
|
Chris@0
|
116
|
Chris@0
|
117 static Work *
|
Chris@0
|
118 makeWork(QString composerName, QString opfield, QString numfield,
|
Chris@0
|
119 int partNumber, QString titlefield, QString datefield,
|
Chris@0
|
120 QString placefield, QString remarksfield, Work *main)
|
Chris@0
|
121 {
|
Chris@0
|
122 if (titlefield.contains("List of ") || titlefield.contains("http:")) return 0;
|
Chris@0
|
123
|
Chris@0
|
124 QString linkText;
|
Chris@0
|
125
|
Chris@0
|
126 Work *w = new Work;
|
Chris@0
|
127
|
Chris@0
|
128 QRegExp embeddedOpMatcher("([Oo]pus|[Oo]p.|WAB) (posth[a-z\\.]* *)?([0-9][^ ;:,]*)(,? *([Nn]umber|[Nn]o.|[Nn]r.) ([0-9][^ ;:,]*))?,?");
|
Chris@0
|
129 if (embeddedOpMatcher.indexIn(titlefield) >= 0) {
|
Chris@0
|
130 QString opf = embeddedOpMatcher.cap(0);
|
Chris@0
|
131 if (opfield == "") opfield = opf;
|
Chris@0
|
132 titlefield.replace(opf, "");
|
Chris@0
|
133 } else if (embeddedOpMatcher.indexIn(remarksfield) >= 0) {
|
Chris@0
|
134 opfield = embeddedOpMatcher.cap(0);
|
Chris@0
|
135 }
|
Chris@0
|
136 if (main && numfield == "") {
|
Chris@0
|
137 QRegExp embeddedNumMatcher("(Number|No.|Nr.) ([0-9][^ ;:,]*)");
|
Chris@0
|
138 if (embeddedNumMatcher.indexIn(titlefield) >= 0) {
|
Chris@0
|
139 numfield = embeddedNumMatcher.cap(2);
|
Chris@0
|
140 } else if (embeddedNumMatcher.indexIn(remarksfield) >= 0) {
|
Chris@0
|
141 numfield = embeddedNumMatcher.cap(2);
|
Chris@0
|
142 }
|
Chris@0
|
143 }
|
Chris@0
|
144
|
Chris@0
|
145 QString op = sanitise(opfield, linkText);
|
Chris@0
|
146 if (op != "") {
|
Chris@0
|
147 if (op.toLower().contains("op")) {
|
Chris@0
|
148 op.replace("Opus ", "");
|
Chris@0
|
149 op.replace("Op. ", "");
|
Chris@0
|
150 op.replace("Op.", "");
|
Chris@0
|
151 op.replace("Op ", "");
|
Chris@0
|
152 op.replace("opus ", "");
|
Chris@0
|
153 op.replace("op. ", "");
|
Chris@0
|
154 op.replace("op.", "");
|
Chris@0
|
155 op.replace("op ", "");
|
Chris@0
|
156 w->setOpus(op);
|
Chris@0
|
157 } else if (QRegExp("^[0-9]*$").indexIn(op) >= 0) {
|
Chris@0
|
158 w->setOpus(op);
|
Chris@0
|
159 } else {
|
Chris@0
|
160 w->setCatalogue(op);
|
Chris@0
|
161 }
|
Chris@0
|
162 }
|
Chris@0
|
163
|
Chris@0
|
164 QString num = sanitise(numfield, linkText);
|
Chris@0
|
165 if (num != "") {
|
Chris@0
|
166 num.replace("No. ", "");
|
Chris@0
|
167 num.replace("No ", "");
|
Chris@0
|
168 w->setNumber(num);
|
Chris@0
|
169 } else if (partNumber > 0) {
|
Chris@0
|
170 w->setNumber(QString("%1").arg(partNumber));
|
Chris@0
|
171 }
|
Chris@0
|
172
|
Chris@0
|
173 QString key = extractKey(titlefield);
|
Chris@0
|
174 if (key != "") {
|
Chris@0
|
175 w->setKey(key);
|
Chris@0
|
176 }
|
Chris@0
|
177
|
Chris@0
|
178 DEBUG << "title before sanitise: " << titlefield << endl;
|
Chris@0
|
179
|
Chris@0
|
180 remarksfield = remarksfield.trimmed();
|
Chris@0
|
181
|
Chris@0
|
182 QString title = sanitise(titlefield, linkText);
|
Chris@0
|
183 title.replace(QRegExp(", which.*$"), "");
|
Chris@0
|
184 if (linkText != "") {
|
Chris@0
|
185 if (remarksfield == "" && title.startsWith(linkText)) {
|
Chris@0
|
186 remarksfield = title.right(title.length() - linkText.length());
|
Chris@0
|
187 title = linkText;
|
Chris@0
|
188 }
|
Chris@0
|
189 linkText.replace(" ", "_");
|
Chris@0
|
190 QUrl url;
|
Chris@0
|
191 url.setScheme("http");
|
Chris@0
|
192 url.setHost("en.wikipedia.org");
|
Chris@0
|
193 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
|
Chris@0
|
194 Document *d = new Document;
|
Chris@18
|
195 d->setUri(Uri(url));
|
Chris@0
|
196 d->setSiteName("Wikipedia");
|
Chris@0
|
197 d->setTopic(w);
|
Chris@0
|
198 w->addPage(d);
|
Chris@0
|
199 }
|
Chris@0
|
200
|
Chris@0
|
201 DEBUG << "title after sanitise: " << title << ", link text " << linkText << ", remarks " << remarksfield << endl;
|
Chris@0
|
202
|
Chris@0
|
203 QRegExp explicationRE("^(\"[^-]+\") - (.+)$");
|
Chris@0
|
204 int pos;
|
Chris@0
|
205 if ((pos = explicationRE.indexIn(title)) >= 0) {
|
Chris@0
|
206 QString part = explicationRE.cap(2);
|
Chris@0
|
207 if (part[0].isUpper()) w->addAlias(explicationRE.cap(2));
|
Chris@0
|
208 else if (remarksfield == "") remarksfield = explicationRE.cap(2);
|
Chris@0
|
209 title = explicationRE.cap(1);
|
Chris@0
|
210 }
|
Chris@0
|
211
|
Chris@0
|
212 QRegExp remarksRE1("^(\"[^-]+\") (for .*)$");
|
Chris@0
|
213 if ((pos = remarksRE1.indexIn(title)) >= 0) {
|
Chris@0
|
214 if (remarksfield != "") {
|
Chris@0
|
215 remarksfield = QString("%1 - %2")
|
Chris@0
|
216 .arg(remarksRE1.cap(2)).arg(remarksfield);
|
Chris@0
|
217 } else {
|
Chris@0
|
218 remarksfield = remarksRE1.cap(2);
|
Chris@0
|
219 }
|
Chris@0
|
220 title = remarksRE1.cap(1);
|
Chris@0
|
221 }
|
Chris@0
|
222
|
Chris@0
|
223 QRegExp remarksRE2("^(\"[^\"]+\"), (.*)$");
|
Chris@0
|
224 if ((pos = remarksRE2.indexIn(title)) >= 0) {
|
Chris@0
|
225 if (remarksfield != "") {
|
Chris@0
|
226 remarksfield = QString("%1 - %2")
|
Chris@0
|
227 .arg(remarksRE2.cap(2)).arg(remarksfield);
|
Chris@0
|
228 } else {
|
Chris@0
|
229 remarksfield = remarksRE2.cap(2);
|
Chris@0
|
230 }
|
Chris@0
|
231 title = remarksRE2.cap(1);
|
Chris@0
|
232 }
|
Chris@0
|
233
|
Chris@0
|
234 QRegExp explicationRE2("^([^\\(]*\") \\(([^\\)]*)\\)(.*)$");
|
Chris@0
|
235 if ((pos = explicationRE2.indexIn(title)) >= 0) {
|
Chris@0
|
236 w->addAlias(explicationRE2.cap(2));
|
Chris@0
|
237 if (remarksfield == "") remarksfield = explicationRE2.cap(3);
|
Chris@0
|
238 title = explicationRE2.cap(1);
|
Chris@0
|
239 }
|
Chris@0
|
240
|
Chris@0
|
241 if (title.startsWith("Song \"")) {
|
Chris@0
|
242 title = title.right(title.length() - 5);
|
Chris@0
|
243 w->addForm(Form::getFormByName("song"));
|
Chris@0
|
244 }
|
Chris@0
|
245 if (!main && title.startsWith("Song cycle \"")) {
|
Chris@0
|
246 title = title.right(title.length() - 11);
|
Chris@0
|
247 w->addForm(Form::getFormByName("song cycle"));
|
Chris@0
|
248 }
|
Chris@0
|
249 if (main && main->forms().contains(Form::getFormByName("song cycle"))) {
|
Chris@0
|
250 w->addForm(Form::getFormByName("song"));
|
Chris@0
|
251 }
|
Chris@0
|
252
|
Chris@0
|
253 if (title == "" && !main) {
|
Chris@0
|
254 delete w;
|
Chris@0
|
255 return 0;
|
Chris@0
|
256 }
|
Chris@0
|
257
|
Chris@0
|
258 w->setName(title);
|
Chris@0
|
259
|
Chris@0
|
260 QString remarks = sanitise(remarksfield, linkText);
|
Chris@0
|
261 if (remarks != "") {
|
Chris@0
|
262 w->setRemarks(remarks);
|
Chris@0
|
263 }
|
Chris@0
|
264
|
Chris@0
|
265 QString year = extractYear(datefield);
|
Chris@0
|
266 QString place = sanitise(placefield, linkText);
|
Chris@0
|
267
|
Chris@0
|
268 DEBUG << "title = " << title << endl;
|
Chris@0
|
269
|
Chris@0
|
270 if (main) {
|
Chris@0
|
271 main->addPart(w);
|
Chris@0
|
272 w->setPartOf(main);
|
Chris@0
|
273 w->setComposition(main->composition());
|
Chris@0
|
274 main->composition()->addWork(w);
|
Chris@0
|
275 }
|
Chris@0
|
276
|
Chris@0
|
277 if (!main || !main->composition() ||
|
Chris@0
|
278 (year != "" && (main->composition()->year() != year.toInt()))) {
|
Chris@0
|
279 Composition *c = new Composition;
|
Chris@0
|
280 c->setComposerName(composerName);
|
Chris@0
|
281 c->addWork(w);
|
Chris@0
|
282 c->setYear(year.toInt());
|
Chris@0
|
283 c->setPlace(place);
|
Chris@0
|
284 w->setComposition(c);
|
Chris@0
|
285 }
|
Chris@0
|
286
|
Chris@0
|
287 return w;
|
Chris@0
|
288 }
|
Chris@0
|
289
|
Chris@0
|
290
|
Chris@0
|
291 void
|
Chris@0
|
292 WikipediaWorksListImporter::import(QUrl source)
|
Chris@0
|
293 {
|
Chris@0
|
294 //!!! for now
|
Chris@0
|
295 QString filename = source.toLocalFile();
|
Chris@0
|
296
|
Chris@0
|
297 QFile file(filename);
|
Chris@0
|
298 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
299 throw std::exception();
|
Chris@0
|
300 }
|
Chris@0
|
301
|
Chris@0
|
302 QTextStream stream(&file);
|
Chris@0
|
303 stream.setCodec("UTF-8");
|
Chris@0
|
304
|
Chris@0
|
305 QString composerName;
|
Chris@0
|
306 if (filename.contains("K%C3%B6chel")) {
|
Chris@0
|
307 composerName = "Wolfgang Amadeus Mozart";
|
Chris@0
|
308 } else if (filename.contains("/Schubert_")) {
|
Chris@0
|
309 composerName = "Franz Schubert";
|
Chris@0
|
310 } else {
|
Chris@0
|
311 QRegExp byby("by_(.*)_by");
|
Chris@0
|
312 if (byby.indexIn(filename) >= 0) {
|
Chris@0
|
313 composerName = byby.cap(1).replace('_', ' ');
|
Chris@0
|
314 } else {
|
Chris@0
|
315 QRegExp bybr("by_(.*)_\\(");
|
Chris@0
|
316 if (bybr.indexIn(filename) >= 0) {
|
Chris@0
|
317 composerName = bybr.cap(1).replace('_', ' ');
|
Chris@0
|
318 } else {
|
Chris@0
|
319 QRegExp by("by_(.*)");
|
Chris@0
|
320 if (by.indexIn(filename) >= 0) {
|
Chris@0
|
321 composerName = by.cap(1).replace('_', ' ');
|
Chris@0
|
322 } else {
|
Chris@0
|
323 QRegExp of("of_([A-Z].*)");
|
Chris@0
|
324 if (of.indexIn(filename) >= 0) {
|
Chris@0
|
325 composerName = of.cap(1).replace('_', ' ');
|
Chris@0
|
326 }
|
Chris@0
|
327 }
|
Chris@0
|
328 }
|
Chris@0
|
329 }
|
Chris@0
|
330 }
|
Chris@0
|
331 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
|
Chris@0
|
332
|
Chris@0
|
333 DEBUG << "composerName = " << composerName << endl;
|
Chris@0
|
334
|
Chris@0
|
335
|
Chris@0
|
336 // We try to keep these matchers specific enough that we can be
|
Chris@0
|
337 // sure the title field will come out containing _at least_ the
|
Chris@0
|
338 // title. i.e. the title field should never end up with just the
|
Chris@0
|
339 // opus number or date or whatever, even if the line is formatted
|
Chris@0
|
340 // in a way we hadn't anticipated. Thus it helps if the title is
|
Chris@0
|
341 // bookended by '' or [[]], etc
|
Chris@0
|
342
|
Chris@0
|
343 // e.g. Beethoven
|
Chris@0
|
344 // *Opus 84: ''[[Egmont (Beethoven)|Egmont]]'', overture and incidental music (1810)
|
Chris@0
|
345 // opus field - n/a - title - date - n/a - remarks
|
Chris@0
|
346 QRegExp workMatcher1("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:{]*)[:,] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");
|
Chris@0
|
347
|
Chris@0
|
348 // e.g. Tchaikovsky
|
Chris@0
|
349 // *'''Op. 19''' 6 Pieces, for piano (1873)
|
Chris@0
|
350 // or Ravel
|
Chris@0
|
351 // * '''1''', Piano Sonata movement (1888), lost
|
Chris@0
|
352
|
Chris@0
|
353 /*
|
Chris@0
|
354 // opus field - n/a - title - date - n/a - remarks
|
Chris@0
|
355 QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G)? *[0-9][^ ,:'{]*)'''[:, ] *(.*) *\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\) *(.*)$");
|
Chris@0
|
356 */
|
Chris@0
|
357 // opus field - n/a - title
|
Chris@0
|
358 QRegExp workMatcher1a("^\\* *'''(([Oo]pus|[Oo]p\\.|WoO|Anh|[A-Z]{1,2})?\\.? *[0-9][^ ,:'{]*),?'''[:, ] *(.*)$");
|
Chris@0
|
359
|
Chris@0
|
360 // e.g. Copland
|
Chris@0
|
361 // * ''Four Motets'' for mixed voices (1921)
|
Chris@0
|
362 // title - date field
|
Chris@0
|
363 // (no opus)
|
Chris@0
|
364 QRegExp workMatcher2("^\\* *(''.*''\\)?) *(.*)$");
|
Chris@0
|
365 workMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings
|
Chris@0
|
366
|
Chris@0
|
367 // e.g. Copland
|
Chris@0
|
368 // * Arrangement of ''Lincoln Portrait'' for concert band (1942)
|
Chris@0
|
369 // or Mendelssohn
|
Chris@0
|
370 // * [[Christe du Lamm Gottes]] (1827), SATB, strings
|
Chris@0
|
371 // title - date field - remarks
|
Chris@0
|
372 // (no opus)
|
Chris@0
|
373 QRegExp workMatcher3("^\\* *([^\\*].*) *\\(([^\\)]*[0-9]{4}[^\\)]*)\\) *(.*)$");
|
Chris@0
|
374
|
Chris@0
|
375 // e.g. Scriabin
|
Chris@0
|
376 // *[[Sonata No. 2 (Scriabin)|Sonata No. 2 in G sharp minor]], Op. 19 (also known as ''Sonata-Fantasy'')"
|
Chris@0
|
377 // title - opus field - n/a - remarks
|
Chris@0
|
378 QRegExp workMatcher4("^\\* *(\\[\\[.*\\]\\]),* (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*) *(.*)$");
|
Chris@0
|
379
|
Chris@0
|
380 // e.g. Scriabin
|
Chris@0
|
381 // *Opus 35: [[Opus 35 (Scriabin)|Three Preludes]]
|
Chris@0
|
382 // opus field - n/a - title - remarks
|
Chris@0
|
383 QRegExp workMatcher5("^\\* *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)[:,]* *([\\[']+.*[\\]']+) *(.*)$");
|
Chris@0
|
384
|
Chris@0
|
385 // e.g. Boccherini
|
Chris@0
|
386 // *G 1: Cello Sonata in F major
|
Chris@0
|
387 // or weird Schubert layout
|
Chris@0
|
388 // * D 505{{nbsp|4}}Adagio in D-flat for Piano
|
Chris@0
|
389 // or Glazunov
|
Chris@0
|
390 // :Op. 67: ''[[The Seasons (ballet)|The Seasons]]'', ballet in one act (1900)
|
Chris@0
|
391 // or even
|
Chris@0
|
392 // ::Op. 77: ''[[Symphony No. 7 (Glazunov)|Symphony No. 7]]'' "Pastorale" in F major (1902-1903)
|
Chris@0
|
393 // This one is a real mess, for really messy pages. Needs to go near
|
Chris@0
|
394 // the end of the matchers in case it catches something it shouldn't
|
Chris@0
|
395 // n/a - opus field - n/a - n/a - n/a - title
|
Chris@0
|
396 QRegExp workMatcher6("^([\\*:]|::) *(([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*)(([:,]| *\\{+[^\\}]+\\}+) *(.*))?$");
|
Chris@0
|
397
|
Chris@0
|
398 // e.g. Bruch
|
Chris@0
|
399 // * Adagio appassionato for violin and orchestra in C sharp minor, Op. 57
|
Chris@0
|
400 // title - opus field - date field
|
Chris@0
|
401 QRegExp workMatcher7("^\\* *(.*),? (([Oo]pus|[Oo]p\\.|WoO|Anh|H|D|G) *[0-9][^ ,:'{]*|[Oo]p. posth[a-z.]*) *(\\([^\\)]*([0-9]{4}(-[0-9]+)*)[^0-9\\)]*\\))? *$");
|
Chris@0
|
402
|
Chris@0
|
403 // e.g. Bruckner
|
Chris@0
|
404 // * Symphony No. 0 in D minor 1869 WAB 100
|
Chris@0
|
405 // title - date field - opus field
|
Chris@0
|
406 QRegExp workMatcher8("^\\* *(.*) ([0-9]{4}[0-9/-]*) *(WAB [0-9][^ ]*)$");
|
Chris@0
|
407
|
Chris@0
|
408 // e.g. Bach
|
Chris@0
|
409 // * BWV 506 ? Was bist du doch, o Seele, so betruebet
|
Chris@0
|
410 // opus field - title
|
Chris@0
|
411 QRegExp workMatcher9("^\\* *(BWV [^ ]+)(.*)$");
|
Chris@0
|
412
|
Chris@0
|
413 // Catch-all for things that look at all promising (anything that
|
Chris@0
|
414 // starts with ' or [ after bullet: take the whole as title)
|
Chris@0
|
415 QRegExp workMatcher10("^[\\*:] *((['\\[]|").*)$");
|
Chris@0
|
416
|
Chris@0
|
417
|
Chris@0
|
418
|
Chris@0
|
419 // e.g. Beethoven
|
Chris@0
|
420 // **No. 1: [[Piano Trio No. 1 (Beethoven)|Piano Trio No. 1]] in E-flat major
|
Chris@0
|
421 // number field - n/a - title, remarks etc
|
Chris@0
|
422 QRegExp partMatcher1("^[\\*:]{2} *((No\\.? *)?[0-9][^ ,:'{]*)[:, ] *(.*)$");
|
Chris@0
|
423
|
Chris@0
|
424 // e.g. Copland
|
Chris@0
|
425 // ** ''Help us, O Lord''
|
Chris@0
|
426 // title - remarks
|
Chris@0
|
427 QRegExp partMatcher2("^\\*\\* *(''.*'') *(.*)$");
|
Chris@0
|
428 partMatcher2.setMinimal(true); // avoid matching multiple ''...'' substrings
|
Chris@0
|
429
|
Chris@0
|
430 // e.g. Scriabin
|
Chris@0
|
431 // **[[Mazurka Op. 40 No. 1 (Scriabin)|Mazurka in D flat major]]
|
Chris@0
|
432 // title - remarks
|
Chris@0
|
433 QRegExp partMatcher3("^\\*\\* *(\\[\\[.*\\]\\])(.*)$");
|
Chris@0
|
434
|
Chris@0
|
435 // e.g. Berlioz
|
Chris@0
|
436 // ** 1: ''Méditation religieuse''
|
Chris@0
|
437 // number - title - remarks
|
Chris@0
|
438 QRegExp partMatcher4("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *([\\[]*''.*''[\\]]*) *(.*)$");
|
Chris@0
|
439
|
Chris@0
|
440 // e.g. Tchaikovsky
|
Chris@0
|
441 // **4. Nocturne [???????] (C? minor)
|
Chris@0
|
442 // number - title - remarks
|
Chris@0
|
443 QRegExp partMatcher5("^\\*\\* *([0-9][0-9a-z]*)[\\.: ] *(.*\\[[^\\]]+\\])(.*)$");
|
Chris@0
|
444
|
Chris@0
|
445 // e.g. Schubert
|
Chris@0
|
446 // **2. "Wohin?"
|
Chris@0
|
447 // n/a - number - title
|
Chris@0
|
448 QRegExp partMatcher6("^\\*\\* *(([0-9][0-9a-z]*)[\\.:])? *(("|'').*)$");
|
Chris@0
|
449
|
Chris@0
|
450 // e.g. Mendelssohn
|
Chris@0
|
451 // ** Notturno
|
Chris@0
|
452 // title only
|
Chris@0
|
453 QRegExp partMatcher7("^\\*\\* *(.*)$");
|
Chris@0
|
454
|
Chris@0
|
455
|
Chris@0
|
456 // Date and remarks within titlefield or remarksfield
|
Chris@0
|
457 QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\),?(.*)");
|
Chris@0
|
458
|
Chris@0
|
459
|
Chris@0
|
460 Work *main = 0;
|
Chris@0
|
461 int partNumber = 0;
|
Chris@0
|
462
|
Chris@0
|
463 QString line;
|
Chris@0
|
464 QString opfield, numfield, titlefield, remarksfield, datefield;
|
Chris@0
|
465
|
Chris@0
|
466 while (!stream.atEnd()) {
|
Chris@0
|
467
|
Chris@0
|
468 if (line == "") {
|
Chris@0
|
469 line = stream.readLine();
|
Chris@0
|
470 DEBUG << "line: " << line << endl;
|
Chris@0
|
471 }
|
Chris@0
|
472
|
Chris@0
|
473 opfield = "";
|
Chris@0
|
474 numfield = "";
|
Chris@0
|
475 titlefield = "";
|
Chris@0
|
476 datefield = "";
|
Chris@0
|
477 remarksfield = "";
|
Chris@0
|
478 partNumber = 0;
|
Chris@0
|
479
|
Chris@0
|
480 if (workMatcher1.indexIn(line) >= 0) {
|
Chris@0
|
481
|
Chris@0
|
482 DEBUG << "matcher 1" << endl;
|
Chris@0
|
483 opfield = workMatcher1.cap(1);
|
Chris@0
|
484 titlefield = workMatcher1.cap(3);
|
Chris@0
|
485 datefield = workMatcher1.cap(4);
|
Chris@0
|
486 remarksfield = workMatcher1.cap(6);
|
Chris@0
|
487
|
Chris@0
|
488 } else if (workMatcher1a.indexIn(line) >= 0) {
|
Chris@0
|
489
|
Chris@0
|
490 DEBUG << "matcher 1a" << endl;
|
Chris@0
|
491 opfield = workMatcher1a.cap(1);
|
Chris@0
|
492 titlefield = workMatcher1a.cap(3);
|
Chris@0
|
493 /*
|
Chris@0
|
494 datefield = workMatcher1a.cap(4);
|
Chris@0
|
495 remarksfield = workMatcher1a.cap(6);
|
Chris@0
|
496 */
|
Chris@0
|
497
|
Chris@0
|
498 } else if (workMatcher2.indexIn(line) >= 0) {
|
Chris@0
|
499
|
Chris@0
|
500 DEBUG << "matcher 2" << endl;
|
Chris@0
|
501 titlefield = workMatcher2.cap(1);
|
Chris@0
|
502 remarksfield = workMatcher2.cap(2);
|
Chris@0
|
503
|
Chris@0
|
504 } else if (workMatcher3.indexIn(line) >= 0) {
|
Chris@0
|
505
|
Chris@0
|
506 DEBUG << "matcher 3" << endl;
|
Chris@0
|
507 titlefield = workMatcher3.cap(1);
|
Chris@0
|
508 datefield = workMatcher3.cap(2);
|
Chris@0
|
509 remarksfield = workMatcher3.cap(3);
|
Chris@0
|
510
|
Chris@0
|
511 } else if (workMatcher4.indexIn(line) >= 0) {
|
Chris@0
|
512
|
Chris@0
|
513 DEBUG << "matcher 4" << endl;
|
Chris@0
|
514 titlefield = workMatcher4.cap(1);
|
Chris@0
|
515 opfield = workMatcher4.cap(2);
|
Chris@0
|
516 remarksfield = workMatcher4.cap(4);
|
Chris@0
|
517
|
Chris@0
|
518 } else if (workMatcher5.indexIn(line) >= 0) {
|
Chris@0
|
519
|
Chris@0
|
520 DEBUG << "matcher 5" << endl;
|
Chris@0
|
521 opfield = workMatcher5.cap(1);
|
Chris@0
|
522 titlefield = workMatcher5.cap(3);
|
Chris@0
|
523 remarksfield = workMatcher5.cap(4);
|
Chris@0
|
524
|
Chris@0
|
525 } else if (workMatcher6.indexIn(line) >= 0) {
|
Chris@0
|
526
|
Chris@0
|
527 DEBUG << "matcher 6" << endl;
|
Chris@0
|
528 opfield = workMatcher6.cap(2);
|
Chris@0
|
529 titlefield = workMatcher6.cap(6);
|
Chris@0
|
530
|
Chris@0
|
531 } else if (workMatcher7.indexIn(line) >= 0) {
|
Chris@0
|
532
|
Chris@0
|
533 DEBUG << "matcher 7" << endl;
|
Chris@0
|
534 titlefield = workMatcher7.cap(1);
|
Chris@0
|
535 opfield = workMatcher7.cap(2);
|
Chris@0
|
536 datefield = workMatcher7.cap(3);
|
Chris@0
|
537
|
Chris@0
|
538 } else if (workMatcher8.indexIn(line) >= 0) {
|
Chris@0
|
539
|
Chris@0
|
540 DEBUG << "matcher 8" << endl;
|
Chris@0
|
541 titlefield = workMatcher8.cap(1);
|
Chris@0
|
542 datefield = workMatcher8.cap(2);
|
Chris@0
|
543 opfield = workMatcher8.cap(3);
|
Chris@0
|
544
|
Chris@0
|
545 } else if (workMatcher9.indexIn(line) >= 0) {
|
Chris@0
|
546
|
Chris@0
|
547 DEBUG << "matcher 9" << endl;
|
Chris@0
|
548 opfield = workMatcher9.cap(1);
|
Chris@0
|
549 titlefield = workMatcher9.cap(2);
|
Chris@0
|
550
|
Chris@0
|
551 } else if (workMatcher10.indexIn(line) >= 0) {
|
Chris@0
|
552
|
Chris@0
|
553 DEBUG << "matcher 10" << endl;
|
Chris@0
|
554 titlefield = workMatcher10.cap(1);
|
Chris@0
|
555
|
Chris@0
|
556 } else {
|
Chris@0
|
557 if (line.startsWith("*") || line.startsWith(":")) {
|
Chris@0
|
558 DEBUG << "Failed to match promising works list line: " << line << endl;
|
Chris@0
|
559 }
|
Chris@0
|
560 line = "";
|
Chris@0
|
561 continue;
|
Chris@0
|
562 }
|
Chris@0
|
563
|
Chris@0
|
564 if (titlefield != "" && datefield == "") {
|
Chris@0
|
565 int dpos;
|
Chris@0
|
566 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
|
Chris@0
|
567 datefield = matcherDate.cap(1);
|
Chris@0
|
568 remarksfield = matcherDate.cap(2);
|
Chris@0
|
569 titlefield = titlefield.left(dpos);
|
Chris@0
|
570 }
|
Chris@0
|
571 }
|
Chris@0
|
572
|
Chris@0
|
573 if (remarksfield != "" && datefield == "") {
|
Chris@0
|
574 int dpos;
|
Chris@0
|
575 if ((dpos = matcherDate.indexIn(remarksfield)) != -1) {
|
Chris@0
|
576 datefield = matcherDate.cap(1);
|
Chris@0
|
577 remarksfield = remarksfield.left(dpos);
|
Chris@0
|
578 }
|
Chris@0
|
579 }
|
Chris@0
|
580
|
Chris@0
|
581 main = makeWork(composerName, opfield, "", 0,
|
Chris@0
|
582 titlefield, datefield, "", remarksfield, 0);
|
Chris@0
|
583
|
Chris@0
|
584 if (main) m_objects.push_back(main);
|
Chris@0
|
585
|
Chris@0
|
586 line = "";
|
Chris@0
|
587
|
Chris@0
|
588 while (!stream.atEnd()) {
|
Chris@0
|
589
|
Chris@0
|
590 ++partNumber;
|
Chris@0
|
591 line = stream.readLine();
|
Chris@0
|
592 DEBUG << "line: " << line << endl;
|
Chris@0
|
593
|
Chris@0
|
594 if (partMatcher1.indexIn(line) >= 0) {
|
Chris@0
|
595
|
Chris@0
|
596 DEBUG << "part matcher 1" << endl;
|
Chris@0
|
597 numfield = partMatcher1.cap(1);
|
Chris@0
|
598 titlefield = partMatcher1.cap(3);
|
Chris@0
|
599 remarksfield = "";
|
Chris@0
|
600
|
Chris@0
|
601 } else if (partMatcher2.indexIn(line) >= 0) {
|
Chris@0
|
602
|
Chris@0
|
603 DEBUG << "part matcher 2" << endl;
|
Chris@0
|
604 titlefield = partMatcher2.cap(1);
|
Chris@0
|
605 remarksfield = partMatcher2.cap(2);
|
Chris@0
|
606
|
Chris@0
|
607 } else if (partMatcher3.indexIn(line) >= 0) {
|
Chris@0
|
608
|
Chris@0
|
609 DEBUG << "part matcher 3" << endl;
|
Chris@0
|
610 titlefield = partMatcher3.cap(1);
|
Chris@0
|
611 remarksfield = partMatcher3.cap(2);
|
Chris@0
|
612
|
Chris@0
|
613 } else if (partMatcher4.indexIn(line) >= 0) {
|
Chris@0
|
614
|
Chris@0
|
615 DEBUG << "part matcher 4" << endl;
|
Chris@0
|
616 numfield = partMatcher4.cap(1);
|
Chris@0
|
617 titlefield = partMatcher4.cap(2);
|
Chris@0
|
618 remarksfield = partMatcher4.cap(3);
|
Chris@0
|
619
|
Chris@0
|
620 } else if (partMatcher5.indexIn(line) >= 0) {
|
Chris@0
|
621
|
Chris@0
|
622 DEBUG << "part matcher 5" << endl;
|
Chris@0
|
623 numfield = partMatcher5.cap(1);
|
Chris@0
|
624 titlefield = partMatcher5.cap(2);
|
Chris@0
|
625 remarksfield = partMatcher5.cap(3);
|
Chris@0
|
626
|
Chris@0
|
627 } else if (partMatcher6.indexIn(line) >= 0) {
|
Chris@0
|
628
|
Chris@0
|
629 DEBUG << "part matcher 6" << endl;
|
Chris@0
|
630 numfield = partMatcher6.cap(2);
|
Chris@0
|
631 titlefield = partMatcher6.cap(3);
|
Chris@0
|
632
|
Chris@0
|
633 } else if (partMatcher7.indexIn(line) >= 0) {
|
Chris@0
|
634
|
Chris@0
|
635 DEBUG << "part matcher 7" << endl;
|
Chris@0
|
636 titlefield = partMatcher7.cap(1);
|
Chris@0
|
637
|
Chris@0
|
638 } else {
|
Chris@0
|
639 if (line.startsWith("**") || line.startsWith("::")) {
|
Chris@0
|
640 DEBUG << "Failed to match promising part line: " << line << endl;
|
Chris@0
|
641 }
|
Chris@0
|
642 break;
|
Chris@0
|
643 }
|
Chris@0
|
644
|
Chris@0
|
645 if (titlefield != "" && datefield == "") {
|
Chris@0
|
646 int dpos;
|
Chris@0
|
647 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
|
Chris@0
|
648 datefield = matcherDate.cap(1);
|
Chris@0
|
649 remarksfield = matcherDate.cap(2);
|
Chris@0
|
650 titlefield = titlefield.left(dpos);
|
Chris@0
|
651 }
|
Chris@0
|
652 }
|
Chris@0
|
653
|
Chris@0
|
654 Work *part = makeWork(composerName, opfield, numfield, partNumber,
|
Chris@0
|
655 titlefield, datefield, "", remarksfield,
|
Chris@0
|
656 main);
|
Chris@0
|
657
|
Chris@0
|
658 if (part) m_objects.push_back(part);
|
Chris@0
|
659 }
|
Chris@0
|
660 }
|
Chris@0
|
661
|
Chris@0
|
662 DEBUG << "Found " << m_objects.size() << " things" << endl;
|
Chris@0
|
663 }
|
Chris@0
|
664
|
Chris@0
|
665
|
Chris@0
|
666 }
|
Chris@0
|
667
|
Chris@0
|
668
|