Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportWikipediaWorks.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 WikipediaWorksImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "WikipediaWorksImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 QString
|
Chris@0
|
27 sanitise(QString field, QString &linkText)
|
Chris@0
|
28 {
|
Chris@0
|
29 int mp;
|
Chris@0
|
30
|
Chris@0
|
31 field.replace(QString::fromUtf8("\342\200\222"), "-");
|
Chris@0
|
32 field.replace(QString::fromUtf8("\342\200\223"), "-");
|
Chris@0
|
33 field.replace(QString::fromUtf8("\342\200\224"), "-");
|
Chris@0
|
34 field.replace(QString::fromUtf8("\342\200\225"), "-");
|
Chris@0
|
35
|
Chris@0
|
36 QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
|
Chris@0
|
37 if ((mp = link2.indexIn(field)) >= 0) {
|
Chris@0
|
38 if (linkText == "") linkText = link2.cap(2);
|
Chris@0
|
39 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
|
Chris@0
|
40 return sanitise(field, linkText);
|
Chris@0
|
41 }
|
Chris@0
|
42
|
Chris@0
|
43 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
|
Chris@0
|
44 if ((mp = link1.indexIn(field)) >= 0) {
|
Chris@0
|
45 if (linkText == "") linkText = link1.cap(2);
|
Chris@0
|
46 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
|
Chris@0
|
47 return sanitise(field, linkText);
|
Chris@0
|
48 }
|
Chris@0
|
49
|
Chris@0
|
50 field = field.trimmed();
|
Chris@0
|
51
|
Chris@0
|
52 field.replace("[", "");
|
Chris@0
|
53 field.replace("]", "");
|
Chris@0
|
54 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
|
Chris@0
|
55 field.replace("''", "\"");
|
Chris@0
|
56 field.replace(""", "\"");
|
Chris@0
|
57 field.replace(QRegExp("<[^&]*>"), "");
|
Chris@0
|
58 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
59
|
Chris@0
|
60 while (field.endsWith(".") || field.endsWith(",")) {
|
Chris@0
|
61 field = field.left(field.length()-1);
|
Chris@0
|
62 }
|
Chris@0
|
63
|
Chris@0
|
64 if (field.startsWith("(") && field.endsWith(")")) {
|
Chris@0
|
65 DEBUG << "before: " << field;
|
Chris@0
|
66 field = field.mid(1, field.length()-2);
|
Chris@0
|
67 DEBUG << "after: " << field;
|
Chris@0
|
68 }
|
Chris@0
|
69 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
70 if (field == ")" || field == "(") {
|
Chris@0
|
71 field = "";
|
Chris@0
|
72 }
|
Chris@0
|
73
|
Chris@0
|
74 field.replace(" - ,", ",");
|
Chris@0
|
75
|
Chris@0
|
76 return field;
|
Chris@0
|
77 }
|
Chris@0
|
78
|
Chris@0
|
79 QString
|
Chris@0
|
80 extractYear(QString datefield)
|
Chris@0
|
81 {
|
Chris@0
|
82 QRegExp re("[0-9]{4}");
|
Chris@0
|
83 if (re.indexIn(datefield) >= 0) {
|
Chris@0
|
84 return re.cap(0);
|
Chris@0
|
85 }
|
Chris@0
|
86 return "";
|
Chris@0
|
87 }
|
Chris@0
|
88
|
Chris@0
|
89 QString
|
Chris@0
|
90 extractKey(QString titlefield)
|
Chris@0
|
91 {
|
Chris@0
|
92 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
|
Chris@0
|
93 if (re.indexIn(titlefield) >= 0) {
|
Chris@0
|
94 return re.cap(1);
|
Chris@0
|
95 }
|
Chris@0
|
96 return "";
|
Chris@0
|
97 }
|
Chris@0
|
98
|
Chris@0
|
99 Work *
|
Chris@0
|
100 makeWork(QString composerName, QString opfield, QString kfield,
|
Chris@0
|
101 QString numfield, QString titlefield, QString datefield,
|
Chris@0
|
102 QString placefield, QString remarksfield, Work *main)
|
Chris@0
|
103 {
|
Chris@0
|
104 QString linkText;
|
Chris@0
|
105
|
Chris@0
|
106 Work *w = new Work;
|
Chris@0
|
107
|
Chris@0
|
108 QString op = sanitise(opfield, linkText);
|
Chris@0
|
109 if (op != "") {
|
Chris@0
|
110 op.replace("Opus ", "");
|
Chris@0
|
111 op.replace("Op. ", "");
|
Chris@0
|
112 op.replace("Op ", "");
|
Chris@0
|
113 w->setOpus(op);
|
Chris@0
|
114 }
|
Chris@0
|
115
|
Chris@0
|
116 QString k = sanitise(kfield, linkText);
|
Chris@0
|
117 if (k != "") {
|
Chris@0
|
118 w->setCatalogue(k);
|
Chris@0
|
119 }
|
Chris@0
|
120
|
Chris@0
|
121 QString num = sanitise(numfield, linkText);
|
Chris@0
|
122 if (num != "") {
|
Chris@0
|
123 num.replace("No. ", "");
|
Chris@0
|
124 num.replace("No ", "");
|
Chris@0
|
125 w->setNumber(num);
|
Chris@0
|
126 }
|
Chris@0
|
127
|
Chris@0
|
128 QString key = extractKey(titlefield);
|
Chris@0
|
129 if (key != "") {
|
Chris@0
|
130 w->setKey(key);
|
Chris@0
|
131 }
|
Chris@0
|
132
|
Chris@0
|
133 QString title = sanitise(titlefield, linkText);
|
Chris@0
|
134 if (linkText != "") {
|
Chris@0
|
135 linkText.replace(" ", "_");
|
Chris@0
|
136 QUrl url;
|
Chris@0
|
137 url.setScheme("http");
|
Chris@0
|
138 url.setHost("en.wikipedia.org");
|
Chris@0
|
139 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
|
Chris@0
|
140 Document *d = new Document;
|
Chris@18
|
141 d->setUri(Uri(url));
|
Chris@0
|
142 d->setSiteName("Wikipedia");
|
Chris@0
|
143 d->setTopic(w);
|
Chris@0
|
144 w->addPage(d);
|
Chris@0
|
145 }
|
Chris@0
|
146
|
Chris@0
|
147 QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
|
Chris@0
|
148 int pos;
|
Chris@0
|
149 if ((pos = explicationRE.indexIn(title)) >= 0) {
|
Chris@0
|
150 w->addAlias(explicationRE.cap(2));
|
Chris@0
|
151 title = explicationRE.cap(1);
|
Chris@0
|
152 }
|
Chris@0
|
153
|
Chris@0
|
154 if (remarksfield == "") {
|
Chris@0
|
155 QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
|
Chris@0
|
156 if ((pos = remarksRE.indexIn(title)) >= 0) {
|
Chris@0
|
157 remarksfield = remarksRE.cap(2);
|
Chris@0
|
158 title = remarksRE.cap(1);
|
Chris@0
|
159 }
|
Chris@0
|
160 }
|
Chris@0
|
161
|
Chris@0
|
162 if (remarksfield == "") {
|
Chris@0
|
163 QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
|
Chris@0
|
164 if ((pos = remarksRE.indexIn(title)) >= 0) {
|
Chris@0
|
165 remarksfield = remarksRE.cap(2);
|
Chris@0
|
166 title = remarksRE.cap(1);
|
Chris@0
|
167 }
|
Chris@0
|
168 }
|
Chris@0
|
169
|
Chris@0
|
170 w->setName(title);
|
Chris@0
|
171
|
Chris@0
|
172 QString remarks = sanitise(remarksfield, linkText);
|
Chris@0
|
173 if (remarks != "") {
|
Chris@0
|
174 w->setRemarks(remarks);
|
Chris@0
|
175 }
|
Chris@0
|
176
|
Chris@0
|
177 QString year = extractYear(datefield);
|
Chris@0
|
178 QString place = sanitise(placefield, linkText);
|
Chris@0
|
179
|
Chris@0
|
180 DEBUG << "title = " << title << endl;
|
Chris@0
|
181
|
Chris@0
|
182 if (main) {
|
Chris@0
|
183 main->addPart(w);
|
Chris@0
|
184 w->setPartOf(main);
|
Chris@0
|
185 w->setComposition(main->composition());
|
Chris@0
|
186 main->composition()->addWork(w);
|
Chris@0
|
187 }
|
Chris@0
|
188
|
Chris@0
|
189 if (!main || !main->composition() ||
|
Chris@0
|
190 (year != "" && (main->composition()->year() != year.toInt()))) {
|
Chris@0
|
191 Composition *c = new Composition;
|
Chris@0
|
192 c->setComposerName(composerName);
|
Chris@0
|
193 c->addWork(w);
|
Chris@0
|
194 c->setYear(year.toInt());
|
Chris@0
|
195 c->setPlace(place);
|
Chris@0
|
196 w->setComposition(c);
|
Chris@0
|
197 }
|
Chris@0
|
198
|
Chris@0
|
199 return w;
|
Chris@0
|
200 }
|
Chris@0
|
201
|
Chris@0
|
202
|
Chris@0
|
203 void
|
Chris@0
|
204 WikipediaWorksImporter::import(QUrl source)
|
Chris@0
|
205 {
|
Chris@0
|
206 //!!! for now
|
Chris@0
|
207 QString filename = source.toLocalFile();
|
Chris@0
|
208
|
Chris@0
|
209 QFile file(filename);
|
Chris@0
|
210 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
211 throw std::exception();
|
Chris@0
|
212 }
|
Chris@0
|
213
|
Chris@0
|
214 QTextStream stream(&file);
|
Chris@0
|
215 stream.setCodec("UTF-8");
|
Chris@0
|
216
|
Chris@0
|
217 QString composerName;
|
Chris@0
|
218 if (filename.contains("K%C3%B6chel")) {
|
Chris@0
|
219 composerName = "Wolfgang Amadeus Mozart";
|
Chris@0
|
220 } else if (filename.contains("/Schubert_")) {
|
Chris@0
|
221 composerName = "Franz Schubert";
|
Chris@0
|
222 } else {
|
Chris@0
|
223 QRegExp byby("by_(.*)_by");
|
Chris@0
|
224 if (byby.indexIn(filename) >= 0) {
|
Chris@0
|
225 composerName = byby.cap(1).replace('_', ' ');
|
Chris@0
|
226 } else {
|
Chris@0
|
227 QRegExp by("by_(.*)");
|
Chris@0
|
228 if (by.indexIn(filename) >= 0) {
|
Chris@0
|
229 composerName = by.cap(1).replace('_', ' ');
|
Chris@0
|
230 }
|
Chris@0
|
231 }
|
Chris@0
|
232 }
|
Chris@0
|
233 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
|
Chris@0
|
234
|
Chris@0
|
235 DEBUG << "composerName = " << composerName << endl;
|
Chris@0
|
236
|
Chris@0
|
237 // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
|
Chris@0
|
238 QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[K\\. *([0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
|
Chris@0
|
239
|
Chris@0
|
240 QString all = stream.readAll();
|
Chris@0
|
241
|
Chris@0
|
242 DEBUG << "Read " << all.length() << " chars" << endl;
|
Chris@0
|
243
|
Chris@0
|
244 all.replace(QRegExp("^.*<page>"), "");
|
Chris@0
|
245
|
Chris@0
|
246 int pos = 0, count = 0;
|
Chris@0
|
247
|
Chris@0
|
248 while ((pos = matcherK.indexIn(all, pos)) != -1) {
|
Chris@0
|
249
|
Chris@0
|
250 all.replace(pos, matcherK.matchedLength(), "");
|
Chris@0
|
251 ++count;
|
Chris@0
|
252
|
Chris@0
|
253 QString kfield = matcherK.cap(1);
|
Chris@0
|
254 QString titlefield = matcherK.cap(2);
|
Chris@0
|
255 QString datefield = matcherK.cap(3);
|
Chris@0
|
256 QString placefield = matcherK.cap(4);
|
Chris@0
|
257
|
Chris@0
|
258 m_objects.push_back
|
Chris@0
|
259 (makeWork(composerName, "K. " + kfield, kfield, "",
|
Chris@0
|
260 titlefield, datefield, placefield, "", 0));
|
Chris@0
|
261 }
|
Chris@0
|
262
|
Chris@0
|
263 // Opus in list form (as used for e.g. Beethoven's works)
|
Chris@0
|
264 QRegExp matcherB("[\\*:] *'*((Opus|Op\\.|WoO|Anh|H|D) [0-9][^,:'{\n]*)'*[,:{] *([^\n]*)\n");
|
Chris@0
|
265
|
Chris@0
|
266 // Part of an opus (e.g. op 18 no 1), intended to be anchored to
|
Chris@0
|
267 // the point at which the last matcherB or matcherB2 match ended
|
Chris@0
|
268 // (note caret)
|
Chris@0
|
269 QRegExp matcherB2("^[\\*:]{2} *([A-Za-z ]*)((No\\.* +)?[0-9][^ :\n]*)[: ] *([^\n]*)\n");
|
Chris@0
|
270
|
Chris@0
|
271 // Date and remarks within titlefield
|
Chris@0
|
272 QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\)(.*)");
|
Chris@0
|
273
|
Chris@0
|
274 pos = 0;
|
Chris@0
|
275
|
Chris@0
|
276 while ((pos = matcherB.indexIn(all, pos)) != -1) {
|
Chris@0
|
277
|
Chris@0
|
278 all.replace(pos, matcherB.matchedLength(), "");
|
Chris@0
|
279 ++count;
|
Chris@0
|
280
|
Chris@0
|
281 QString opfield = matcherB.cap(1);
|
Chris@0
|
282 QString titlefield = matcherB.cap(3);
|
Chris@0
|
283
|
Chris@0
|
284 QString datefield, remarksfield;
|
Chris@0
|
285
|
Chris@0
|
286 if (titlefield != "") {
|
Chris@0
|
287 int dpos;
|
Chris@0
|
288 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
|
Chris@0
|
289 datefield = matcherDate.cap(1);
|
Chris@0
|
290 remarksfield = matcherDate.cap(2);
|
Chris@0
|
291 titlefield = titlefield.left(dpos);
|
Chris@0
|
292 }
|
Chris@0
|
293 }
|
Chris@0
|
294
|
Chris@0
|
295 Work *main = makeWork(composerName, opfield, "", "",
|
Chris@0
|
296 titlefield, datefield, "", remarksfield, 0);
|
Chris@0
|
297
|
Chris@0
|
298 m_objects.push_back(main);
|
Chris@0
|
299
|
Chris@0
|
300 int spos = pos;
|
Chris@0
|
301
|
Chris@0
|
302 while ((spos = matcherB2.indexIn(all, spos, QRegExp::CaretAtOffset))
|
Chris@0
|
303 != -1) {
|
Chris@0
|
304
|
Chris@0
|
305 all.replace(spos, matcherB2.matchedLength(), "");
|
Chris@0
|
306 ++count;
|
Chris@0
|
307
|
Chris@0
|
308 QString numfield = matcherB2.cap(2);
|
Chris@0
|
309
|
Chris@0
|
310 titlefield = matcherB2.cap(4);
|
Chris@0
|
311
|
Chris@0
|
312 if (matcherB2.cap(1).trimmed() != "") {
|
Chris@0
|
313 titlefield = matcherB2.cap(1) + matcherB2.cap(2) + " "
|
Chris@0
|
314 + matcherB2.cap(4);
|
Chris@0
|
315 DEBUG << "prefix to number = " << matcherB2.cap(1) << ", so extending title from " << matcherB2.cap(4) << " to " << titlefield << endl;
|
Chris@0
|
316 }
|
Chris@0
|
317
|
Chris@0
|
318 datefield = "";
|
Chris@0
|
319 remarksfield = "";
|
Chris@0
|
320
|
Chris@0
|
321 if (titlefield != "") {
|
Chris@0
|
322 int dpos;
|
Chris@0
|
323 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
|
Chris@0
|
324 datefield = matcherDate.cap(1);
|
Chris@0
|
325 remarksfield = matcherDate.cap(2);
|
Chris@0
|
326 titlefield = titlefield.left(dpos);
|
Chris@0
|
327 }
|
Chris@0
|
328 }
|
Chris@0
|
329
|
Chris@0
|
330 Work *sub = makeWork(composerName, opfield, "", numfield,
|
Chris@0
|
331 titlefield, datefield, "", remarksfield, main);
|
Chris@0
|
332
|
Chris@0
|
333 m_objects.push_back(sub);
|
Chris@0
|
334 }
|
Chris@0
|
335 }
|
Chris@0
|
336
|
Chris@0
|
337 // Title with date but no opus in list form (as used for e.g. Copland)
|
Chris@0
|
338 QRegExp matcherC("\\* *([^\n]*)\\([^\\)]*([0-9]{4})[^\\)]*\\) *\n");
|
Chris@0
|
339
|
Chris@0
|
340 // Part of the above (e.g. song in cycle), intended to be anchored to
|
Chris@0
|
341 // the point at which the last matcherC or matcherC2 match ended
|
Chris@0
|
342 // (note caret)
|
Chris@0
|
343 QRegExp matcherC2("^\\*\\* *([^\n]*)\n");
|
Chris@0
|
344
|
Chris@0
|
345 pos = 0;
|
Chris@0
|
346
|
Chris@0
|
347 while ((pos = matcherC.indexIn(all, pos)) != -1) {
|
Chris@0
|
348
|
Chris@0
|
349 all.replace(pos, matcherC.matchedLength(), "");
|
Chris@0
|
350 ++count;
|
Chris@0
|
351
|
Chris@0
|
352 QString titlefield = matcherC.cap(1);
|
Chris@0
|
353 QString datefield = matcherC.cap(2);
|
Chris@0
|
354
|
Chris@0
|
355 Work *main = makeWork(composerName, "", "", "",
|
Chris@0
|
356 titlefield, datefield, "", "", 0);
|
Chris@0
|
357
|
Chris@0
|
358 m_objects.push_back(main);
|
Chris@0
|
359
|
Chris@0
|
360 int spos = pos;
|
Chris@0
|
361
|
Chris@0
|
362 while ((spos = matcherC2.indexIn(all, spos, QRegExp::CaretAtOffset))
|
Chris@0
|
363 != -1) {
|
Chris@0
|
364
|
Chris@0
|
365 all.replace(spos, matcherC2.matchedLength(), "");
|
Chris@0
|
366 ++count;
|
Chris@0
|
367
|
Chris@0
|
368 titlefield = matcherC2.cap(1);
|
Chris@0
|
369
|
Chris@0
|
370 datefield = "";
|
Chris@0
|
371
|
Chris@0
|
372 if (titlefield != "") {
|
Chris@0
|
373 int dpos;
|
Chris@0
|
374 if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
|
Chris@0
|
375 datefield = matcherDate.cap(1);
|
Chris@0
|
376 titlefield = titlefield.left(dpos);
|
Chris@0
|
377 }
|
Chris@0
|
378 }
|
Chris@0
|
379
|
Chris@0
|
380 Work *sub = makeWork(composerName, "", "", "",
|
Chris@0
|
381 titlefield, datefield, "", "", main);
|
Chris@0
|
382
|
Chris@0
|
383 m_objects.push_back(sub);
|
Chris@0
|
384 }
|
Chris@0
|
385 }
|
Chris@0
|
386
|
Chris@0
|
387
|
Chris@0
|
388
|
Chris@0
|
389 DEBUG << "Left over: " << all << endl;
|
Chris@0
|
390
|
Chris@0
|
391 // Other forms:
|
Chris@0
|
392 // *March No. 1 in F major for Military Band, WoO 18 (1808)
|
Chris@0
|
393
|
Chris@0
|
394
|
Chris@0
|
395 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
396 }
|
Chris@0
|
397
|
Chris@0
|
398
|
Chris@0
|
399 }
|
Chris@0
|
400
|
Chris@0
|
401
|