Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportClassicalComposersOrg.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 ClassicalComposersOrgImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 typedef QMap<QString, int> NameMap;
|
Chris@0
|
27
|
Chris@0
|
28 void
|
Chris@0
|
29 parseNames(QString field, NameMap &names, int score = 0)
|
Chris@0
|
30 {
|
Chris@0
|
31 QString a(field), b(field);
|
Chris@0
|
32
|
Chris@0
|
33 int mp;
|
Chris@0
|
34 QRegExp re;
|
Chris@0
|
35
|
Chris@0
|
36 /* classical-composers.org uses quite a few (not always
|
Chris@0
|
37 * consistent) ways to indicate alternatives in composer
|
Chris@0
|
38 * names. Not all of them are distinguishable.
|
Chris@0
|
39 * Examples:
|
Chris@0
|
40 *
|
Chris@0
|
41 * Pipe used to separate sorted surname from alternative for whole:
|
Chris@0
|
42 * Hardin | Moondog, Louis Thomas
|
Chris@0
|
43 * -> "Louis Thomas Hardin", "Moondog"
|
Chris@0
|
44 * Barron | Charlotte May Wind, Bebe
|
Chris@0
|
45 * -> "Bebe Barron", "Charlotte May Wind"
|
Chris@0
|
46 *
|
Chris@0
|
47 * Pipe used to separate alternatives for surname only (seems
|
Chris@0
|
48 * slightly more common than the previous one; if there is only
|
Chris@0
|
49 * one word between the pipe and a following comma, I'd be
|
Chris@0
|
50 * inclined to assume this case, Moondog notwithstanding):
|
Chris@0
|
51 * Mendelssohn | Hensel, Fanny Cécile
|
Chris@0
|
52 * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel"
|
Chris@0
|
53 * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander
|
Chris@0
|
54 * -> "Thomas Alexander Erskine, 6th Earl of Kellie",
|
Chris@0
|
55 * "Thomas Alexander Kelly"
|
Chris@0
|
56 *
|
Chris@0
|
57 * Round brackets used to indicate one or more alternatives for
|
Chris@0
|
58 * prior word; slash for alternation:
|
Chris@0
|
59 * Edelmann, Jean-Frédéric (Johann-Friedrich)
|
Chris@0
|
60 * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann"
|
Chris@0
|
61 * Eberwein, Max (Traugott Maximilian)
|
Chris@0
|
62 * -> "Max Eberwein", "Traugott Maximilian Eberwein"
|
Chris@0
|
63 * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio)
|
Chris@0
|
64 * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti",
|
Chris@0
|
65 * "Antoine Mahout", "Anton Mahaut", "Anton Mahault",
|
Chris@0
|
66 * "Anton Mahoti", "Anton Mahout", "Antonio Mahaut",
|
Chris@0
|
67 * "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout"
|
Chris@0
|
68 *
|
Chris@0
|
69 * Round brackets used to indicate alternative to prior
|
Chris@0
|
70 * names, with some meaning left implicit:
|
Chris@0
|
71 * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich)
|
Chris@0
|
72 * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest",
|
Chris@0
|
73 * perhaps "Heinrich Kaan" (but not "Jindrich z Albestu
|
Chris@0
|
74 * Kaan-Albest")
|
Chris@0
|
75 *
|
Chris@0
|
76 * Round brackets used to augment rather than
|
Chris@0
|
77 * alternate. Probably can't identify this reliably, though
|
Chris@0
|
78 * round brackets used somewhere other than at end of line
|
Chris@0
|
79 * are relatively likely to be this form (?):
|
Chris@0
|
80 * Linley (the elder), Thomas
|
Chris@0
|
81 * -> "Thomas Linley", "Thomas Linley the elder"
|
Chris@0
|
82 * Keys | Keyes, Ivor (Christopher Banfield)
|
Chris@0
|
83 * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys",
|
Chris@0
|
84 * "Ivor Christopher Banfield Keyes"
|
Chris@0
|
85 *
|
Chris@0
|
86 * Square brackets used to indicate alternative for all
|
Chris@0
|
87 * forenames:
|
Chris@0
|
88 * Moller | Möller, John Christopher [Johann Christoph]
|
Chris@0
|
89 * -> "John Christopher Moller", "John Christopher Möller",
|
Chris@0
|
90 * "Johann Christoph Moller", "Johann Christoph Möller"
|
Chris@0
|
91 *
|
Chris@0
|
92 * Complicated examples:
|
Chris@0
|
93 * Mayr | Mayer, (Johann) Simon [Giovanni Simone]
|
Chris@0
|
94 * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr",
|
Chris@0
|
95 * "Johann Simon Mayer", "Giovanni Simone Mayr",
|
Chris@0
|
96 * "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr")
|
Chris@0
|
97 * Frauenlob | Heinrich von Meissen
|
Chris@0
|
98 * -> "Heinrich Frauenlob", "Heinrich von Meissen", or
|
Chris@0
|
99 * perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob")
|
Chris@0
|
100 */
|
Chris@0
|
101
|
Chris@0
|
102 // DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl;
|
Chris@0
|
103
|
Chris@0
|
104 // round brackets used for augmentation right at the start
|
Chris@0
|
105 re = QRegExp("\\(([^\\)]+)\\) ");
|
Chris@0
|
106 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
107 int ml = re.matchedLength();
|
Chris@0
|
108 QString c(re.cap(1));
|
Chris@0
|
109 a.replace(mp, ml, "");
|
Chris@0
|
110 b.replace(mp, ml, QString("%1 ").arg(c));
|
Chris@0
|
111 parseNames(a, names, score);
|
Chris@0
|
112 parseNames(b, names, score+1);
|
Chris@0
|
113 return;
|
Chris@0
|
114 }
|
Chris@0
|
115
|
Chris@0
|
116 // round brackets used for augmentation directly after the comma
|
Chris@0
|
117 re = QRegExp(", \\(([^\\)]+)\\)");
|
Chris@0
|
118 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
119 int ml = re.matchedLength();
|
Chris@0
|
120 QString c(re.cap(1));
|
Chris@0
|
121 a.replace(mp, ml, ",");
|
Chris@0
|
122 b.replace(mp, ml, QString(", %1").arg(c));
|
Chris@0
|
123 parseNames(a, names, score);
|
Chris@0
|
124 parseNames(b, names, score+1);
|
Chris@0
|
125 return;
|
Chris@0
|
126 }
|
Chris@0
|
127
|
Chris@0
|
128 // round brackets used for augmentation directly before the comma
|
Chris@0
|
129 re = QRegExp(" \\(([^\\)]+)\\),");
|
Chris@0
|
130 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
131 int ml = re.matchedLength();
|
Chris@0
|
132 QString c(re.cap(1));
|
Chris@0
|
133 a.replace(mp, ml, ",");
|
Chris@0
|
134 b.replace(mp, ml, QString(" %1,").arg(c));
|
Chris@0
|
135 parseNames(a, names, score);
|
Chris@0
|
136 parseNames(b, names, score+1);
|
Chris@0
|
137 return;
|
Chris@0
|
138 }
|
Chris@0
|
139
|
Chris@0
|
140 // round brackets for alternation of single name, anywhere
|
Chris@0
|
141 re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)");
|
Chris@0
|
142 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
143 int ml = re.matchedLength();
|
Chris@0
|
144 QString c(re.cap(1));
|
Chris@0
|
145 QString d(re.cap(2));
|
Chris@0
|
146 a.replace(mp, ml, c);
|
Chris@0
|
147 b.replace(mp, ml, d);
|
Chris@0
|
148 parseNames(a, names, score);
|
Chris@0
|
149 parseNames(b, names, score+1);
|
Chris@0
|
150 return;
|
Chris@0
|
151 }
|
Chris@0
|
152
|
Chris@0
|
153 // square brackets for alternation of a series of names, at end or after pipe
|
Chris@0
|
154 re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]");
|
Chris@0
|
155 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
156 int ml = re.matchedLength();
|
Chris@0
|
157 QString p(re.cap(1));
|
Chris@0
|
158 QString c(re.cap(2));
|
Chris@0
|
159 QString d(re.cap(3));
|
Chris@0
|
160 a.replace(mp, ml, QString("%1 %2").arg(p).arg(c));
|
Chris@0
|
161 b.replace(mp, ml, QString("%1 %2").arg(p).arg(d));
|
Chris@0
|
162 parseNames(a, names, score);
|
Chris@0
|
163 parseNames(b, names, score+1);
|
Chris@0
|
164 return;
|
Chris@0
|
165 }
|
Chris@0
|
166
|
Chris@0
|
167 // square brackets for alternation of a series of names, at start
|
Chris@0
|
168 re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]");
|
Chris@0
|
169 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
170 int ml = re.matchedLength();
|
Chris@0
|
171 QString c(re.cap(1));
|
Chris@0
|
172 QString d(re.cap(2));
|
Chris@0
|
173 a.replace(mp, ml, c);
|
Chris@0
|
174 b.replace(mp, ml, d);
|
Chris@0
|
175 parseNames(a, names, score);
|
Chris@0
|
176 parseNames(b, names, score+1);
|
Chris@0
|
177 return;
|
Chris@0
|
178 }
|
Chris@0
|
179
|
Chris@0
|
180 // slash for alternation of word
|
Chris@0
|
181 re = QRegExp("([^ ,|]+)/([^ ,|]+)");
|
Chris@0
|
182 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
183 int ml = re.matchedLength();
|
Chris@0
|
184 QString c(re.cap(1));
|
Chris@0
|
185 QString d(re.cap(2));
|
Chris@0
|
186 a.replace(mp, ml, c);
|
Chris@0
|
187 b.replace(mp, ml, d);
|
Chris@0
|
188 parseNames(a, names, score);
|
Chris@0
|
189 parseNames(b, names, score+1);
|
Chris@0
|
190 return;
|
Chris@0
|
191 }
|
Chris@0
|
192
|
Chris@0
|
193 // pipe for alternation of surname
|
Chris@0
|
194 re = QRegExp("^(.*) \\| ([^|, ]+),");
|
Chris@0
|
195 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
196 int ml = re.matchedLength();
|
Chris@0
|
197 QString c(re.cap(1));
|
Chris@0
|
198 QString d(re.cap(2));
|
Chris@0
|
199 a.replace(mp, ml, c + ",");
|
Chris@0
|
200 b.replace(mp, ml, d + ",");
|
Chris@0
|
201 parseNames(a, names, score);
|
Chris@0
|
202 parseNames(b, names, score+1);
|
Chris@0
|
203 return;
|
Chris@0
|
204 }
|
Chris@0
|
205
|
Chris@0
|
206 // pipe for alternation of whole (before comma)
|
Chris@0
|
207 re = QRegExp("^(.*) \\| ([^|,]+),");
|
Chris@0
|
208 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
209 int ml = re.matchedLength();
|
Chris@0
|
210 QString c(re.cap(1));
|
Chris@0
|
211 QString d(re.cap(2));
|
Chris@0
|
212 a.replace(mp, ml, c + ",");
|
Chris@0
|
213 b = d;
|
Chris@0
|
214 parseNames(a, names, score);
|
Chris@0
|
215 parseNames(b, names, score+1);
|
Chris@0
|
216 return;
|
Chris@0
|
217 }
|
Chris@0
|
218
|
Chris@0
|
219 // pipe for alternation of whole (at end)
|
Chris@0
|
220 re = QRegExp("^(.*) \\| ([^|,]+)$");
|
Chris@0
|
221 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
222 int ml = re.matchedLength();
|
Chris@0
|
223 QString c(re.cap(1));
|
Chris@0
|
224 QString d(re.cap(2));
|
Chris@0
|
225 a.replace(mp, ml, c);
|
Chris@0
|
226 b.replace(mp, ml, d);
|
Chris@0
|
227 parseNames(a, names, score);
|
Chris@0
|
228 parseNames(b, names, score+1);
|
Chris@0
|
229 return;
|
Chris@0
|
230 }
|
Chris@0
|
231
|
Chris@0
|
232 // comma
|
Chris@0
|
233 re = QRegExp("^(.+), ([^,]+)$");
|
Chris@0
|
234 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
235 QString c(re.cap(1));
|
Chris@0
|
236 QString d(re.cap(2));
|
Chris@0
|
237 parseNames(d + " " + c, names, score+1);
|
Chris@0
|
238 // fall through to add
|
Chris@0
|
239 }
|
Chris@0
|
240
|
Chris@4
|
241 field.replace("(", "");
|
Chris@4
|
242 field.replace(")", "");
|
Chris@4
|
243
|
Chris@0
|
244 names[field] = score;
|
Chris@0
|
245 }
|
Chris@0
|
246
|
Chris@0
|
247 void
|
Chris@0
|
248 ClassicalComposersOrgImporter::import(QUrl source)
|
Chris@0
|
249 {
|
Chris@0
|
250 int i = 0;
|
Chris@0
|
251
|
Chris@0
|
252 //!!! for now
|
Chris@0
|
253 QString filename = source.toLocalFile();
|
Chris@0
|
254
|
Chris@0
|
255
|
Chris@0
|
256 QFile file(filename);
|
Chris@0
|
257 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
258 throw std::exception();
|
Chris@0
|
259 }
|
Chris@0
|
260
|
Chris@0
|
261 QTextStream stream(&file);
|
Chris@0
|
262 stream.setCodec("UTF-8");
|
Chris@0
|
263 QString all = stream.readAll();
|
Chris@0
|
264
|
Chris@0
|
265 all.replace(QRegExp("^.*<div id=\"main\">"), "");
|
Chris@0
|
266
|
Chris@0
|
267 QRegExp matcher
|
Chris@1
|
268 (QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>"));
|
Chris@0
|
269
|
Chris@0
|
270 int pos = 0, count = 0;
|
Chris@0
|
271 while ((pos = matcher.indexIn(all, pos)) != -1) {
|
Chris@0
|
272
|
Chris@0
|
273 pos += matcher.matchedLength();
|
Chris@0
|
274 ++count;
|
Chris@0
|
275
|
Chris@0
|
276 QString page = matcher.cap(1);
|
Chris@0
|
277 QString name = matcher.cap(2);
|
Chris@1
|
278 QString star = matcher.cap(5);
|
Chris@0
|
279 QString birth = matcher.cap(6);
|
Chris@1
|
280 QString dagger = matcher.cap(7);
|
Chris@1
|
281 QString death = matcher.cap(8);
|
Chris@1
|
282 QString female = matcher.cap(9);
|
Chris@0
|
283
|
Chris@0
|
284 DEBUG << "Item " << count
|
Chris@0
|
285 << ": page = " << page
|
Chris@0
|
286 << ", name = " << name
|
Chris@0
|
287 << ", birth = " << birth
|
Chris@0
|
288 << ", death = " << death
|
Chris@0
|
289 << ", female = " << female;
|
Chris@0
|
290
|
Chris@0
|
291 QString namefield = name.trimmed();
|
Chris@0
|
292 NameMap names;
|
Chris@0
|
293
|
Chris@4
|
294 if (namefield.contains("P.D.Q.")) { // lose this joke
|
Chris@4
|
295 continue;
|
Chris@4
|
296 }
|
Chris@4
|
297
|
Chris@0
|
298 parseNames(namefield, names);
|
Chris@0
|
299
|
Chris@0
|
300 i = 0;
|
Chris@0
|
301 QString preferred;
|
Chris@0
|
302 foreach (QString n, names.keys()) {
|
Chris@0
|
303 if (preferred == "" || names[n] == 0) preferred = n;
|
Chris@0
|
304 DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl;
|
Chris@0
|
305 ++i;
|
Chris@0
|
306 }
|
Chris@0
|
307
|
Chris@0
|
308 if (names.empty()) {
|
Chris@0
|
309 DEBUG << "No name!" << endl;
|
Chris@0
|
310 continue;
|
Chris@0
|
311 }
|
Chris@0
|
312
|
Chris@0
|
313 Composer *composer = new Composer();
|
Chris@0
|
314 composer->setName(preferred);
|
Chris@0
|
315 foreach (QString n, names.keys()) {
|
Chris@0
|
316 if (n != preferred) composer->addAlias(n);
|
Chris@0
|
317 }
|
Chris@0
|
318
|
Chris@0
|
319 if (page != "") {
|
Chris@0
|
320 Document *d = new Document;
|
Chris@18
|
321 d->setUri(Uri("http://www.classical-composers.org" + page));
|
Chris@0
|
322 d->setTopic(composer);
|
Chris@0
|
323 d->setSiteName("Classical Composers Database");
|
Chris@0
|
324 composer->addPage(d);
|
Chris@0
|
325 }
|
Chris@1
|
326
|
Chris@1
|
327 if (birth != "" && death == "") {
|
Chris@1
|
328 if (star == "" && dagger != QString::fromUtf8("\342\200\240")) {
|
Chris@1
|
329 DEBUG << "Unexpected \"dagger\" character" << dagger << endl;
|
Chris@1
|
330 birth = "";
|
Chris@1
|
331 }
|
Chris@1
|
332 if (star == "" && dagger == "") {
|
Chris@1
|
333 DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl;
|
Chris@1
|
334 birth = "";
|
Chris@1
|
335 } else if (star != "" && dagger != "") {
|
Chris@1
|
336 DEBUG << "Date range features both star and dagger -- ignoring" << endl;
|
Chris@1
|
337 birth = "";
|
Chris@1
|
338 } else if (dagger != "") {
|
Chris@1
|
339 DEBUG << "dagger found: setting death to " << birth << endl;
|
Chris@1
|
340 death = birth;
|
Chris@1
|
341 birth = "";
|
Chris@1
|
342 }
|
Chris@1
|
343 }
|
Chris@1
|
344
|
Chris@0
|
345 if (birth != "") {
|
Chris@0
|
346 Birth *e = new Birth(birth.toInt());
|
Chris@0
|
347 composer->setBirth(e);
|
Chris@0
|
348 }
|
Chris@0
|
349 if (death != "") {
|
Chris@0
|
350 composer->setDeath(new Death(death.toInt()));
|
Chris@0
|
351 }
|
Chris@0
|
352 if (female != "") {
|
Chris@0
|
353 composer->setGender("female");
|
Chris@20
|
354 } else {
|
Chris@20
|
355 composer->setGender("male");
|
Chris@20
|
356 }
|
Chris@0
|
357
|
Chris@0
|
358 m_objects.push_back(composer);
|
Chris@0
|
359 }
|
Chris@0
|
360
|
Chris@0
|
361 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
362
|
Chris@0
|
363 }
|
Chris@0
|
364
|
Chris@0
|
365
|
Chris@0
|
366 }
|
Chris@0
|
367
|