Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@4
|
3 #include "ImportClassicalArchives.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@4
|
20 ClassicalArchivesImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@4
|
22 DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@4
|
26 static const char *locmap[] = {
|
Chris@4
|
27 "ARG", "Argentinian", "Argentina", "3865483",
|
Chris@4
|
28 "ARM", "Armenian", "Armenia", "174982",
|
Chris@4
|
29 "AUS", "Australian", "Australia", "2077456",
|
Chris@4
|
30 "AUT", "Austrian", "Austria", "2782113",
|
Chris@4
|
31 "AZE", "Azeri", "Azerbaijan", "587116",
|
Chris@4
|
32 "BEL", "Belgian", "Belgium", "2802361",
|
Chris@4
|
33 "BGR", "Bulgarian", "Bulgaria", "732800",
|
Chris@4
|
34 "BLR", "Belarusian", "Belarus", "630336",
|
Chris@4
|
35 "BOH", "Bohemian", "Bohemia", "3074194",
|
Chris@4
|
36 "BRA", "Brazilian", "Brazil", "3469058",
|
Chris@4
|
37 "BSQ", "Basque", "Basque country", "3104499",
|
Chris@4
|
38 "CAN", "Canadian", "Canada", "6251999",
|
Chris@4
|
39 "CHE", "Swiss", "Switzerland", "2658434",
|
Chris@4
|
40 "CHL", "Chilean", "Chile", "3895114",
|
Chris@4
|
41 "CHN", "Chinese", "China", "1814991",
|
Chris@4
|
42 "CRI", "Costa Rican", "Costa Rica", "3624060",
|
Chris@4
|
43 "CTN", "Catalonian", "Catalonia", "3108286",
|
Chris@4
|
44 "CUB", "Cuban", "Cuba", "3562981",
|
Chris@4
|
45 "CZE", "Czech", "Czech Republic", "3077311",
|
Chris@4
|
46 "DEU", "German", "Germany", "2921044",
|
Chris@4
|
47 "DNK", "Danish", "Denmark", "2623032",
|
Chris@4
|
48 "ECU", "Ecuadorian", "Ecuador", "3658394",
|
Chris@4
|
49 "EGY", "Egyptian", "Egypt", "357994",
|
Chris@4
|
50 "ENG", "English", "England", "2635167",
|
Chris@4
|
51 "EPR", "German", "Germany", "2921044", // pardon?
|
Chris@4
|
52 "ESP", "Spanish", "Spain", "2510769",
|
Chris@4
|
53 "EST", "Estonian", "Estonia", "453733",
|
Chris@4
|
54 "ETH", "Ethiopian", "Ethiopia", "337996",
|
Chris@4
|
55 "FIN", "Finnish", "Finland", "660013",
|
Chris@4
|
56 "FLM", "Flemish", "Flanders", "3337388",
|
Chris@4
|
57 "FRA", "French", "France", "3017382",
|
Chris@4
|
58 "GBR", "British", "Britain", "4839292",
|
Chris@4
|
59 "GEO", "Georgian", "Georgia", "614540",
|
Chris@4
|
60 "GRC", "Greek", "Greece", "390903",
|
Chris@4
|
61 "GTM", "Guatemalan", "Guatemala", "3595528",
|
Chris@4
|
62 "HKG", "Hong Kong Chinese", "Hong Kong", "1819729",
|
Chris@4
|
63 "HOL", "Dutch", "Holland", "2750405",
|
Chris@4
|
64 "HRV", "Croatian", "Croatia", "3202326",
|
Chris@4
|
65 "HUN", "Hungarian", "Hungary", "719819",
|
Chris@4
|
66 "IND", "Indian", "India", "1269750",
|
Chris@4
|
67 "IRL", "Irish", "Ireland", "2963597",
|
Chris@4
|
68 "IRN", "Iranian", "Iran", "130758",
|
Chris@4
|
69 "ISL", "Icelandic", "Iceland", "2629691",
|
Chris@4
|
70 "ISR", "Israeli", "Israel", "294640",
|
Chris@4
|
71 "ITA", "Italian", "Italy", "3175395",
|
Chris@4
|
72 "JPN", "Japanese", "Japan", "1861060",
|
Chris@4
|
73 "KAZ", "Kazakh", "Kazakhstan", "1522867",
|
Chris@4
|
74 "KOR", "Korean", "Korea", "1835841",
|
Chris@4
|
75 "LBN", "Lebanese", "Lebanon", "272103",
|
Chris@4
|
76 "LTU", "Lithuanian", "Lithuania", "597427",
|
Chris@4
|
77 "LVA", "Latvian", "Latvia", "458258",
|
Chris@4
|
78 "MAR", "Moroccan", "Morocco", "2542007",
|
Chris@4
|
79 "MEX", "Mexican", "Mexico", "3996063",
|
Chris@4
|
80 "MKD", "Macedonian", "Macedonia", "718075",
|
Chris@4
|
81 "MOR", "Moravian", "Moravia", "3078610",
|
Chris@4
|
82 "MYS", "Malaysian", "Malaysia", "1733045",
|
Chris@4
|
83 "NAI", "North American Indian", "United States of America", "6252001",
|
Chris@4
|
84 "NLD", "Dutch", "Netherlands", "2750405",
|
Chris@4
|
85 "NOR", "Norwegian", "Norway", "3144096",
|
Chris@4
|
86 "NZL", "New Zealander", "New Zealand", "2186224",
|
Chris@4
|
87 "PER", "Peruvian", "Peru", "3932488",
|
Chris@4
|
88 "PHL", "Filipino", "Philippines", "1694008",
|
Chris@4
|
89 "POL", "Polish", "Poland", "798544",
|
Chris@4
|
90 "PRT", "Portuguese", "Portugal", "2264397",
|
Chris@4
|
91 "PRU", "Prussian", "Prussia", "772636",
|
Chris@4
|
92 "PRY", "Paraguayan", "Paraguay", "3437598",
|
Chris@4
|
93 "ROU", "Romanian", "Romania", "798549",
|
Chris@4
|
94 "RUS", "Russian", "Russia", "2017370",
|
Chris@4
|
95 "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468",
|
Chris@4
|
96 "SCO", "Scottish", "Scotland", "2638360",
|
Chris@4
|
97 "SGP", "Singaporean", "Singapore", "1880251",
|
Chris@4
|
98 "SVK", "Slovakian", "Slovakia", "3057568",
|
Chris@4
|
99 "SVN", "Slovenian", "Slovenia", "3190538",
|
Chris@4
|
100 "SWE", "Swedish", "Sweden", "2661886",
|
Chris@4
|
101 "TKM", "Turkmen", "Turkmenistan", "1218197",
|
Chris@4
|
102 "TSL", "Transylvanian", "Transylvania", "4495544",
|
Chris@4
|
103 "TSM", "Tasmanian", "Tasmania", "2147291",
|
Chris@4
|
104 "TUR", "Turkish", "Turkey", "298795",
|
Chris@4
|
105 "UKR", "Ukrainian", "Ukraine", "690791",
|
Chris@4
|
106 "URY", "Uruguayan", "Uruguay", "3439705",
|
Chris@4
|
107 "USA", "American", "United States of America", "6252001",
|
Chris@4
|
108 "VEN", "Venezuelan", "Venezuela", "3625428",
|
Chris@4
|
109 "VNM", "Vietnamese", "Vietnam", "1562822",
|
Chris@4
|
110 "WLS", "Samoan", "Samoa", "4034894",
|
Chris@4
|
111 "ZAF", "South African", "South Africa", "953987",
|
Chris@4
|
112 };
|
Chris@4
|
113
|
Chris@4
|
114 QSet<QString>
|
Chris@4
|
115 locationToNationality(QString location)
|
Chris@4
|
116 {
|
Chris@4
|
117 QSet<QString> nationalities;
|
Chris@4
|
118 QStringList locations = location.split('/');
|
Chris@4
|
119 foreach (location, locations) {
|
Chris@4
|
120 int cols = 4;
|
Chris@4
|
121 for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
|
Chris@4
|
122 if (location == locmap[i*cols]) {
|
Chris@4
|
123 nationalities.insert(locmap[i*cols+1]);
|
Chris@4
|
124 }
|
Chris@4
|
125 }
|
Chris@4
|
126 }
|
Chris@4
|
127 return nationalities;
|
Chris@4
|
128 }
|
Chris@4
|
129
|
Chris@18
|
130 QSet<Uri>
|
Chris@4
|
131 locationToGeonameURIs(QString location)
|
Chris@4
|
132 {
|
Chris@18
|
133 QSet<Uri> uris;
|
Chris@4
|
134 QStringList locations = location.split('/');
|
Chris@4
|
135 foreach (location, locations) {
|
Chris@4
|
136 int cols = 4;
|
Chris@4
|
137 for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
|
Chris@4
|
138 if (location == locmap[i*cols]) {
|
Chris@18
|
139 uris.insert(Uri(QString("http://sws.geonames.org/")
|
Chris@18
|
140 + locmap[i*cols+3] + "/"));
|
Chris@4
|
141 }
|
Chris@4
|
142 }
|
Chris@4
|
143 }
|
Chris@4
|
144 return uris;
|
Chris@4
|
145 }
|
Chris@4
|
146
|
Chris@0
|
147 void
|
Chris@4
|
148 parseNames(QString field, QStringList &names, int &birth, int &death,
|
Chris@4
|
149 bool &approx, QString &location)
|
Chris@0
|
150 {
|
Chris@4
|
151 field.replace(QRegExp("<[^>]*>"), "");
|
Chris@0
|
152
|
Chris@4
|
153 QRegExp locre("; (.*)$");
|
Chris@4
|
154 int pos;
|
Chris@4
|
155 if ((pos = locre.indexIn(field)) >= 0) {
|
Chris@4
|
156 location = locre.cap(1);
|
Chris@4
|
157 field.replace(pos, locre.matchedLength(), "");
|
Chris@0
|
158 }
|
Chris@0
|
159
|
Chris@4
|
160 QRegExp datere("\\(([^\\)]+)\\) *$");
|
Chris@4
|
161 if ((pos = datere.indexIn(field)) >= 0) {
|
Chris@4
|
162 QString contents = datere.cap(1);
|
Chris@4
|
163 if (contents.startsWith("c.")) {
|
Chris@4
|
164 approx = true;
|
Chris@4
|
165 contents = contents.replace("c.", "");
|
Chris@4
|
166 contents = contents.trimmed();
|
Chris@4
|
167 }
|
Chris@4
|
168 if (QRegExp("\\d{4}").indexIn(contents) >= 0) {
|
Chris@4
|
169 QStringList bits = contents.split("-");
|
Chris@4
|
170 if (!bits.empty()) {
|
Chris@4
|
171 QString f1 = bits[0];
|
Chris@4
|
172 QString f2;
|
Chris@4
|
173 if (bits.size() > 1) f2 = bits[1];
|
Chris@4
|
174 if (f1.startsWith("b")) {
|
Chris@4
|
175 f1.replace(QRegExp("b[^0-9]*"), "");
|
Chris@4
|
176 birth = f1.toInt();
|
Chris@4
|
177 } else if (f1.startsWith("d")) {
|
Chris@4
|
178 f1.replace(QRegExp("d[^0-9]*"), "");
|
Chris@4
|
179 death = f1.toInt();
|
Chris@4
|
180 } else if (f2 != "") {
|
Chris@4
|
181 birth = f1.toInt();
|
Chris@4
|
182 }
|
Chris@4
|
183 if (f2 != "") {
|
Chris@4
|
184 death = f2.toInt();
|
Chris@4
|
185 }
|
Chris@4
|
186 }
|
Chris@4
|
187 }
|
Chris@4
|
188 field.replace(pos, datere.matchedLength(), "");
|
Chris@0
|
189 }
|
Chris@0
|
190
|
Chris@4
|
191 // we don't properly handle their slash alternatives syntax
|
Chris@4
|
192 field = field.replace(QRegExp("/[^/,]*"), "");
|
Chris@4
|
193
|
Chris@4
|
194 // nor these
|
Chris@4
|
195 field.replace(QRegExp("\\[[^\\]]*\\]"), "");
|
Chris@4
|
196
|
Chris@4
|
197 // nor these
|
Chris@4
|
198 field.replace(QRegExp("\\([^\\)]*\\)"), "");
|
Chris@4
|
199
|
Chris@4
|
200 field.replace(QRegExp(" +"), " ");
|
Chris@4
|
201
|
Chris@4
|
202 // and let's be picky -- we don't like names with just initials,
|
Chris@4
|
203 // can't properly match them
|
Chris@4
|
204 if (QRegExp(",.*\\.").indexIn(field) >= 0) {
|
Chris@4
|
205 return;
|
Chris@4
|
206 }
|
Chris@4
|
207
|
Chris@4
|
208 // and, from this particular source, I'm suspicious of single-word
|
Chris@4
|
209 // names (sorry)
|
Chris@4
|
210 if (!field.contains(",")) return;
|
Chris@4
|
211
|
Chris@5
|
212 field.replace(QRegExp(" +,"), ",");
|
Chris@0
|
213 field = field.trimmed();
|
Chris@0
|
214 names.push_back(field);
|
Chris@0
|
215
|
Chris@0
|
216 // comma
|
Chris@4
|
217 QRegExp commare = QRegExp("^([^,]+), *([^,]+)$");
|
Chris@4
|
218 if ((pos = commare.indexIn(field)) >= 0) {
|
Chris@4
|
219 QString c(commare.cap(1));
|
Chris@4
|
220 QString d(commare.cap(2));
|
Chris@4
|
221 names.push_back(QString(d + " " + c).trimmed());
|
Chris@0
|
222 }
|
Chris@0
|
223 }
|
Chris@0
|
224
|
Chris@0
|
225 void
|
Chris@4
|
226 ClassicalArchivesImporter::import(QUrl source)
|
Chris@0
|
227 {
|
Chris@0
|
228 //!!! for now
|
Chris@0
|
229 QString filename = source.toLocalFile();
|
Chris@0
|
230
|
Chris@0
|
231 QFile file(filename);
|
Chris@0
|
232 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
233 throw std::exception();
|
Chris@0
|
234 }
|
Chris@0
|
235
|
Chris@0
|
236 QTextStream stream(&file);
|
Chris@0
|
237 stream.setCodec("UTF-8");
|
Chris@0
|
238 QString all = stream.readAll();
|
Chris@0
|
239
|
Chris@4
|
240 QRegExp matcher
|
Chris@4
|
241 ("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>");
|
Chris@0
|
242
|
Chris@4
|
243 DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl;
|
Chris@4
|
244
|
Chris@0
|
245 int pos = 0, count = 0;
|
Chris@0
|
246 while ((pos = matcher.indexIn(all, pos)) != -1) {
|
Chris@0
|
247 pos += matcher.matchedLength();
|
Chris@0
|
248 ++count;
|
Chris@0
|
249
|
Chris@0
|
250 QString namefield = matcher.cap(2);
|
Chris@0
|
251 QStringList names;
|
Chris@0
|
252
|
Chris@4
|
253 int birth = 0, death = 0;
|
Chris@4
|
254 bool approx = false;
|
Chris@4
|
255 QString location;
|
Chris@4
|
256
|
Chris@4
|
257 parseNames(namefield, names, birth, death, approx, location);
|
Chris@4
|
258
|
Chris@0
|
259 if (names.empty()) {
|
Chris@0
|
260 DEBUG << "No name!" << endl;
|
Chris@0
|
261 continue;
|
Chris@0
|
262 }
|
Chris@0
|
263
|
Chris@4
|
264 DEBUG << "Item " << count
|
Chris@4
|
265 << ": page = " << matcher.cap(1)
|
Chris@4
|
266 << ", name = " << names[0]
|
Chris@4
|
267 << ", birth = " << birth << ", death = " << death
|
Chris@4
|
268 << ", loc " << location << endl;
|
Chris@4
|
269
|
Chris@4
|
270 if (names[0].contains("Anonymous") ||
|
Chris@4
|
271 names[0].contains("Traditional")) {
|
Chris@0
|
272 continue;
|
Chris@0
|
273 }
|
Chris@0
|
274
|
Chris@0
|
275 Composer *composer = new Composer();
|
Chris@0
|
276 composer->setName(names[0]);
|
Chris@0
|
277 for (int i = 1; i < names.size(); ++i) {
|
Chris@0
|
278 composer->addAlias(names[i]);
|
Chris@0
|
279 }
|
Chris@4
|
280
|
Chris@4
|
281 if (birth != 0) {
|
Chris@4
|
282 Birth *e = new Birth(birth);
|
Chris@4
|
283 if (approx) e->setApproximate(true);
|
Chris@4
|
284 composer->setBirth(e);
|
Chris@4
|
285 }
|
Chris@4
|
286
|
Chris@4
|
287 if (death != 0) {
|
Chris@4
|
288 Death *e = new Death(death);
|
Chris@4
|
289 if (approx) e->setApproximate(true);
|
Chris@4
|
290 composer->setDeath(e);
|
Chris@4
|
291 }
|
Chris@4
|
292
|
Chris@4
|
293 if (location != "") {
|
Chris@4
|
294 composer->setNationality(locationToNationality(location));
|
Chris@4
|
295 composer->setGeonameURIs(locationToGeonameURIs(location));
|
Chris@4
|
296 }
|
Chris@0
|
297
|
Chris@0
|
298 if (matcher.cap(1) != "") {
|
Chris@0
|
299 QString url = matcher.cap(1);
|
Chris@0
|
300 Document *d = new Document;
|
Chris@18
|
301 d->setUri(Uri("http://www.classicalarchives.com" + url));
|
Chris@0
|
302 d->setTopic(composer);
|
Chris@4
|
303 d->setSiteName("Classical Archives");
|
Chris@0
|
304 composer->addPage(d);
|
Chris@0
|
305 }
|
Chris@0
|
306
|
Chris@0
|
307 m_objects.push_back(composer);
|
Chris@0
|
308 }
|
Chris@0
|
309
|
Chris@0
|
310
|
Chris@0
|
311 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
312 }
|
Chris@0
|
313
|
Chris@0
|
314
|
Chris@0
|
315 }
|
Chris@0
|
316
|
Chris@0
|
317
|