annotate import/ImportClassicalArchives.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@4 3 #include "ImportClassicalArchives.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@4 20 ClassicalArchivesImporter::setSource(QUrl source)
Chris@0 21 {
Chris@4 22 DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@4 26 static const char *locmap[] = {
Chris@4 27 "ARG", "Argentinian", "Argentina", "3865483",
Chris@4 28 "ARM", "Armenian", "Armenia", "174982",
Chris@4 29 "AUS", "Australian", "Australia", "2077456",
Chris@4 30 "AUT", "Austrian", "Austria", "2782113",
Chris@4 31 "AZE", "Azeri", "Azerbaijan", "587116",
Chris@4 32 "BEL", "Belgian", "Belgium", "2802361",
Chris@4 33 "BGR", "Bulgarian", "Bulgaria", "732800",
Chris@4 34 "BLR", "Belarusian", "Belarus", "630336",
Chris@4 35 "BOH", "Bohemian", "Bohemia", "3074194",
Chris@4 36 "BRA", "Brazilian", "Brazil", "3469058",
Chris@4 37 "BSQ", "Basque", "Basque country", "3104499",
Chris@4 38 "CAN", "Canadian", "Canada", "6251999",
Chris@4 39 "CHE", "Swiss", "Switzerland", "2658434",
Chris@4 40 "CHL", "Chilean", "Chile", "3895114",
Chris@4 41 "CHN", "Chinese", "China", "1814991",
Chris@4 42 "CRI", "Costa Rican", "Costa Rica", "3624060",
Chris@4 43 "CTN", "Catalonian", "Catalonia", "3108286",
Chris@4 44 "CUB", "Cuban", "Cuba", "3562981",
Chris@4 45 "CZE", "Czech", "Czech Republic", "3077311",
Chris@4 46 "DEU", "German", "Germany", "2921044",
Chris@4 47 "DNK", "Danish", "Denmark", "2623032",
Chris@4 48 "ECU", "Ecuadorian", "Ecuador", "3658394",
Chris@4 49 "EGY", "Egyptian", "Egypt", "357994",
Chris@4 50 "ENG", "English", "England", "2635167",
Chris@4 51 "EPR", "German", "Germany", "2921044", // pardon?
Chris@4 52 "ESP", "Spanish", "Spain", "2510769",
Chris@4 53 "EST", "Estonian", "Estonia", "453733",
Chris@4 54 "ETH", "Ethiopian", "Ethiopia", "337996",
Chris@4 55 "FIN", "Finnish", "Finland", "660013",
Chris@4 56 "FLM", "Flemish", "Flanders", "3337388",
Chris@4 57 "FRA", "French", "France", "3017382",
Chris@4 58 "GBR", "British", "Britain", "4839292",
Chris@4 59 "GEO", "Georgian", "Georgia", "614540",
Chris@4 60 "GRC", "Greek", "Greece", "390903",
Chris@4 61 "GTM", "Guatemalan", "Guatemala", "3595528",
Chris@4 62 "HKG", "Hong Kong Chinese", "Hong Kong", "1819729",
Chris@4 63 "HOL", "Dutch", "Holland", "2750405",
Chris@4 64 "HRV", "Croatian", "Croatia", "3202326",
Chris@4 65 "HUN", "Hungarian", "Hungary", "719819",
Chris@4 66 "IND", "Indian", "India", "1269750",
Chris@4 67 "IRL", "Irish", "Ireland", "2963597",
Chris@4 68 "IRN", "Iranian", "Iran", "130758",
Chris@4 69 "ISL", "Icelandic", "Iceland", "2629691",
Chris@4 70 "ISR", "Israeli", "Israel", "294640",
Chris@4 71 "ITA", "Italian", "Italy", "3175395",
Chris@4 72 "JPN", "Japanese", "Japan", "1861060",
Chris@4 73 "KAZ", "Kazakh", "Kazakhstan", "1522867",
Chris@4 74 "KOR", "Korean", "Korea", "1835841",
Chris@4 75 "LBN", "Lebanese", "Lebanon", "272103",
Chris@4 76 "LTU", "Lithuanian", "Lithuania", "597427",
Chris@4 77 "LVA", "Latvian", "Latvia", "458258",
Chris@4 78 "MAR", "Moroccan", "Morocco", "2542007",
Chris@4 79 "MEX", "Mexican", "Mexico", "3996063",
Chris@4 80 "MKD", "Macedonian", "Macedonia", "718075",
Chris@4 81 "MOR", "Moravian", "Moravia", "3078610",
Chris@4 82 "MYS", "Malaysian", "Malaysia", "1733045",
Chris@4 83 "NAI", "North American Indian", "United States of America", "6252001",
Chris@4 84 "NLD", "Dutch", "Netherlands", "2750405",
Chris@4 85 "NOR", "Norwegian", "Norway", "3144096",
Chris@4 86 "NZL", "New Zealander", "New Zealand", "2186224",
Chris@4 87 "PER", "Peruvian", "Peru", "3932488",
Chris@4 88 "PHL", "Filipino", "Philippines", "1694008",
Chris@4 89 "POL", "Polish", "Poland", "798544",
Chris@4 90 "PRT", "Portuguese", "Portugal", "2264397",
Chris@4 91 "PRU", "Prussian", "Prussia", "772636",
Chris@4 92 "PRY", "Paraguayan", "Paraguay", "3437598",
Chris@4 93 "ROU", "Romanian", "Romania", "798549",
Chris@4 94 "RUS", "Russian", "Russia", "2017370",
Chris@4 95 "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468",
Chris@4 96 "SCO", "Scottish", "Scotland", "2638360",
Chris@4 97 "SGP", "Singaporean", "Singapore", "1880251",
Chris@4 98 "SVK", "Slovakian", "Slovakia", "3057568",
Chris@4 99 "SVN", "Slovenian", "Slovenia", "3190538",
Chris@4 100 "SWE", "Swedish", "Sweden", "2661886",
Chris@4 101 "TKM", "Turkmen", "Turkmenistan", "1218197",
Chris@4 102 "TSL", "Transylvanian", "Transylvania", "4495544",
Chris@4 103 "TSM", "Tasmanian", "Tasmania", "2147291",
Chris@4 104 "TUR", "Turkish", "Turkey", "298795",
Chris@4 105 "UKR", "Ukrainian", "Ukraine", "690791",
Chris@4 106 "URY", "Uruguayan", "Uruguay", "3439705",
Chris@4 107 "USA", "American", "United States of America", "6252001",
Chris@4 108 "VEN", "Venezuelan", "Venezuela", "3625428",
Chris@4 109 "VNM", "Vietnamese", "Vietnam", "1562822",
Chris@4 110 "WLS", "Samoan", "Samoa", "4034894",
Chris@4 111 "ZAF", "South African", "South Africa", "953987",
Chris@4 112 };
Chris@4 113
Chris@4 114 QSet<QString>
Chris@4 115 locationToNationality(QString location)
Chris@4 116 {
Chris@4 117 QSet<QString> nationalities;
Chris@4 118 QStringList locations = location.split('/');
Chris@4 119 foreach (location, locations) {
Chris@4 120 int cols = 4;
Chris@4 121 for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
Chris@4 122 if (location == locmap[i*cols]) {
Chris@4 123 nationalities.insert(locmap[i*cols+1]);
Chris@4 124 }
Chris@4 125 }
Chris@4 126 }
Chris@4 127 return nationalities;
Chris@4 128 }
Chris@4 129
Chris@18 130 QSet<Uri>
Chris@4 131 locationToGeonameURIs(QString location)
Chris@4 132 {
Chris@18 133 QSet<Uri> uris;
Chris@4 134 QStringList locations = location.split('/');
Chris@4 135 foreach (location, locations) {
Chris@4 136 int cols = 4;
Chris@4 137 for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
Chris@4 138 if (location == locmap[i*cols]) {
Chris@18 139 uris.insert(Uri(QString("http://sws.geonames.org/")
Chris@18 140 + locmap[i*cols+3] + "/"));
Chris@4 141 }
Chris@4 142 }
Chris@4 143 }
Chris@4 144 return uris;
Chris@4 145 }
Chris@4 146
Chris@0 147 void
Chris@4 148 parseNames(QString field, QStringList &names, int &birth, int &death,
Chris@4 149 bool &approx, QString &location)
Chris@0 150 {
Chris@4 151 field.replace(QRegExp("<[^>]*>"), "");
Chris@0 152
Chris@4 153 QRegExp locre("; (.*)$");
Chris@4 154 int pos;
Chris@4 155 if ((pos = locre.indexIn(field)) >= 0) {
Chris@4 156 location = locre.cap(1);
Chris@4 157 field.replace(pos, locre.matchedLength(), "");
Chris@0 158 }
Chris@0 159
Chris@4 160 QRegExp datere("\\(([^\\)]+)\\) *$");
Chris@4 161 if ((pos = datere.indexIn(field)) >= 0) {
Chris@4 162 QString contents = datere.cap(1);
Chris@4 163 if (contents.startsWith("c.")) {
Chris@4 164 approx = true;
Chris@4 165 contents = contents.replace("c.", "");
Chris@4 166 contents = contents.trimmed();
Chris@4 167 }
Chris@4 168 if (QRegExp("\\d{4}").indexIn(contents) >= 0) {
Chris@4 169 QStringList bits = contents.split("-");
Chris@4 170 if (!bits.empty()) {
Chris@4 171 QString f1 = bits[0];
Chris@4 172 QString f2;
Chris@4 173 if (bits.size() > 1) f2 = bits[1];
Chris@4 174 if (f1.startsWith("b")) {
Chris@4 175 f1.replace(QRegExp("b[^0-9]*"), "");
Chris@4 176 birth = f1.toInt();
Chris@4 177 } else if (f1.startsWith("d")) {
Chris@4 178 f1.replace(QRegExp("d[^0-9]*"), "");
Chris@4 179 death = f1.toInt();
Chris@4 180 } else if (f2 != "") {
Chris@4 181 birth = f1.toInt();
Chris@4 182 }
Chris@4 183 if (f2 != "") {
Chris@4 184 death = f2.toInt();
Chris@4 185 }
Chris@4 186 }
Chris@4 187 }
Chris@4 188 field.replace(pos, datere.matchedLength(), "");
Chris@0 189 }
Chris@0 190
Chris@4 191 // we don't properly handle their slash alternatives syntax
Chris@4 192 field = field.replace(QRegExp("/[^/,]*"), "");
Chris@4 193
Chris@4 194 // nor these
Chris@4 195 field.replace(QRegExp("\\[[^\\]]*\\]"), "");
Chris@4 196
Chris@4 197 // nor these
Chris@4 198 field.replace(QRegExp("\\([^\\)]*\\)"), "");
Chris@4 199
Chris@4 200 field.replace(QRegExp(" +"), " ");
Chris@4 201
Chris@4 202 // and let's be picky -- we don't like names with just initials,
Chris@4 203 // can't properly match them
Chris@4 204 if (QRegExp(",.*\\.").indexIn(field) >= 0) {
Chris@4 205 return;
Chris@4 206 }
Chris@4 207
Chris@4 208 // and, from this particular source, I'm suspicious of single-word
Chris@4 209 // names (sorry)
Chris@4 210 if (!field.contains(",")) return;
Chris@4 211
Chris@5 212 field.replace(QRegExp(" +,"), ",");
Chris@0 213 field = field.trimmed();
Chris@0 214 names.push_back(field);
Chris@0 215
Chris@0 216 // comma
Chris@4 217 QRegExp commare = QRegExp("^([^,]+), *([^,]+)$");
Chris@4 218 if ((pos = commare.indexIn(field)) >= 0) {
Chris@4 219 QString c(commare.cap(1));
Chris@4 220 QString d(commare.cap(2));
Chris@4 221 names.push_back(QString(d + " " + c).trimmed());
Chris@0 222 }
Chris@0 223 }
Chris@0 224
Chris@0 225 void
Chris@4 226 ClassicalArchivesImporter::import(QUrl source)
Chris@0 227 {
Chris@0 228 //!!! for now
Chris@0 229 QString filename = source.toLocalFile();
Chris@0 230
Chris@0 231 QFile file(filename);
Chris@0 232 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 233 throw std::exception();
Chris@0 234 }
Chris@0 235
Chris@0 236 QTextStream stream(&file);
Chris@0 237 stream.setCodec("UTF-8");
Chris@0 238 QString all = stream.readAll();
Chris@0 239
Chris@4 240 QRegExp matcher
Chris@4 241 ("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>");
Chris@0 242
Chris@4 243 DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl;
Chris@4 244
Chris@0 245 int pos = 0, count = 0;
Chris@0 246 while ((pos = matcher.indexIn(all, pos)) != -1) {
Chris@0 247 pos += matcher.matchedLength();
Chris@0 248 ++count;
Chris@0 249
Chris@0 250 QString namefield = matcher.cap(2);
Chris@0 251 QStringList names;
Chris@0 252
Chris@4 253 int birth = 0, death = 0;
Chris@4 254 bool approx = false;
Chris@4 255 QString location;
Chris@4 256
Chris@4 257 parseNames(namefield, names, birth, death, approx, location);
Chris@4 258
Chris@0 259 if (names.empty()) {
Chris@0 260 DEBUG << "No name!" << endl;
Chris@0 261 continue;
Chris@0 262 }
Chris@0 263
Chris@4 264 DEBUG << "Item " << count
Chris@4 265 << ": page = " << matcher.cap(1)
Chris@4 266 << ", name = " << names[0]
Chris@4 267 << ", birth = " << birth << ", death = " << death
Chris@4 268 << ", loc " << location << endl;
Chris@4 269
Chris@4 270 if (names[0].contains("Anonymous") ||
Chris@4 271 names[0].contains("Traditional")) {
Chris@0 272 continue;
Chris@0 273 }
Chris@0 274
Chris@0 275 Composer *composer = new Composer();
Chris@0 276 composer->setName(names[0]);
Chris@0 277 for (int i = 1; i < names.size(); ++i) {
Chris@0 278 composer->addAlias(names[i]);
Chris@0 279 }
Chris@4 280
Chris@4 281 if (birth != 0) {
Chris@4 282 Birth *e = new Birth(birth);
Chris@4 283 if (approx) e->setApproximate(true);
Chris@4 284 composer->setBirth(e);
Chris@4 285 }
Chris@4 286
Chris@4 287 if (death != 0) {
Chris@4 288 Death *e = new Death(death);
Chris@4 289 if (approx) e->setApproximate(true);
Chris@4 290 composer->setDeath(e);
Chris@4 291 }
Chris@4 292
Chris@4 293 if (location != "") {
Chris@4 294 composer->setNationality(locationToNationality(location));
Chris@4 295 composer->setGeonameURIs(locationToGeonameURIs(location));
Chris@4 296 }
Chris@0 297
Chris@0 298 if (matcher.cap(1) != "") {
Chris@0 299 QString url = matcher.cap(1);
Chris@0 300 Document *d = new Document;
Chris@18 301 d->setUri(Uri("http://www.classicalarchives.com" + url));
Chris@0 302 d->setTopic(composer);
Chris@4 303 d->setSiteName("Classical Archives");
Chris@0 304 composer->addPage(d);
Chris@0 305 }
Chris@0 306
Chris@0 307 m_objects.push_back(composer);
Chris@0 308 }
Chris@0 309
Chris@0 310
Chris@0 311 DEBUG << "Found " << count << " things" << endl;
Chris@0 312 }
Chris@0 313
Chris@0 314
Chris@0 315 }
Chris@0 316
Chris@0 317