To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.
root / import / ImportClassicalArchives.cpp
History | View | Annotate | Download (9.89 KB)
| 1 |
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
|---|---|
| 2 |
|
| 3 |
#include "ImportClassicalArchives.h" |
| 4 |
|
| 5 |
#include <dataquay/Debug.h> |
| 6 |
|
| 7 |
#include <QFile> |
| 8 |
#include <QFileInfo> |
| 9 |
#include <QTextStream> |
| 10 |
#include <QRegExp> |
| 11 |
#include <QVariant> |
| 12 |
|
| 13 |
#include <exception> |
| 14 |
|
| 15 |
using namespace Dataquay; |
| 16 |
|
| 17 |
namespace ClassicalData {
|
| 18 |
|
| 19 |
void
|
| 20 |
ClassicalArchivesImporter::setSource(QUrl source) |
| 21 |
{
|
| 22 |
DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl;
|
| 23 |
import(source); |
| 24 |
} |
| 25 |
|
| 26 |
static const char *locmap[] = { |
| 27 |
"ARG", "Argentinian", "Argentina", "3865483", |
| 28 |
"ARM", "Armenian", "Armenia", "174982", |
| 29 |
"AUS", "Australian", "Australia", "2077456", |
| 30 |
"AUT", "Austrian", "Austria", "2782113", |
| 31 |
"AZE", "Azeri", "Azerbaijan", "587116", |
| 32 |
"BEL", "Belgian", "Belgium", "2802361", |
| 33 |
"BGR", "Bulgarian", "Bulgaria", "732800", |
| 34 |
"BLR", "Belarusian", "Belarus", "630336", |
| 35 |
"BOH", "Bohemian", "Bohemia", "3074194", |
| 36 |
"BRA", "Brazilian", "Brazil", "3469058", |
| 37 |
"BSQ", "Basque", "Basque country", "3104499", |
| 38 |
"CAN", "Canadian", "Canada", "6251999", |
| 39 |
"CHE", "Swiss", "Switzerland", "2658434", |
| 40 |
"CHL", "Chilean", "Chile", "3895114", |
| 41 |
"CHN", "Chinese", "China", "1814991", |
| 42 |
"CRI", "Costa Rican", "Costa Rica", "3624060", |
| 43 |
"CTN", "Catalonian", "Catalonia", "3108286", |
| 44 |
"CUB", "Cuban", "Cuba", "3562981", |
| 45 |
"CZE", "Czech", "Czech Republic", "3077311", |
| 46 |
"DEU", "German", "Germany", "2921044", |
| 47 |
"DNK", "Danish", "Denmark", "2623032", |
| 48 |
"ECU", "Ecuadorian", "Ecuador", "3658394", |
| 49 |
"EGY", "Egyptian", "Egypt", "357994", |
| 50 |
"ENG", "English", "England", "2635167", |
| 51 |
"EPR", "German", "Germany", "2921044", // pardon? |
| 52 |
"ESP", "Spanish", "Spain", "2510769", |
| 53 |
"EST", "Estonian", "Estonia", "453733", |
| 54 |
"ETH", "Ethiopian", "Ethiopia", "337996", |
| 55 |
"FIN", "Finnish", "Finland", "660013", |
| 56 |
"FLM", "Flemish", "Flanders", "3337388", |
| 57 |
"FRA", "French", "France", "3017382", |
| 58 |
"GBR", "British", "Britain", "4839292", |
| 59 |
"GEO", "Georgian", "Georgia", "614540", |
| 60 |
"GRC", "Greek", "Greece", "390903", |
| 61 |
"GTM", "Guatemalan", "Guatemala", "3595528", |
| 62 |
"HKG", "Hong Kong Chinese", "Hong Kong", "1819729", |
| 63 |
"HOL", "Dutch", "Holland", "2750405", |
| 64 |
"HRV", "Croatian", "Croatia", "3202326", |
| 65 |
"HUN", "Hungarian", "Hungary", "719819", |
| 66 |
"IND", "Indian", "India", "1269750", |
| 67 |
"IRL", "Irish", "Ireland", "2963597", |
| 68 |
"IRN", "Iranian", "Iran", "130758", |
| 69 |
"ISL", "Icelandic", "Iceland", "2629691", |
| 70 |
"ISR", "Israeli", "Israel", "294640", |
| 71 |
"ITA", "Italian", "Italy", "3175395", |
| 72 |
"JPN", "Japanese", "Japan", "1861060", |
| 73 |
"KAZ", "Kazakh", "Kazakhstan", "1522867", |
| 74 |
"KOR", "Korean", "Korea", "1835841", |
| 75 |
"LBN", "Lebanese", "Lebanon", "272103", |
| 76 |
"LTU", "Lithuanian", "Lithuania", "597427", |
| 77 |
"LVA", "Latvian", "Latvia", "458258", |
| 78 |
"MAR", "Moroccan", "Morocco", "2542007", |
| 79 |
"MEX", "Mexican", "Mexico", "3996063", |
| 80 |
"MKD", "Macedonian", "Macedonia", "718075", |
| 81 |
"MOR", "Moravian", "Moravia", "3078610", |
| 82 |
"MYS", "Malaysian", "Malaysia", "1733045", |
| 83 |
"NAI", "North American Indian", "United States of America", "6252001", |
| 84 |
"NLD", "Dutch", "Netherlands", "2750405", |
| 85 |
"NOR", "Norwegian", "Norway", "3144096", |
| 86 |
"NZL", "New Zealander", "New Zealand", "2186224", |
| 87 |
"PER", "Peruvian", "Peru", "3932488", |
| 88 |
"PHL", "Filipino", "Philippines", "1694008", |
| 89 |
"POL", "Polish", "Poland", "798544", |
| 90 |
"PRT", "Portuguese", "Portugal", "2264397", |
| 91 |
"PRU", "Prussian", "Prussia", "772636", |
| 92 |
"PRY", "Paraguayan", "Paraguay", "3437598", |
| 93 |
"ROU", "Romanian", "Romania", "798549", |
| 94 |
"RUS", "Russian", "Russia", "2017370", |
| 95 |
"SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468", |
| 96 |
"SCO", "Scottish", "Scotland", "2638360", |
| 97 |
"SGP", "Singaporean", "Singapore", "1880251", |
| 98 |
"SVK", "Slovakian", "Slovakia", "3057568", |
| 99 |
"SVN", "Slovenian", "Slovenia", "3190538", |
| 100 |
"SWE", "Swedish", "Sweden", "2661886", |
| 101 |
"TKM", "Turkmen", "Turkmenistan", "1218197", |
| 102 |
"TSL", "Transylvanian", "Transylvania", "4495544", |
| 103 |
"TSM", "Tasmanian", "Tasmania", "2147291", |
| 104 |
"TUR", "Turkish", "Turkey", "298795", |
| 105 |
"UKR", "Ukrainian", "Ukraine", "690791", |
| 106 |
"URY", "Uruguayan", "Uruguay", "3439705", |
| 107 |
"USA", "American", "United States of America", "6252001", |
| 108 |
"VEN", "Venezuelan", "Venezuela", "3625428", |
| 109 |
"VNM", "Vietnamese", "Vietnam", "1562822", |
| 110 |
"WLS", "Samoan", "Samoa", "4034894", |
| 111 |
"ZAF", "South African", "South Africa", "953987", |
| 112 |
}; |
| 113 |
|
| 114 |
QSet<QString> |
| 115 |
locationToNationality(QString location) |
| 116 |
{
|
| 117 |
QSet<QString> nationalities; |
| 118 |
QStringList locations = location.split('/');
|
| 119 |
foreach (location, locations) {
|
| 120 |
int cols = 4; |
| 121 |
for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { |
| 122 |
if (location == locmap[i*cols]) {
|
| 123 |
nationalities.insert(locmap[i*cols+1]);
|
| 124 |
} |
| 125 |
} |
| 126 |
} |
| 127 |
return nationalities;
|
| 128 |
} |
| 129 |
|
| 130 |
QSet<Uri> |
| 131 |
locationToGeonameURIs(QString location) |
| 132 |
{
|
| 133 |
QSet<Uri> uris; |
| 134 |
QStringList locations = location.split('/');
|
| 135 |
foreach (location, locations) {
|
| 136 |
int cols = 4; |
| 137 |
for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { |
| 138 |
if (location == locmap[i*cols]) {
|
| 139 |
uris.insert(Uri(QString("http://sws.geonames.org/")
|
| 140 |
+ locmap[i*cols+3] + "/")); |
| 141 |
} |
| 142 |
} |
| 143 |
} |
| 144 |
return uris;
|
| 145 |
} |
| 146 |
|
| 147 |
void
|
| 148 |
parseNames(QString field, QStringList &names, int &birth, int &death, |
| 149 |
bool &approx, QString &location)
|
| 150 |
{
|
| 151 |
field.replace(QRegExp("<[^>]*>"), ""); |
| 152 |
|
| 153 |
QRegExp locre("; (.*)$");
|
| 154 |
int pos;
|
| 155 |
if ((pos = locre.indexIn(field)) >= 0) { |
| 156 |
location = locre.cap(1);
|
| 157 |
field.replace(pos, locre.matchedLength(), "");
|
| 158 |
} |
| 159 |
|
| 160 |
QRegExp datere("\\(([^\\)]+)\\) *$");
|
| 161 |
if ((pos = datere.indexIn(field)) >= 0) { |
| 162 |
QString contents = datere.cap(1);
|
| 163 |
if (contents.startsWith("c.")) { |
| 164 |
approx = true;
|
| 165 |
contents = contents.replace("c.", ""); |
| 166 |
contents = contents.trimmed(); |
| 167 |
} |
| 168 |
if (QRegExp("\\d{4}").indexIn(contents) >= 0) { |
| 169 |
QStringList bits = contents.split("-");
|
| 170 |
if (!bits.empty()) {
|
| 171 |
QString f1 = bits[0];
|
| 172 |
QString f2; |
| 173 |
if (bits.size() > 1) f2 = bits[1]; |
| 174 |
if (f1.startsWith("b")) { |
| 175 |
f1.replace(QRegExp("b[^0-9]*"), ""); |
| 176 |
birth = f1.toInt(); |
| 177 |
} else if (f1.startsWith("d")) { |
| 178 |
f1.replace(QRegExp("d[^0-9]*"), ""); |
| 179 |
death = f1.toInt(); |
| 180 |
} else if (f2 != "") { |
| 181 |
birth = f1.toInt(); |
| 182 |
} |
| 183 |
if (f2 != "") { |
| 184 |
death = f2.toInt(); |
| 185 |
} |
| 186 |
} |
| 187 |
} |
| 188 |
field.replace(pos, datere.matchedLength(), "");
|
| 189 |
} |
| 190 |
|
| 191 |
// we don't properly handle their slash alternatives syntax
|
| 192 |
field = field.replace(QRegExp("/[^/,]*"), ""); |
| 193 |
|
| 194 |
// nor these
|
| 195 |
field.replace(QRegExp("\\[[^\\]]*\\]"), ""); |
| 196 |
|
| 197 |
// nor these
|
| 198 |
field.replace(QRegExp("\\([^\\)]*\\)"), ""); |
| 199 |
|
| 200 |
field.replace(QRegExp(" +"), " "); |
| 201 |
|
| 202 |
// and let's be picky -- we don't like names with just initials,
|
| 203 |
// can't properly match them
|
| 204 |
if (QRegExp(",.*\\.").indexIn(field) >= 0) { |
| 205 |
return;
|
| 206 |
} |
| 207 |
|
| 208 |
// and, from this particular source, I'm suspicious of single-word
|
| 209 |
// names (sorry)
|
| 210 |
if (!field.contains(",")) return; |
| 211 |
|
| 212 |
field.replace(QRegExp(" +,"), ","); |
| 213 |
field = field.trimmed(); |
| 214 |
names.push_back(field); |
| 215 |
|
| 216 |
// comma
|
| 217 |
QRegExp commare = QRegExp("^([^,]+), *([^,]+)$");
|
| 218 |
if ((pos = commare.indexIn(field)) >= 0) { |
| 219 |
QString c(commare.cap(1));
|
| 220 |
QString d(commare.cap(2));
|
| 221 |
names.push_back(QString(d + " " + c).trimmed());
|
| 222 |
} |
| 223 |
} |
| 224 |
|
| 225 |
void
|
| 226 |
ClassicalArchivesImporter::import(QUrl source) |
| 227 |
{
|
| 228 |
//!!! for now
|
| 229 |
QString filename = source.toLocalFile(); |
| 230 |
|
| 231 |
QFile file(filename); |
| 232 |
if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
| 233 |
throw std::exception();
|
| 234 |
} |
| 235 |
|
| 236 |
QTextStream stream(&file); |
| 237 |
stream.setCodec("UTF-8");
|
| 238 |
QString all = stream.readAll(); |
| 239 |
|
| 240 |
QRegExp matcher |
| 241 |
("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>");
|
| 242 |
|
| 243 |
DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl; |
| 244 |
|
| 245 |
int pos = 0, count = 0; |
| 246 |
while ((pos = matcher.indexIn(all, pos)) != -1) { |
| 247 |
pos += matcher.matchedLength(); |
| 248 |
++count; |
| 249 |
|
| 250 |
QString namefield = matcher.cap(2);
|
| 251 |
QStringList names; |
| 252 |
|
| 253 |
int birth = 0, death = 0; |
| 254 |
bool approx = false; |
| 255 |
QString location; |
| 256 |
|
| 257 |
parseNames(namefield, names, birth, death, approx, location); |
| 258 |
|
| 259 |
if (names.empty()) {
|
| 260 |
DEBUG << "No name!" << endl;
|
| 261 |
continue;
|
| 262 |
} |
| 263 |
|
| 264 |
DEBUG << "Item " << count
|
| 265 |
<< ": page = " << matcher.cap(1) |
| 266 |
<< ", name = " << names[0] |
| 267 |
<< ", birth = " << birth << ", death = " << death |
| 268 |
<< ", loc " << location << endl;
|
| 269 |
|
| 270 |
if (names[0].contains("Anonymous") || |
| 271 |
names[0].contains("Traditional")) { |
| 272 |
continue;
|
| 273 |
} |
| 274 |
|
| 275 |
Composer *composer = new Composer();
|
| 276 |
composer->setName(names[0]);
|
| 277 |
for (int i = 1; i < names.size(); ++i) { |
| 278 |
composer->addAlias(names[i]); |
| 279 |
} |
| 280 |
|
| 281 |
if (birth != 0) { |
| 282 |
Birth *e = new Birth(birth);
|
| 283 |
if (approx) e->setApproximate(true); |
| 284 |
composer->setBirth(e); |
| 285 |
} |
| 286 |
|
| 287 |
if (death != 0) { |
| 288 |
Death *e = new Death(death);
|
| 289 |
if (approx) e->setApproximate(true); |
| 290 |
composer->setDeath(e); |
| 291 |
} |
| 292 |
|
| 293 |
if (location != "") { |
| 294 |
composer->setNationality(locationToNationality(location)); |
| 295 |
composer->setGeonameURIs(locationToGeonameURIs(location)); |
| 296 |
} |
| 297 |
|
| 298 |
if (matcher.cap(1) != "") { |
| 299 |
QString url = matcher.cap(1);
|
| 300 |
Document *d = new Document;
|
| 301 |
d->setUri(Uri("http://www.classicalarchives.com" + url));
|
| 302 |
d->setTopic(composer); |
| 303 |
d->setSiteName("Classical Archives");
|
| 304 |
composer->addPage(d); |
| 305 |
} |
| 306 |
|
| 307 |
m_objects.push_back(composer); |
| 308 |
} |
| 309 |
|
| 310 |
|
| 311 |
DEBUG << "Found " << count << " things" << endl; |
| 312 |
} |
| 313 |
|
| 314 |
|
| 315 |
} |
| 316 |
|
| 317 |
|