Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@4: #include "ImportClassicalArchives.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@4: ClassicalArchivesImporter::setSource(QUrl source) Chris@0: { Chris@4: DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@4: static const char *locmap[] = { Chris@4: "ARG", "Argentinian", "Argentina", "3865483", Chris@4: "ARM", "Armenian", "Armenia", "174982", Chris@4: "AUS", "Australian", "Australia", "2077456", Chris@4: "AUT", "Austrian", "Austria", "2782113", Chris@4: "AZE", "Azeri", "Azerbaijan", "587116", Chris@4: "BEL", "Belgian", "Belgium", "2802361", Chris@4: "BGR", "Bulgarian", "Bulgaria", "732800", Chris@4: "BLR", "Belarusian", "Belarus", "630336", Chris@4: "BOH", "Bohemian", "Bohemia", "3074194", Chris@4: "BRA", "Brazilian", "Brazil", "3469058", Chris@4: "BSQ", "Basque", "Basque country", "3104499", Chris@4: "CAN", "Canadian", "Canada", "6251999", Chris@4: "CHE", "Swiss", "Switzerland", "2658434", Chris@4: "CHL", "Chilean", "Chile", "3895114", Chris@4: "CHN", "Chinese", "China", "1814991", Chris@4: "CRI", "Costa Rican", "Costa Rica", "3624060", Chris@4: "CTN", "Catalonian", "Catalonia", "3108286", Chris@4: "CUB", "Cuban", "Cuba", "3562981", Chris@4: "CZE", "Czech", "Czech Republic", "3077311", Chris@4: "DEU", "German", "Germany", "2921044", Chris@4: "DNK", "Danish", "Denmark", "2623032", Chris@4: "ECU", "Ecuadorian", "Ecuador", "3658394", Chris@4: "EGY", "Egyptian", "Egypt", "357994", Chris@4: "ENG", "English", "England", "2635167", Chris@4: "EPR", "German", "Germany", "2921044", // pardon? Chris@4: "ESP", "Spanish", "Spain", "2510769", Chris@4: "EST", "Estonian", "Estonia", "453733", Chris@4: "ETH", "Ethiopian", "Ethiopia", "337996", Chris@4: "FIN", "Finnish", "Finland", "660013", Chris@4: "FLM", "Flemish", "Flanders", "3337388", Chris@4: "FRA", "French", "France", "3017382", Chris@4: "GBR", "British", "Britain", "4839292", Chris@4: "GEO", "Georgian", "Georgia", "614540", Chris@4: "GRC", "Greek", "Greece", "390903", Chris@4: "GTM", "Guatemalan", "Guatemala", "3595528", Chris@4: "HKG", "Hong Kong Chinese", "Hong Kong", "1819729", Chris@4: "HOL", "Dutch", "Holland", "2750405", Chris@4: "HRV", "Croatian", "Croatia", "3202326", Chris@4: "HUN", "Hungarian", "Hungary", "719819", Chris@4: "IND", "Indian", "India", "1269750", Chris@4: "IRL", "Irish", "Ireland", "2963597", Chris@4: "IRN", "Iranian", "Iran", "130758", Chris@4: "ISL", "Icelandic", "Iceland", "2629691", Chris@4: "ISR", "Israeli", "Israel", "294640", Chris@4: "ITA", "Italian", "Italy", "3175395", Chris@4: "JPN", "Japanese", "Japan", "1861060", Chris@4: "KAZ", "Kazakh", "Kazakhstan", "1522867", Chris@4: "KOR", "Korean", "Korea", "1835841", Chris@4: "LBN", "Lebanese", "Lebanon", "272103", Chris@4: "LTU", "Lithuanian", "Lithuania", "597427", Chris@4: "LVA", "Latvian", "Latvia", "458258", Chris@4: "MAR", "Moroccan", "Morocco", "2542007", Chris@4: "MEX", "Mexican", "Mexico", "3996063", Chris@4: "MKD", "Macedonian", "Macedonia", "718075", Chris@4: "MOR", "Moravian", "Moravia", "3078610", Chris@4: "MYS", "Malaysian", "Malaysia", "1733045", Chris@4: "NAI", "North American Indian", "United States of America", "6252001", Chris@4: "NLD", "Dutch", "Netherlands", "2750405", Chris@4: "NOR", "Norwegian", "Norway", "3144096", Chris@4: "NZL", "New Zealander", "New Zealand", "2186224", Chris@4: "PER", "Peruvian", "Peru", "3932488", Chris@4: "PHL", "Filipino", "Philippines", "1694008", Chris@4: "POL", "Polish", "Poland", "798544", Chris@4: "PRT", "Portuguese", "Portugal", "2264397", Chris@4: "PRU", "Prussian", "Prussia", "772636", Chris@4: "PRY", "Paraguayan", "Paraguay", "3437598", Chris@4: "ROU", "Romanian", "Romania", "798549", Chris@4: "RUS", "Russian", "Russia", "2017370", Chris@4: "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468", Chris@4: "SCO", "Scottish", "Scotland", "2638360", Chris@4: "SGP", "Singaporean", "Singapore", "1880251", Chris@4: "SVK", "Slovakian", "Slovakia", "3057568", Chris@4: "SVN", "Slovenian", "Slovenia", "3190538", Chris@4: "SWE", "Swedish", "Sweden", "2661886", Chris@4: "TKM", "Turkmen", "Turkmenistan", "1218197", Chris@4: "TSL", "Transylvanian", "Transylvania", "4495544", Chris@4: "TSM", "Tasmanian", "Tasmania", "2147291", Chris@4: "TUR", "Turkish", "Turkey", "298795", Chris@4: "UKR", "Ukrainian", "Ukraine", "690791", Chris@4: "URY", "Uruguayan", "Uruguay", "3439705", Chris@4: "USA", "American", "United States of America", "6252001", Chris@4: "VEN", "Venezuelan", "Venezuela", "3625428", Chris@4: "VNM", "Vietnamese", "Vietnam", "1562822", Chris@4: "WLS", "Samoan", "Samoa", "4034894", Chris@4: "ZAF", "South African", "South Africa", "953987", Chris@4: }; Chris@4: Chris@4: QSet Chris@4: locationToNationality(QString location) Chris@4: { Chris@4: QSet nationalities; Chris@4: QStringList locations = location.split('/'); Chris@4: foreach (location, locations) { Chris@4: int cols = 4; Chris@4: for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { Chris@4: if (location == locmap[i*cols]) { Chris@4: nationalities.insert(locmap[i*cols+1]); Chris@4: } Chris@4: } Chris@4: } Chris@4: return nationalities; Chris@4: } Chris@4: Chris@18: QSet Chris@4: locationToGeonameURIs(QString location) Chris@4: { Chris@18: QSet uris; Chris@4: QStringList locations = location.split('/'); Chris@4: foreach (location, locations) { Chris@4: int cols = 4; Chris@4: for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { Chris@4: if (location == locmap[i*cols]) { Chris@18: uris.insert(Uri(QString("http://sws.geonames.org/") Chris@18: + locmap[i*cols+3] + "/")); Chris@4: } Chris@4: } Chris@4: } Chris@4: return uris; Chris@4: } Chris@4: Chris@0: void Chris@4: parseNames(QString field, QStringList &names, int &birth, int &death, Chris@4: bool &approx, QString &location) Chris@0: { Chris@4: field.replace(QRegExp("<[^>]*>"), ""); Chris@0: Chris@4: QRegExp locre("; (.*)$"); Chris@4: int pos; Chris@4: if ((pos = locre.indexIn(field)) >= 0) { Chris@4: location = locre.cap(1); Chris@4: field.replace(pos, locre.matchedLength(), ""); Chris@0: } Chris@0: Chris@4: QRegExp datere("\\(([^\\)]+)\\) *$"); Chris@4: if ((pos = datere.indexIn(field)) >= 0) { Chris@4: QString contents = datere.cap(1); Chris@4: if (contents.startsWith("c.")) { Chris@4: approx = true; Chris@4: contents = contents.replace("c.", ""); Chris@4: contents = contents.trimmed(); Chris@4: } Chris@4: if (QRegExp("\\d{4}").indexIn(contents) >= 0) { Chris@4: QStringList bits = contents.split("-"); Chris@4: if (!bits.empty()) { Chris@4: QString f1 = bits[0]; Chris@4: QString f2; Chris@4: if (bits.size() > 1) f2 = bits[1]; Chris@4: if (f1.startsWith("b")) { Chris@4: f1.replace(QRegExp("b[^0-9]*"), ""); Chris@4: birth = f1.toInt(); Chris@4: } else if (f1.startsWith("d")) { Chris@4: f1.replace(QRegExp("d[^0-9]*"), ""); Chris@4: death = f1.toInt(); Chris@4: } else if (f2 != "") { Chris@4: birth = f1.toInt(); Chris@4: } Chris@4: if (f2 != "") { Chris@4: death = f2.toInt(); Chris@4: } Chris@4: } Chris@4: } Chris@4: field.replace(pos, datere.matchedLength(), ""); Chris@0: } Chris@0: Chris@4: // we don't properly handle their slash alternatives syntax Chris@4: field = field.replace(QRegExp("/[^/,]*"), ""); Chris@4: Chris@4: // nor these Chris@4: field.replace(QRegExp("\\[[^\\]]*\\]"), ""); Chris@4: Chris@4: // nor these Chris@4: field.replace(QRegExp("\\([^\\)]*\\)"), ""); Chris@4: Chris@4: field.replace(QRegExp(" +"), " "); Chris@4: Chris@4: // and let's be picky -- we don't like names with just initials, Chris@4: // can't properly match them Chris@4: if (QRegExp(",.*\\.").indexIn(field) >= 0) { Chris@4: return; Chris@4: } Chris@4: Chris@4: // and, from this particular source, I'm suspicious of single-word Chris@4: // names (sorry) Chris@4: if (!field.contains(",")) return; Chris@4: Chris@5: field.replace(QRegExp(" +,"), ","); Chris@0: field = field.trimmed(); Chris@0: names.push_back(field); Chris@0: Chris@0: // comma Chris@4: QRegExp commare = QRegExp("^([^,]+), *([^,]+)$"); Chris@4: if ((pos = commare.indexIn(field)) >= 0) { Chris@4: QString c(commare.cap(1)); Chris@4: QString d(commare.cap(2)); Chris@4: names.push_back(QString(d + " " + c).trimmed()); Chris@0: } Chris@0: } Chris@0: Chris@0: void Chris@4: ClassicalArchivesImporter::import(QUrl source) Chris@0: { Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: QString all = stream.readAll(); Chris@0: Chris@4: QRegExp matcher Chris@4: ("]*>([^\n]+)"); Chris@0: Chris@4: DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl; Chris@4: Chris@0: int pos = 0, count = 0; Chris@0: while ((pos = matcher.indexIn(all, pos)) != -1) { Chris@0: pos += matcher.matchedLength(); Chris@0: ++count; Chris@0: Chris@0: QString namefield = matcher.cap(2); Chris@0: QStringList names; Chris@0: Chris@4: int birth = 0, death = 0; Chris@4: bool approx = false; Chris@4: QString location; Chris@4: Chris@4: parseNames(namefield, names, birth, death, approx, location); Chris@4: Chris@0: if (names.empty()) { Chris@0: DEBUG << "No name!" << endl; Chris@0: continue; Chris@0: } Chris@0: Chris@4: DEBUG << "Item " << count Chris@4: << ": page = " << matcher.cap(1) Chris@4: << ", name = " << names[0] Chris@4: << ", birth = " << birth << ", death = " << death Chris@4: << ", loc " << location << endl; Chris@4: Chris@4: if (names[0].contains("Anonymous") || Chris@4: names[0].contains("Traditional")) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: Composer *composer = new Composer(); Chris@0: composer->setName(names[0]); Chris@0: for (int i = 1; i < names.size(); ++i) { Chris@0: composer->addAlias(names[i]); Chris@0: } Chris@4: Chris@4: if (birth != 0) { Chris@4: Birth *e = new Birth(birth); Chris@4: if (approx) e->setApproximate(true); Chris@4: composer->setBirth(e); Chris@4: } Chris@4: Chris@4: if (death != 0) { Chris@4: Death *e = new Death(death); Chris@4: if (approx) e->setApproximate(true); Chris@4: composer->setDeath(e); Chris@4: } Chris@4: Chris@4: if (location != "") { Chris@4: composer->setNationality(locationToNationality(location)); Chris@4: composer->setGeonameURIs(locationToGeonameURIs(location)); Chris@4: } Chris@0: Chris@0: if (matcher.cap(1) != "") { Chris@0: QString url = matcher.cap(1); Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri("http://www.classicalarchives.com" + url)); Chris@0: d->setTopic(composer); Chris@4: d->setSiteName("Classical Archives"); Chris@0: composer->addPage(d); Chris@0: } Chris@0: Chris@0: m_objects.push_back(composer); Chris@0: } Chris@0: Chris@0: Chris@0: DEBUG << "Found " << count << " things" << endl; Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: Chris@0: