Mercurial > hg > classical
view import/ImportClassicalArchives.cpp @ 20:c4cb65c436ef classical-rdf
* Simple query utility
author | Chris Cannam |
---|---|
date | Tue, 23 Feb 2010 16:37:49 +0000 |
parents | c8ef23d3888c |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportClassicalArchives.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void ClassicalArchivesImporter::setSource(QUrl source) { DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl; import(source); } static const char *locmap[] = { "ARG", "Argentinian", "Argentina", "3865483", "ARM", "Armenian", "Armenia", "174982", "AUS", "Australian", "Australia", "2077456", "AUT", "Austrian", "Austria", "2782113", "AZE", "Azeri", "Azerbaijan", "587116", "BEL", "Belgian", "Belgium", "2802361", "BGR", "Bulgarian", "Bulgaria", "732800", "BLR", "Belarusian", "Belarus", "630336", "BOH", "Bohemian", "Bohemia", "3074194", "BRA", "Brazilian", "Brazil", "3469058", "BSQ", "Basque", "Basque country", "3104499", "CAN", "Canadian", "Canada", "6251999", "CHE", "Swiss", "Switzerland", "2658434", "CHL", "Chilean", "Chile", "3895114", "CHN", "Chinese", "China", "1814991", "CRI", "Costa Rican", "Costa Rica", "3624060", "CTN", "Catalonian", "Catalonia", "3108286", "CUB", "Cuban", "Cuba", "3562981", "CZE", "Czech", "Czech Republic", "3077311", "DEU", "German", "Germany", "2921044", "DNK", "Danish", "Denmark", "2623032", "ECU", "Ecuadorian", "Ecuador", "3658394", "EGY", "Egyptian", "Egypt", "357994", "ENG", "English", "England", "2635167", "EPR", "German", "Germany", "2921044", // pardon? "ESP", "Spanish", "Spain", "2510769", "EST", "Estonian", "Estonia", "453733", "ETH", "Ethiopian", "Ethiopia", "337996", "FIN", "Finnish", "Finland", "660013", "FLM", "Flemish", "Flanders", "3337388", "FRA", "French", "France", "3017382", "GBR", "British", "Britain", "4839292", "GEO", "Georgian", "Georgia", "614540", "GRC", "Greek", "Greece", "390903", "GTM", "Guatemalan", "Guatemala", "3595528", "HKG", "Hong Kong Chinese", "Hong Kong", "1819729", "HOL", "Dutch", "Holland", "2750405", "HRV", "Croatian", "Croatia", "3202326", "HUN", "Hungarian", "Hungary", "719819", "IND", "Indian", "India", "1269750", "IRL", "Irish", "Ireland", "2963597", "IRN", "Iranian", "Iran", "130758", "ISL", "Icelandic", "Iceland", "2629691", "ISR", "Israeli", "Israel", "294640", "ITA", "Italian", "Italy", "3175395", "JPN", "Japanese", "Japan", "1861060", "KAZ", "Kazakh", "Kazakhstan", "1522867", "KOR", "Korean", "Korea", "1835841", "LBN", "Lebanese", "Lebanon", "272103", "LTU", "Lithuanian", "Lithuania", "597427", "LVA", "Latvian", "Latvia", "458258", "MAR", "Moroccan", "Morocco", "2542007", "MEX", "Mexican", "Mexico", "3996063", "MKD", "Macedonian", "Macedonia", "718075", "MOR", "Moravian", "Moravia", "3078610", "MYS", "Malaysian", "Malaysia", "1733045", "NAI", "North American Indian", "United States of America", "6252001", "NLD", "Dutch", "Netherlands", "2750405", "NOR", "Norwegian", "Norway", "3144096", "NZL", "New Zealander", "New Zealand", "2186224", "PER", "Peruvian", "Peru", "3932488", "PHL", "Filipino", "Philippines", "1694008", "POL", "Polish", "Poland", "798544", "PRT", "Portuguese", "Portugal", "2264397", "PRU", "Prussian", "Prussia", "772636", "PRY", "Paraguayan", "Paraguay", "3437598", "ROU", "Romanian", "Romania", "798549", "RUS", "Russian", "Russia", "2017370", "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468", "SCO", "Scottish", "Scotland", "2638360", "SGP", "Singaporean", "Singapore", "1880251", "SVK", "Slovakian", "Slovakia", "3057568", "SVN", "Slovenian", "Slovenia", "3190538", "SWE", "Swedish", "Sweden", "2661886", "TKM", "Turkmen", "Turkmenistan", "1218197", "TSL", "Transylvanian", "Transylvania", "4495544", "TSM", "Tasmanian", "Tasmania", "2147291", "TUR", "Turkish", "Turkey", "298795", "UKR", "Ukrainian", "Ukraine", "690791", "URY", "Uruguayan", "Uruguay", "3439705", "USA", "American", "United States of America", "6252001", "VEN", "Venezuelan", "Venezuela", "3625428", "VNM", "Vietnamese", "Vietnam", "1562822", "WLS", "Samoan", "Samoa", "4034894", "ZAF", "South African", "South Africa", "953987", }; QSet<QString> locationToNationality(QString location) { QSet<QString> nationalities; QStringList locations = location.split('/'); foreach (location, locations) { int cols = 4; for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { if (location == locmap[i*cols]) { nationalities.insert(locmap[i*cols+1]); } } } return nationalities; } QSet<Uri> locationToGeonameURIs(QString location) { QSet<Uri> uris; QStringList locations = location.split('/'); foreach (location, locations) { int cols = 4; for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { if (location == locmap[i*cols]) { uris.insert(Uri(QString("http://sws.geonames.org/") + locmap[i*cols+3] + "/")); } } } return uris; } void parseNames(QString field, QStringList &names, int &birth, int &death, bool &approx, QString &location) { field.replace(QRegExp("<[^>]*>"), ""); QRegExp locre("; (.*)$"); int pos; if ((pos = locre.indexIn(field)) >= 0) { location = locre.cap(1); field.replace(pos, locre.matchedLength(), ""); } QRegExp datere("\\(([^\\)]+)\\) *$"); if ((pos = datere.indexIn(field)) >= 0) { QString contents = datere.cap(1); if (contents.startsWith("c.")) { approx = true; contents = contents.replace("c.", ""); contents = contents.trimmed(); } if (QRegExp("\\d{4}").indexIn(contents) >= 0) { QStringList bits = contents.split("-"); if (!bits.empty()) { QString f1 = bits[0]; QString f2; if (bits.size() > 1) f2 = bits[1]; if (f1.startsWith("b")) { f1.replace(QRegExp("b[^0-9]*"), ""); birth = f1.toInt(); } else if (f1.startsWith("d")) { f1.replace(QRegExp("d[^0-9]*"), ""); death = f1.toInt(); } else if (f2 != "") { birth = f1.toInt(); } if (f2 != "") { death = f2.toInt(); } } } field.replace(pos, datere.matchedLength(), ""); } // we don't properly handle their slash alternatives syntax field = field.replace(QRegExp("/[^/,]*"), ""); // nor these field.replace(QRegExp("\\[[^\\]]*\\]"), ""); // nor these field.replace(QRegExp("\\([^\\)]*\\)"), ""); field.replace(QRegExp(" +"), " "); // and let's be picky -- we don't like names with just initials, // can't properly match them if (QRegExp(",.*\\.").indexIn(field) >= 0) { return; } // and, from this particular source, I'm suspicious of single-word // names (sorry) if (!field.contains(",")) return; field.replace(QRegExp(" +,"), ","); field = field.trimmed(); names.push_back(field); // comma QRegExp commare = QRegExp("^([^,]+), *([^,]+)$"); if ((pos = commare.indexIn(field)) >= 0) { QString c(commare.cap(1)); QString d(commare.cap(2)); names.push_back(QString(d + " " + c).trimmed()); } } void ClassicalArchivesImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString all = stream.readAll(); QRegExp matcher ("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>"); DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl; int pos = 0, count = 0; while ((pos = matcher.indexIn(all, pos)) != -1) { pos += matcher.matchedLength(); ++count; QString namefield = matcher.cap(2); QStringList names; int birth = 0, death = 0; bool approx = false; QString location; parseNames(namefield, names, birth, death, approx, location); if (names.empty()) { DEBUG << "No name!" << endl; continue; } DEBUG << "Item " << count << ": page = " << matcher.cap(1) << ", name = " << names[0] << ", birth = " << birth << ", death = " << death << ", loc " << location << endl; if (names[0].contains("Anonymous") || names[0].contains("Traditional")) { continue; } Composer *composer = new Composer(); composer->setName(names[0]); for (int i = 1; i < names.size(); ++i) { composer->addAlias(names[i]); } if (birth != 0) { Birth *e = new Birth(birth); if (approx) e->setApproximate(true); composer->setBirth(e); } if (death != 0) { Death *e = new Death(death); if (approx) e->setApproximate(true); composer->setDeath(e); } if (location != "") { composer->setNationality(locationToNationality(location)); composer->setGeonameURIs(locationToGeonameURIs(location)); } if (matcher.cap(1) != "") { QString url = matcher.cap(1); Document *d = new Document; d->setUri(Uri("http://www.classicalarchives.com" + url)); d->setTopic(composer); d->setSiteName("Classical Archives"); composer->addPage(d); } m_objects.push_back(composer); } DEBUG << "Found " << count << " things" << endl; } }