Mercurial > hg > classical
view import/ImportClassicalArchives.cpp @ 5:d23a4c935a22 classical-rdf
* Update CMN and mbz mappings for new classical archives import
author | Chris Cannam |
---|---|
date | Fri, 11 Dec 2009 16:10:29 +0000 |
parents | 719a4f477098 |
children | c8ef23d3888c |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportClassicalArchives.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void ClassicalArchivesImporter::setSource(QUrl source) { DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl; import(source); } static const char *locmap[] = { "ARG", "Argentinian", "Argentina", "3865483", "ARM", "Armenian", "Armenia", "174982", "AUS", "Australian", "Australia", "2077456", "AUT", "Austrian", "Austria", "2782113", "AZE", "Azeri", "Azerbaijan", "587116", "BEL", "Belgian", "Belgium", "2802361", "BGR", "Bulgarian", "Bulgaria", "732800", "BLR", "Belarusian", "Belarus", "630336", "BOH", "Bohemian", "Bohemia", "3074194", "BRA", "Brazilian", "Brazil", "3469058", "BSQ", "Basque", "Basque country", "3104499", "CAN", "Canadian", "Canada", "6251999", "CHE", "Swiss", "Switzerland", "2658434", "CHL", "Chilean", "Chile", "3895114", "CHN", "Chinese", "China", "1814991", "CRI", "Costa Rican", "Costa Rica", "3624060", "CTN", "Catalonian", "Catalonia", "3108286", "CUB", "Cuban", "Cuba", "3562981", "CZE", "Czech", "Czech Republic", "3077311", "DEU", "German", "Germany", "2921044", "DNK", "Danish", "Denmark", "2623032", "ECU", "Ecuadorian", "Ecuador", "3658394", "EGY", "Egyptian", "Egypt", "357994", "ENG", "English", "England", "2635167", "EPR", "German", "Germany", "2921044", // pardon? "ESP", "Spanish", "Spain", "2510769", "EST", "Estonian", "Estonia", "453733", "ETH", "Ethiopian", "Ethiopia", "337996", "FIN", "Finnish", "Finland", "660013", "FLM", "Flemish", "Flanders", "3337388", "FRA", "French", "France", "3017382", "GBR", "British", "Britain", "4839292", "GEO", "Georgian", "Georgia", "614540", "GRC", "Greek", "Greece", "390903", "GTM", "Guatemalan", "Guatemala", "3595528", "HKG", "Hong Kong Chinese", "Hong Kong", "1819729", "HOL", "Dutch", "Holland", "2750405", "HRV", "Croatian", "Croatia", "3202326", "HUN", "Hungarian", "Hungary", "719819", "IND", "Indian", "India", "1269750", "IRL", "Irish", "Ireland", "2963597", "IRN", "Iranian", "Iran", "130758", "ISL", "Icelandic", "Iceland", "2629691", "ISR", "Israeli", "Israel", "294640", "ITA", "Italian", "Italy", "3175395", "JPN", "Japanese", "Japan", "1861060", "KAZ", "Kazakh", "Kazakhstan", "1522867", "KOR", "Korean", "Korea", "1835841", "LBN", "Lebanese", "Lebanon", "272103", "LTU", "Lithuanian", "Lithuania", "597427", "LVA", "Latvian", "Latvia", "458258", "MAR", "Moroccan", "Morocco", "2542007", "MEX", "Mexican", "Mexico", "3996063", "MKD", "Macedonian", "Macedonia", "718075", "MOR", "Moravian", "Moravia", "3078610", "MYS", "Malaysian", "Malaysia", "1733045", "NAI", "North American Indian", "United States of America", "6252001", "NLD", "Dutch", "Netherlands", "2750405", "NOR", "Norwegian", "Norway", "3144096", "NZL", "New Zealander", "New Zealand", "2186224", "PER", "Peruvian", "Peru", "3932488", "PHL", "Filipino", "Philippines", "1694008", "POL", "Polish", "Poland", "798544", "PRT", "Portuguese", "Portugal", "2264397", "PRU", "Prussian", "Prussia", "772636", "PRY", "Paraguayan", "Paraguay", "3437598", "ROU", "Romanian", "Romania", "798549", "RUS", "Russian", "Russia", "2017370", "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468", "SCO", "Scottish", "Scotland", "2638360", "SGP", "Singaporean", "Singapore", "1880251", "SVK", "Slovakian", "Slovakia", "3057568", "SVN", "Slovenian", "Slovenia", "3190538", "SWE", "Swedish", "Sweden", "2661886", "TKM", "Turkmen", "Turkmenistan", "1218197", "TSL", "Transylvanian", "Transylvania", "4495544", "TSM", "Tasmanian", "Tasmania", "2147291", "TUR", "Turkish", "Turkey", "298795", "UKR", "Ukrainian", "Ukraine", "690791", "URY", "Uruguayan", "Uruguay", "3439705", "USA", "American", "United States of America", "6252001", "VEN", "Venezuelan", "Venezuela", "3625428", "VNM", "Vietnamese", "Vietnam", "1562822", "WLS", "Samoan", "Samoa", "4034894", "ZAF", "South African", "South Africa", "953987", }; QSet<QString> locationToNationality(QString location) { QSet<QString> nationalities; QStringList locations = location.split('/'); foreach (location, locations) { int cols = 4; for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { if (location == locmap[i*cols]) { nationalities.insert(locmap[i*cols+1]); } } } return nationalities; } QSet<QUrl> locationToGeonameURIs(QString location) { QSet<QUrl> uris; QStringList locations = location.split('/'); foreach (location, locations) { int cols = 4; for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) { if (location == locmap[i*cols]) { uris.insert(QUrl(QString("http://sws.geonames.org/") + locmap[i*cols+3] + "/")); } } } return uris; } void parseNames(QString field, QStringList &names, int &birth, int &death, bool &approx, QString &location) { field.replace(QRegExp("<[^>]*>"), ""); QRegExp locre("; (.*)$"); int pos; if ((pos = locre.indexIn(field)) >= 0) { location = locre.cap(1); field.replace(pos, locre.matchedLength(), ""); } QRegExp datere("\\(([^\\)]+)\\) *$"); if ((pos = datere.indexIn(field)) >= 0) { QString contents = datere.cap(1); if (contents.startsWith("c.")) { approx = true; contents = contents.replace("c.", ""); contents = contents.trimmed(); } if (QRegExp("\\d{4}").indexIn(contents) >= 0) { QStringList bits = contents.split("-"); if (!bits.empty()) { QString f1 = bits[0]; QString f2; if (bits.size() > 1) f2 = bits[1]; if (f1.startsWith("b")) { f1.replace(QRegExp("b[^0-9]*"), ""); birth = f1.toInt(); } else if (f1.startsWith("d")) { f1.replace(QRegExp("d[^0-9]*"), ""); death = f1.toInt(); } else if (f2 != "") { birth = f1.toInt(); } if (f2 != "") { death = f2.toInt(); } } } field.replace(pos, datere.matchedLength(), ""); } // we don't properly handle their slash alternatives syntax field = field.replace(QRegExp("/[^/,]*"), ""); // nor these field.replace(QRegExp("\\[[^\\]]*\\]"), ""); // nor these field.replace(QRegExp("\\([^\\)]*\\)"), ""); field.replace(QRegExp(" +"), " "); // and let's be picky -- we don't like names with just initials, // can't properly match them if (QRegExp(",.*\\.").indexIn(field) >= 0) { return; } // and, from this particular source, I'm suspicious of single-word // names (sorry) if (!field.contains(",")) return; field.replace(QRegExp(" +,"), ","); field = field.trimmed(); names.push_back(field); // comma QRegExp commare = QRegExp("^([^,]+), *([^,]+)$"); if ((pos = commare.indexIn(field)) >= 0) { QString c(commare.cap(1)); QString d(commare.cap(2)); names.push_back(QString(d + " " + c).trimmed()); } } void ClassicalArchivesImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString all = stream.readAll(); QRegExp matcher ("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>"); DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl; int pos = 0, count = 0; while ((pos = matcher.indexIn(all, pos)) != -1) { pos += matcher.matchedLength(); ++count; QString namefield = matcher.cap(2); QStringList names; int birth = 0, death = 0; bool approx = false; QString location; parseNames(namefield, names, birth, death, approx, location); if (names.empty()) { DEBUG << "No name!" << endl; continue; } DEBUG << "Item " << count << ": page = " << matcher.cap(1) << ", name = " << names[0] << ", birth = " << birth << ", death = " << death << ", loc " << location << endl; if (names[0].contains("Anonymous") || names[0].contains("Traditional")) { continue; } Composer *composer = new Composer(); composer->setName(names[0]); for (int i = 1; i < names.size(); ++i) { composer->addAlias(names[i]); } if (birth != 0) { Birth *e = new Birth(birth); if (approx) e->setApproximate(true); composer->setBirth(e); } if (death != 0) { Death *e = new Death(death); if (approx) e->setApproximate(true); composer->setDeath(e); } if (location != "") { composer->setNationality(locationToNationality(location)); composer->setGeonameURIs(locationToGeonameURIs(location)); } if (matcher.cap(1) != "") { QString url = matcher.cap(1); Document *d = new Document; d->setUri(QUrl("http://www.classicalarchives.com" + url)); d->setTopic(composer); d->setSiteName("Classical Archives"); composer->addPage(d); } m_objects.push_back(composer); } DEBUG << "Found " << count << " things" << endl; } }