view import/ImportClassicalArchives.cpp @ 45:0033259c6772

* More Music Ontology-like structure for audio files & signals
author Chris Cannam
date Fri, 14 May 2010 17:58:04 +0100
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportClassicalArchives.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
ClassicalArchivesImporter::setSource(QUrl source)
{
    DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl;
    import(source);
}

static const char *locmap[] = {
    "ARG", "Argentinian", "Argentina", "3865483",
    "ARM", "Armenian", "Armenia", "174982",
    "AUS", "Australian", "Australia", "2077456",
    "AUT", "Austrian", "Austria", "2782113",
    "AZE", "Azeri", "Azerbaijan", "587116",
    "BEL", "Belgian", "Belgium", "2802361",
    "BGR", "Bulgarian", "Bulgaria", "732800",
    "BLR", "Belarusian", "Belarus", "630336",
    "BOH", "Bohemian", "Bohemia", "3074194",
    "BRA", "Brazilian", "Brazil", "3469058",
    "BSQ", "Basque", "Basque country", "3104499",
    "CAN", "Canadian", "Canada", "6251999",
    "CHE", "Swiss", "Switzerland", "2658434",
    "CHL", "Chilean", "Chile", "3895114",
    "CHN", "Chinese", "China", "1814991",
    "CRI", "Costa Rican", "Costa Rica", "3624060",
    "CTN", "Catalonian", "Catalonia", "3108286",
    "CUB", "Cuban", "Cuba", "3562981",
    "CZE", "Czech", "Czech Republic", "3077311",
    "DEU", "German", "Germany", "2921044",
    "DNK", "Danish", "Denmark", "2623032",
    "ECU", "Ecuadorian", "Ecuador", "3658394",
    "EGY", "Egyptian", "Egypt", "357994",
    "ENG", "English", "England", "2635167",
    "EPR", "German", "Germany", "2921044", // pardon?
    "ESP", "Spanish", "Spain", "2510769",
    "EST", "Estonian", "Estonia", "453733",
    "ETH", "Ethiopian", "Ethiopia", "337996",
    "FIN", "Finnish", "Finland", "660013",
    "FLM", "Flemish", "Flanders", "3337388",
    "FRA", "French", "France", "3017382",
    "GBR", "British", "Britain", "4839292",
    "GEO", "Georgian", "Georgia", "614540",
    "GRC", "Greek", "Greece", "390903",
    "GTM", "Guatemalan", "Guatemala", "3595528",
    "HKG", "Hong Kong Chinese", "Hong Kong", "1819729",
    "HOL", "Dutch", "Holland", "2750405",
    "HRV", "Croatian", "Croatia", "3202326",
    "HUN", "Hungarian", "Hungary", "719819",
    "IND", "Indian", "India", "1269750",
    "IRL", "Irish", "Ireland", "2963597",
    "IRN", "Iranian", "Iran", "130758",
    "ISL", "Icelandic", "Iceland", "2629691",
    "ISR", "Israeli", "Israel", "294640",
    "ITA", "Italian", "Italy", "3175395",
    "JPN", "Japanese", "Japan", "1861060",
    "KAZ", "Kazakh", "Kazakhstan", "1522867",
    "KOR", "Korean", "Korea", "1835841",
    "LBN", "Lebanese", "Lebanon", "272103",
    "LTU", "Lithuanian", "Lithuania", "597427",
    "LVA", "Latvian", "Latvia", "458258",
    "MAR", "Moroccan", "Morocco", "2542007",
    "MEX", "Mexican", "Mexico", "3996063",
    "MKD", "Macedonian", "Macedonia", "718075",
    "MOR", "Moravian", "Moravia", "3078610",
    "MYS", "Malaysian", "Malaysia", "1733045",
    "NAI", "North American Indian", "United States of America", "6252001",
    "NLD", "Dutch", "Netherlands", "2750405",
    "NOR", "Norwegian", "Norway", "3144096",
    "NZL", "New Zealander", "New Zealand", "2186224",
    "PER", "Peruvian", "Peru", "3932488",
    "PHL", "Filipino", "Philippines", "1694008",
    "POL", "Polish", "Poland", "798544",
    "PRT", "Portuguese", "Portugal", "2264397",
    "PRU", "Prussian", "Prussia", "772636",
    "PRY", "Paraguayan", "Paraguay", "3437598",
    "ROU", "Romanian", "Romania", "798549",
    "RUS", "Russian", "Russia", "2017370",
    "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468",
    "SCO", "Scottish", "Scotland", "2638360",
    "SGP", "Singaporean", "Singapore", "1880251",
    "SVK", "Slovakian", "Slovakia", "3057568",
    "SVN", "Slovenian", "Slovenia", "3190538",
    "SWE", "Swedish", "Sweden", "2661886",
    "TKM", "Turkmen", "Turkmenistan", "1218197",
    "TSL", "Transylvanian", "Transylvania", "4495544",
    "TSM", "Tasmanian", "Tasmania", "2147291",
    "TUR", "Turkish", "Turkey", "298795",
    "UKR", "Ukrainian", "Ukraine", "690791",
    "URY", "Uruguayan", "Uruguay", "3439705",
    "USA", "American", "United States of America", "6252001",
    "VEN", "Venezuelan", "Venezuela", "3625428",
    "VNM", "Vietnamese", "Vietnam", "1562822",
    "WLS", "Samoan", "Samoa", "4034894",
    "ZAF", "South African", "South Africa", "953987",
};

QSet<QString>
locationToNationality(QString location)
{
    QSet<QString> nationalities;
    QStringList locations = location.split('/');
    foreach (location, locations) {
        int cols = 4;
        for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
            if (location == locmap[i*cols]) {
                nationalities.insert(locmap[i*cols+1]);
            }
        }
    }
    return nationalities;
}

QSet<Uri>
locationToGeonameURIs(QString location)
{
    QSet<Uri> uris;
    QStringList locations = location.split('/');
    foreach (location, locations) {
        int cols = 4;
        for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
            if (location == locmap[i*cols]) {
                uris.insert(Uri(QString("http://sws.geonames.org/")
                                + locmap[i*cols+3] + "/"));
            }
        }
    }
    return uris;
}

void
parseNames(QString field, QStringList &names, int &birth, int &death,
           bool &approx, QString &location)
{
    field.replace(QRegExp("<[^>]*>"), "");

    QRegExp locre("; (.*)$");
    int pos;
    if ((pos = locre.indexIn(field)) >= 0) {
        location = locre.cap(1);
        field.replace(pos, locre.matchedLength(), "");
    }

    QRegExp datere("\\(([^\\)]+)\\) *$");
    if ((pos = datere.indexIn(field)) >= 0) {
        QString contents = datere.cap(1);
        if (contents.startsWith("c.")) {
            approx = true;
            contents = contents.replace("c.", "");
            contents = contents.trimmed();
        }
        if (QRegExp("\\d{4}").indexIn(contents) >= 0) {
            QStringList bits = contents.split("-");
            if (!bits.empty()) {
                QString f1 = bits[0];
                QString f2;
                if (bits.size() > 1) f2 = bits[1];
                if (f1.startsWith("b")) {
                    f1.replace(QRegExp("b[^0-9]*"), "");
                    birth = f1.toInt();
                } else if (f1.startsWith("d")) {
                    f1.replace(QRegExp("d[^0-9]*"), "");
                    death = f1.toInt();
                } else if (f2 != "") {
                    birth = f1.toInt();
                }
                if (f2 != "") {
                    death = f2.toInt();
                }
            }
        }
        field.replace(pos, datere.matchedLength(), "");
    }

    // we don't properly handle their slash alternatives syntax
    field = field.replace(QRegExp("/[^/,]*"), "");

    // nor these
    field.replace(QRegExp("\\[[^\\]]*\\]"), "");

    // nor these
    field.replace(QRegExp("\\([^\\)]*\\)"), "");

    field.replace(QRegExp(" +"), " ");

    // and let's be picky -- we don't like names with just initials,
    // can't properly match them
    if (QRegExp(",.*\\.").indexIn(field) >= 0) {
        return;
    }

    // and, from this particular source, I'm suspicious of single-word
    // names (sorry)
    if (!field.contains(",")) return;

    field.replace(QRegExp(" +,"), ",");
    field = field.trimmed();
    names.push_back(field);

    // comma
    QRegExp commare = QRegExp("^([^,]+), *([^,]+)$");
    if ((pos = commare.indexIn(field)) >= 0) {
        QString c(commare.cap(1));
        QString d(commare.cap(2));
	names.push_back(QString(d + " " + c).trimmed());
    }
}

void
ClassicalArchivesImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    QString all = stream.readAll();
    
    QRegExp matcher
	("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>");
    
    DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl;

    int pos = 0, count = 0;
    while ((pos = matcher.indexIn(all, pos)) != -1) {
	pos += matcher.matchedLength();
	++count;

        QString namefield = matcher.cap(2);
	QStringList names;

        int birth = 0, death = 0;
        bool approx = false;
        QString location;

        parseNames(namefield, names, birth, death, approx, location);

	if (names.empty()) {
	    DEBUG << "No name!" << endl;
            continue;
        }

	DEBUG << "Item " << count
	      << ": page = " << matcher.cap(1)
	      << ", name = " << names[0]
              << ", birth = " << birth << ", death = " << death
              << ", loc " << location << endl;

        if (names[0].contains("Anonymous") ||
            names[0].contains("Traditional")) {
            continue;
        }

	Composer *composer = new Composer();
        composer->setName(names[0]);
	for (int i = 1; i < names.size(); ++i) {
            composer->addAlias(names[i]);
        }

        if (birth != 0) {
	    Birth *e = new Birth(birth);
            if (approx) e->setApproximate(true);
	    composer->setBirth(e);
        }

        if (death != 0) {
            Death *e = new Death(death);
            if (approx) e->setApproximate(true);
	    composer->setDeath(e);
        }

        if (location != "") {
            composer->setNationality(locationToNationality(location));
            composer->setGeonameURIs(locationToGeonameURIs(location));
        }
	
        if (matcher.cap(1) != "") {
	    QString url = matcher.cap(1);
            Document *d = new Document;
            d->setUri(Uri("http://www.classicalarchives.com" + url));
            d->setTopic(composer);
            d->setSiteName("Classical Archives");
            composer->addPage(d);
        }
	
	m_objects.push_back(composer);
    }

    
    DEBUG << "Found " << count << " things" << endl;
}


}