view import/ImportClassicalComposersOrg.cpp @ 18:c8ef23d3888c classical-rdf

* Update for new Dataquay::Uri in preference to QUrl
author Chris Cannam
date Mon, 22 Feb 2010 14:18:30 +0000
parents 719a4f477098
children c4cb65c436ef
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportClassicalComposersOrg.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
ClassicalComposersOrgImporter::setSource(QUrl source)
{
    DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl;
    import(source);
}

typedef QMap<QString, int> NameMap;

void
parseNames(QString field, NameMap &names, int score = 0)
{
    QString a(field), b(field);

    int mp;
    QRegExp re;

    /* classical-composers.org uses quite a few (not always
     * consistent) ways to indicate alternatives in composer
     * names.  Not all of them are distinguishable.
     * Examples:
     *
     * Pipe used to separate sorted surname from alternative for whole:
     * Hardin | Moondog, Louis Thomas
     * -> "Louis Thomas Hardin", "Moondog"
     * Barron | Charlotte May Wind, Bebe
     * -> "Bebe Barron", "Charlotte May Wind"
     *
     * Pipe used to separate alternatives for surname only (seems
     * slightly more common than the previous one; if there is only
     * one word between the pipe and a following comma, I'd be
     * inclined to assume this case, Moondog notwithstanding):
     * Mendelssohn | Hensel, Fanny Cécile
     * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel"
     * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander 
     * -> "Thomas Alexander Erskine, 6th Earl of Kellie",
     *    "Thomas Alexander Kelly"
     *
     * Round brackets used to indicate one or more alternatives for
     * prior word; slash for alternation:
     * Edelmann, Jean-Frédéric (Johann-Friedrich)
     * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann"
     * Eberwein, Max (Traugott Maximilian)
     * -> "Max Eberwein", "Traugott Maximilian Eberwein"
     * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio)
     * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti",
     *    "Antoine Mahout", "Anton Mahaut", "Anton Mahault",
     *    "Anton Mahoti", "Anton Mahout", "Antonio Mahaut",
     *    "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout"
     *
     * Round brackets used to indicate alternative to prior
     * names, with some meaning left implicit:
     * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich) 
     * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest",
     *    perhaps "Heinrich Kaan" (but not "Jindrich z Albestu
     *    Kaan-Albest")
     *
     * Round brackets used to augment rather than
     * alternate. Probably can't identify this reliably, though
     * round brackets used somewhere other than at end of line
     * are relatively likely to be this form (?):
     * Linley (the elder), Thomas
     * -> "Thomas Linley", "Thomas Linley the elder"
     * Keys | Keyes, Ivor (Christopher Banfield)
     * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys",
     *    "Ivor Christopher Banfield Keyes"
     *
     * Square brackets used to indicate alternative for all
     * forenames:
     * Moller | Möller, John Christopher [Johann Christoph] 
     * -> "John Christopher Moller", "John Christopher Möller", 
     *    "Johann Christoph Moller", "Johann Christoph Möller"
     *
     * Complicated examples:
     * Mayr | Mayer, (Johann) Simon [Giovanni Simone] 
     * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr",
     *    "Johann Simon Mayer", "Giovanni Simone Mayr",
     *    "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr")
     * Frauenlob | Heinrich von Meissen
     * -> "Heinrich Frauenlob", "Heinrich von Meissen", or
     *    perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob")
     */

//    DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl;

    // round brackets used for augmentation right at the start
    re = QRegExp("\\(([^\\)]+)\\) ");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        a.replace(mp, ml, "");
        b.replace(mp, ml, QString("%1 ").arg(c));
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }
    
    // round brackets used for augmentation directly after the comma
    re = QRegExp(", \\(([^\\)]+)\\)");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        a.replace(mp, ml, ",");
        b.replace(mp, ml, QString(", %1").arg(c));
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // round brackets used for augmentation directly before the comma
    re = QRegExp(" \\(([^\\)]+)\\),");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        a.replace(mp, ml, ",");
        b.replace(mp, ml, QString(" %1,").arg(c));
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }
    
    // round brackets for alternation of single name, anywhere
    re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c);
        b.replace(mp, ml, d);
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // square brackets for alternation of a series of names, at end or after pipe
    re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString p(re.cap(1));
        QString c(re.cap(2));
        QString d(re.cap(3));
        a.replace(mp, ml, QString("%1 %2").arg(p).arg(c));
        b.replace(mp, ml, QString("%1 %2").arg(p).arg(d));
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // square brackets for alternation of a series of names, at start
    re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c);
        b.replace(mp, ml, d);
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // slash for alternation of word
    re = QRegExp("([^ ,|]+)/([^ ,|]+)");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c);
        b.replace(mp, ml, d);
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }
    
    // pipe for alternation of surname
    re = QRegExp("^(.*) \\| ([^|, ]+),");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c + ",");
        b.replace(mp, ml, d + ",");
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // pipe for alternation of whole (before comma)
    re = QRegExp("^(.*) \\| ([^|,]+),");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c + ",");
        b = d;
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }

    // pipe for alternation of whole (at end)
    re = QRegExp("^(.*) \\| ([^|,]+)$");
    if ((mp = re.indexIn(field)) >= 0) {
        int ml = re.matchedLength();
        QString c(re.cap(1));
        QString d(re.cap(2));
        a.replace(mp, ml, c);
        b.replace(mp, ml, d);
        parseNames(a, names, score);
        parseNames(b, names, score+1);
        return;
    }
    
    // comma
    re = QRegExp("^(.+), ([^,]+)$");
    if ((mp = re.indexIn(field)) >= 0) {
        QString c(re.cap(1));
        QString d(re.cap(2));
        parseNames(d + " " + c, names, score+1);
        // fall through to add
    }

    field.replace("(", "");
    field.replace(")", "");

    names[field] = score;
}

void
ClassicalComposersOrgImporter::import(QUrl source)
{
    int i = 0;

    //!!! for now
    QString filename = source.toLocalFile();


    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    QString all = stream.readAll();
    
    all.replace(QRegExp("^.*<div id=\"main\">"), "");

    QRegExp matcher
	(QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>"));
    
    int pos = 0, count = 0;
    while ((pos = matcher.indexIn(all, pos)) != -1) {

	pos += matcher.matchedLength();
	++count;

        QString page = matcher.cap(1);
        QString name = matcher.cap(2);
        QString star = matcher.cap(5);
        QString birth = matcher.cap(6);
        QString dagger = matcher.cap(7);
        QString death = matcher.cap(8);
        QString female = matcher.cap(9);

	DEBUG << "Item " << count
	      << ": page = " << page
	      << ", name = " << name
	      << ", birth = " << birth
	      << ", death = " << death
	      << ", female = " << female;

        QString namefield = name.trimmed();
	NameMap names;

        if (namefield.contains("P.D.Q.")) { // lose this joke
            continue;
        }

        parseNames(namefield, names);

        i = 0;
        QString preferred;
        foreach (QString n, names.keys()) {
            if (preferred == "" || names[n] == 0) preferred = n;
            DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl;
            ++i;
	}

	if (names.empty()) {
	    DEBUG << "No name!" << endl;
            continue;
        }

	Composer *composer = new Composer();
        composer->setName(preferred);
        foreach (QString n, names.keys()) {
            if (n != preferred) composer->addAlias(n);
        }
	
        if (page != "") {
            Document *d = new Document;
            d->setUri(Uri("http://www.classical-composers.org" + page));
            d->setTopic(composer);
            d->setSiteName("Classical Composers Database");
            composer->addPage(d);
        }

        if (birth != "" && death == "") {
            if (star == "" && dagger != QString::fromUtf8("\342\200\240")) {
                DEBUG << "Unexpected \"dagger\" character" << dagger << endl;
                birth = "";
            }
            if (star == "" && dagger == "") {
                DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl;
                birth = "";
            } else if (star != "" && dagger != "") {
                DEBUG << "Date range features both star and dagger -- ignoring" << endl;
                birth = "";
            } else if (dagger != "") {
                DEBUG << "dagger found: setting death to " << birth << endl;
                death = birth;
                birth = "";
            }
        }

        if (birth != "") {
	    Birth *e = new Birth(birth.toInt());
	    composer->setBirth(e);
	}
	if (death != "") {
	    composer->setDeath(new Death(death.toInt()));
	}
	if (female != "") {
	    composer->setGender("female");
	}

	m_objects.push_back(composer);
    }

    DEBUG << "Found " << count << " things" << endl;
    
}


}