view import/ImportClassicalDotNet.cpp @ 20:c4cb65c436ef classical-rdf

* Simple query utility
author Chris Cannam
date Tue, 23 Feb 2010 16:37:49 +0000
parents c8ef23d3888c
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "ImportClassicalDotNet.h"

#include <dataquay/Debug.h>

#include <QFile>
#include <QFileInfo>
#include <QTextStream>
#include <QRegExp>
#include <QVariant>

#include <exception>

using namespace Dataquay;

namespace ClassicalData {

void
ClassicalDotNetImporter::setSource(QUrl source)
{
    DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl;
    import(source);
}

void
parseNames(QString field, QStringList &names)
{
    field.replace("&#196;", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS
    field.replace("&#322;", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE
    field.replace("&#344;", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON

    field.replace("&aacute;", QChar(0x00E1));
    field.replace("&Aacute;", QChar(0x00C1));
    field.replace("&ccedil;", QChar(0x00E7));
    field.replace("&eacute;", QChar(0x00E9));
    field.replace("&Eacute;", QChar(0x00C9));
    field.replace("&Egrave;", QChar(0x00C8));
    field.replace("&Euml;", QChar(0x00CB));
    field.replace("&iacute;", QChar(0x00ED));
    field.replace("&Iuml;", QChar(0x00CF));
    field.replace("&Ntilde;", QChar(0x00D1));
    field.replace("&Oacute;", QChar(0x00D3));
    field.replace("&Ocirc;", QChar(0x00D4));
    field.replace("&ograve;", QChar(0x00F2));
    field.replace("&ouml;", QChar(0x00F6));
    field.replace("&Yuml;", QChar(0x0178));

    if (field.contains(QRegExp("&[^ ]+;"))) {
        DEBUG << "Failed to handle entity in " << field << endl;
    }

    // all-caps -> titlecase
    QRegExp re("[A-Z][^ ,]*[A-Z][^,]+");
    int mp = re.indexIn(field);
    if (mp >= 0) {
	int ml = re.matchedLength();
	bool initial = true;
	for (int i = 0; i < ml; ++i) {
	    if (initial) {
		initial = false;
		continue;
	    }
	    if (field[mp + i].isUpper()) {
		field[mp + i] = field[mp + i].toLower();
	    } else if (field[mp + i].isSpace()) {
		initial = true;
	    }
	}
    }

    field = field.trimmed();
    names.push_back(field);

    // comma
    re = QRegExp("^([^,]+), ([^,]+)$");
    if ((mp = re.indexIn(field)) >= 0) {
        QString c(re.cap(1));
        QString d(re.cap(2));
	names.push_back(d + " " + c);
        return;
    }
}

void
ClassicalDotNetImporter::import(QUrl source)
{
    //!!! for now
    QString filename = source.toLocalFile();

    QFile file(filename);
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
	throw std::exception();
    }

    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    QString all = stream.readAll();
    
    all.replace(QRegExp("^.*<div id=\"center\">"), "");
    
    QRegExp matcher
	("<li><a href=\"([^\"]+)\">([^<]+)</a></li>");
    
    int pos = 0, count = 0;
    while ((pos = matcher.indexIn(all, pos)) != -1) {
	pos += matcher.matchedLength();
	++count;

	DEBUG << "Item " << count
	      << ": page = " << matcher.cap(1)
	      << ", name = " << matcher.cap(2);

        QString namefield = matcher.cap(2);
	QStringList names;

        parseNames(namefield, names);
	if (names.empty()) {
	    DEBUG << "No name!" << endl;
            continue;
        }

        if (names[0].contains(" Collections")) {
            continue;
        }

	Composer *composer = new Composer();
        composer->setName(names[0]);
	for (int i = 1; i < names.size(); ++i) {
            composer->addAlias(names[i]);
        }
	
        if (matcher.cap(1) != "") {
	    QString url = matcher.cap(1);
	    url.replace(QRegExp("^\\.\\./"), "/music/");
            Document *d = new Document;
            d->setUri(Uri("http://www.classical.net" + url));
            d->setTopic(composer);
            d->setSiteName("Classical Net");
            composer->addPage(d);
        }
	
	m_objects.push_back(composer);
    }

    
    DEBUG << "Found " << count << " things" << endl;
}


}