Mercurial > hg > classical
view import/ImportClassicalDotNet.cpp @ 20:c4cb65c436ef classical-rdf
* Simple query utility
author | Chris Cannam |
---|---|
date | Tue, 23 Feb 2010 16:37:49 +0000 |
parents | c8ef23d3888c |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportClassicalDotNet.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void ClassicalDotNetImporter::setSource(QUrl source) { DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl; import(source); } void parseNames(QString field, QStringList &names) { field.replace("Ä", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS field.replace("ł", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE field.replace("Ř", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON field.replace("á", QChar(0x00E1)); field.replace("Á", QChar(0x00C1)); field.replace("ç", QChar(0x00E7)); field.replace("é", QChar(0x00E9)); field.replace("É", QChar(0x00C9)); field.replace("È", QChar(0x00C8)); field.replace("Ë", QChar(0x00CB)); field.replace("í", QChar(0x00ED)); field.replace("Ï", QChar(0x00CF)); field.replace("Ñ", QChar(0x00D1)); field.replace("Ó", QChar(0x00D3)); field.replace("Ô", QChar(0x00D4)); field.replace("ò", QChar(0x00F2)); field.replace("ö", QChar(0x00F6)); field.replace("Ÿ", QChar(0x0178)); if (field.contains(QRegExp("&[^ ]+;"))) { DEBUG << "Failed to handle entity in " << field << endl; } // all-caps -> titlecase QRegExp re("[A-Z][^ ,]*[A-Z][^,]+"); int mp = re.indexIn(field); if (mp >= 0) { int ml = re.matchedLength(); bool initial = true; for (int i = 0; i < ml; ++i) { if (initial) { initial = false; continue; } if (field[mp + i].isUpper()) { field[mp + i] = field[mp + i].toLower(); } else if (field[mp + i].isSpace()) { initial = true; } } } field = field.trimmed(); names.push_back(field); // comma re = QRegExp("^([^,]+), ([^,]+)$"); if ((mp = re.indexIn(field)) >= 0) { QString c(re.cap(1)); QString d(re.cap(2)); names.push_back(d + " " + c); return; } } void ClassicalDotNetImporter::import(QUrl source) { //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString all = stream.readAll(); all.replace(QRegExp("^.*<div id=\"center\">"), ""); QRegExp matcher ("<li><a href=\"([^\"]+)\">([^<]+)</a></li>"); int pos = 0, count = 0; while ((pos = matcher.indexIn(all, pos)) != -1) { pos += matcher.matchedLength(); ++count; DEBUG << "Item " << count << ": page = " << matcher.cap(1) << ", name = " << matcher.cap(2); QString namefield = matcher.cap(2); QStringList names; parseNames(namefield, names); if (names.empty()) { DEBUG << "No name!" << endl; continue; } if (names[0].contains(" Collections")) { continue; } Composer *composer = new Composer(); composer->setName(names[0]); for (int i = 1; i < names.size(); ++i) { composer->addAlias(names[i]); } if (matcher.cap(1) != "") { QString url = matcher.cap(1); url.replace(QRegExp("^\\.\\./"), "/music/"); Document *d = new Document; d->setUri(Uri("http://www.classical.net" + url)); d->setTopic(composer); d->setSiteName("Classical Net"); composer->addPage(d); } m_objects.push_back(composer); } DEBUG << "Found " << count << " things" << endl; } }