To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.
root / import / ImportClassicalDotNet.cpp
History | View | Annotate | Download (3.77 KB)
| 1 |
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
|---|---|
| 2 |
|
| 3 |
#include "ImportClassicalDotNet.h" |
| 4 |
|
| 5 |
#include <dataquay/Debug.h> |
| 6 |
|
| 7 |
#include <QFile> |
| 8 |
#include <QFileInfo> |
| 9 |
#include <QTextStream> |
| 10 |
#include <QRegExp> |
| 11 |
#include <QVariant> |
| 12 |
|
| 13 |
#include <exception> |
| 14 |
|
| 15 |
using namespace Dataquay; |
| 16 |
|
| 17 |
namespace ClassicalData {
|
| 18 |
|
| 19 |
void
|
| 20 |
ClassicalDotNetImporter::setSource(QUrl source) |
| 21 |
{
|
| 22 |
DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl;
|
| 23 |
import(source); |
| 24 |
} |
| 25 |
|
| 26 |
void
|
| 27 |
parseNames(QString field, QStringList &names) |
| 28 |
{
|
| 29 |
field.replace("Ä", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS |
| 30 |
field.replace("ł", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE |
| 31 |
field.replace("Ř", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON |
| 32 |
|
| 33 |
field.replace("á", QChar(0x00E1)); |
| 34 |
field.replace("Á", QChar(0x00C1)); |
| 35 |
field.replace("ç", QChar(0x00E7)); |
| 36 |
field.replace("é", QChar(0x00E9)); |
| 37 |
field.replace("É", QChar(0x00C9)); |
| 38 |
field.replace("È", QChar(0x00C8)); |
| 39 |
field.replace("Ë", QChar(0x00CB)); |
| 40 |
field.replace("í", QChar(0x00ED)); |
| 41 |
field.replace("Ï", QChar(0x00CF)); |
| 42 |
field.replace("Ñ", QChar(0x00D1)); |
| 43 |
field.replace("Ó", QChar(0x00D3)); |
| 44 |
field.replace("Ô", QChar(0x00D4)); |
| 45 |
field.replace("ò", QChar(0x00F2)); |
| 46 |
field.replace("ö", QChar(0x00F6)); |
| 47 |
field.replace("Ÿ", QChar(0x0178)); |
| 48 |
|
| 49 |
if (field.contains(QRegExp("&[^ ]+;"))) { |
| 50 |
DEBUG << "Failed to handle entity in " << field << endl;
|
| 51 |
} |
| 52 |
|
| 53 |
// all-caps -> titlecase
|
| 54 |
QRegExp re("[A-Z][^ ,]*[A-Z][^,]+");
|
| 55 |
int mp = re.indexIn(field);
|
| 56 |
if (mp >= 0) { |
| 57 |
int ml = re.matchedLength();
|
| 58 |
bool initial = true; |
| 59 |
for (int i = 0; i < ml; ++i) { |
| 60 |
if (initial) {
|
| 61 |
initial = false;
|
| 62 |
continue;
|
| 63 |
} |
| 64 |
if (field[mp + i].isUpper()) {
|
| 65 |
field[mp + i] = field[mp + i].toLower(); |
| 66 |
} else if (field[mp + i].isSpace()) { |
| 67 |
initial = true;
|
| 68 |
} |
| 69 |
} |
| 70 |
} |
| 71 |
|
| 72 |
field = field.trimmed(); |
| 73 |
names.push_back(field); |
| 74 |
|
| 75 |
// comma
|
| 76 |
re = QRegExp("^([^,]+), ([^,]+)$");
|
| 77 |
if ((mp = re.indexIn(field)) >= 0) { |
| 78 |
QString c(re.cap(1));
|
| 79 |
QString d(re.cap(2));
|
| 80 |
names.push_back(d + " " + c);
|
| 81 |
return;
|
| 82 |
} |
| 83 |
} |
| 84 |
|
| 85 |
void
|
| 86 |
ClassicalDotNetImporter::import(QUrl source) |
| 87 |
{
|
| 88 |
//!!! for now
|
| 89 |
QString filename = source.toLocalFile(); |
| 90 |
|
| 91 |
QFile file(filename); |
| 92 |
if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
| 93 |
throw std::exception();
|
| 94 |
} |
| 95 |
|
| 96 |
QTextStream stream(&file); |
| 97 |
stream.setCodec("UTF-8");
|
| 98 |
QString all = stream.readAll(); |
| 99 |
|
| 100 |
all.replace(QRegExp("^.*<div id=\"center\">"), ""); |
| 101 |
|
| 102 |
QRegExp matcher |
| 103 |
("<li><a href=\"([^\"]+)\">([^<]+)</a></li>");
|
| 104 |
|
| 105 |
int pos = 0, count = 0; |
| 106 |
while ((pos = matcher.indexIn(all, pos)) != -1) { |
| 107 |
pos += matcher.matchedLength(); |
| 108 |
++count; |
| 109 |
|
| 110 |
DEBUG << "Item " << count
|
| 111 |
<< ": page = " << matcher.cap(1) |
| 112 |
<< ", name = " << matcher.cap(2); |
| 113 |
|
| 114 |
QString namefield = matcher.cap(2);
|
| 115 |
QStringList names; |
| 116 |
|
| 117 |
parseNames(namefield, names); |
| 118 |
if (names.empty()) {
|
| 119 |
DEBUG << "No name!" << endl;
|
| 120 |
continue;
|
| 121 |
} |
| 122 |
|
| 123 |
if (names[0].contains(" Collections")) { |
| 124 |
continue;
|
| 125 |
} |
| 126 |
|
| 127 |
Composer *composer = new Composer();
|
| 128 |
composer->setName(names[0]);
|
| 129 |
for (int i = 1; i < names.size(); ++i) { |
| 130 |
composer->addAlias(names[i]); |
| 131 |
} |
| 132 |
|
| 133 |
if (matcher.cap(1) != "") { |
| 134 |
QString url = matcher.cap(1);
|
| 135 |
url.replace(QRegExp("^\\.\\./"), "/music/"); |
| 136 |
Document *d = new Document;
|
| 137 |
d->setUri(Uri("http://www.classical.net" + url));
|
| 138 |
d->setTopic(composer); |
| 139 |
d->setSiteName("Classical Net");
|
| 140 |
composer->addPage(d); |
| 141 |
} |
| 142 |
|
| 143 |
m_objects.push_back(composer); |
| 144 |
} |
| 145 |
|
| 146 |
|
| 147 |
DEBUG << "Found " << count << " things" << endl; |
| 148 |
} |
| 149 |
|
| 150 |
|
| 151 |
} |
| 152 |
|
| 153 |
|