annotate import/ImportClassicalDotNet.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportClassicalDotNet.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 ClassicalDotNetImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 void
Chris@0 27 parseNames(QString field, QStringList &names)
Chris@0 28 {
Chris@0 29 field.replace("&#196;", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS
Chris@0 30 field.replace("&#322;", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE
Chris@0 31 field.replace("&#344;", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON
Chris@0 32
Chris@0 33 field.replace("&aacute;", QChar(0x00E1));
Chris@0 34 field.replace("&Aacute;", QChar(0x00C1));
Chris@0 35 field.replace("&ccedil;", QChar(0x00E7));
Chris@0 36 field.replace("&eacute;", QChar(0x00E9));
Chris@0 37 field.replace("&Eacute;", QChar(0x00C9));
Chris@0 38 field.replace("&Egrave;", QChar(0x00C8));
Chris@0 39 field.replace("&Euml;", QChar(0x00CB));
Chris@0 40 field.replace("&iacute;", QChar(0x00ED));
Chris@0 41 field.replace("&Iuml;", QChar(0x00CF));
Chris@0 42 field.replace("&Ntilde;", QChar(0x00D1));
Chris@0 43 field.replace("&Oacute;", QChar(0x00D3));
Chris@0 44 field.replace("&Ocirc;", QChar(0x00D4));
Chris@0 45 field.replace("&ograve;", QChar(0x00F2));
Chris@0 46 field.replace("&ouml;", QChar(0x00F6));
Chris@0 47 field.replace("&Yuml;", QChar(0x0178));
Chris@0 48
Chris@0 49 if (field.contains(QRegExp("&[^ ]+;"))) {
Chris@0 50 DEBUG << "Failed to handle entity in " << field << endl;
Chris@0 51 }
Chris@0 52
Chris@0 53 // all-caps -> titlecase
Chris@0 54 QRegExp re("[A-Z][^ ,]*[A-Z][^,]+");
Chris@0 55 int mp = re.indexIn(field);
Chris@0 56 if (mp >= 0) {
Chris@0 57 int ml = re.matchedLength();
Chris@0 58 bool initial = true;
Chris@0 59 for (int i = 0; i < ml; ++i) {
Chris@0 60 if (initial) {
Chris@0 61 initial = false;
Chris@0 62 continue;
Chris@0 63 }
Chris@0 64 if (field[mp + i].isUpper()) {
Chris@0 65 field[mp + i] = field[mp + i].toLower();
Chris@0 66 } else if (field[mp + i].isSpace()) {
Chris@0 67 initial = true;
Chris@0 68 }
Chris@0 69 }
Chris@0 70 }
Chris@0 71
Chris@0 72 field = field.trimmed();
Chris@0 73 names.push_back(field);
Chris@0 74
Chris@0 75 // comma
Chris@0 76 re = QRegExp("^([^,]+), ([^,]+)$");
Chris@0 77 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 78 QString c(re.cap(1));
Chris@0 79 QString d(re.cap(2));
Chris@0 80 names.push_back(d + " " + c);
Chris@0 81 return;
Chris@0 82 }
Chris@0 83 }
Chris@0 84
Chris@0 85 void
Chris@0 86 ClassicalDotNetImporter::import(QUrl source)
Chris@0 87 {
Chris@0 88 //!!! for now
Chris@0 89 QString filename = source.toLocalFile();
Chris@0 90
Chris@0 91 QFile file(filename);
Chris@0 92 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 93 throw std::exception();
Chris@0 94 }
Chris@0 95
Chris@0 96 QTextStream stream(&file);
Chris@0 97 stream.setCodec("UTF-8");
Chris@0 98 QString all = stream.readAll();
Chris@0 99
Chris@0 100 all.replace(QRegExp("^.*<div id=\"center\">"), "");
Chris@0 101
Chris@0 102 QRegExp matcher
Chris@0 103 ("<li><a href=\"([^\"]+)\">([^<]+)</a></li>");
Chris@0 104
Chris@0 105 int pos = 0, count = 0;
Chris@0 106 while ((pos = matcher.indexIn(all, pos)) != -1) {
Chris@0 107 pos += matcher.matchedLength();
Chris@0 108 ++count;
Chris@0 109
Chris@0 110 DEBUG << "Item " << count
Chris@0 111 << ": page = " << matcher.cap(1)
Chris@0 112 << ", name = " << matcher.cap(2);
Chris@0 113
Chris@0 114 QString namefield = matcher.cap(2);
Chris@0 115 QStringList names;
Chris@0 116
Chris@0 117 parseNames(namefield, names);
Chris@0 118 if (names.empty()) {
Chris@0 119 DEBUG << "No name!" << endl;
Chris@0 120 continue;
Chris@0 121 }
Chris@0 122
Chris@0 123 if (names[0].contains(" Collections")) {
Chris@0 124 continue;
Chris@0 125 }
Chris@0 126
Chris@0 127 Composer *composer = new Composer();
Chris@0 128 composer->setName(names[0]);
Chris@0 129 for (int i = 1; i < names.size(); ++i) {
Chris@0 130 composer->addAlias(names[i]);
Chris@0 131 }
Chris@0 132
Chris@0 133 if (matcher.cap(1) != "") {
Chris@0 134 QString url = matcher.cap(1);
Chris@0 135 url.replace(QRegExp("^\\.\\./"), "/music/");
Chris@0 136 Document *d = new Document;
Chris@18 137 d->setUri(Uri("http://www.classical.net" + url));
Chris@0 138 d->setTopic(composer);
Chris@0 139 d->setSiteName("Classical Net");
Chris@0 140 composer->addPage(d);
Chris@0 141 }
Chris@0 142
Chris@0 143 m_objects.push_back(composer);
Chris@0 144 }
Chris@0 145
Chris@0 146
Chris@0 147 DEBUG << "Found " << count << " things" << endl;
Chris@0 148 }
Chris@0 149
Chris@0 150
Chris@0 151 }
Chris@0 152
Chris@0 153