Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportClassicalDotNet.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 ClassicalDotNetImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 void
|
Chris@0
|
27 parseNames(QString field, QStringList &names)
|
Chris@0
|
28 {
|
Chris@0
|
29 field.replace("Ä", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS
|
Chris@0
|
30 field.replace("ł", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE
|
Chris@0
|
31 field.replace("Ř", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON
|
Chris@0
|
32
|
Chris@0
|
33 field.replace("á", QChar(0x00E1));
|
Chris@0
|
34 field.replace("Á", QChar(0x00C1));
|
Chris@0
|
35 field.replace("ç", QChar(0x00E7));
|
Chris@0
|
36 field.replace("é", QChar(0x00E9));
|
Chris@0
|
37 field.replace("É", QChar(0x00C9));
|
Chris@0
|
38 field.replace("È", QChar(0x00C8));
|
Chris@0
|
39 field.replace("Ë", QChar(0x00CB));
|
Chris@0
|
40 field.replace("í", QChar(0x00ED));
|
Chris@0
|
41 field.replace("Ï", QChar(0x00CF));
|
Chris@0
|
42 field.replace("Ñ", QChar(0x00D1));
|
Chris@0
|
43 field.replace("Ó", QChar(0x00D3));
|
Chris@0
|
44 field.replace("Ô", QChar(0x00D4));
|
Chris@0
|
45 field.replace("ò", QChar(0x00F2));
|
Chris@0
|
46 field.replace("ö", QChar(0x00F6));
|
Chris@0
|
47 field.replace("Ÿ", QChar(0x0178));
|
Chris@0
|
48
|
Chris@0
|
49 if (field.contains(QRegExp("&[^ ]+;"))) {
|
Chris@0
|
50 DEBUG << "Failed to handle entity in " << field << endl;
|
Chris@0
|
51 }
|
Chris@0
|
52
|
Chris@0
|
53 // all-caps -> titlecase
|
Chris@0
|
54 QRegExp re("[A-Z][^ ,]*[A-Z][^,]+");
|
Chris@0
|
55 int mp = re.indexIn(field);
|
Chris@0
|
56 if (mp >= 0) {
|
Chris@0
|
57 int ml = re.matchedLength();
|
Chris@0
|
58 bool initial = true;
|
Chris@0
|
59 for (int i = 0; i < ml; ++i) {
|
Chris@0
|
60 if (initial) {
|
Chris@0
|
61 initial = false;
|
Chris@0
|
62 continue;
|
Chris@0
|
63 }
|
Chris@0
|
64 if (field[mp + i].isUpper()) {
|
Chris@0
|
65 field[mp + i] = field[mp + i].toLower();
|
Chris@0
|
66 } else if (field[mp + i].isSpace()) {
|
Chris@0
|
67 initial = true;
|
Chris@0
|
68 }
|
Chris@0
|
69 }
|
Chris@0
|
70 }
|
Chris@0
|
71
|
Chris@0
|
72 field = field.trimmed();
|
Chris@0
|
73 names.push_back(field);
|
Chris@0
|
74
|
Chris@0
|
75 // comma
|
Chris@0
|
76 re = QRegExp("^([^,]+), ([^,]+)$");
|
Chris@0
|
77 if ((mp = re.indexIn(field)) >= 0) {
|
Chris@0
|
78 QString c(re.cap(1));
|
Chris@0
|
79 QString d(re.cap(2));
|
Chris@0
|
80 names.push_back(d + " " + c);
|
Chris@0
|
81 return;
|
Chris@0
|
82 }
|
Chris@0
|
83 }
|
Chris@0
|
84
|
Chris@0
|
85 void
|
Chris@0
|
86 ClassicalDotNetImporter::import(QUrl source)
|
Chris@0
|
87 {
|
Chris@0
|
88 //!!! for now
|
Chris@0
|
89 QString filename = source.toLocalFile();
|
Chris@0
|
90
|
Chris@0
|
91 QFile file(filename);
|
Chris@0
|
92 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
93 throw std::exception();
|
Chris@0
|
94 }
|
Chris@0
|
95
|
Chris@0
|
96 QTextStream stream(&file);
|
Chris@0
|
97 stream.setCodec("UTF-8");
|
Chris@0
|
98 QString all = stream.readAll();
|
Chris@0
|
99
|
Chris@0
|
100 all.replace(QRegExp("^.*<div id=\"center\">"), "");
|
Chris@0
|
101
|
Chris@0
|
102 QRegExp matcher
|
Chris@0
|
103 ("<li><a href=\"([^\"]+)\">([^<]+)</a></li>");
|
Chris@0
|
104
|
Chris@0
|
105 int pos = 0, count = 0;
|
Chris@0
|
106 while ((pos = matcher.indexIn(all, pos)) != -1) {
|
Chris@0
|
107 pos += matcher.matchedLength();
|
Chris@0
|
108 ++count;
|
Chris@0
|
109
|
Chris@0
|
110 DEBUG << "Item " << count
|
Chris@0
|
111 << ": page = " << matcher.cap(1)
|
Chris@0
|
112 << ", name = " << matcher.cap(2);
|
Chris@0
|
113
|
Chris@0
|
114 QString namefield = matcher.cap(2);
|
Chris@0
|
115 QStringList names;
|
Chris@0
|
116
|
Chris@0
|
117 parseNames(namefield, names);
|
Chris@0
|
118 if (names.empty()) {
|
Chris@0
|
119 DEBUG << "No name!" << endl;
|
Chris@0
|
120 continue;
|
Chris@0
|
121 }
|
Chris@0
|
122
|
Chris@0
|
123 if (names[0].contains(" Collections")) {
|
Chris@0
|
124 continue;
|
Chris@0
|
125 }
|
Chris@0
|
126
|
Chris@0
|
127 Composer *composer = new Composer();
|
Chris@0
|
128 composer->setName(names[0]);
|
Chris@0
|
129 for (int i = 1; i < names.size(); ++i) {
|
Chris@0
|
130 composer->addAlias(names[i]);
|
Chris@0
|
131 }
|
Chris@0
|
132
|
Chris@0
|
133 if (matcher.cap(1) != "") {
|
Chris@0
|
134 QString url = matcher.cap(1);
|
Chris@0
|
135 url.replace(QRegExp("^\\.\\./"), "/music/");
|
Chris@0
|
136 Document *d = new Document;
|
Chris@18
|
137 d->setUri(Uri("http://www.classical.net" + url));
|
Chris@0
|
138 d->setTopic(composer);
|
Chris@0
|
139 d->setSiteName("Classical Net");
|
Chris@0
|
140 composer->addPage(d);
|
Chris@0
|
141 }
|
Chris@0
|
142
|
Chris@0
|
143 m_objects.push_back(composer);
|
Chris@0
|
144 }
|
Chris@0
|
145
|
Chris@0
|
146
|
Chris@0
|
147 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
148 }
|
Chris@0
|
149
|
Chris@0
|
150
|
Chris@0
|
151 }
|
Chris@0
|
152
|
Chris@0
|
153
|