Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportWikipediaWorksK.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 WikipediaWorksKImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "WikipediaWorksKImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 static QString
|
Chris@0
|
27 sanitise(QString field, QString &linkText)
|
Chris@0
|
28 {
|
Chris@0
|
29 int mp;
|
Chris@0
|
30
|
Chris@0
|
31 field.replace(QString::fromUtf8("\342\200\222"), "-");
|
Chris@0
|
32 field.replace(QString::fromUtf8("\342\200\223"), "-");
|
Chris@0
|
33 field.replace(QString::fromUtf8("\342\200\224"), "-");
|
Chris@0
|
34 field.replace(QString::fromUtf8("\342\200\225"), "-");
|
Chris@0
|
35
|
Chris@0
|
36 QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
|
Chris@0
|
37 if ((mp = link2.indexIn(field)) >= 0) {
|
Chris@0
|
38 if (linkText == "") linkText = link2.cap(2);
|
Chris@0
|
39 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
|
Chris@0
|
40 return sanitise(field, linkText);
|
Chris@0
|
41 }
|
Chris@0
|
42
|
Chris@0
|
43 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
|
Chris@0
|
44 if ((mp = link1.indexIn(field)) >= 0) {
|
Chris@0
|
45 if (linkText == "") linkText = link1.cap(2);
|
Chris@0
|
46 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
|
Chris@0
|
47 return sanitise(field, linkText);
|
Chris@0
|
48 }
|
Chris@0
|
49
|
Chris@0
|
50 field = field.trimmed();
|
Chris@0
|
51
|
Chris@0
|
52 field.replace("[", "");
|
Chris@0
|
53 field.replace("]", "");
|
Chris@0
|
54 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
|
Chris@0
|
55 field.replace("''", "\"");
|
Chris@0
|
56 field.replace(""", "\"");
|
Chris@0
|
57 field.replace(QRegExp("<[^&]*>"), "");
|
Chris@0
|
58 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
59
|
Chris@0
|
60 while (field.endsWith(".") || field.endsWith(",")) {
|
Chris@0
|
61 field = field.left(field.length()-1);
|
Chris@0
|
62 }
|
Chris@0
|
63
|
Chris@0
|
64 if (field.startsWith("(") && field.endsWith(")")) {
|
Chris@0
|
65 DEBUG << "before: " << field;
|
Chris@0
|
66 field = field.mid(1, field.length()-2);
|
Chris@0
|
67 DEBUG << "after: " << field;
|
Chris@0
|
68 }
|
Chris@0
|
69 field.replace(QRegExp("^\\**"), "");
|
Chris@0
|
70 if (field == ")" || field == "(") {
|
Chris@0
|
71 field = "";
|
Chris@0
|
72 }
|
Chris@0
|
73
|
Chris@0
|
74 field.replace(" - ,", ",");
|
Chris@0
|
75
|
Chris@0
|
76 return field;
|
Chris@0
|
77 }
|
Chris@0
|
78
|
Chris@0
|
79 static QString
|
Chris@0
|
80 extractYear(QString datefield)
|
Chris@0
|
81 {
|
Chris@0
|
82 QRegExp re("[0-9]{4}");
|
Chris@0
|
83 if (re.indexIn(datefield) >= 0) {
|
Chris@0
|
84 return re.cap(0);
|
Chris@0
|
85 }
|
Chris@0
|
86 return "";
|
Chris@0
|
87 }
|
Chris@0
|
88
|
Chris@0
|
89 static QString
|
Chris@0
|
90 extractKey(QString titlefield)
|
Chris@0
|
91 {
|
Chris@0
|
92 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
|
Chris@0
|
93 if (re.indexIn(titlefield) >= 0) {
|
Chris@0
|
94 return re.cap(1);
|
Chris@0
|
95 }
|
Chris@0
|
96 return "";
|
Chris@0
|
97 }
|
Chris@0
|
98
|
Chris@0
|
99 static Work *
|
Chris@0
|
100 makeWork(QString composerName, QString opfield, QString kfield,
|
Chris@0
|
101 QString numfield, QString titlefield, QString datefield,
|
Chris@0
|
102 QString placefield, QString remarksfield, Work *main)
|
Chris@0
|
103 {
|
Chris@0
|
104 QString linkText;
|
Chris@0
|
105
|
Chris@0
|
106 Work *w = new Work;
|
Chris@0
|
107
|
Chris@0
|
108 QString op = sanitise(opfield, linkText);
|
Chris@0
|
109 if (op != "") {
|
Chris@0
|
110 op.replace("Opus ", "");
|
Chris@0
|
111 op.replace("Op. ", "");
|
Chris@0
|
112 op.replace("Op ", "");
|
Chris@0
|
113 w->setOpus(op);
|
Chris@0
|
114 }
|
Chris@0
|
115
|
Chris@0
|
116 QString k = sanitise(kfield, linkText);
|
Chris@0
|
117 if (k != "") {
|
Chris@0
|
118 k.replace("K. ", "K ");
|
Chris@0
|
119 w->setCatalogue(k);
|
Chris@0
|
120 }
|
Chris@0
|
121
|
Chris@0
|
122 QString num = sanitise(numfield, linkText);
|
Chris@0
|
123 if (num != "") {
|
Chris@0
|
124 num.replace("No. ", "");
|
Chris@0
|
125 num.replace("No ", "");
|
Chris@0
|
126 w->setNumber(num);
|
Chris@0
|
127 }
|
Chris@0
|
128
|
Chris@0
|
129 QString key = extractKey(titlefield);
|
Chris@0
|
130 if (key != "") {
|
Chris@0
|
131 w->setKey(key);
|
Chris@0
|
132 }
|
Chris@0
|
133
|
Chris@0
|
134 QString title = sanitise(titlefield, linkText);
|
Chris@0
|
135 if (linkText != "") {
|
Chris@0
|
136 linkText.replace(" ", "_");
|
Chris@0
|
137 QUrl url;
|
Chris@0
|
138 url.setScheme("http");
|
Chris@0
|
139 url.setHost("en.wikipedia.org");
|
Chris@0
|
140 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
|
Chris@0
|
141 Document *d = new Document;
|
Chris@18
|
142 d->setUri(Uri(url));
|
Chris@0
|
143 d->setSiteName("Wikipedia");
|
Chris@0
|
144 d->setTopic(w);
|
Chris@0
|
145 w->addPage(d);
|
Chris@0
|
146 }
|
Chris@0
|
147
|
Chris@0
|
148 QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
|
Chris@0
|
149 int pos;
|
Chris@0
|
150 if ((pos = explicationRE.indexIn(title)) >= 0) {
|
Chris@0
|
151 w->addAlias(explicationRE.cap(2));
|
Chris@0
|
152 title = explicationRE.cap(1);
|
Chris@0
|
153 }
|
Chris@0
|
154
|
Chris@0
|
155 if (remarksfield == "") {
|
Chris@0
|
156 QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
|
Chris@0
|
157 if ((pos = remarksRE.indexIn(title)) >= 0) {
|
Chris@0
|
158 remarksfield = remarksRE.cap(2);
|
Chris@0
|
159 title = remarksRE.cap(1);
|
Chris@0
|
160 }
|
Chris@0
|
161 }
|
Chris@0
|
162
|
Chris@0
|
163 if (remarksfield == "") {
|
Chris@0
|
164 QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
|
Chris@0
|
165 if ((pos = remarksRE.indexIn(title)) >= 0) {
|
Chris@0
|
166 remarksfield = remarksRE.cap(2);
|
Chris@0
|
167 title = remarksRE.cap(1);
|
Chris@0
|
168 }
|
Chris@0
|
169 }
|
Chris@0
|
170
|
Chris@0
|
171 w->setName(title);
|
Chris@0
|
172
|
Chris@0
|
173 QString remarks = sanitise(remarksfield, linkText);
|
Chris@0
|
174 if (remarks != "") {
|
Chris@0
|
175 w->setRemarks(remarks);
|
Chris@0
|
176 }
|
Chris@0
|
177
|
Chris@0
|
178 QString year = extractYear(datefield);
|
Chris@0
|
179 QString place = sanitise(placefield, linkText);
|
Chris@0
|
180
|
Chris@0
|
181 DEBUG << "title = " << title << endl;
|
Chris@0
|
182
|
Chris@0
|
183 if (main) {
|
Chris@0
|
184 main->addPart(w);
|
Chris@0
|
185 w->setPartOf(main);
|
Chris@0
|
186 w->setComposition(main->composition());
|
Chris@0
|
187 main->composition()->addWork(w);
|
Chris@0
|
188 }
|
Chris@0
|
189
|
Chris@0
|
190 if (!main || !main->composition() ||
|
Chris@0
|
191 (year != "" && (main->composition()->year() != year.toInt()))) {
|
Chris@0
|
192 Composition *c = new Composition;
|
Chris@0
|
193 c->setComposerName(composerName);
|
Chris@0
|
194 c->addWork(w);
|
Chris@0
|
195 c->setYear(year.toInt());
|
Chris@0
|
196 c->setPlace(place);
|
Chris@0
|
197 w->setComposition(c);
|
Chris@0
|
198 }
|
Chris@0
|
199
|
Chris@0
|
200 return w;
|
Chris@0
|
201 }
|
Chris@0
|
202
|
Chris@0
|
203
|
Chris@0
|
204 void
|
Chris@0
|
205 WikipediaWorksKImporter::import(QUrl source)
|
Chris@0
|
206 {
|
Chris@0
|
207 //!!! for now
|
Chris@0
|
208 QString filename = source.toLocalFile();
|
Chris@0
|
209
|
Chris@0
|
210 QFile file(filename);
|
Chris@0
|
211 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
212 throw std::exception();
|
Chris@0
|
213 }
|
Chris@0
|
214
|
Chris@0
|
215 QTextStream stream(&file);
|
Chris@0
|
216 stream.setCodec("UTF-8");
|
Chris@0
|
217
|
Chris@0
|
218 QString composerName;
|
Chris@0
|
219 if (filename.contains("K%C3%B6chel")) {
|
Chris@0
|
220 composerName = "Wolfgang Amadeus Mozart";
|
Chris@0
|
221 } else {
|
Chris@0
|
222 QRegExp byby("by_(.*)_by");
|
Chris@0
|
223 if (byby.indexIn(filename) >= 0) {
|
Chris@0
|
224 composerName = byby.cap(1).replace('_', ' ');
|
Chris@0
|
225 } else {
|
Chris@0
|
226 QRegExp by("by_(.*)");
|
Chris@0
|
227 if (by.indexIn(filename) >= 0) {
|
Chris@0
|
228 composerName = by.cap(1).replace('_', ' ');
|
Chris@0
|
229 }
|
Chris@0
|
230 }
|
Chris@0
|
231 }
|
Chris@0
|
232 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
|
Chris@0
|
233
|
Chris@0
|
234 DEBUG << "composerName = " << composerName << endl;
|
Chris@0
|
235
|
Chris@0
|
236 // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
|
Chris@0
|
237 QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[(K\\.? *[0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
|
Chris@0
|
238
|
Chris@0
|
239 QString all = stream.readAll();
|
Chris@0
|
240
|
Chris@0
|
241 DEBUG << "Read " << all.length() << " chars" << endl;
|
Chris@0
|
242
|
Chris@0
|
243 all.replace(QRegExp("^.*<page>"), "");
|
Chris@0
|
244
|
Chris@0
|
245 int pos = 0, count = 0;
|
Chris@0
|
246
|
Chris@0
|
247 while ((pos = matcherK.indexIn(all, pos)) != -1) {
|
Chris@0
|
248
|
Chris@0
|
249 all.replace(pos, matcherK.matchedLength(), "");
|
Chris@0
|
250 ++count;
|
Chris@0
|
251
|
Chris@0
|
252 QString kfield = matcherK.cap(1);
|
Chris@0
|
253 QString titlefield = matcherK.cap(2);
|
Chris@0
|
254 QString datefield = matcherK.cap(3);
|
Chris@0
|
255 QString placefield = matcherK.cap(4);
|
Chris@0
|
256
|
Chris@0
|
257 m_objects.push_back
|
Chris@0
|
258 (makeWork(composerName, "", kfield, "",
|
Chris@0
|
259 titlefield, datefield, placefield, "", 0));
|
Chris@0
|
260 }
|
Chris@0
|
261
|
Chris@0
|
262 DEBUG << "Left over: " << all << endl;
|
Chris@0
|
263
|
Chris@0
|
264 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
265 }
|
Chris@0
|
266
|
Chris@0
|
267
|
Chris@0
|
268 }
|
Chris@0
|
269
|
Chris@0
|
270
|