Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "ImportWikipediaComposers.h"
|
Chris@0
|
4
|
Chris@0
|
5 #include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <QFile>
|
Chris@0
|
8 #include <QFileInfo>
|
Chris@0
|
9 #include <QTextStream>
|
Chris@0
|
10 #include <QRegExp>
|
Chris@0
|
11 #include <QVariant>
|
Chris@0
|
12
|
Chris@0
|
13 #include <exception>
|
Chris@0
|
14
|
Chris@0
|
15 using namespace Dataquay;
|
Chris@0
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 void
|
Chris@0
|
20 WikipediaComposersImporter::setSource(QUrl source)
|
Chris@0
|
21 {
|
Chris@0
|
22 DEBUG << "WikipediaComposersImporter::setSource: " << source << endl;
|
Chris@0
|
23 import(source);
|
Chris@0
|
24 }
|
Chris@0
|
25
|
Chris@0
|
26 Composer *
|
Chris@0
|
27 addComposer(QString namefield, QString birthfield, QString deathfield,
|
Chris@0
|
28 QString datesfield, QString nationalityfield, QString worksfield,
|
Chris@0
|
29 QString summaryfield)
|
Chris@0
|
30 {
|
Chris@0
|
31 namefield = namefield.trimmed();
|
Chris@0
|
32 birthfield = birthfield.trimmed();
|
Chris@0
|
33 deathfield = deathfield.trimmed();
|
Chris@0
|
34 datesfield = datesfield.trimmed();
|
Chris@0
|
35 nationalityfield = nationalityfield.trimmed();
|
Chris@0
|
36 worksfield = worksfield.trimmed();
|
Chris@0
|
37 summaryfield = summaryfield.trimmed();
|
Chris@0
|
38
|
Chris@0
|
39 Composer *composer = new Composer();
|
Chris@0
|
40
|
Chris@0
|
41 QString name = namefield;
|
Chris@0
|
42 name.replace("[[", "");
|
Chris@0
|
43 name.replace("]]", "");
|
Chris@0
|
44 QString pagename = name;
|
Chris@0
|
45
|
Chris@0
|
46 if (name.contains('|')) {
|
Chris@0
|
47 QStringList bits = name.split('|');
|
Chris@0
|
48 pagename = bits[0];
|
Chris@0
|
49 name = bits[1];
|
Chris@0
|
50 }
|
Chris@0
|
51
|
Chris@0
|
52 composer->setName(name);
|
Chris@0
|
53
|
Chris@0
|
54 pagename.replace(" ", "_");
|
Chris@0
|
55 QUrl url;
|
Chris@0
|
56 url.setScheme("http");
|
Chris@0
|
57 url.setHost("en.wikipedia.org");
|
Chris@0
|
58
|
Chris@0
|
59 url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename));
|
Chris@0
|
60 Document *d = new Document;
|
Chris@18
|
61 d->setUri(Uri(url));
|
Chris@0
|
62 d->setSiteName("Wikipedia");
|
Chris@0
|
63 d->setTopic(composer);
|
Chris@0
|
64 composer->addPage(d);
|
Chris@0
|
65
|
Chris@0
|
66 if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment
|
Chris@0
|
67
|
Chris@0
|
68 bool approx = (datesfield.contains("c.") || datesfield.contains("?")
|
Chris@0
|
69 || datesfield.contains("before") || datesfield.contains("after"));
|
Chris@0
|
70
|
Chris@0
|
71 if (datesfield != "") {
|
Chris@0
|
72 DEBUG << "dates for " << name << ": " << datesfield << endl;
|
Chris@0
|
73 datesfield.replace("(", "");
|
Chris@0
|
74 datesfield.replace(")", "");
|
Chris@0
|
75 datesfield.replace(" ", "");
|
Chris@0
|
76 datesfield.replace(QString::fromUtf8("\342\200\222"), "-");
|
Chris@0
|
77 datesfield.replace(QString::fromUtf8("\342\200\223"), "-");
|
Chris@0
|
78 datesfield.replace(QString::fromUtf8("\342\200\224"), "-");
|
Chris@0
|
79 datesfield.replace(QString::fromUtf8("\342\200\225"), "-");
|
Chris@0
|
80 datesfield.replace("--", "-");
|
Chris@0
|
81 DEBUG << "dates for " << name << ": " << datesfield << endl;
|
Chris@0
|
82
|
Chris@0
|
83 QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-");
|
Chris@0
|
84 QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?");
|
Chris@0
|
85
|
Chris@0
|
86 if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1);
|
Chris@0
|
87 else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2);
|
Chris@0
|
88
|
Chris@0
|
89 QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
|
Chris@0
|
90 QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
|
Chris@0
|
91
|
Chris@0
|
92 if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2);
|
Chris@0
|
93 else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2);
|
Chris@0
|
94
|
Chris@0
|
95 // datesfield.replace(QRegExp("[^0-9]+"), "-");
|
Chris@0
|
96 /*
|
Chris@0
|
97 QStringList list = datesfield.split('-');
|
Chris@0
|
98 if (!list.empty()) {
|
Chris@0
|
99 birthfield = list[0];
|
Chris@0
|
100 if (list.size() > 1) {
|
Chris@0
|
101 deathfield = list[1];
|
Chris@0
|
102 }
|
Chris@0
|
103 }
|
Chris@0
|
104 */
|
Chris@0
|
105 DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl;
|
Chris@0
|
106 }
|
Chris@0
|
107 if (birthfield != "") {
|
Chris@0
|
108 Birth *e = new Birth(birthfield.toInt());
|
Chris@0
|
109 e->setApproximate(approx);
|
Chris@0
|
110 composer->setBirth(e);
|
Chris@0
|
111 }
|
Chris@0
|
112 if (deathfield != "") {
|
Chris@0
|
113 Death *e = new Death(deathfield.toInt());
|
Chris@0
|
114 e->setApproximate(approx);
|
Chris@0
|
115 composer->setDeath(e);
|
Chris@0
|
116 }
|
Chris@0
|
117 if (nationalityfield != "") {
|
Chris@4
|
118 composer->addNationality(nationalityfield);
|
Chris@0
|
119 }
|
Chris@0
|
120 if (summaryfield != "") {
|
Chris@0
|
121 summaryfield.replace(QRegExp("^[Cc]omposer, *"), "");
|
Chris@0
|
122 summaryfield[0] = summaryfield[0].toUpper();
|
Chris@0
|
123 summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[[");
|
Chris@0
|
124 summaryfield.replace("[[", "");
|
Chris@0
|
125 summaryfield.replace("]]", "");
|
Chris@0
|
126 summaryfield.replace("''", "\"");
|
Chris@0
|
127 summaryfield.replace(""", "'");
|
Chris@0
|
128 summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), "");
|
Chris@0
|
129 summaryfield.replace("[", "");
|
Chris@0
|
130 summaryfield.replace("]", "");
|
Chris@0
|
131 composer->setRemarks(summaryfield);
|
Chris@0
|
132 }
|
Chris@0
|
133
|
Chris@0
|
134 return composer;
|
Chris@0
|
135 }
|
Chris@0
|
136
|
Chris@0
|
137 void
|
Chris@0
|
138 WikipediaComposersImporter::import(QUrl source)
|
Chris@0
|
139 {
|
Chris@0
|
140 //!!! for now
|
Chris@0
|
141 QString filename = source.toLocalFile();
|
Chris@0
|
142
|
Chris@0
|
143 QFile file(filename);
|
Chris@0
|
144 if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
Chris@0
|
145 throw std::exception();
|
Chris@0
|
146 }
|
Chris@0
|
147
|
Chris@0
|
148 QTextStream stream(&file);
|
Chris@0
|
149 stream.setCodec("UTF-8");
|
Chris@0
|
150
|
Chris@0
|
151 QString period;
|
Chris@0
|
152 DEBUG << "source = " << source.toString() << endl;
|
Chris@0
|
153 QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_");
|
Chris@0
|
154 QRegExp pmatcher2("List_of_([^_-]+)[_-]era_");
|
Chris@0
|
155 QRegExp pmatcher3("([^_-]+)_composers");
|
Chris@0
|
156 if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1);
|
Chris@0
|
157 else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1);
|
Chris@0
|
158 else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1);
|
Chris@0
|
159 DEBUG << "period = "<< period << endl;
|
Chris@0
|
160
|
Chris@0
|
161 int count = 0;
|
Chris@0
|
162
|
Chris@0
|
163 // table form A (used of e.g. Romantic transitional composers)
|
Chris@0
|
164 // | Name || birth || death || nationality || summary || flags
|
Chris@0
|
165 // note: 5x ||
|
Chris@0
|
166 QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|");
|
Chris@0
|
167
|
Chris@0
|
168 // table form B (used of e.g. 20th-century composers)
|
Chris@0
|
169 // | Name || birth-[death] || nationality || notable works || remarks
|
Chris@0
|
170 // Note name may contain a single | if in double-square brackets, hence 2a
|
Chris@0
|
171 // note: 4x ||
|
Chris@0
|
172 QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)");
|
Chris@0
|
173 // just in case the final column has been omitted completely (as happens).
|
Chris@0
|
174 // this must be matched after matcher2
|
Chris@0
|
175 QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)");
|
Chris@0
|
176
|
Chris@0
|
177 // list form
|
Chris@0
|
178 // * [[Name]] [alias?] (stuff about dates)[,] notes
|
Chris@0
|
179 QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)");
|
Chris@0
|
180
|
Chris@0
|
181 while (!stream.atEnd()) {
|
Chris@0
|
182 QString line = stream.readLine();
|
Chris@0
|
183
|
Chris@0
|
184 Composer *o = 0;
|
Chris@0
|
185
|
Chris@0
|
186 if (matcher1.indexIn(line) >= 0) {
|
Chris@0
|
187
|
Chris@0
|
188 o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3),
|
Chris@0
|
189 "", matcher1.cap(4), "", matcher1.cap(5));
|
Chris@0
|
190
|
Chris@0
|
191 } else if (matcher2.indexIn(line) >= 0) {
|
Chris@0
|
192
|
Chris@0
|
193 o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "",
|
Chris@0
|
194 matcher2.cap(4), matcher2.cap(5), "");
|
Chris@0
|
195
|
Chris@0
|
196 } else if (matcher2a.indexIn(line) >= 0) {
|
Chris@0
|
197
|
Chris@0
|
198 o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "",
|
Chris@0
|
199 matcher2a.cap(4), "", "");
|
Chris@0
|
200
|
Chris@0
|
201 } else if (matcher3.indexIn(line) >= 0) {
|
Chris@0
|
202
|
Chris@0
|
203 o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3),
|
Chris@0
|
204 "", "", matcher3.cap(5));
|
Chris@0
|
205
|
Chris@0
|
206 } else if (line.startsWith("* ") || line.startsWith("| ") ||
|
Chris@0
|
207 line.startsWith("*[") || line.startsWith("|[")) {
|
Chris@0
|
208 DEBUG << "Failed to match promising line: " << line << endl;
|
Chris@0
|
209 }
|
Chris@0
|
210
|
Chris@0
|
211 if (o) {
|
Chris@0
|
212 if (period != "") o->setPeriod(period);
|
Chris@0
|
213 m_objects.push_back(o);
|
Chris@0
|
214 ++count;
|
Chris@0
|
215 }
|
Chris@0
|
216
|
Chris@0
|
217 }
|
Chris@0
|
218
|
Chris@0
|
219 DEBUG << "Found " << count << " things" << endl;
|
Chris@0
|
220 }
|
Chris@0
|
221
|
Chris@0
|
222
|
Chris@0
|
223 }
|
Chris@0
|
224
|
Chris@0
|
225
|