To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportWikipediaWorks.cpp

History | View | Annotate | Download (10.6 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportWikipediaWorks.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
WikipediaWorksImporter::setSource(QUrl source)
21
{
22
    DEBUG << "WikipediaWorksImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
QString
27
sanitise(QString field, QString &linkText)
28
{
29
    int mp;
30

    
31
    field.replace(QString::fromUtf8("\342\200\222"), "-");
32
    field.replace(QString::fromUtf8("\342\200\223"), "-");
33
    field.replace(QString::fromUtf8("\342\200\224"), "-");
34
    field.replace(QString::fromUtf8("\342\200\225"), "-");
35

    
36
    QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
37
    if ((mp = link2.indexIn(field)) >= 0) {
38
        if (linkText == "") linkText = link2.cap(2);
39
        field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
40
        return sanitise(field, linkText);
41
    }
42

    
43
    QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
44
    if ((mp = link1.indexIn(field)) >= 0) {
45
        if (linkText == "") linkText = link1.cap(2);
46
        field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
47
        return sanitise(field, linkText);
48
    }
49

    
50
    field = field.trimmed();
51

    
52
    field.replace("[", "");
53
    field.replace("]", "");
54
    field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
55
    field.replace("''", "\"");
56
    field.replace("&quot;", "\"");
57
    field.replace(QRegExp("&lt;[^&]*&gt;"), "");
58
    field.replace(QRegExp("^\\**"), "");
59

    
60
    while (field.endsWith(".") || field.endsWith(",")) {
61
        field = field.left(field.length()-1);
62
    }
63

    
64
    if (field.startsWith("(") && field.endsWith(")")) {
65
        DEBUG << "before: " << field;
66
        field = field.mid(1, field.length()-2);
67
        DEBUG << "after: " << field;
68
    }
69
    field.replace(QRegExp("^\\**"), "");
70
    if (field == ")" || field == "(") {
71
        field = "";
72
    }
73

    
74
    field.replace(" - ,", ",");
75

    
76
    return field;
77
}
78

    
79
QString
80
extractYear(QString datefield)
81
{
82
    QRegExp re("[0-9]{4}");
83
    if (re.indexIn(datefield) >= 0) {
84
        return re.cap(0);
85
    }
86
    return "";
87
}
88

    
89
QString
90
extractKey(QString titlefield)
91
{
92
    QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
93
    if (re.indexIn(titlefield) >= 0) {
94
        return re.cap(1);
95
    }
96
    return "";
97
}
98

    
99
Work *
100
makeWork(QString composerName, QString opfield, QString kfield, 
101
         QString numfield, QString titlefield, QString datefield,
102
         QString placefield, QString remarksfield, Work *main)
103
{
104
    QString linkText;
105

    
106
    Work *w = new Work;
107

    
108
    QString op = sanitise(opfield, linkText);
109
    if (op != "") {
110
        op.replace("Opus ", "");
111
        op.replace("Op. ", "");
112
        op.replace("Op ", "");
113
        w->setOpus(op);
114
    }
115

    
116
    QString k = sanitise(kfield, linkText);
117
    if (k != "") {
118
        w->setCatalogue(k);
119
    }
120

    
121
    QString num = sanitise(numfield, linkText);
122
    if (num != "") {
123
        num.replace("No. ", "");
124
        num.replace("No ", "");
125
        w->setNumber(num);
126
    }
127

    
128
    QString key = extractKey(titlefield);
129
    if (key != "") {
130
        w->setKey(key);
131
    }
132

    
133
    QString title = sanitise(titlefield, linkText);
134
    if (linkText != "") {
135
        linkText.replace(" ", "_");
136
        QUrl url;
137
        url.setScheme("http");
138
        url.setHost("en.wikipedia.org");
139
        url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
140
        Document *d = new Document;
141
        d->setUri(Uri(url));
142
        d->setSiteName("Wikipedia");
143
        d->setTopic(w);
144
        w->addPage(d);
145
    }
146

    
147
    QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
148
    int pos;
149
    if ((pos = explicationRE.indexIn(title)) >= 0) {
150
        w->addAlias(explicationRE.cap(2));
151
        title = explicationRE.cap(1);
152
    }
153

    
154
    if (remarksfield == "") {
155
        QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
156
        if ((pos = remarksRE.indexIn(title)) >= 0) {
157
            remarksfield = remarksRE.cap(2);
158
            title = remarksRE.cap(1);
159
        }
160
    }
161
    
162
    if (remarksfield == "") {
163
        QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
164
        if ((pos = remarksRE.indexIn(title)) >= 0) {
165
            remarksfield = remarksRE.cap(2);
166
            title = remarksRE.cap(1);
167
        }
168
    }
169

    
170
    w->setName(title);
171
    
172
    QString remarks = sanitise(remarksfield, linkText);
173
    if (remarks != "") {
174
        w->setRemarks(remarks);
175
    }
176

    
177
    QString year = extractYear(datefield);
178
    QString place = sanitise(placefield, linkText);
179

    
180
    DEBUG << "title = " << title << endl;
181

    
182
    if (main) {
183
        main->addPart(w);
184
        w->setPartOf(main);
185
        w->setComposition(main->composition());
186
        main->composition()->addWork(w);
187
    }
188

    
189
    if (!main || !main->composition() ||
190
        (year != "" && (main->composition()->year() != year.toInt()))) {
191
        Composition *c = new Composition;
192
        c->setComposerName(composerName);
193
        c->addWork(w);
194
        c->setYear(year.toInt());
195
        c->setPlace(place);
196
        w->setComposition(c);
197
    }
198

    
199
    return w;
200
}
201

    
202

    
203
void
204
WikipediaWorksImporter::import(QUrl source)
205
{
206
    //!!! for now
207
    QString filename = source.toLocalFile();
208

    
209
    QFile file(filename);
210
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
211
        throw std::exception();
212
    }
213

    
214
    QTextStream stream(&file);
215
    stream.setCodec("UTF-8");
216
    
217
    QString composerName;
218
    if (filename.contains("K%C3%B6chel")) {
219
        composerName = "Wolfgang Amadeus Mozart";
220
    } else if (filename.contains("/Schubert_")) {
221
        composerName = "Franz Schubert";
222
    } else {
223
        QRegExp byby("by_(.*)_by");
224
        if (byby.indexIn(filename) >= 0) {
225
            composerName = byby.cap(1).replace('_', ' ');
226
        } else {
227
            QRegExp by("by_(.*)");
228
            if (by.indexIn(filename) >= 0) {
229
                composerName = by.cap(1).replace('_', ' ');
230
            }
231
        }
232
    }
233
    composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
234

    
235
    DEBUG << "composerName = " << composerName << endl;
236

    
237
    // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
238
    QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[K\\. *([0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
239

    
240
    QString all = stream.readAll();
241

    
242
    DEBUG << "Read " << all.length() << " chars" << endl;
243

    
244
    all.replace(QRegExp("^.*<page>"), "");
245

    
246
    int pos = 0, count = 0;
247

    
248
    while ((pos = matcherK.indexIn(all, pos)) != -1) {
249

    
250
        all.replace(pos, matcherK.matchedLength(), "");
251
        ++count;
252

    
253
        QString kfield = matcherK.cap(1);
254
        QString titlefield = matcherK.cap(2);
255
        QString datefield = matcherK.cap(3);
256
        QString placefield = matcherK.cap(4);
257

    
258
        m_objects.push_back
259
            (makeWork(composerName, "K. " + kfield, kfield, "",
260
                      titlefield, datefield, placefield, "", 0));
261
    }
262

    
263
    // Opus in list form (as used for e.g. Beethoven's works)
264
    QRegExp matcherB("[\\*:] *'*((Opus|Op\\.|WoO|Anh|H|D) [0-9][^,:'{\n]*)'*[,:{] *([^\n]*)\n");
265

    
266
    // Part of an opus (e.g. op 18 no 1), intended to be anchored to
267
    // the point at which the last matcherB or matcherB2 match ended
268
    // (note caret)
269
    QRegExp matcherB2("^[\\*:]{2} *([A-Za-z ]*)((No\\.* +)?[0-9][^ :\n]*)[: ] *([^\n]*)\n");
270

    
271
    // Date and remarks within titlefield
272
    QRegExp matcherDate("\\([^\\)]*([0-9]{4})[^0-9\\)]*\\)(.*)");
273

    
274
    pos = 0;
275

    
276
    while ((pos = matcherB.indexIn(all, pos)) != -1) {
277

    
278
        all.replace(pos, matcherB.matchedLength(), "");
279
        ++count;
280

    
281
        QString opfield = matcherB.cap(1);
282
        QString titlefield = matcherB.cap(3);
283

    
284
        QString datefield, remarksfield;
285

    
286
        if (titlefield != "") {
287
            int dpos;
288
            if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
289
                datefield = matcherDate.cap(1);
290
                remarksfield = matcherDate.cap(2);
291
                titlefield = titlefield.left(dpos);
292
            }
293
        }
294

    
295
        Work *main = makeWork(composerName, opfield, "", "",
296
                              titlefield, datefield, "", remarksfield, 0);
297

    
298
        m_objects.push_back(main);
299

    
300
        int spos = pos;
301

    
302
        while ((spos = matcherB2.indexIn(all, spos, QRegExp::CaretAtOffset))
303
               != -1) {
304

    
305
            all.replace(spos, matcherB2.matchedLength(), "");
306
            ++count;
307

    
308
            QString numfield = matcherB2.cap(2);
309

    
310
            titlefield = matcherB2.cap(4);
311

    
312
            if (matcherB2.cap(1).trimmed() != "") {
313
                titlefield = matcherB2.cap(1) + matcherB2.cap(2) + " " 
314
                    + matcherB2.cap(4);
315
                DEBUG << "prefix to number = " << matcherB2.cap(1) << ", so extending title from " << matcherB2.cap(4) << " to " << titlefield << endl;
316
            }
317

    
318
            datefield = "";
319
            remarksfield = "";
320

    
321
            if (titlefield != "") {
322
                int dpos;
323
                if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
324
                    datefield = matcherDate.cap(1);
325
                    remarksfield = matcherDate.cap(2);
326
                    titlefield = titlefield.left(dpos);
327
                }
328
            }
329

    
330
            Work *sub = makeWork(composerName, opfield, "", numfield,
331
                                 titlefield, datefield, "", remarksfield, main);
332

    
333
            m_objects.push_back(sub);
334
        }
335
    }
336

    
337
    // Title with date but no opus in list form (as used for e.g. Copland)
338
    QRegExp matcherC("\\* *([^\n]*)\\([^\\)]*([0-9]{4})[^\\)]*\\) *\n");
339

    
340
    // Part of the above (e.g. song in cycle), intended to be anchored to
341
    // the point at which the last matcherC or matcherC2 match ended
342
    // (note caret)
343
    QRegExp matcherC2("^\\*\\* *([^\n]*)\n");
344

    
345
    pos = 0;
346

    
347
    while ((pos = matcherC.indexIn(all, pos)) != -1) {
348

    
349
        all.replace(pos, matcherC.matchedLength(), "");
350
        ++count;
351

    
352
        QString titlefield = matcherC.cap(1);
353
        QString datefield = matcherC.cap(2);
354

    
355
        Work *main = makeWork(composerName, "", "", "",
356
                              titlefield, datefield, "", "", 0);
357

    
358
        m_objects.push_back(main);
359

    
360
        int spos = pos;
361

    
362
        while ((spos = matcherC2.indexIn(all, spos, QRegExp::CaretAtOffset))
363
               != -1) {
364

    
365
            all.replace(spos, matcherC2.matchedLength(), "");
366
            ++count;
367

    
368
            titlefield = matcherC2.cap(1);
369

    
370
            datefield = "";
371

    
372
            if (titlefield != "") {
373
                int dpos;
374
                if ((dpos = matcherDate.indexIn(titlefield)) != -1) {
375
                    datefield = matcherDate.cap(1);
376
                    titlefield = titlefield.left(dpos);
377
                }
378
            }
379

    
380
            Work *sub = makeWork(composerName, "", "", "",
381
                                 titlefield, datefield, "", "", main);
382

    
383
            m_objects.push_back(sub);
384
        }
385
    }
386

    
387

    
388

    
389
    DEBUG << "Left over: " << all << endl;
390

    
391
    // Other forms:
392
    // *March No. 1 in F major for Military Band, WoO 18 (1808)
393

    
394

    
395
    DEBUG << "Found " << count << " things" << endl;
396
}
397

    
398

    
399
}
400

    
401