To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportWikipediaWorksK.cpp

History | View | Annotate | Download (6.58 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportWikipediaWorksK.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
WikipediaWorksKImporter::setSource(QUrl source)
21
{
22
    DEBUG << "WikipediaWorksKImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
static QString
27
sanitise(QString field, QString &linkText)
28
{
29
    int mp;
30

    
31
    field.replace(QString::fromUtf8("\342\200\222"), "-");
32
    field.replace(QString::fromUtf8("\342\200\223"), "-");
33
    field.replace(QString::fromUtf8("\342\200\224"), "-");
34
    field.replace(QString::fromUtf8("\342\200\225"), "-");
35

    
36
    QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
37
    if ((mp = link2.indexIn(field)) >= 0) {
38
        if (linkText == "") linkText = link2.cap(2);
39
        field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
40
        return sanitise(field, linkText);
41
    }
42

    
43
    QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
44
    if ((mp = link1.indexIn(field)) >= 0) {
45
        if (linkText == "") linkText = link1.cap(2);
46
        field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
47
        return sanitise(field, linkText);
48
    }
49

    
50
    field = field.trimmed();
51

    
52
    field.replace("[", "");
53
    field.replace("]", "");
54
    field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
55
    field.replace("''", "\"");
56
    field.replace("&quot;", "\"");
57
    field.replace(QRegExp("&lt;[^&]*&gt;"), "");
58
    field.replace(QRegExp("^\\**"), "");
59

    
60
    while (field.endsWith(".") || field.endsWith(",")) {
61
        field = field.left(field.length()-1);
62
    }
63

    
64
    if (field.startsWith("(") && field.endsWith(")")) {
65
        DEBUG << "before: " << field;
66
        field = field.mid(1, field.length()-2);
67
        DEBUG << "after: " << field;
68
    }
69
    field.replace(QRegExp("^\\**"), "");
70
    if (field == ")" || field == "(") {
71
        field = "";
72
    }
73

    
74
    field.replace(" - ,", ",");
75

    
76
    return field;
77
}
78

    
79
static QString
80
extractYear(QString datefield)
81
{
82
    QRegExp re("[0-9]{4}");
83
    if (re.indexIn(datefield) >= 0) {
84
        return re.cap(0);
85
    }
86
    return "";
87
}
88

    
89
static QString
90
extractKey(QString titlefield)
91
{
92
    QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
93
    if (re.indexIn(titlefield) >= 0) {
94
        return re.cap(1);
95
    }
96
    return "";
97
}
98

    
99
static Work *
100
makeWork(QString composerName, QString opfield, QString kfield, 
101
         QString numfield, QString titlefield, QString datefield,
102
         QString placefield, QString remarksfield, Work *main)
103
{
104
    QString linkText;
105

    
106
    Work *w = new Work;
107

    
108
    QString op = sanitise(opfield, linkText);
109
    if (op != "") {
110
        op.replace("Opus ", "");
111
        op.replace("Op. ", "");
112
        op.replace("Op ", "");
113
        w->setOpus(op);
114
    }
115

    
116
    QString k = sanitise(kfield, linkText);
117
    if (k != "") {
118
        k.replace("K. ", "K ");
119
        w->setCatalogue(k);
120
    }
121

    
122
    QString num = sanitise(numfield, linkText);
123
    if (num != "") {
124
        num.replace("No. ", "");
125
        num.replace("No ", "");
126
        w->setNumber(num);
127
    }
128

    
129
    QString key = extractKey(titlefield);
130
    if (key != "") {
131
        w->setKey(key);
132
    }
133

    
134
    QString title = sanitise(titlefield, linkText);
135
    if (linkText != "") {
136
        linkText.replace(" ", "_");
137
        QUrl url;
138
        url.setScheme("http");
139
        url.setHost("en.wikipedia.org");
140
        url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
141
        Document *d = new Document;
142
        d->setUri(Uri(url));
143
        d->setSiteName("Wikipedia");
144
        d->setTopic(w);
145
        w->addPage(d);
146
    }
147

    
148
    QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
149
    int pos;
150
    if ((pos = explicationRE.indexIn(title)) >= 0) {
151
        w->addAlias(explicationRE.cap(2));
152
        title = explicationRE.cap(1);
153
    }
154

    
155
    if (remarksfield == "") {
156
        QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
157
        if ((pos = remarksRE.indexIn(title)) >= 0) {
158
            remarksfield = remarksRE.cap(2);
159
            title = remarksRE.cap(1);
160
        }
161
    }
162
    
163
    if (remarksfield == "") {
164
        QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
165
        if ((pos = remarksRE.indexIn(title)) >= 0) {
166
            remarksfield = remarksRE.cap(2);
167
            title = remarksRE.cap(1);
168
        }
169
    }
170

    
171
    w->setName(title);
172
    
173
    QString remarks = sanitise(remarksfield, linkText);
174
    if (remarks != "") {
175
        w->setRemarks(remarks);
176
    }
177

    
178
    QString year = extractYear(datefield);
179
    QString place = sanitise(placefield, linkText);
180

    
181
    DEBUG << "title = " << title << endl;
182

    
183
    if (main) {
184
        main->addPart(w);
185
        w->setPartOf(main);
186
        w->setComposition(main->composition());
187
        main->composition()->addWork(w);
188
    }
189

    
190
    if (!main || !main->composition() ||
191
        (year != "" && (main->composition()->year() != year.toInt()))) {
192
        Composition *c = new Composition;
193
        c->setComposerName(composerName);
194
        c->addWork(w);
195
        c->setYear(year.toInt());
196
        c->setPlace(place);
197
        w->setComposition(c);
198
    }
199

    
200
    return w;
201
}
202

    
203

    
204
void
205
WikipediaWorksKImporter::import(QUrl source)
206
{
207
    //!!! for now
208
    QString filename = source.toLocalFile();
209

    
210
    QFile file(filename);
211
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
212
        throw std::exception();
213
    }
214

    
215
    QTextStream stream(&file);
216
    stream.setCodec("UTF-8");
217
    
218
    QString composerName;
219
    if (filename.contains("K%C3%B6chel")) {
220
        composerName = "Wolfgang Amadeus Mozart";
221
    } else {
222
        QRegExp byby("by_(.*)_by");
223
        if (byby.indexIn(filename) >= 0) {
224
            composerName = byby.cap(1).replace('_', ' ');
225
        } else {
226
            QRegExp by("by_(.*)");
227
            if (by.indexIn(filename) >= 0) {
228
                composerName = by.cap(1).replace('_', ' ');
229
            }
230
        }
231
    }
232
    composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
233

    
234
    DEBUG << "composerName = " << composerName << endl;
235

    
236
    // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
237
    QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[(K\\.? *[0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
238

    
239
    QString all = stream.readAll();
240

    
241
    DEBUG << "Read " << all.length() << " chars" << endl;
242

    
243
    all.replace(QRegExp("^.*<page>"), "");
244

    
245
    int pos = 0, count = 0;
246

    
247
    while ((pos = matcherK.indexIn(all, pos)) != -1) {
248

    
249
        all.replace(pos, matcherK.matchedLength(), "");
250
        ++count;
251

    
252
        QString kfield = matcherK.cap(1);
253
        QString titlefield = matcherK.cap(2);
254
        QString datefield = matcherK.cap(3);
255
        QString placefield = matcherK.cap(4);
256

    
257
        m_objects.push_back
258
            (makeWork(composerName, "", kfield, "",
259
                      titlefield, datefield, placefield, "", 0));
260
    }
261

    
262
    DEBUG << "Left over: " << all << endl;
263

    
264
    DEBUG << "Found " << count << " things" << endl;
265
}
266

    
267

    
268
}
269

    
270