To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportWikipediaComposers.cpp

History | View | Annotate | Download (7.47 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportWikipediaComposers.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
WikipediaComposersImporter::setSource(QUrl source)
21
{
22
    DEBUG << "WikipediaComposersImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
Composer *
27
addComposer(QString namefield, QString birthfield, QString deathfield,
28
            QString datesfield, QString nationalityfield, QString worksfield,
29
            QString summaryfield)
30
{
31
    namefield = namefield.trimmed();
32
    birthfield = birthfield.trimmed();
33
    deathfield = deathfield.trimmed();
34
    datesfield = datesfield.trimmed();
35
    nationalityfield = nationalityfield.trimmed();
36
    worksfield = worksfield.trimmed();
37
    summaryfield = summaryfield.trimmed();
38

    
39
    Composer *composer = new Composer();
40

    
41
    QString name = namefield;
42
    name.replace("[[", "");
43
    name.replace("]]", "");
44
    QString pagename = name;
45

    
46
    if (name.contains('|')) {
47
        QStringList bits = name.split('|');
48
        pagename = bits[0];
49
        name = bits[1];
50
    }
51

    
52
    composer->setName(name);
53

    
54
    pagename.replace(" ", "_");
55
    QUrl url;
56
    url.setScheme("http");
57
    url.setHost("en.wikipedia.org");
58

    
59
    url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename));
60
    Document *d = new Document;
61
    d->setUri(Uri(url));
62
    d->setSiteName("Wikipedia");
63
    d->setTopic(composer);
64
    composer->addPage(d);
65

    
66
    if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment
67

    
68
    bool approx = (datesfield.contains("c.") || datesfield.contains("?")
69
                   || datesfield.contains("before") || datesfield.contains("after"));
70

    
71
    if (datesfield != "") {
72
        DEBUG << "dates for " << name << ": " << datesfield << endl;
73
        datesfield.replace("(", "");
74
        datesfield.replace(")", "");
75
        datesfield.replace(" ", "");
76
        datesfield.replace(QString::fromUtf8("\342\200\222"), "-");
77
        datesfield.replace(QString::fromUtf8("\342\200\223"), "-");
78
        datesfield.replace(QString::fromUtf8("\342\200\224"), "-");
79
        datesfield.replace(QString::fromUtf8("\342\200\225"), "-");
80
        datesfield.replace("--", "-");
81
        DEBUG << "dates for " << name << ": " << datesfield << endl;
82

    
83
        QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-");
84
        QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?");
85

    
86
        if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1);
87
        else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2);
88

    
89
        QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
90
        QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
91

    
92
        if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2);
93
        else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2);
94

    
95
//        datesfield.replace(QRegExp("[^0-9]+"), "-");
96
/*
97
        QStringList list = datesfield.split('-');
98
        if (!list.empty()) {
99
            birthfield = list[0];
100
            if (list.size() > 1) {
101
                deathfield = list[1];
102
            }
103
        }
104
*/
105
        DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl;
106
    }
107
    if (birthfield != "") {
108
        Birth *e = new Birth(birthfield.toInt());
109
        e->setApproximate(approx);
110
        composer->setBirth(e);
111
    }
112
    if (deathfield != "") {
113
        Death *e = new Death(deathfield.toInt());
114
        e->setApproximate(approx);
115
        composer->setDeath(e);
116
    }
117
    if (nationalityfield != "") {
118
        composer->addNationality(nationalityfield);
119
    }
120
    if (summaryfield != "") {
121
        summaryfield.replace(QRegExp("^[Cc]omposer, *"), "");
122
        summaryfield[0] = summaryfield[0].toUpper();
123
        summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[[");
124
        summaryfield.replace("[[", "");
125
        summaryfield.replace("]]", "");
126
        summaryfield.replace("''", "\"");
127
        summaryfield.replace("&quot;", "'");
128
        summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), "");
129
        summaryfield.replace("[", "");
130
        summaryfield.replace("]", "");
131
        composer->setRemarks(summaryfield);
132
    }
133

    
134
    return composer;
135
}    
136

    
137
void
138
WikipediaComposersImporter::import(QUrl source)
139
{
140
    //!!! for now
141
    QString filename = source.toLocalFile();
142

    
143
    QFile file(filename);
144
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
145
        throw std::exception();
146
    }
147

    
148
    QTextStream stream(&file);
149
    stream.setCodec("UTF-8");
150
    
151
    QString period;
152
    DEBUG << "source = " << source.toString() << endl;
153
    QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_");
154
    QRegExp pmatcher2("List_of_([^_-]+)[_-]era_");
155
    QRegExp pmatcher3("([^_-]+)_composers");
156
    if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1);
157
    else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1);
158
    else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1);
159
    DEBUG << "period = "<< period << endl;
160

    
161
    int count = 0;
162
    
163
    // table form A (used of e.g. Romantic transitional composers)
164
    // | Name || birth || death || nationality || summary || flags
165
    // note: 5x ||
166
    QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|");
167

    
168
    // table form B (used of e.g. 20th-century composers)
169
    // | Name || birth-[death] || nationality || notable works || remarks
170
    // Note name may contain a single | if in double-square brackets, hence 2a
171
    // note: 4x ||
172
    QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)");
173
    // just in case the final column has been omitted completely (as happens).
174
    // this must be matched after matcher2
175
    QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)");
176

    
177
    // list form
178
    // * [[Name]] [alias?] (stuff about dates)[,] notes
179
    QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)");
180

    
181
    while (!stream.atEnd()) {
182
        QString line = stream.readLine();
183

    
184
        Composer *o = 0;
185

    
186
        if (matcher1.indexIn(line) >= 0) {
187

    
188
            o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3),
189
                            "", matcher1.cap(4), "", matcher1.cap(5));
190

    
191
        } else if (matcher2.indexIn(line) >= 0) {
192

    
193
            o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "",
194
                            matcher2.cap(4), matcher2.cap(5), "");
195

    
196
        } else if (matcher2a.indexIn(line) >= 0) {
197

    
198
            o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "",
199
                            matcher2a.cap(4), "", "");
200

    
201
        } else if (matcher3.indexIn(line) >= 0) {
202
            
203
            o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3),
204
                            "", "", matcher3.cap(5));
205

    
206
        } else if (line.startsWith("* ") || line.startsWith("| ") ||
207
                   line.startsWith("*[") || line.startsWith("|[")) {
208
            DEBUG << "Failed to match promising line: " << line << endl;
209
        }
210

    
211
        if (o) {
212
            if (period != "") o->setPeriod(period);
213
            m_objects.push_back(o);
214
            ++count;
215
        }
216

    
217
    }
218

    
219
    DEBUG << "Found " << count << " things" << endl;
220
}
221

    
222

    
223
}
224

    
225