To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportClassicalComposersOrg.cpp

History | View | Annotate | Download (11.4 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportClassicalComposersOrg.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
ClassicalComposersOrgImporter::setSource(QUrl source)
21
{
22
    DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
typedef QMap<QString, int> NameMap;
27

    
28
void
29
parseNames(QString field, NameMap &names, int score = 0)
30
{
31
    QString a(field), b(field);
32

    
33
    int mp;
34
    QRegExp re;
35

    
36
    /* classical-composers.org uses quite a few (not always
37
     * consistent) ways to indicate alternatives in composer
38
     * names.  Not all of them are distinguishable.
39
     * Examples:
40
     *
41
     * Pipe used to separate sorted surname from alternative for whole:
42
     * Hardin | Moondog, Louis Thomas
43
     * -> "Louis Thomas Hardin", "Moondog"
44
     * Barron | Charlotte May Wind, Bebe
45
     * -> "Bebe Barron", "Charlotte May Wind"
46
     *
47
     * Pipe used to separate alternatives for surname only (seems
48
     * slightly more common than the previous one; if there is only
49
     * one word between the pipe and a following comma, I'd be
50
     * inclined to assume this case, Moondog notwithstanding):
51
     * Mendelssohn | Hensel, Fanny Cécile
52
     * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel"
53
     * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander 
54
     * -> "Thomas Alexander Erskine, 6th Earl of Kellie",
55
     *    "Thomas Alexander Kelly"
56
     *
57
     * Round brackets used to indicate one or more alternatives for
58
     * prior word; slash for alternation:
59
     * Edelmann, Jean-Frédéric (Johann-Friedrich)
60
     * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann"
61
     * Eberwein, Max (Traugott Maximilian)
62
     * -> "Max Eberwein", "Traugott Maximilian Eberwein"
63
     * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio)
64
     * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti",
65
     *    "Antoine Mahout", "Anton Mahaut", "Anton Mahault",
66
     *    "Anton Mahoti", "Anton Mahout", "Antonio Mahaut",
67
     *    "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout"
68
     *
69
     * Round brackets used to indicate alternative to prior
70
     * names, with some meaning left implicit:
71
     * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich) 
72
     * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest",
73
     *    perhaps "Heinrich Kaan" (but not "Jindrich z Albestu
74
     *    Kaan-Albest")
75
     *
76
     * Round brackets used to augment rather than
77
     * alternate. Probably can't identify this reliably, though
78
     * round brackets used somewhere other than at end of line
79
     * are relatively likely to be this form (?):
80
     * Linley (the elder), Thomas
81
     * -> "Thomas Linley", "Thomas Linley the elder"
82
     * Keys | Keyes, Ivor (Christopher Banfield)
83
     * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys",
84
     *    "Ivor Christopher Banfield Keyes"
85
     *
86
     * Square brackets used to indicate alternative for all
87
     * forenames:
88
     * Moller | Möller, John Christopher [Johann Christoph] 
89
     * -> "John Christopher Moller", "John Christopher Möller", 
90
     *    "Johann Christoph Moller", "Johann Christoph Möller"
91
     *
92
     * Complicated examples:
93
     * Mayr | Mayer, (Johann) Simon [Giovanni Simone] 
94
     * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr",
95
     *    "Johann Simon Mayer", "Giovanni Simone Mayr",
96
     *    "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr")
97
     * Frauenlob | Heinrich von Meissen
98
     * -> "Heinrich Frauenlob", "Heinrich von Meissen", or
99
     *    perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob")
100
     */
101

    
102
//    DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl;
103

    
104
    // round brackets used for augmentation right at the start
105
    re = QRegExp("\\(([^\\)]+)\\) ");
106
    if ((mp = re.indexIn(field)) >= 0) {
107
        int ml = re.matchedLength();
108
        QString c(re.cap(1));
109
        a.replace(mp, ml, "");
110
        b.replace(mp, ml, QString("%1 ").arg(c));
111
        parseNames(a, names, score);
112
        parseNames(b, names, score+1);
113
        return;
114
    }
115
    
116
    // round brackets used for augmentation directly after the comma
117
    re = QRegExp(", \\(([^\\)]+)\\)");
118
    if ((mp = re.indexIn(field)) >= 0) {
119
        int ml = re.matchedLength();
120
        QString c(re.cap(1));
121
        a.replace(mp, ml, ",");
122
        b.replace(mp, ml, QString(", %1").arg(c));
123
        parseNames(a, names, score);
124
        parseNames(b, names, score+1);
125
        return;
126
    }
127

    
128
    // round brackets used for augmentation directly before the comma
129
    re = QRegExp(" \\(([^\\)]+)\\),");
130
    if ((mp = re.indexIn(field)) >= 0) {
131
        int ml = re.matchedLength();
132
        QString c(re.cap(1));
133
        a.replace(mp, ml, ",");
134
        b.replace(mp, ml, QString(" %1,").arg(c));
135
        parseNames(a, names, score);
136
        parseNames(b, names, score+1);
137
        return;
138
    }
139
    
140
    // round brackets for alternation of single name, anywhere
141
    re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)");
142
    if ((mp = re.indexIn(field)) >= 0) {
143
        int ml = re.matchedLength();
144
        QString c(re.cap(1));
145
        QString d(re.cap(2));
146
        a.replace(mp, ml, c);
147
        b.replace(mp, ml, d);
148
        parseNames(a, names, score);
149
        parseNames(b, names, score+1);
150
        return;
151
    }
152

    
153
    // square brackets for alternation of a series of names, at end or after pipe
154
    re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]");
155
    if ((mp = re.indexIn(field)) >= 0) {
156
        int ml = re.matchedLength();
157
        QString p(re.cap(1));
158
        QString c(re.cap(2));
159
        QString d(re.cap(3));
160
        a.replace(mp, ml, QString("%1 %2").arg(p).arg(c));
161
        b.replace(mp, ml, QString("%1 %2").arg(p).arg(d));
162
        parseNames(a, names, score);
163
        parseNames(b, names, score+1);
164
        return;
165
    }
166

    
167
    // square brackets for alternation of a series of names, at start
168
    re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]");
169
    if ((mp = re.indexIn(field)) >= 0) {
170
        int ml = re.matchedLength();
171
        QString c(re.cap(1));
172
        QString d(re.cap(2));
173
        a.replace(mp, ml, c);
174
        b.replace(mp, ml, d);
175
        parseNames(a, names, score);
176
        parseNames(b, names, score+1);
177
        return;
178
    }
179

    
180
    // slash for alternation of word
181
    re = QRegExp("([^ ,|]+)/([^ ,|]+)");
182
    if ((mp = re.indexIn(field)) >= 0) {
183
        int ml = re.matchedLength();
184
        QString c(re.cap(1));
185
        QString d(re.cap(2));
186
        a.replace(mp, ml, c);
187
        b.replace(mp, ml, d);
188
        parseNames(a, names, score);
189
        parseNames(b, names, score+1);
190
        return;
191
    }
192
    
193
    // pipe for alternation of surname
194
    re = QRegExp("^(.*) \\| ([^|, ]+),");
195
    if ((mp = re.indexIn(field)) >= 0) {
196
        int ml = re.matchedLength();
197
        QString c(re.cap(1));
198
        QString d(re.cap(2));
199
        a.replace(mp, ml, c + ",");
200
        b.replace(mp, ml, d + ",");
201
        parseNames(a, names, score);
202
        parseNames(b, names, score+1);
203
        return;
204
    }
205

    
206
    // pipe for alternation of whole (before comma)
207
    re = QRegExp("^(.*) \\| ([^|,]+),");
208
    if ((mp = re.indexIn(field)) >= 0) {
209
        int ml = re.matchedLength();
210
        QString c(re.cap(1));
211
        QString d(re.cap(2));
212
        a.replace(mp, ml, c + ",");
213
        b = d;
214
        parseNames(a, names, score);
215
        parseNames(b, names, score+1);
216
        return;
217
    }
218

    
219
    // pipe for alternation of whole (at end)
220
    re = QRegExp("^(.*) \\| ([^|,]+)$");
221
    if ((mp = re.indexIn(field)) >= 0) {
222
        int ml = re.matchedLength();
223
        QString c(re.cap(1));
224
        QString d(re.cap(2));
225
        a.replace(mp, ml, c);
226
        b.replace(mp, ml, d);
227
        parseNames(a, names, score);
228
        parseNames(b, names, score+1);
229
        return;
230
    }
231
    
232
    // comma
233
    re = QRegExp("^(.+), ([^,]+)$");
234
    if ((mp = re.indexIn(field)) >= 0) {
235
        QString c(re.cap(1));
236
        QString d(re.cap(2));
237
        parseNames(d + " " + c, names, score+1);
238
        // fall through to add
239
    }
240

    
241
    field.replace("(", "");
242
    field.replace(")", "");
243

    
244
    names[field] = score;
245
}
246

    
247
void
248
ClassicalComposersOrgImporter::import(QUrl source)
249
{
250
    int i = 0;
251

    
252
    //!!! for now
253
    QString filename = source.toLocalFile();
254

    
255

    
256
    QFile file(filename);
257
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
258
        throw std::exception();
259
    }
260

    
261
    QTextStream stream(&file);
262
    stream.setCodec("UTF-8");
263
    QString all = stream.readAll();
264
    
265
    all.replace(QRegExp("^.*<div id=\"main\">"), "");
266

    
267
    QRegExp matcher
268
        (QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>"));
269
    
270
    int pos = 0, count = 0;
271
    while ((pos = matcher.indexIn(all, pos)) != -1) {
272

    
273
        pos += matcher.matchedLength();
274
        ++count;
275

    
276
        QString page = matcher.cap(1);
277
        QString name = matcher.cap(2);
278
        QString star = matcher.cap(5);
279
        QString birth = matcher.cap(6);
280
        QString dagger = matcher.cap(7);
281
        QString death = matcher.cap(8);
282
        QString female = matcher.cap(9);
283

    
284
        DEBUG << "Item " << count
285
              << ": page = " << page
286
              << ", name = " << name
287
              << ", birth = " << birth
288
              << ", death = " << death
289
              << ", female = " << female;
290

    
291
        QString namefield = name.trimmed();
292
        NameMap names;
293

    
294
        if (namefield.contains("P.D.Q.")) { // lose this joke
295
            continue;
296
        }
297

    
298
        parseNames(namefield, names);
299

    
300
        i = 0;
301
        QString preferred;
302
        foreach (QString n, names.keys()) {
303
            if (preferred == "" || names[n] == 0) preferred = n;
304
            DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl;
305
            ++i;
306
        }
307

    
308
        if (names.empty()) {
309
            DEBUG << "No name!" << endl;
310
            continue;
311
        }
312

    
313
        Composer *composer = new Composer();
314
        composer->setName(preferred);
315
        foreach (QString n, names.keys()) {
316
            if (n != preferred) composer->addAlias(n);
317
        }
318
        
319
        if (page != "") {
320
            Document *d = new Document;
321
            d->setUri(Uri("http://www.classical-composers.org" + page));
322
            d->setTopic(composer);
323
            d->setSiteName("Classical Composers Database");
324
            composer->addPage(d);
325
        }
326

    
327
        if (birth != "" && death == "") {
328
            if (star == "" && dagger != QString::fromUtf8("\342\200\240")) {
329
                DEBUG << "Unexpected \"dagger\" character" << dagger << endl;
330
                birth = "";
331
            }
332
            if (star == "" && dagger == "") {
333
                DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl;
334
                birth = "";
335
            } else if (star != "" && dagger != "") {
336
                DEBUG << "Date range features both star and dagger -- ignoring" << endl;
337
                birth = "";
338
            } else if (dagger != "") {
339
                DEBUG << "dagger found: setting death to " << birth << endl;
340
                death = birth;
341
                birth = "";
342
            }
343
        }
344

    
345
        if (birth != "") {
346
            Birth *e = new Birth(birth.toInt());
347
            composer->setBirth(e);
348
        }
349
        if (death != "") {
350
            composer->setDeath(new Death(death.toInt()));
351
        }
352
        if (female != "") {
353
            composer->setGender("female");
354
        } else {
355
            composer->setGender("male");
356
        }
357

    
358
        m_objects.push_back(composer);
359
    }
360

    
361
    DEBUG << "Found " << count << " things" << endl;
362
    
363
}
364

    
365

    
366
}
367