To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportClassicalArchives.cpp

History | View | Annotate | Download (9.89 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportClassicalArchives.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
ClassicalArchivesImporter::setSource(QUrl source)
21
{
22
    DEBUG << "ClassicalArchivesImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
static const char *locmap[] = {
27
    "ARG", "Argentinian", "Argentina", "3865483",
28
    "ARM", "Armenian", "Armenia", "174982",
29
    "AUS", "Australian", "Australia", "2077456",
30
    "AUT", "Austrian", "Austria", "2782113",
31
    "AZE", "Azeri", "Azerbaijan", "587116",
32
    "BEL", "Belgian", "Belgium", "2802361",
33
    "BGR", "Bulgarian", "Bulgaria", "732800",
34
    "BLR", "Belarusian", "Belarus", "630336",
35
    "BOH", "Bohemian", "Bohemia", "3074194",
36
    "BRA", "Brazilian", "Brazil", "3469058",
37
    "BSQ", "Basque", "Basque country", "3104499",
38
    "CAN", "Canadian", "Canada", "6251999",
39
    "CHE", "Swiss", "Switzerland", "2658434",
40
    "CHL", "Chilean", "Chile", "3895114",
41
    "CHN", "Chinese", "China", "1814991",
42
    "CRI", "Costa Rican", "Costa Rica", "3624060",
43
    "CTN", "Catalonian", "Catalonia", "3108286",
44
    "CUB", "Cuban", "Cuba", "3562981",
45
    "CZE", "Czech", "Czech Republic", "3077311",
46
    "DEU", "German", "Germany", "2921044",
47
    "DNK", "Danish", "Denmark", "2623032",
48
    "ECU", "Ecuadorian", "Ecuador", "3658394",
49
    "EGY", "Egyptian", "Egypt", "357994",
50
    "ENG", "English", "England", "2635167",
51
    "EPR", "German", "Germany", "2921044", // pardon?
52
    "ESP", "Spanish", "Spain", "2510769",
53
    "EST", "Estonian", "Estonia", "453733",
54
    "ETH", "Ethiopian", "Ethiopia", "337996",
55
    "FIN", "Finnish", "Finland", "660013",
56
    "FLM", "Flemish", "Flanders", "3337388",
57
    "FRA", "French", "France", "3017382",
58
    "GBR", "British", "Britain", "4839292",
59
    "GEO", "Georgian", "Georgia", "614540",
60
    "GRC", "Greek", "Greece", "390903",
61
    "GTM", "Guatemalan", "Guatemala", "3595528",
62
    "HKG", "Hong Kong Chinese", "Hong Kong", "1819729",
63
    "HOL", "Dutch", "Holland", "2750405",
64
    "HRV", "Croatian", "Croatia", "3202326",
65
    "HUN", "Hungarian", "Hungary", "719819",
66
    "IND", "Indian", "India", "1269750",
67
    "IRL", "Irish", "Ireland", "2963597",
68
    "IRN", "Iranian", "Iran", "130758",
69
    "ISL", "Icelandic", "Iceland", "2629691",
70
    "ISR", "Israeli", "Israel", "294640",
71
    "ITA", "Italian", "Italy", "3175395",
72
    "JPN", "Japanese", "Japan", "1861060",
73
    "KAZ", "Kazakh", "Kazakhstan", "1522867",
74
    "KOR", "Korean", "Korea", "1835841",
75
    "LBN", "Lebanese", "Lebanon", "272103",
76
    "LTU", "Lithuanian", "Lithuania", "597427",
77
    "LVA", "Latvian", "Latvia", "458258",
78
    "MAR", "Moroccan", "Morocco", "2542007",
79
    "MEX", "Mexican", "Mexico", "3996063",
80
    "MKD", "Macedonian", "Macedonia", "718075",
81
    "MOR", "Moravian", "Moravia", "3078610",
82
    "MYS", "Malaysian", "Malaysia", "1733045",
83
    "NAI", "North American Indian", "United States of America", "6252001",
84
    "NLD", "Dutch", "Netherlands", "2750405",
85
    "NOR", "Norwegian", "Norway", "3144096",
86
    "NZL", "New Zealander", "New Zealand", "2186224",
87
    "PER", "Peruvian", "Peru", "3932488",
88
    "PHL", "Filipino", "Philippines", "1694008",
89
    "POL", "Polish", "Poland", "798544",
90
    "PRT", "Portuguese", "Portugal", "2264397",
91
    "PRU", "Prussian", "Prussia", "772636",
92
    "PRY", "Paraguayan", "Paraguay", "3437598",
93
    "ROU", "Romanian", "Romania", "798549",
94
    "RUS", "Russian", "Russia", "2017370",
95
    "SCG", "Serbian-Montenegran", "Serbia-Montenegro", "3202468",
96
    "SCO", "Scottish", "Scotland", "2638360",
97
    "SGP", "Singaporean", "Singapore", "1880251",
98
    "SVK", "Slovakian", "Slovakia", "3057568",
99
    "SVN", "Slovenian", "Slovenia", "3190538",
100
    "SWE", "Swedish", "Sweden", "2661886",
101
    "TKM", "Turkmen", "Turkmenistan", "1218197",
102
    "TSL", "Transylvanian", "Transylvania", "4495544",
103
    "TSM", "Tasmanian", "Tasmania", "2147291",
104
    "TUR", "Turkish", "Turkey", "298795",
105
    "UKR", "Ukrainian", "Ukraine", "690791",
106
    "URY", "Uruguayan", "Uruguay", "3439705",
107
    "USA", "American", "United States of America", "6252001",
108
    "VEN", "Venezuelan", "Venezuela", "3625428",
109
    "VNM", "Vietnamese", "Vietnam", "1562822",
110
    "WLS", "Samoan", "Samoa", "4034894",
111
    "ZAF", "South African", "South Africa", "953987",
112
};
113

    
114
QSet<QString>
115
locationToNationality(QString location)
116
{
117
    QSet<QString> nationalities;
118
    QStringList locations = location.split('/');
119
    foreach (location, locations) {
120
        int cols = 4;
121
        for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
122
            if (location == locmap[i*cols]) {
123
                nationalities.insert(locmap[i*cols+1]);
124
            }
125
        }
126
    }
127
    return nationalities;
128
}
129

    
130
QSet<Uri>
131
locationToGeonameURIs(QString location)
132
{
133
    QSet<Uri> uris;
134
    QStringList locations = location.split('/');
135
    foreach (location, locations) {
136
        int cols = 4;
137
        for (size_t i = 0; i < (sizeof(locmap)/sizeof(locmap[0])) / cols; ++i) {
138
            if (location == locmap[i*cols]) {
139
                uris.insert(Uri(QString("http://sws.geonames.org/")
140
                                + locmap[i*cols+3] + "/"));
141
            }
142
        }
143
    }
144
    return uris;
145
}
146

    
147
void
148
parseNames(QString field, QStringList &names, int &birth, int &death,
149
           bool &approx, QString &location)
150
{
151
    field.replace(QRegExp("<[^>]*>"), "");
152

    
153
    QRegExp locre("; (.*)$");
154
    int pos;
155
    if ((pos = locre.indexIn(field)) >= 0) {
156
        location = locre.cap(1);
157
        field.replace(pos, locre.matchedLength(), "");
158
    }
159

    
160
    QRegExp datere("\\(([^\\)]+)\\) *$");
161
    if ((pos = datere.indexIn(field)) >= 0) {
162
        QString contents = datere.cap(1);
163
        if (contents.startsWith("c.")) {
164
            approx = true;
165
            contents = contents.replace("c.", "");
166
            contents = contents.trimmed();
167
        }
168
        if (QRegExp("\\d{4}").indexIn(contents) >= 0) {
169
            QStringList bits = contents.split("-");
170
            if (!bits.empty()) {
171
                QString f1 = bits[0];
172
                QString f2;
173
                if (bits.size() > 1) f2 = bits[1];
174
                if (f1.startsWith("b")) {
175
                    f1.replace(QRegExp("b[^0-9]*"), "");
176
                    birth = f1.toInt();
177
                } else if (f1.startsWith("d")) {
178
                    f1.replace(QRegExp("d[^0-9]*"), "");
179
                    death = f1.toInt();
180
                } else if (f2 != "") {
181
                    birth = f1.toInt();
182
                }
183
                if (f2 != "") {
184
                    death = f2.toInt();
185
                }
186
            }
187
        }
188
        field.replace(pos, datere.matchedLength(), "");
189
    }
190

    
191
    // we don't properly handle their slash alternatives syntax
192
    field = field.replace(QRegExp("/[^/,]*"), "");
193

    
194
    // nor these
195
    field.replace(QRegExp("\\[[^\\]]*\\]"), "");
196

    
197
    // nor these
198
    field.replace(QRegExp("\\([^\\)]*\\)"), "");
199

    
200
    field.replace(QRegExp(" +"), " ");
201

    
202
    // and let's be picky -- we don't like names with just initials,
203
    // can't properly match them
204
    if (QRegExp(",.*\\.").indexIn(field) >= 0) {
205
        return;
206
    }
207

    
208
    // and, from this particular source, I'm suspicious of single-word
209
    // names (sorry)
210
    if (!field.contains(",")) return;
211

    
212
    field.replace(QRegExp(" +,"), ",");
213
    field = field.trimmed();
214
    names.push_back(field);
215

    
216
    // comma
217
    QRegExp commare = QRegExp("^([^,]+), *([^,]+)$");
218
    if ((pos = commare.indexIn(field)) >= 0) {
219
        QString c(commare.cap(1));
220
        QString d(commare.cap(2));
221
        names.push_back(QString(d + " " + c).trimmed());
222
    }
223
}
224

    
225
void
226
ClassicalArchivesImporter::import(QUrl source)
227
{
228
    //!!! for now
229
    QString filename = source.toLocalFile();
230

    
231
    QFile file(filename);
232
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
233
        throw std::exception();
234
    }
235

    
236
    QTextStream stream(&file);
237
    stream.setCodec("UTF-8");
238
    QString all = stream.readAll();
239
    
240
    QRegExp matcher
241
        ("<a href=\"(/composer/[^\"]+)\"><div[^>]*>([^\n]+)</div></a>");
242
    
243
    DEBUG << "ClassicalArchivesImporter: Have " << all.length() << " chars" << endl;
244

    
245
    int pos = 0, count = 0;
246
    while ((pos = matcher.indexIn(all, pos)) != -1) {
247
        pos += matcher.matchedLength();
248
        ++count;
249

    
250
        QString namefield = matcher.cap(2);
251
        QStringList names;
252

    
253
        int birth = 0, death = 0;
254
        bool approx = false;
255
        QString location;
256

    
257
        parseNames(namefield, names, birth, death, approx, location);
258

    
259
        if (names.empty()) {
260
            DEBUG << "No name!" << endl;
261
            continue;
262
        }
263

    
264
        DEBUG << "Item " << count
265
              << ": page = " << matcher.cap(1)
266
              << ", name = " << names[0]
267
              << ", birth = " << birth << ", death = " << death
268
              << ", loc " << location << endl;
269

    
270
        if (names[0].contains("Anonymous") ||
271
            names[0].contains("Traditional")) {
272
            continue;
273
        }
274

    
275
        Composer *composer = new Composer();
276
        composer->setName(names[0]);
277
        for (int i = 1; i < names.size(); ++i) {
278
            composer->addAlias(names[i]);
279
        }
280

    
281
        if (birth != 0) {
282
            Birth *e = new Birth(birth);
283
            if (approx) e->setApproximate(true);
284
            composer->setBirth(e);
285
        }
286

    
287
        if (death != 0) {
288
            Death *e = new Death(death);
289
            if (approx) e->setApproximate(true);
290
            composer->setDeath(e);
291
        }
292

    
293
        if (location != "") {
294
            composer->setNationality(locationToNationality(location));
295
            composer->setGeonameURIs(locationToGeonameURIs(location));
296
        }
297
        
298
        if (matcher.cap(1) != "") {
299
            QString url = matcher.cap(1);
300
            Document *d = new Document;
301
            d->setUri(Uri("http://www.classicalarchives.com" + url));
302
            d->setTopic(composer);
303
            d->setSiteName("Classical Archives");
304
            composer->addPage(d);
305
        }
306
        
307
        m_objects.push_back(composer);
308
    }
309

    
310
    
311
    DEBUG << "Found " << count << " things" << endl;
312
}
313

    
314

    
315
}
316

    
317