To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.
root / import / ImportWikipediaComposers.cpp
History | View | Annotate | Download (7.47 KB)
| 1 |
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
|---|---|
| 2 |
|
| 3 |
#include "ImportWikipediaComposers.h" |
| 4 |
|
| 5 |
#include <dataquay/Debug.h> |
| 6 |
|
| 7 |
#include <QFile> |
| 8 |
#include <QFileInfo> |
| 9 |
#include <QTextStream> |
| 10 |
#include <QRegExp> |
| 11 |
#include <QVariant> |
| 12 |
|
| 13 |
#include <exception> |
| 14 |
|
| 15 |
using namespace Dataquay; |
| 16 |
|
| 17 |
namespace ClassicalData {
|
| 18 |
|
| 19 |
void
|
| 20 |
WikipediaComposersImporter::setSource(QUrl source) |
| 21 |
{
|
| 22 |
DEBUG << "WikipediaComposersImporter::setSource: " << source << endl;
|
| 23 |
import(source); |
| 24 |
} |
| 25 |
|
| 26 |
Composer * |
| 27 |
addComposer(QString namefield, QString birthfield, QString deathfield, |
| 28 |
QString datesfield, QString nationalityfield, QString worksfield, |
| 29 |
QString summaryfield) |
| 30 |
{
|
| 31 |
namefield = namefield.trimmed(); |
| 32 |
birthfield = birthfield.trimmed(); |
| 33 |
deathfield = deathfield.trimmed(); |
| 34 |
datesfield = datesfield.trimmed(); |
| 35 |
nationalityfield = nationalityfield.trimmed(); |
| 36 |
worksfield = worksfield.trimmed(); |
| 37 |
summaryfield = summaryfield.trimmed(); |
| 38 |
|
| 39 |
Composer *composer = new Composer();
|
| 40 |
|
| 41 |
QString name = namefield; |
| 42 |
name.replace("[[", ""); |
| 43 |
name.replace("]]", ""); |
| 44 |
QString pagename = name; |
| 45 |
|
| 46 |
if (name.contains('|')) { |
| 47 |
QStringList bits = name.split('|');
|
| 48 |
pagename = bits[0];
|
| 49 |
name = bits[1];
|
| 50 |
} |
| 51 |
|
| 52 |
composer->setName(name); |
| 53 |
|
| 54 |
pagename.replace(" ", "_"); |
| 55 |
QUrl url; |
| 56 |
url.setScheme("http");
|
| 57 |
url.setHost("en.wikipedia.org");
|
| 58 |
|
| 59 |
url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename));
|
| 60 |
Document *d = new Document;
|
| 61 |
d->setUri(Uri(url)); |
| 62 |
d->setSiteName("Wikipedia");
|
| 63 |
d->setTopic(composer); |
| 64 |
composer->addPage(d); |
| 65 |
|
| 66 |
if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment |
| 67 |
|
| 68 |
bool approx = (datesfield.contains("c.") || datesfield.contains("?") |
| 69 |
|| datesfield.contains("before") || datesfield.contains("after")); |
| 70 |
|
| 71 |
if (datesfield != "") { |
| 72 |
DEBUG << "dates for " << name << ": " << datesfield << endl; |
| 73 |
datesfield.replace("(", ""); |
| 74 |
datesfield.replace(")", ""); |
| 75 |
datesfield.replace(" ", ""); |
| 76 |
datesfield.replace(QString::fromUtf8("\342\200\222"), "-"); |
| 77 |
datesfield.replace(QString::fromUtf8("\342\200\223"), "-"); |
| 78 |
datesfield.replace(QString::fromUtf8("\342\200\224"), "-"); |
| 79 |
datesfield.replace(QString::fromUtf8("\342\200\225"), "-"); |
| 80 |
datesfield.replace("--", "-"); |
| 81 |
DEBUG << "dates for " << name << ": " << datesfield << endl; |
| 82 |
|
| 83 |
QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-");
|
| 84 |
QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?");
|
| 85 |
|
| 86 |
if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1); |
| 87 |
else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2); |
| 88 |
|
| 89 |
QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
|
| 90 |
QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
|
| 91 |
|
| 92 |
if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2); |
| 93 |
else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2); |
| 94 |
|
| 95 |
// datesfield.replace(QRegExp("[^0-9]+"), "-");
|
| 96 |
/*
|
| 97 |
QStringList list = datesfield.split('-');
|
| 98 |
if (!list.empty()) {
|
| 99 |
birthfield = list[0];
|
| 100 |
if (list.size() > 1) {
|
| 101 |
deathfield = list[1];
|
| 102 |
}
|
| 103 |
}
|
| 104 |
*/
|
| 105 |
DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl; |
| 106 |
} |
| 107 |
if (birthfield != "") { |
| 108 |
Birth *e = new Birth(birthfield.toInt());
|
| 109 |
e->setApproximate(approx); |
| 110 |
composer->setBirth(e); |
| 111 |
} |
| 112 |
if (deathfield != "") { |
| 113 |
Death *e = new Death(deathfield.toInt());
|
| 114 |
e->setApproximate(approx); |
| 115 |
composer->setDeath(e); |
| 116 |
} |
| 117 |
if (nationalityfield != "") { |
| 118 |
composer->addNationality(nationalityfield); |
| 119 |
} |
| 120 |
if (summaryfield != "") { |
| 121 |
summaryfield.replace(QRegExp("^[Cc]omposer, *"), ""); |
| 122 |
summaryfield[0] = summaryfield[0].toUpper(); |
| 123 |
summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[["); |
| 124 |
summaryfield.replace("[[", ""); |
| 125 |
summaryfield.replace("]]", ""); |
| 126 |
summaryfield.replace("''", "\""); |
| 127 |
summaryfield.replace(""", "'"); |
| 128 |
summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), ""); |
| 129 |
summaryfield.replace("[", ""); |
| 130 |
summaryfield.replace("]", ""); |
| 131 |
composer->setRemarks(summaryfield); |
| 132 |
} |
| 133 |
|
| 134 |
return composer;
|
| 135 |
} |
| 136 |
|
| 137 |
void
|
| 138 |
WikipediaComposersImporter::import(QUrl source) |
| 139 |
{
|
| 140 |
//!!! for now
|
| 141 |
QString filename = source.toLocalFile(); |
| 142 |
|
| 143 |
QFile file(filename); |
| 144 |
if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
| 145 |
throw std::exception();
|
| 146 |
} |
| 147 |
|
| 148 |
QTextStream stream(&file); |
| 149 |
stream.setCodec("UTF-8");
|
| 150 |
|
| 151 |
QString period; |
| 152 |
DEBUG << "source = " << source.toString() << endl;
|
| 153 |
QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_");
|
| 154 |
QRegExp pmatcher2("List_of_([^_-]+)[_-]era_");
|
| 155 |
QRegExp pmatcher3("([^_-]+)_composers");
|
| 156 |
if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1); |
| 157 |
else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1); |
| 158 |
else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1); |
| 159 |
DEBUG << "period = "<< period << endl;
|
| 160 |
|
| 161 |
int count = 0; |
| 162 |
|
| 163 |
// table form A (used of e.g. Romantic transitional composers)
|
| 164 |
// | Name || birth || death || nationality || summary || flags
|
| 165 |
// note: 5x ||
|
| 166 |
QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|");
|
| 167 |
|
| 168 |
// table form B (used of e.g. 20th-century composers)
|
| 169 |
// | Name || birth-[death] || nationality || notable works || remarks
|
| 170 |
// Note name may contain a single | if in double-square brackets, hence 2a
|
| 171 |
// note: 4x ||
|
| 172 |
QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)");
|
| 173 |
// just in case the final column has been omitted completely (as happens).
|
| 174 |
// this must be matched after matcher2
|
| 175 |
QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)");
|
| 176 |
|
| 177 |
// list form
|
| 178 |
// * [[Name]] [alias?] (stuff about dates)[,] notes
|
| 179 |
QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)");
|
| 180 |
|
| 181 |
while (!stream.atEnd()) {
|
| 182 |
QString line = stream.readLine(); |
| 183 |
|
| 184 |
Composer *o = 0;
|
| 185 |
|
| 186 |
if (matcher1.indexIn(line) >= 0) { |
| 187 |
|
| 188 |
o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3), |
| 189 |
"", matcher1.cap(4), "", matcher1.cap(5)); |
| 190 |
|
| 191 |
} else if (matcher2.indexIn(line) >= 0) { |
| 192 |
|
| 193 |
o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "", |
| 194 |
matcher2.cap(4), matcher2.cap(5), ""); |
| 195 |
|
| 196 |
} else if (matcher2a.indexIn(line) >= 0) { |
| 197 |
|
| 198 |
o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "", |
| 199 |
matcher2a.cap(4), "", ""); |
| 200 |
|
| 201 |
} else if (matcher3.indexIn(line) >= 0) { |
| 202 |
|
| 203 |
o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3), |
| 204 |
"", "", matcher3.cap(5)); |
| 205 |
|
| 206 |
} else if (line.startsWith("* ") || line.startsWith("| ") || |
| 207 |
line.startsWith("*[") || line.startsWith("|[")) { |
| 208 |
DEBUG << "Failed to match promising line: " << line << endl;
|
| 209 |
} |
| 210 |
|
| 211 |
if (o) {
|
| 212 |
if (period != "") o->setPeriod(period); |
| 213 |
m_objects.push_back(o); |
| 214 |
++count; |
| 215 |
} |
| 216 |
|
| 217 |
} |
| 218 |
|
| 219 |
DEBUG << "Found " << count << " things" << endl; |
| 220 |
} |
| 221 |
|
| 222 |
|
| 223 |
} |
| 224 |
|
| 225 |
|