To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.
root / import / ImportClassicalComposersOrg.cpp
History | View | Annotate | Download (11.4 KB)
| 1 |
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
|---|---|
| 2 |
|
| 3 |
#include "ImportClassicalComposersOrg.h" |
| 4 |
|
| 5 |
#include <dataquay/Debug.h> |
| 6 |
|
| 7 |
#include <QFile> |
| 8 |
#include <QFileInfo> |
| 9 |
#include <QTextStream> |
| 10 |
#include <QRegExp> |
| 11 |
#include <QVariant> |
| 12 |
|
| 13 |
#include <exception> |
| 14 |
|
| 15 |
using namespace Dataquay; |
| 16 |
|
| 17 |
namespace ClassicalData {
|
| 18 |
|
| 19 |
void
|
| 20 |
ClassicalComposersOrgImporter::setSource(QUrl source) |
| 21 |
{
|
| 22 |
DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl;
|
| 23 |
import(source); |
| 24 |
} |
| 25 |
|
| 26 |
typedef QMap<QString, int> NameMap; |
| 27 |
|
| 28 |
void
|
| 29 |
parseNames(QString field, NameMap &names, int score = 0) |
| 30 |
{
|
| 31 |
QString a(field), b(field); |
| 32 |
|
| 33 |
int mp;
|
| 34 |
QRegExp re; |
| 35 |
|
| 36 |
/* classical-composers.org uses quite a few (not always
|
| 37 |
* consistent) ways to indicate alternatives in composer
|
| 38 |
* names. Not all of them are distinguishable.
|
| 39 |
* Examples:
|
| 40 |
*
|
| 41 |
* Pipe used to separate sorted surname from alternative for whole:
|
| 42 |
* Hardin | Moondog, Louis Thomas
|
| 43 |
* -> "Louis Thomas Hardin", "Moondog"
|
| 44 |
* Barron | Charlotte May Wind, Bebe
|
| 45 |
* -> "Bebe Barron", "Charlotte May Wind"
|
| 46 |
*
|
| 47 |
* Pipe used to separate alternatives for surname only (seems
|
| 48 |
* slightly more common than the previous one; if there is only
|
| 49 |
* one word between the pipe and a following comma, I'd be
|
| 50 |
* inclined to assume this case, Moondog notwithstanding):
|
| 51 |
* Mendelssohn | Hensel, Fanny Cécile
|
| 52 |
* -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel"
|
| 53 |
* Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander
|
| 54 |
* -> "Thomas Alexander Erskine, 6th Earl of Kellie",
|
| 55 |
* "Thomas Alexander Kelly"
|
| 56 |
*
|
| 57 |
* Round brackets used to indicate one or more alternatives for
|
| 58 |
* prior word; slash for alternation:
|
| 59 |
* Edelmann, Jean-Frédéric (Johann-Friedrich)
|
| 60 |
* -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann"
|
| 61 |
* Eberwein, Max (Traugott Maximilian)
|
| 62 |
* -> "Max Eberwein", "Traugott Maximilian Eberwein"
|
| 63 |
* Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio)
|
| 64 |
* -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti",
|
| 65 |
* "Antoine Mahout", "Anton Mahaut", "Anton Mahault",
|
| 66 |
* "Anton Mahoti", "Anton Mahout", "Antonio Mahaut",
|
| 67 |
* "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout"
|
| 68 |
*
|
| 69 |
* Round brackets used to indicate alternative to prior
|
| 70 |
* names, with some meaning left implicit:
|
| 71 |
* Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich)
|
| 72 |
* -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest",
|
| 73 |
* perhaps "Heinrich Kaan" (but not "Jindrich z Albestu
|
| 74 |
* Kaan-Albest")
|
| 75 |
*
|
| 76 |
* Round brackets used to augment rather than
|
| 77 |
* alternate. Probably can't identify this reliably, though
|
| 78 |
* round brackets used somewhere other than at end of line
|
| 79 |
* are relatively likely to be this form (?):
|
| 80 |
* Linley (the elder), Thomas
|
| 81 |
* -> "Thomas Linley", "Thomas Linley the elder"
|
| 82 |
* Keys | Keyes, Ivor (Christopher Banfield)
|
| 83 |
* -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys",
|
| 84 |
* "Ivor Christopher Banfield Keyes"
|
| 85 |
*
|
| 86 |
* Square brackets used to indicate alternative for all
|
| 87 |
* forenames:
|
| 88 |
* Moller | Möller, John Christopher [Johann Christoph]
|
| 89 |
* -> "John Christopher Moller", "John Christopher Möller",
|
| 90 |
* "Johann Christoph Moller", "Johann Christoph Möller"
|
| 91 |
*
|
| 92 |
* Complicated examples:
|
| 93 |
* Mayr | Mayer, (Johann) Simon [Giovanni Simone]
|
| 94 |
* -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr",
|
| 95 |
* "Johann Simon Mayer", "Giovanni Simone Mayr",
|
| 96 |
* "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr")
|
| 97 |
* Frauenlob | Heinrich von Meissen
|
| 98 |
* -> "Heinrich Frauenlob", "Heinrich von Meissen", or
|
| 99 |
* perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob")
|
| 100 |
*/
|
| 101 |
|
| 102 |
// DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl;
|
| 103 |
|
| 104 |
// round brackets used for augmentation right at the start
|
| 105 |
re = QRegExp("\\(([^\\)]+)\\) ");
|
| 106 |
if ((mp = re.indexIn(field)) >= 0) { |
| 107 |
int ml = re.matchedLength();
|
| 108 |
QString c(re.cap(1));
|
| 109 |
a.replace(mp, ml, "");
|
| 110 |
b.replace(mp, ml, QString("%1 ").arg(c));
|
| 111 |
parseNames(a, names, score); |
| 112 |
parseNames(b, names, score+1);
|
| 113 |
return;
|
| 114 |
} |
| 115 |
|
| 116 |
// round brackets used for augmentation directly after the comma
|
| 117 |
re = QRegExp(", \\(([^\\)]+)\\)");
|
| 118 |
if ((mp = re.indexIn(field)) >= 0) { |
| 119 |
int ml = re.matchedLength();
|
| 120 |
QString c(re.cap(1));
|
| 121 |
a.replace(mp, ml, ",");
|
| 122 |
b.replace(mp, ml, QString(", %1").arg(c));
|
| 123 |
parseNames(a, names, score); |
| 124 |
parseNames(b, names, score+1);
|
| 125 |
return;
|
| 126 |
} |
| 127 |
|
| 128 |
// round brackets used for augmentation directly before the comma
|
| 129 |
re = QRegExp(" \\(([^\\)]+)\\),");
|
| 130 |
if ((mp = re.indexIn(field)) >= 0) { |
| 131 |
int ml = re.matchedLength();
|
| 132 |
QString c(re.cap(1));
|
| 133 |
a.replace(mp, ml, ",");
|
| 134 |
b.replace(mp, ml, QString(" %1,").arg(c));
|
| 135 |
parseNames(a, names, score); |
| 136 |
parseNames(b, names, score+1);
|
| 137 |
return;
|
| 138 |
} |
| 139 |
|
| 140 |
// round brackets for alternation of single name, anywhere
|
| 141 |
re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)");
|
| 142 |
if ((mp = re.indexIn(field)) >= 0) { |
| 143 |
int ml = re.matchedLength();
|
| 144 |
QString c(re.cap(1));
|
| 145 |
QString d(re.cap(2));
|
| 146 |
a.replace(mp, ml, c); |
| 147 |
b.replace(mp, ml, d); |
| 148 |
parseNames(a, names, score); |
| 149 |
parseNames(b, names, score+1);
|
| 150 |
return;
|
| 151 |
} |
| 152 |
|
| 153 |
// square brackets for alternation of a series of names, at end or after pipe
|
| 154 |
re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]");
|
| 155 |
if ((mp = re.indexIn(field)) >= 0) { |
| 156 |
int ml = re.matchedLength();
|
| 157 |
QString p(re.cap(1));
|
| 158 |
QString c(re.cap(2));
|
| 159 |
QString d(re.cap(3));
|
| 160 |
a.replace(mp, ml, QString("%1 %2").arg(p).arg(c));
|
| 161 |
b.replace(mp, ml, QString("%1 %2").arg(p).arg(d));
|
| 162 |
parseNames(a, names, score); |
| 163 |
parseNames(b, names, score+1);
|
| 164 |
return;
|
| 165 |
} |
| 166 |
|
| 167 |
// square brackets for alternation of a series of names, at start
|
| 168 |
re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]");
|
| 169 |
if ((mp = re.indexIn(field)) >= 0) { |
| 170 |
int ml = re.matchedLength();
|
| 171 |
QString c(re.cap(1));
|
| 172 |
QString d(re.cap(2));
|
| 173 |
a.replace(mp, ml, c); |
| 174 |
b.replace(mp, ml, d); |
| 175 |
parseNames(a, names, score); |
| 176 |
parseNames(b, names, score+1);
|
| 177 |
return;
|
| 178 |
} |
| 179 |
|
| 180 |
// slash for alternation of word
|
| 181 |
re = QRegExp("([^ ,|]+)/([^ ,|]+)");
|
| 182 |
if ((mp = re.indexIn(field)) >= 0) { |
| 183 |
int ml = re.matchedLength();
|
| 184 |
QString c(re.cap(1));
|
| 185 |
QString d(re.cap(2));
|
| 186 |
a.replace(mp, ml, c); |
| 187 |
b.replace(mp, ml, d); |
| 188 |
parseNames(a, names, score); |
| 189 |
parseNames(b, names, score+1);
|
| 190 |
return;
|
| 191 |
} |
| 192 |
|
| 193 |
// pipe for alternation of surname
|
| 194 |
re = QRegExp("^(.*) \\| ([^|, ]+),");
|
| 195 |
if ((mp = re.indexIn(field)) >= 0) { |
| 196 |
int ml = re.matchedLength();
|
| 197 |
QString c(re.cap(1));
|
| 198 |
QString d(re.cap(2));
|
| 199 |
a.replace(mp, ml, c + ",");
|
| 200 |
b.replace(mp, ml, d + ",");
|
| 201 |
parseNames(a, names, score); |
| 202 |
parseNames(b, names, score+1);
|
| 203 |
return;
|
| 204 |
} |
| 205 |
|
| 206 |
// pipe for alternation of whole (before comma)
|
| 207 |
re = QRegExp("^(.*) \\| ([^|,]+),");
|
| 208 |
if ((mp = re.indexIn(field)) >= 0) { |
| 209 |
int ml = re.matchedLength();
|
| 210 |
QString c(re.cap(1));
|
| 211 |
QString d(re.cap(2));
|
| 212 |
a.replace(mp, ml, c + ",");
|
| 213 |
b = d; |
| 214 |
parseNames(a, names, score); |
| 215 |
parseNames(b, names, score+1);
|
| 216 |
return;
|
| 217 |
} |
| 218 |
|
| 219 |
// pipe for alternation of whole (at end)
|
| 220 |
re = QRegExp("^(.*) \\| ([^|,]+)$");
|
| 221 |
if ((mp = re.indexIn(field)) >= 0) { |
| 222 |
int ml = re.matchedLength();
|
| 223 |
QString c(re.cap(1));
|
| 224 |
QString d(re.cap(2));
|
| 225 |
a.replace(mp, ml, c); |
| 226 |
b.replace(mp, ml, d); |
| 227 |
parseNames(a, names, score); |
| 228 |
parseNames(b, names, score+1);
|
| 229 |
return;
|
| 230 |
} |
| 231 |
|
| 232 |
// comma
|
| 233 |
re = QRegExp("^(.+), ([^,]+)$");
|
| 234 |
if ((mp = re.indexIn(field)) >= 0) { |
| 235 |
QString c(re.cap(1));
|
| 236 |
QString d(re.cap(2));
|
| 237 |
parseNames(d + " " + c, names, score+1); |
| 238 |
// fall through to add
|
| 239 |
} |
| 240 |
|
| 241 |
field.replace("(", ""); |
| 242 |
field.replace(")", ""); |
| 243 |
|
| 244 |
names[field] = score; |
| 245 |
} |
| 246 |
|
| 247 |
void
|
| 248 |
ClassicalComposersOrgImporter::import(QUrl source) |
| 249 |
{
|
| 250 |
int i = 0; |
| 251 |
|
| 252 |
//!!! for now
|
| 253 |
QString filename = source.toLocalFile(); |
| 254 |
|
| 255 |
|
| 256 |
QFile file(filename); |
| 257 |
if (!file.open(QFile::ReadOnly | QFile::Text)) {
|
| 258 |
throw std::exception();
|
| 259 |
} |
| 260 |
|
| 261 |
QTextStream stream(&file); |
| 262 |
stream.setCodec("UTF-8");
|
| 263 |
QString all = stream.readAll(); |
| 264 |
|
| 265 |
all.replace(QRegExp("^.*<div id=\"main\">"), ""); |
| 266 |
|
| 267 |
QRegExp matcher |
| 268 |
(QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>"));
|
| 269 |
|
| 270 |
int pos = 0, count = 0; |
| 271 |
while ((pos = matcher.indexIn(all, pos)) != -1) { |
| 272 |
|
| 273 |
pos += matcher.matchedLength(); |
| 274 |
++count; |
| 275 |
|
| 276 |
QString page = matcher.cap(1);
|
| 277 |
QString name = matcher.cap(2);
|
| 278 |
QString star = matcher.cap(5);
|
| 279 |
QString birth = matcher.cap(6);
|
| 280 |
QString dagger = matcher.cap(7);
|
| 281 |
QString death = matcher.cap(8);
|
| 282 |
QString female = matcher.cap(9);
|
| 283 |
|
| 284 |
DEBUG << "Item " << count
|
| 285 |
<< ": page = " << page
|
| 286 |
<< ", name = " << name
|
| 287 |
<< ", birth = " << birth
|
| 288 |
<< ", death = " << death
|
| 289 |
<< ", female = " << female;
|
| 290 |
|
| 291 |
QString namefield = name.trimmed(); |
| 292 |
NameMap names; |
| 293 |
|
| 294 |
if (namefield.contains("P.D.Q.")) { // lose this joke |
| 295 |
continue;
|
| 296 |
} |
| 297 |
|
| 298 |
parseNames(namefield, names); |
| 299 |
|
| 300 |
i = 0;
|
| 301 |
QString preferred; |
| 302 |
foreach (QString n, names.keys()) {
|
| 303 |
if (preferred == "" || names[n] == 0) preferred = n; |
| 304 |
DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl; |
| 305 |
++i; |
| 306 |
} |
| 307 |
|
| 308 |
if (names.empty()) {
|
| 309 |
DEBUG << "No name!" << endl;
|
| 310 |
continue;
|
| 311 |
} |
| 312 |
|
| 313 |
Composer *composer = new Composer();
|
| 314 |
composer->setName(preferred); |
| 315 |
foreach (QString n, names.keys()) {
|
| 316 |
if (n != preferred) composer->addAlias(n);
|
| 317 |
} |
| 318 |
|
| 319 |
if (page != "") { |
| 320 |
Document *d = new Document;
|
| 321 |
d->setUri(Uri("http://www.classical-composers.org" + page));
|
| 322 |
d->setTopic(composer); |
| 323 |
d->setSiteName("Classical Composers Database");
|
| 324 |
composer->addPage(d); |
| 325 |
} |
| 326 |
|
| 327 |
if (birth != "" && death == "") { |
| 328 |
if (star == "" && dagger != QString::fromUtf8("\342\200\240")) { |
| 329 |
DEBUG << "Unexpected \"dagger\" character" << dagger << endl;
|
| 330 |
birth = "";
|
| 331 |
} |
| 332 |
if (star == "" && dagger == "") { |
| 333 |
DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl; |
| 334 |
birth = "";
|
| 335 |
} else if (star != "" && dagger != "") { |
| 336 |
DEBUG << "Date range features both star and dagger -- ignoring" << endl;
|
| 337 |
birth = "";
|
| 338 |
} else if (dagger != "") { |
| 339 |
DEBUG << "dagger found: setting death to " << birth << endl;
|
| 340 |
death = birth; |
| 341 |
birth = "";
|
| 342 |
} |
| 343 |
} |
| 344 |
|
| 345 |
if (birth != "") { |
| 346 |
Birth *e = new Birth(birth.toInt());
|
| 347 |
composer->setBirth(e); |
| 348 |
} |
| 349 |
if (death != "") { |
| 350 |
composer->setDeath(new Death(death.toInt()));
|
| 351 |
} |
| 352 |
if (female != "") { |
| 353 |
composer->setGender("female");
|
| 354 |
} else {
|
| 355 |
composer->setGender("male");
|
| 356 |
} |
| 357 |
|
| 358 |
m_objects.push_back(composer); |
| 359 |
} |
| 360 |
|
| 361 |
DEBUG << "Found " << count << " things" << endl; |
| 362 |
|
| 363 |
} |
| 364 |
|
| 365 |
|
| 366 |
} |
| 367 |
|