annotate import/ImportClassicalComposersOrg.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c4cb65c436ef
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportClassicalComposersOrg.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 ClassicalComposersOrgImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 typedef QMap<QString, int> NameMap;
Chris@0 27
Chris@0 28 void
Chris@0 29 parseNames(QString field, NameMap &names, int score = 0)
Chris@0 30 {
Chris@0 31 QString a(field), b(field);
Chris@0 32
Chris@0 33 int mp;
Chris@0 34 QRegExp re;
Chris@0 35
Chris@0 36 /* classical-composers.org uses quite a few (not always
Chris@0 37 * consistent) ways to indicate alternatives in composer
Chris@0 38 * names. Not all of them are distinguishable.
Chris@0 39 * Examples:
Chris@0 40 *
Chris@0 41 * Pipe used to separate sorted surname from alternative for whole:
Chris@0 42 * Hardin | Moondog, Louis Thomas
Chris@0 43 * -> "Louis Thomas Hardin", "Moondog"
Chris@0 44 * Barron | Charlotte May Wind, Bebe
Chris@0 45 * -> "Bebe Barron", "Charlotte May Wind"
Chris@0 46 *
Chris@0 47 * Pipe used to separate alternatives for surname only (seems
Chris@0 48 * slightly more common than the previous one; if there is only
Chris@0 49 * one word between the pipe and a following comma, I'd be
Chris@0 50 * inclined to assume this case, Moondog notwithstanding):
Chris@0 51 * Mendelssohn | Hensel, Fanny Cécile
Chris@0 52 * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel"
Chris@0 53 * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander
Chris@0 54 * -> "Thomas Alexander Erskine, 6th Earl of Kellie",
Chris@0 55 * "Thomas Alexander Kelly"
Chris@0 56 *
Chris@0 57 * Round brackets used to indicate one or more alternatives for
Chris@0 58 * prior word; slash for alternation:
Chris@0 59 * Edelmann, Jean-Frédéric (Johann-Friedrich)
Chris@0 60 * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann"
Chris@0 61 * Eberwein, Max (Traugott Maximilian)
Chris@0 62 * -> "Max Eberwein", "Traugott Maximilian Eberwein"
Chris@0 63 * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio)
Chris@0 64 * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti",
Chris@0 65 * "Antoine Mahout", "Anton Mahaut", "Anton Mahault",
Chris@0 66 * "Anton Mahoti", "Anton Mahout", "Antonio Mahaut",
Chris@0 67 * "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout"
Chris@0 68 *
Chris@0 69 * Round brackets used to indicate alternative to prior
Chris@0 70 * names, with some meaning left implicit:
Chris@0 71 * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich)
Chris@0 72 * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest",
Chris@0 73 * perhaps "Heinrich Kaan" (but not "Jindrich z Albestu
Chris@0 74 * Kaan-Albest")
Chris@0 75 *
Chris@0 76 * Round brackets used to augment rather than
Chris@0 77 * alternate. Probably can't identify this reliably, though
Chris@0 78 * round brackets used somewhere other than at end of line
Chris@0 79 * are relatively likely to be this form (?):
Chris@0 80 * Linley (the elder), Thomas
Chris@0 81 * -> "Thomas Linley", "Thomas Linley the elder"
Chris@0 82 * Keys | Keyes, Ivor (Christopher Banfield)
Chris@0 83 * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys",
Chris@0 84 * "Ivor Christopher Banfield Keyes"
Chris@0 85 *
Chris@0 86 * Square brackets used to indicate alternative for all
Chris@0 87 * forenames:
Chris@0 88 * Moller | Möller, John Christopher [Johann Christoph]
Chris@0 89 * -> "John Christopher Moller", "John Christopher Möller",
Chris@0 90 * "Johann Christoph Moller", "Johann Christoph Möller"
Chris@0 91 *
Chris@0 92 * Complicated examples:
Chris@0 93 * Mayr | Mayer, (Johann) Simon [Giovanni Simone]
Chris@0 94 * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr",
Chris@0 95 * "Johann Simon Mayer", "Giovanni Simone Mayr",
Chris@0 96 * "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr")
Chris@0 97 * Frauenlob | Heinrich von Meissen
Chris@0 98 * -> "Heinrich Frauenlob", "Heinrich von Meissen", or
Chris@0 99 * perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob")
Chris@0 100 */
Chris@0 101
Chris@0 102 // DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl;
Chris@0 103
Chris@0 104 // round brackets used for augmentation right at the start
Chris@0 105 re = QRegExp("\\(([^\\)]+)\\) ");
Chris@0 106 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 107 int ml = re.matchedLength();
Chris@0 108 QString c(re.cap(1));
Chris@0 109 a.replace(mp, ml, "");
Chris@0 110 b.replace(mp, ml, QString("%1 ").arg(c));
Chris@0 111 parseNames(a, names, score);
Chris@0 112 parseNames(b, names, score+1);
Chris@0 113 return;
Chris@0 114 }
Chris@0 115
Chris@0 116 // round brackets used for augmentation directly after the comma
Chris@0 117 re = QRegExp(", \\(([^\\)]+)\\)");
Chris@0 118 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 119 int ml = re.matchedLength();
Chris@0 120 QString c(re.cap(1));
Chris@0 121 a.replace(mp, ml, ",");
Chris@0 122 b.replace(mp, ml, QString(", %1").arg(c));
Chris@0 123 parseNames(a, names, score);
Chris@0 124 parseNames(b, names, score+1);
Chris@0 125 return;
Chris@0 126 }
Chris@0 127
Chris@0 128 // round brackets used for augmentation directly before the comma
Chris@0 129 re = QRegExp(" \\(([^\\)]+)\\),");
Chris@0 130 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 131 int ml = re.matchedLength();
Chris@0 132 QString c(re.cap(1));
Chris@0 133 a.replace(mp, ml, ",");
Chris@0 134 b.replace(mp, ml, QString(" %1,").arg(c));
Chris@0 135 parseNames(a, names, score);
Chris@0 136 parseNames(b, names, score+1);
Chris@0 137 return;
Chris@0 138 }
Chris@0 139
Chris@0 140 // round brackets for alternation of single name, anywhere
Chris@0 141 re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)");
Chris@0 142 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 143 int ml = re.matchedLength();
Chris@0 144 QString c(re.cap(1));
Chris@0 145 QString d(re.cap(2));
Chris@0 146 a.replace(mp, ml, c);
Chris@0 147 b.replace(mp, ml, d);
Chris@0 148 parseNames(a, names, score);
Chris@0 149 parseNames(b, names, score+1);
Chris@0 150 return;
Chris@0 151 }
Chris@0 152
Chris@0 153 // square brackets for alternation of a series of names, at end or after pipe
Chris@0 154 re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]");
Chris@0 155 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 156 int ml = re.matchedLength();
Chris@0 157 QString p(re.cap(1));
Chris@0 158 QString c(re.cap(2));
Chris@0 159 QString d(re.cap(3));
Chris@0 160 a.replace(mp, ml, QString("%1 %2").arg(p).arg(c));
Chris@0 161 b.replace(mp, ml, QString("%1 %2").arg(p).arg(d));
Chris@0 162 parseNames(a, names, score);
Chris@0 163 parseNames(b, names, score+1);
Chris@0 164 return;
Chris@0 165 }
Chris@0 166
Chris@0 167 // square brackets for alternation of a series of names, at start
Chris@0 168 re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]");
Chris@0 169 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 170 int ml = re.matchedLength();
Chris@0 171 QString c(re.cap(1));
Chris@0 172 QString d(re.cap(2));
Chris@0 173 a.replace(mp, ml, c);
Chris@0 174 b.replace(mp, ml, d);
Chris@0 175 parseNames(a, names, score);
Chris@0 176 parseNames(b, names, score+1);
Chris@0 177 return;
Chris@0 178 }
Chris@0 179
Chris@0 180 // slash for alternation of word
Chris@0 181 re = QRegExp("([^ ,|]+)/([^ ,|]+)");
Chris@0 182 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 183 int ml = re.matchedLength();
Chris@0 184 QString c(re.cap(1));
Chris@0 185 QString d(re.cap(2));
Chris@0 186 a.replace(mp, ml, c);
Chris@0 187 b.replace(mp, ml, d);
Chris@0 188 parseNames(a, names, score);
Chris@0 189 parseNames(b, names, score+1);
Chris@0 190 return;
Chris@0 191 }
Chris@0 192
Chris@0 193 // pipe for alternation of surname
Chris@0 194 re = QRegExp("^(.*) \\| ([^|, ]+),");
Chris@0 195 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 196 int ml = re.matchedLength();
Chris@0 197 QString c(re.cap(1));
Chris@0 198 QString d(re.cap(2));
Chris@0 199 a.replace(mp, ml, c + ",");
Chris@0 200 b.replace(mp, ml, d + ",");
Chris@0 201 parseNames(a, names, score);
Chris@0 202 parseNames(b, names, score+1);
Chris@0 203 return;
Chris@0 204 }
Chris@0 205
Chris@0 206 // pipe for alternation of whole (before comma)
Chris@0 207 re = QRegExp("^(.*) \\| ([^|,]+),");
Chris@0 208 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 209 int ml = re.matchedLength();
Chris@0 210 QString c(re.cap(1));
Chris@0 211 QString d(re.cap(2));
Chris@0 212 a.replace(mp, ml, c + ",");
Chris@0 213 b = d;
Chris@0 214 parseNames(a, names, score);
Chris@0 215 parseNames(b, names, score+1);
Chris@0 216 return;
Chris@0 217 }
Chris@0 218
Chris@0 219 // pipe for alternation of whole (at end)
Chris@0 220 re = QRegExp("^(.*) \\| ([^|,]+)$");
Chris@0 221 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 222 int ml = re.matchedLength();
Chris@0 223 QString c(re.cap(1));
Chris@0 224 QString d(re.cap(2));
Chris@0 225 a.replace(mp, ml, c);
Chris@0 226 b.replace(mp, ml, d);
Chris@0 227 parseNames(a, names, score);
Chris@0 228 parseNames(b, names, score+1);
Chris@0 229 return;
Chris@0 230 }
Chris@0 231
Chris@0 232 // comma
Chris@0 233 re = QRegExp("^(.+), ([^,]+)$");
Chris@0 234 if ((mp = re.indexIn(field)) >= 0) {
Chris@0 235 QString c(re.cap(1));
Chris@0 236 QString d(re.cap(2));
Chris@0 237 parseNames(d + " " + c, names, score+1);
Chris@0 238 // fall through to add
Chris@0 239 }
Chris@0 240
Chris@4 241 field.replace("(", "");
Chris@4 242 field.replace(")", "");
Chris@4 243
Chris@0 244 names[field] = score;
Chris@0 245 }
Chris@0 246
Chris@0 247 void
Chris@0 248 ClassicalComposersOrgImporter::import(QUrl source)
Chris@0 249 {
Chris@0 250 int i = 0;
Chris@0 251
Chris@0 252 //!!! for now
Chris@0 253 QString filename = source.toLocalFile();
Chris@0 254
Chris@0 255
Chris@0 256 QFile file(filename);
Chris@0 257 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 258 throw std::exception();
Chris@0 259 }
Chris@0 260
Chris@0 261 QTextStream stream(&file);
Chris@0 262 stream.setCodec("UTF-8");
Chris@0 263 QString all = stream.readAll();
Chris@0 264
Chris@0 265 all.replace(QRegExp("^.*<div id=\"main\">"), "");
Chris@0 266
Chris@0 267 QRegExp matcher
Chris@1 268 (QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>"));
Chris@0 269
Chris@0 270 int pos = 0, count = 0;
Chris@0 271 while ((pos = matcher.indexIn(all, pos)) != -1) {
Chris@0 272
Chris@0 273 pos += matcher.matchedLength();
Chris@0 274 ++count;
Chris@0 275
Chris@0 276 QString page = matcher.cap(1);
Chris@0 277 QString name = matcher.cap(2);
Chris@1 278 QString star = matcher.cap(5);
Chris@0 279 QString birth = matcher.cap(6);
Chris@1 280 QString dagger = matcher.cap(7);
Chris@1 281 QString death = matcher.cap(8);
Chris@1 282 QString female = matcher.cap(9);
Chris@0 283
Chris@0 284 DEBUG << "Item " << count
Chris@0 285 << ": page = " << page
Chris@0 286 << ", name = " << name
Chris@0 287 << ", birth = " << birth
Chris@0 288 << ", death = " << death
Chris@0 289 << ", female = " << female;
Chris@0 290
Chris@0 291 QString namefield = name.trimmed();
Chris@0 292 NameMap names;
Chris@0 293
Chris@4 294 if (namefield.contains("P.D.Q.")) { // lose this joke
Chris@4 295 continue;
Chris@4 296 }
Chris@4 297
Chris@0 298 parseNames(namefield, names);
Chris@0 299
Chris@0 300 i = 0;
Chris@0 301 QString preferred;
Chris@0 302 foreach (QString n, names.keys()) {
Chris@0 303 if (preferred == "" || names[n] == 0) preferred = n;
Chris@0 304 DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl;
Chris@0 305 ++i;
Chris@0 306 }
Chris@0 307
Chris@0 308 if (names.empty()) {
Chris@0 309 DEBUG << "No name!" << endl;
Chris@0 310 continue;
Chris@0 311 }
Chris@0 312
Chris@0 313 Composer *composer = new Composer();
Chris@0 314 composer->setName(preferred);
Chris@0 315 foreach (QString n, names.keys()) {
Chris@0 316 if (n != preferred) composer->addAlias(n);
Chris@0 317 }
Chris@0 318
Chris@0 319 if (page != "") {
Chris@0 320 Document *d = new Document;
Chris@18 321 d->setUri(Uri("http://www.classical-composers.org" + page));
Chris@0 322 d->setTopic(composer);
Chris@0 323 d->setSiteName("Classical Composers Database");
Chris@0 324 composer->addPage(d);
Chris@0 325 }
Chris@1 326
Chris@1 327 if (birth != "" && death == "") {
Chris@1 328 if (star == "" && dagger != QString::fromUtf8("\342\200\240")) {
Chris@1 329 DEBUG << "Unexpected \"dagger\" character" << dagger << endl;
Chris@1 330 birth = "";
Chris@1 331 }
Chris@1 332 if (star == "" && dagger == "") {
Chris@1 333 DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl;
Chris@1 334 birth = "";
Chris@1 335 } else if (star != "" && dagger != "") {
Chris@1 336 DEBUG << "Date range features both star and dagger -- ignoring" << endl;
Chris@1 337 birth = "";
Chris@1 338 } else if (dagger != "") {
Chris@1 339 DEBUG << "dagger found: setting death to " << birth << endl;
Chris@1 340 death = birth;
Chris@1 341 birth = "";
Chris@1 342 }
Chris@1 343 }
Chris@1 344
Chris@0 345 if (birth != "") {
Chris@0 346 Birth *e = new Birth(birth.toInt());
Chris@0 347 composer->setBirth(e);
Chris@0 348 }
Chris@0 349 if (death != "") {
Chris@0 350 composer->setDeath(new Death(death.toInt()));
Chris@0 351 }
Chris@0 352 if (female != "") {
Chris@0 353 composer->setGender("female");
Chris@20 354 } else {
Chris@20 355 composer->setGender("male");
Chris@20 356 }
Chris@0 357
Chris@0 358 m_objects.push_back(composer);
Chris@0 359 }
Chris@0 360
Chris@0 361 DEBUG << "Found " << count << " things" << endl;
Chris@0 362
Chris@0 363 }
Chris@0 364
Chris@0 365
Chris@0 366 }
Chris@0 367