annotate import/Import.cpp @ 5:d23a4c935a22 classical-rdf

* Update CMN and mbz mappings for new classical archives import
author Chris Cannam
date Fri, 11 Dec 2009 16:10:29 +0000
parents 719a4f477098
children 96bf272e74c5
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "Objects.h"
Chris@0 4
Chris@0 5 #include <dataquay/BasicStore.h>
Chris@0 6 #include <dataquay/RDFException.h>
Chris@0 7 #include <dataquay/objectmapper/ObjectMapper.h>
Chris@0 8 #include <dataquay/objectmapper/ObjectBuilder.h>
Chris@0 9 #include <dataquay/objectmapper/ContainerBuilder.h>
Chris@0 10
Chris@0 11 #include "ImportClassicalComposersOrg.h"
Chris@0 12 #include "ImportClassicalDotNet.h"
Chris@4 13 #include "ImportClassicalArchives.h"
Chris@0 14 #include "ImportWikipediaComposers.h"
Chris@0 15 #include "ImportWikipediaWorks.h"
Chris@0 16 #include "ImportWikipediaWorksK.h"
Chris@0 17 #include "ImportWikipediaWorksList.h"
Chris@0 18 #include "ImportHoboken.h"
Chris@0 19
Chris@0 20 #include <dataquay/Debug.h>
Chris@0 21
Chris@0 22 using namespace ClassicalData;
Chris@0 23 using namespace Dataquay;
Chris@0 24
Chris@0 25 #include <iostream>
Chris@0 26 #include <set>
Chris@0 27
Chris@0 28 typedef QMap<QString, QSet<Composer *> > ComposerMap; // name -> composers
Chris@0 29
Chris@0 30 void
Chris@0 31 addMiscExpansions(Composer *c)
Chris@0 32 {
Chris@0 33 QString n = c->name();
Chris@0 34
Chris@0 35 DEBUG << "addMiscExpansions: n = " << n << endl;
Chris@0 36
Chris@0 37 // lovely hard-coded special cases go here! some of these are
Chris@0 38 // needed for works->composer assignments
Chris@0 39 if (n == "Balakirev, Milii") {
Chris@0 40 c->addAlias("Mily Balakirev");
Chris@0 41 }
Chris@0 42 if (n.startsWith("Cui, C")) {
Chris@0 43 c->addAlias(QString::fromUtf8("C\303\251sar Cui"));
Chris@0 44 }
Chris@0 45 if (n == "Handel, George Frideric") {
Chris@0 46 c->addAlias("Handel, Georg Friedrich");
Chris@0 47 c->addAlias("Handel");
Chris@0 48 }
Chris@1 49 if (n == "Prokofiev, Sergey") {
Chris@1 50 c->addAlias("Prokofieff, Sergei");
Chris@1 51 c->addAlias("Sergei Prokofieff");
Chris@1 52 }
Chris@1 53 if (n == "Rossini, Gioacchino") {
Chris@1 54 c->addAlias("Rossini, Gioachino");
Chris@1 55 c->addAlias("Gioachino Rossini");
Chris@1 56 }
Chris@1 57 if (n == "Edwards, Richard") {
Chris@1 58 c->addAlias("Edwardes, Richard");
Chris@1 59 c->addAlias("Richard Edwardes");
Chris@1 60 c->addAlias("Richard Edwards");
Chris@1 61 }
Chris@1 62 if (n == "Rimsky-Korsakov, Nikolay Andreyevich") {
Chris@1 63 c->addAlias("Nikolai Rimsky-Korsakov");
Chris@1 64 }
Chris@1 65 if (n.startsWith("Piccinni, Nico")) {
Chris@1 66 c->addAlias(n);
Chris@1 67 c->setName(QString::fromUtf8("Piccinni, Niccol\303\262"));
Chris@1 68 }
Chris@1 69 if (n == "Tchaikovsky, Pyotr Ilyich") {
Chris@1 70 c->addAlias("Tchaikovsky, Piotr Ilyitch");
Chris@1 71 }
Chris@1 72 if (n == "Wilhelm Stenhammar") {
Chris@1 73 c->addAlias("Stenhammar, Vilhelm Eugene");
Chris@1 74 c->setName("Stenhammar, Wilhelm");
Chris@1 75 c->addAlias(n);
Chris@1 76 }
Chris@1 77 if (n == "Mercadante, Saverio Rafaele") {
Chris@1 78 c->addAlias("Mercadante, Giuseppe");
Chris@1 79 }
Chris@1 80 if (n == "Johann Wenzel Anton Stamitz") {
Chris@1 81 c->addAlias(n);
Chris@1 82 c->setName("Stamitz, Johann Wenzel Anton");
Chris@1 83 c->addAlias("Stamitz, Jan Vaclav");
Chris@1 84 }
Chris@1 85 if (n == "Mario Castelnuovo-Tedesco") {
Chris@1 86 c->addAlias("Castelnuovo Tedesco, Mario");
Chris@1 87 }
Chris@0 88 if (n == "Mayr, Simon") {
Chris@0 89 c->addAlias("Mayr");
Chris@0 90 }
Chris@0 91
Chris@0 92 n.replace(", Sr.", " Sr.");
Chris@0 93 n.replace(", Jr.", " Jr.");
Chris@0 94
Chris@0 95 int comma = n.indexOf(", ");
Chris@0 96 if (comma > 0 && comma + 2 < n.length()) {
Chris@0 97
Chris@0 98 QString left = n.left(comma);
Chris@0 99 QString right = n.right(n.length() - comma - 2);
Chris@0 100
Chris@0 101 QRegExp jrsr("( (Sr\\.|Jr\\.|I|II))$");
Chris@0 102 if (jrsr.indexIn(right) >= 0) {
Chris@0 103 left = left + jrsr.cap(1);
Chris@0 104 right = right.left(right.length()-jrsr.matchedLength());
Chris@0 105 }
Chris@0 106 n = right + " " + left;
Chris@0 107 }
Chris@0 108
Chris@0 109 if (n != c->name()) c->addAlias(n);
Chris@0 110
Chris@0 111 if (n.contains("Sergey")) {
Chris@0 112 QString nn(n);
Chris@0 113 nn.replace("Sergey", "Sergei");
Chris@0 114 c->addAlias(nn);
Chris@1 115 } else if (n.contains("Sergei")) {
Chris@1 116 QString nn(n);
Chris@1 117 nn.replace("Sergei", "Sergey");
Chris@1 118 c->addAlias(nn);
Chris@0 119 }
Chris@0 120
Chris@0 121 QRegExp sr("((, )?Sr\\.|Senior|\\(?the elder\\)?)", Qt::CaseInsensitive);
Chris@0 122 if (sr.indexIn(n) >= 0) {
Chris@0 123 QString nr = n;
Chris@0 124 nr.replace(sr.pos(0), sr.matchedLength(), " I");
Chris@0 125 nr.replace(" ", " ");
Chris@0 126 DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl;
Chris@0 127 c->addAlias(nr);
Chris@0 128 }
Chris@0 129 QRegExp jr("((, )?Jr\\.|Junior|\\(?the younger\\)?)", Qt::CaseInsensitive);
Chris@0 130 if (jr.indexIn(n) >= 0) {
Chris@0 131 QString nr = n;
Chris@0 132 nr.replace(jr.pos(0), jr.matchedLength(), " II");
Chris@0 133 nr.replace(" ", " ");
Chris@0 134 DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl;
Chris@0 135 c->addAlias(nr);
Chris@0 136 }
Chris@0 137 QString nr = n;
Chris@0 138 nr.replace("(I)", "I");
Chris@0 139 nr.replace("(II)", "II");
Chris@0 140 nr.replace("(III)", "III");
Chris@0 141 c->addAlias(nr);
Chris@0 142 }
Chris@0 143
Chris@5 144 QString makeNameKey(QString name)
Chris@5 145 {
Chris@5 146 QString key = name.toLower()
Chris@5 147 .replace("'", "")
Chris@5 148 .replace("x", "ks")
Chris@5 149 .replace("y", "i")
Chris@5 150 .replace("k", "c")
Chris@5 151 .replace("ch", "c")
Chris@5 152 .replace("cc", "c")
Chris@5 153 .replace("v", "f")
Chris@5 154 .replace("ff", "f")
Chris@5 155 .replace("th", "t")
Chris@5 156 .replace("tch", "ch")
Chris@5 157 .replace("er", "r");
Chris@5 158 // DEBUG << "makeNameKey(" << name << "): " << key << endl;
Chris@5 159 return key;
Chris@5 160 }
Chris@5 161
Chris@0 162 bool namesFuzzyMatch(QString an, Composer *b)
Chris@0 163 {
Chris@0 164 // ew!
Chris@0 165
Chris@0 166 QString bn = b->name();
Chris@0 167 if (bn == an) return true;
Chris@0 168 if (b->aliases().contains(an)) return true;
Chris@0 169 int aSurnameIndex = 0, bSurnameIndex = 0;
Chris@0 170 if (an.contains(",")) {
Chris@0 171 an.replace(",", "");
Chris@0 172 } else {
Chris@0 173 aSurnameIndex = -1;
Chris@0 174 }
Chris@0 175 if (bn.contains(",")) {
Chris@0 176 bn.replace(",", "");
Chris@0 177 } else {
Chris@0 178 bSurnameIndex = -1;
Chris@0 179 }
Chris@0 180 QStringList nl = an.split(QRegExp("[ -]"));
Chris@5 181 QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]"));
Chris@0 182 int matchCount = 0;
Chris@0 183 QString surnameMatch = "";
Chris@0 184 if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
Chris@0 185 if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
Chris@0 186 if (nl[aSurnameIndex][0].isUpper() &&
Chris@0 187 nl[aSurnameIndex] != "Della" &&
Chris@5 188 makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
Chris@0 189 surnameMatch = nl[aSurnameIndex];
Chris@0 190 }
Chris@5 191 int tested = 0;
Chris@0 192 foreach (QString elt, nl) {
Chris@0 193 if (!elt[0].isUpper() || elt == "Della") continue;
Chris@5 194 QString k = makeNameKey(elt);
Chris@5 195 if (bnl.contains(k)) {
Chris@0 196 ++matchCount;
Chris@5 197 }
Chris@5 198 if (++tested == 2 && matchCount == 0) {
Chris@5 199 return false;
Chris@0 200 }
Chris@0 201 }
Chris@5 202 if (surnameMatch != "") {
Chris@0 203 DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
Chris@5 204 if (matchCount > 1) {
Chris@5 205 return true;
Chris@5 206 } else {
Chris@5 207 DEBUG << "(but not enough else matched)" << endl;
Chris@5 208 return false;
Chris@5 209 }
Chris@0 210 }
Chris@0 211 return false;
Chris@0 212 }
Chris@0 213
Chris@0 214 bool
Chris@0 215 hasBetterName(Composer *c, Composer *other)
Chris@0 216 {
Chris@0 217 if (c->name() == other->name()) return false;
Chris@0 218
Chris@0 219 // Try to guess which of c and other is more likely to have a good
Chris@0 220 // "canonical form" of the composer's name
Chris@0 221
Chris@0 222 if (c->name().startsWith("van ")) {
Chris@0 223 return false; // wrong choice of sort for e.g. LvB; should be
Chris@0 224 // Beethoven, Ludwig van, not van Beethoven, Ludwig
Chris@0 225 }
Chris@0 226 if (other->name().startsWith("van ")) {
Chris@0 227 return true;
Chris@0 228 }
Chris@0 229
Chris@0 230 if (c->aliases().size() != other->aliases().size()) {
Chris@0 231 // a rather weak heuristic
Chris@0 232 return c->aliases().size() > other->aliases().size();
Chris@0 233 }
Chris@0 234
Chris@0 235 if (c->name().contains(',') && !other->name().contains(',')) {
Chris@0 236 // another rather weak heuristic
Chris@0 237 return true;
Chris@0 238 }
Chris@0 239
Chris@0 240 return false;
Chris@0 241 }
Chris@0 242
Chris@0 243 void mergeComposer(Composer *c, ComposerMap &composers)
Chris@0 244 {
Chris@0 245 QString name = c->name();
Chris@0 246
Chris@0 247 QSet<QString> allNames = c->aliases();
Chris@0 248 allNames.insert(name);
Chris@0 249
Chris@0 250 QString dates;
Chris@0 251 if (c->birth()) {
Chris@0 252 if (c->death()) {
Chris@0 253 dates = QString("%1-%2").arg(c->birth()->year()).arg(c->death()->year());
Chris@0 254 } else {
Chris@0 255 dates = QString("%1-").arg(c->birth()->year());
Chris@0 256 }
Chris@0 257 }
Chris@0 258 if (dates != "") {
Chris@0 259 allNames.insert(dates);
Chris@0 260 }
Chris@0 261
Chris@0 262 QSet<Composer *> matches;
Chris@0 263
Chris@0 264 foreach (QString candidateName, allNames) {
Chris@5 265 QString key = makeNameKey(candidateName);
Chris@0 266 if (composers.contains(key)) {
Chris@0 267 foreach (Composer *candidate, composers[key]) {
Chris@0 268 if (candidateName == dates) {
Chris@5 269 if (c->name() == candidate->name()) {
Chris@5 270 DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl;
Chris@5 271 } else if (!namesFuzzyMatch(c->name(), candidate) &&
Chris@5 272 !namesFuzzyMatch(candidate->name(), c)) {
Chris@0 273 DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl;
Chris@0 274 continue;
Chris@0 275 } else {
Chris@0 276 DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl;
Chris@0 277 }
Chris@0 278 } else {
Chris@1 279 if (!c->datesMatch(candidate)) {
Chris@0 280 DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl;
Chris@0 281 continue;
Chris@0 282 }
Chris@0 283 }
Chris@0 284 matches.insert(candidate);
Chris@0 285 }
Chris@0 286 }
Chris@0 287 }
Chris@0 288
Chris@0 289 if (matches.empty()) {
Chris@0 290 DEBUG << "mergeComposer: No existing composer with alias matching any alias of " << c->name() << ", adding" << endl;
Chris@0 291
Chris@0 292 if (!c->birth() && !c->death()) {
Chris@5 293 DEBUG << "Composer has no dates, laboriously searching for all names" << endl;
Chris@0 294 // laboriously look for fuzzy match across _all_ composers
Chris@0 295 for (ComposerMap::iterator i = composers.begin();
Chris@0 296 i != composers.end(); ++i) {
Chris@0 297 foreach (Composer *candidate, *i) {
Chris@0 298 if (namesFuzzyMatch(c->name(), candidate)) {
Chris@0 299 DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl;
Chris@0 300 matches.insert(candidate);
Chris@0 301 break;
Chris@0 302 }
Chris@0 303 }
Chris@0 304 if (!matches.empty()) break;
Chris@0 305 }
Chris@0 306 }
Chris@0 307
Chris@0 308 if (matches.empty()) {
Chris@0 309 foreach (QString candidateName, allNames) {
Chris@5 310 QString key = makeNameKey(candidateName);
Chris@5 311 composers[key].insert(c);
Chris@0 312 DEBUG << "added for alias or date " << candidateName << endl;
Chris@0 313 }
Chris@0 314 return;
Chris@0 315 }
Chris@0 316 }
Chris@0 317
Chris@0 318 if (matches.size() > 1) {
Chris@0 319 DEBUG << "mergeComposer: More than one composer matches name and date(s) for " << c->name() << " -- something fishy here" << endl;
Chris@0 320 }
Chris@0 321
Chris@0 322 Composer *other = *matches.begin();
Chris@0 323
Chris@0 324 DEBUG << "mergeComposer: Merging " << c->name() << " with " << other->name() << endl;
Chris@0 325
Chris@0 326 if (hasBetterName(c, other)) {
Chris@0 327 other->addAlias(other->name());
Chris@0 328 other->setName(c->name());
Chris@0 329 } else {
Chris@0 330 other->addAlias(c->name());
Chris@0 331 }
Chris@5 332 composers[makeNameKey(c->name())].insert(other);
Chris@0 333 DEBUG << "linking from alias " << c->name() << endl;
Chris@0 334
Chris@0 335 foreach (QString alias, c->aliases()) {
Chris@0 336 if (alias != other->name() &&
Chris@0 337 !other->aliases().contains(alias)) {
Chris@0 338 other->addAlias(alias);
Chris@5 339 composers[makeNameKey(alias)].insert(other);
Chris@0 340 DEBUG << "linking from alias " << alias << endl;
Chris@0 341 }
Chris@0 342 }
Chris@0 343
Chris@0 344 foreach (Document *d, c->pages()) {
Chris@0 345 bool found = false;
Chris@0 346 foreach (Document *dd, other->pages()) {
Chris@0 347 if (d->uri() == dd->uri()) {
Chris@0 348 found = true;
Chris@0 349 break;
Chris@0 350 }
Chris@0 351 }
Chris@0 352 if (!found) {
Chris@0 353 d->setTopic(other);
Chris@0 354 other->addPage(d);
Chris@0 355 }
Chris@0 356 }
Chris@0 357
Chris@0 358 //!!! actually the "approximate" bits of the following are bogus;
Chris@0 359 // a source reporting birth or death date as approx is probably
Chris@0 360 // more accurate than one reporting an exact date
Chris@0 361
Chris@0 362 if (c->birth()) {
Chris@0 363 if (!other->birth() || other->birth()->approximate()) {
Chris@0 364 other->setBirth(c->birth());
Chris@0 365 }
Chris@0 366 }
Chris@0 367
Chris@0 368 if (c->death()) {
Chris@0 369 if (!other->death() || other->death()->approximate()) {
Chris@0 370 other->setDeath(c->death());
Chris@0 371 }
Chris@0 372 }
Chris@0 373
Chris@0 374 if (c->gender() != "") other->setGender(c->gender());
Chris@4 375
Chris@4 376 foreach (QString s, c->nationality()) {
Chris@4 377 other->addNationality(s);
Chris@4 378 }
Chris@4 379
Chris@4 380 foreach (QUrl s, c->geonameURIs()) {
Chris@4 381 other->addGeonameURI(s);
Chris@4 382 }
Chris@4 383
Chris@0 384 if (c->remarks() != "") other->setRemarks(c->remarks());
Chris@0 385 if (c->period() != "") other->setPeriod(c->period());
Chris@0 386
Chris@0 387 }
Chris@0 388
Chris@0 389 QString
Chris@0 390 asciify(QString field)
Chris@0 391 {
Chris@0 392 // accented characters etc -- add "ascii version" for dumb search purposes
Chris@0 393 QString ascii;
Chris@0 394 for (int i = 0; i < field.length(); ++i) {
Chris@0 395 QString dc = field[i].decomposition();
Chris@0 396 if (dc != "") ascii += dc[0];
Chris@0 397 else if (field[i] == QChar(0x00DF)) {
Chris@0 398 ascii += "ss";
Chris@0 399 } else {
Chris@0 400 ascii += field[i];
Chris@0 401 }
Chris@0 402 }
Chris@0 403 ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
Chris@0 404 ascii.replace(QString::fromUtf8("\342\200\222"), "-");
Chris@0 405 ascii.replace(QString::fromUtf8("\342\200\223"), "-");
Chris@0 406 ascii.replace(QString::fromUtf8("\342\200\224"), "-");
Chris@0 407 ascii.replace(QString::fromUtf8("\342\200\225"), "-");
Chris@0 408 return ascii;
Chris@0 409 }
Chris@0 410
Chris@0 411 void
Chris@0 412 asciify(Composer *c)
Chris@0 413 {
Chris@0 414 QString n = c->name();
Chris@0 415 QString asc = asciify(n);
Chris@0 416 if (asc != n && !c->aliases().contains(asc)) c->addAlias(asc);
Chris@0 417 foreach (QString alias, c->aliases()) {
Chris@0 418 asc = asciify(alias);
Chris@0 419 if (asc != alias && !c->aliases().contains(asc)) c->addAlias(asc);
Chris@0 420 }
Chris@0 421 }
Chris@0 422
Chris@0 423 void
Chris@0 424 asciify(Work *w)
Chris@0 425 {
Chris@0 426 QString n = w->name();
Chris@0 427 QString asc = asciify(n);
Chris@0 428 if (asc != n && !w->aliases().contains(asc)) w->addAlias(asc);
Chris@0 429 foreach (QString alias, w->aliases()) {
Chris@0 430 asc = asciify(alias);
Chris@0 431 if (asc != alias && !w->aliases().contains(asc)) w->addAlias(asc);
Chris@0 432 }
Chris@0 433 }
Chris@0 434
Chris@0 435 void
Chris@0 436 assignUri(Store *s, Composer *c)
Chris@0 437 {
Chris@0 438 static QSet<QString> convSet;
Chris@0 439 QString conv = c->name();
Chris@0 440 if (!conv.contains(",")) {
Chris@0 441 QStringList sl = conv.split(" ");
Chris@0 442 if (!sl.empty()) {
Chris@0 443 sl.push_front(sl[sl.size()-1]);
Chris@0 444 sl.removeLast();
Chris@0 445 conv = sl.join(" ");
Chris@0 446 DEBUG << "assignUri: " << c->name() << " -> " << conv << endl;
Chris@0 447 }
Chris@0 448 }
Chris@0 449 conv = asciify(conv);
Chris@0 450 conv.replace(" ", "_");
Chris@0 451 conv.replace("-", "_");
Chris@0 452 conv.replace(QRegExp("[^a-zA-Z0-9_-]"), "");
Chris@0 453 conv = conv.toLower();
Chris@0 454 QString initial = conv;
Chris@1 455 int i = 2;
Chris@0 456 while (convSet.contains(conv)) {
Chris@0 457 conv = QString("%1__%2").arg(initial).arg(i);
Chris@0 458 i++;
Chris@0 459 }
Chris@0 460 convSet.insert(conv);
Chris@1 461 c->setProperty("uri", s->expand(":composer/" + conv));
Chris@0 462 }
Chris@0 463
Chris@0 464 void
Chris@0 465 assignUri(Store *s, Work *w, Composer *c)
Chris@0 466 {
Chris@0 467 QString pfx = c->property("uri").toUrl().toString();
Chris@0 468 DEBUG << "pfx = " << pfx << endl;
Chris@2 469 if (!pfx.contains("composer/")) pfx = ":work/";
Chris@2 470 else {
Chris@2 471 pfx.replace("composer/", "work/");
Chris@2 472 pfx += "/";
Chris@2 473 }
Chris@0 474
Chris@0 475 static QSet<QString> convSet;
Chris@1 476
Chris@0 477 QString conv = w->catalogue();
Chris@0 478 if (conv == "") conv = w->opus();
Chris@0 479 conv = conv.replace(".", "");
Chris@0 480 bool hasOpus = (conv != "");
Chris@1 481 if (conv == "") conv = w->name().toLower();
Chris@0 482 if (w->number() != "") conv = conv + "_no" + w->number();
Chris@0 483 conv = asciify(conv);
Chris@0 484 conv.replace(" ", "_");
Chris@0 485 conv.replace("-", "_");
Chris@0 486 conv.replace(":", "_");
Chris@0 487 conv.replace(QRegExp("[^a-zA-Z0-9_-]"), "");
Chris@1 488
Chris@2 489 if (pfx != "") conv = pfx + conv;
Chris@1 490
Chris@0 491 // I think actually for works we want to merge duplicates rather than
Chris@0 492 // assign them separate URIs, _unless_ they lack a viable opus number
Chris@0 493 if (!hasOpus) {
Chris@0 494 QString initial = conv;
Chris@1 495 int i = 2;
Chris@0 496 while (convSet.contains(conv)) {
Chris@0 497 conv = QString("%1__%2").arg(initial).arg(i);
Chris@0 498 i++;
Chris@0 499 }
Chris@0 500 }
Chris@0 501 convSet.insert(conv);
Chris@1 502
Chris@1 503 w->setProperty("uri", conv);
Chris@0 504 }
Chris@0 505
Chris@0 506 void
Chris@0 507 addDbpediaResource(Store *store, QObject *o, QString s)
Chris@0 508 {
Chris@0 509 QUrl u = o->property("uri").toUrl();
Chris@0 510 if (u == QUrl()) return;
Chris@0 511 if (s.startsWith("http://en.wikipedia.org/wiki/")) {
Chris@0 512 store->add(Triple(u,
Chris@0 513 "mo:wikipedia",
Chris@0 514 QUrl(s)));
Chris@0 515 s.replace("http://en.wikipedia.org/wiki/",
Chris@0 516 "http://dbpedia.org/resource/");
Chris@0 517 store->add(Triple(u,
Chris@0 518 "owl:sameAs",
Chris@0 519 QUrl(s)));
Chris@0 520 }
Chris@0 521 }
Chris@0 522
Chris@0 523 int main(int argc, char **argv)
Chris@0 524 {
Chris@0 525 qRegisterMetaType<HistoricalEvent *>
Chris@0 526 ("ClassicalData::HistoricalEvent*");
Chris@0 527 qRegisterMetaType<Birth *>
Chris@0 528 ("ClassicalData::Birth*");
Chris@0 529 qRegisterMetaType<Death *>
Chris@0 530 ("ClassicalData::Death*");
Chris@0 531 qRegisterMetaType<Composition *>
Chris@0 532 ("ClassicalData::Composition*");
Chris@0 533 qRegisterMetaType<Work *>
Chris@0 534 ("ClassicalData::Work*");
Chris@0 535 qRegisterMetaType<Movement *>
Chris@0 536 ("ClassicalData::Movement*");
Chris@0 537 qRegisterMetaType<Composer *>
Chris@0 538 ("ClassicalData::Composer*");
Chris@0 539 qRegisterMetaType<Document *>
Chris@0 540 ("ClassicalData::Document*");
Chris@0 541 qRegisterMetaType<Form *>
Chris@0 542 ("ClassicalData::Form*");
Chris@0 543 qRegisterMetaType<QSet<Work *> >
Chris@0 544 ("QSet<ClassicalData::Work*>");
Chris@0 545 qRegisterMetaType<QSet<Movement *> >
Chris@0 546 ("QSet<ClassicalData::Movement*>");
Chris@0 547 qRegisterMetaType<QSet<Document *> >
Chris@0 548 ("QSet<ClassicalData::Document*>");
Chris@0 549 qRegisterMetaType<QSet<Form *> >
Chris@0 550 ("QSet<ClassicalData::Form*>");
Chris@0 551 qRegisterMetaType<QSet<QString> >
Chris@0 552 ("QSet<QString>");
Chris@4 553 qRegisterMetaType<QSet<QUrl> >
Chris@4 554 ("QSet<QUrl>");
Chris@0 555
Chris@0 556 qRegisterMetaType<ClassicalComposersOrgImporter *>
Chris@0 557 ("ClassicalData::ClassicalComposersOrgImporter*");
Chris@0 558 qRegisterMetaType<ClassicalDotNetImporter *>
Chris@0 559 ("ClassicalData::ClassicalDotNetImporter*");
Chris@4 560 qRegisterMetaType<ClassicalArchivesImporter *>
Chris@4 561 ("ClassicalData::ClassicalArchivesImporter*");
Chris@0 562 qRegisterMetaType<WikipediaComposersImporter *>
Chris@0 563 ("ClassicalData::WikipediaComposersImporter*");
Chris@0 564 qRegisterMetaType<WikipediaWorksImporter *>
Chris@0 565 ("ClassicalData::WikipediaWorksImporter*");
Chris@0 566 qRegisterMetaType<WikipediaWorksKImporter *>
Chris@0 567 ("ClassicalData::WikipediaWorksKImporter*");
Chris@0 568 qRegisterMetaType<WikipediaWorksListImporter *>
Chris@0 569 ("ClassicalData::WikipediaWorksListImporter*");
Chris@0 570 qRegisterMetaType<HobokenImporter *>
Chris@0 571 ("ClassicalData::HobokenImporter*");
Chris@0 572
Chris@0 573 ObjectBuilder::getInstance()->registerClass
Chris@0 574 <HistoricalEvent>("ClassicalData::HistoricalEvent*");
Chris@0 575 ObjectBuilder::getInstance()->registerClass
Chris@0 576 <Birth>("ClassicalData::Birth*");
Chris@0 577 ObjectBuilder::getInstance()->registerClass
Chris@0 578 <Death>("ClassicalData::Death*");
Chris@0 579 ObjectBuilder::getInstance()->registerClass
Chris@0 580 <Composition>("ClassicalData::Composition*");
Chris@0 581 ObjectBuilder::getInstance()->registerClass
Chris@0 582 <Work, QObject>("ClassicalData::Work*");
Chris@0 583 ObjectBuilder::getInstance()->registerClass
Chris@0 584 <Movement, QObject>("ClassicalData::Movement*");
Chris@0 585 ObjectBuilder::getInstance()->registerClass
Chris@0 586 <Composer, QObject>("ClassicalData::Composer*");
Chris@0 587 ObjectBuilder::getInstance()->registerClass
Chris@0 588 <Document, QObject>("ClassicalData::Document*");
Chris@0 589 ObjectBuilder::getInstance()->registerClass
Chris@0 590 <Form, QObject>("ClassicalData::Form*");
Chris@0 591
Chris@0 592 ObjectBuilder::getInstance()->registerClass
Chris@0 593 <ClassicalComposersOrgImporter>("ClassicalData::ClassicalComposersOrgImporter*");
Chris@0 594 ObjectBuilder::getInstance()->registerClass
Chris@0 595 <ClassicalDotNetImporter>("ClassicalData::ClassicalDotNetImporter*");
Chris@0 596 ObjectBuilder::getInstance()->registerClass
Chris@4 597 <ClassicalArchivesImporter>("ClassicalData::ClassicalArchivesImporter*");
Chris@4 598 ObjectBuilder::getInstance()->registerClass
Chris@0 599 <WikipediaComposersImporter>("ClassicalData::WikipediaComposersImporter*");
Chris@0 600 ObjectBuilder::getInstance()->registerClass
Chris@0 601 <WikipediaWorksImporter>("ClassicalData::WikipediaWorksImporter*");
Chris@0 602 ObjectBuilder::getInstance()->registerClass
Chris@0 603 <WikipediaWorksKImporter>("ClassicalData::WikipediaWorksKImporter*");
Chris@0 604 ObjectBuilder::getInstance()->registerClass
Chris@0 605 <WikipediaWorksListImporter>("ClassicalData::WikipediaWorksListImporter*");
Chris@0 606 ObjectBuilder::getInstance()->registerClass
Chris@0 607 <HobokenImporter>("ClassicalData::HobokenImporter*");
Chris@0 608
Chris@0 609 ContainerBuilder::getInstance()->registerContainer
Chris@0 610 <QString, QSet<QString> >
Chris@0 611 ("QString", "QSet<QString>", ContainerBuilder::SetKind);
Chris@0 612
Chris@0 613 ContainerBuilder::getInstance()->registerContainer
Chris@4 614 <QUrl, QSet<QUrl> >
Chris@4 615 ("QUrl", "QSet<QUrl>", ContainerBuilder::SetKind);
Chris@4 616
Chris@4 617 ContainerBuilder::getInstance()->registerContainer
Chris@0 618 <Work*, QSet<Work*> >
Chris@0 619 ("ClassicalData::Work*", "QSet<ClassicalData::Work*>",
Chris@0 620 ContainerBuilder::SetKind);
Chris@0 621
Chris@0 622 ContainerBuilder::getInstance()->registerContainer
Chris@0 623 <Movement*, QSet<Movement*> >
Chris@0 624 ("ClassicalData::Movement*", "QSet<ClassicalData::Movement*>",
Chris@0 625 ContainerBuilder::SetKind);
Chris@0 626
Chris@0 627 ContainerBuilder::getInstance()->registerContainer
Chris@0 628 <Document*, QSet<Document*> >
Chris@0 629 ("ClassicalData::Document*", "QSet<ClassicalData::Document*>",
Chris@0 630 ContainerBuilder::SetKind);
Chris@0 631
Chris@0 632 ContainerBuilder::getInstance()->registerContainer
Chris@0 633 <Form*, QSet<Form*> >
Chris@0 634 ("ClassicalData::Form*", "QSet<ClassicalData::Form*>",
Chris@0 635 ContainerBuilder::SetKind);
Chris@0 636
Chris@0 637 BasicStore *store = BasicStore::load("file:importers.ttl");
Chris@0 638 ObjectMapper mapper(store);
Chris@0 639 QObject *parentObject = mapper.loadAllObjects(new QObject());
Chris@0 640
Chris@0 641 BasicStore *outstore = new BasicStore();
Chris@1 642 outstore->setBaseUri("http://dbtune.org/classical/resource/");
Chris@0 643 ObjectMapper outmapper(outstore);
Chris@0 644
Chris@0 645 outmapper.setPropertyStorePolicy(ObjectMapper::StoreIfChanged);
Chris@3 646 outmapper.setObjectStorePolicy(ObjectMapper::StoreAllObjects);
Chris@3 647 outmapper.setBlankNodePolicy(ObjectMapper::NoBlankNodes);
Chris@1 648
Chris@1 649 outmapper.setObjectTypePrefix("http://dbtune.org/classical/resource/");
Chris@1 650 outmapper.setPropertyPrefix("http://dbtune.org/classical/resource/vocab/");
Chris@1 651 outmapper.setRelationshipPrefix("http://dbtune.org/classical/resource/vocab/relationship/");
Chris@0 652
Chris@0 653 outstore->addPrefix("type", outmapper.getObjectTypePrefix());
Chris@1 654 outstore->addPrefix("classical", outmapper.getObjectTypePrefix() + "type/");
Chris@0 655 outstore->addPrefix("property", outmapper.getPropertyPrefix());
Chris@0 656 outstore->addPrefix("rel", outmapper.getRelationshipPrefix());
Chris@1 657
Chris@0 658 outstore->addPrefix("foaf", "http://xmlns.com/foaf/0.1/");
Chris@0 659 outstore->addPrefix("mo", "http://purl.org/ontology/mo/");
Chris@0 660 outstore->addPrefix("dc", "http://purl.org/dc/elements/1.1/");
Chris@0 661 outstore->addPrefix("bio", "http://purl.org/vocab/bio/0.1/");
Chris@0 662 outstore->addPrefix("owl", "http://www.w3.org/2002/07/owl#");
Chris@3 663 outstore->addPrefix("rdfs", "http://www.w3.org/2000/01/rdf-schema#");
Chris@3 664 outstore->addPrefix("db", "http://dbtune.org/musicbrainz/resource/");
Chris@3 665 outstore->addPrefix("dbv", "http://dbtune.org/musicbrainz/resource/vocab/");
Chris@3 666 outstore->addPrefix("cmn", "http://purl.org/ontology/classicalmusicnav#");
Chris@3 667 outstore->addPrefix("sim", "http://purl.org/ontology/similarity/");
Chris@0 668
Chris@1 669 outmapper.addTypeMapping("ClassicalData::Composer", "classical:Composer");
Chris@1 670 outmapper.addPropertyMapping("ClassicalData::Composer", "pages", "foaf:page");
Chris@1 671 outmapper.addPropertyMapping("ClassicalData::Composer", "name", "foaf:name");
Chris@1 672 outmapper.addPropertyMapping("ClassicalData::Composer", "aliases", "dbv:alias");
Chris@1 673 outmapper.addPropertyMapping("ClassicalData::Composer", "birth", "property:birth");
Chris@1 674 outmapper.addPropertyMapping("ClassicalData::Composer", "death", "property:death");
Chris@4 675 outmapper.addPropertyMapping("ClassicalData::Composer", "geonameURIs", "foaf:based_near");
Chris@0 676
Chris@1 677 outmapper.addTypeMapping("ClassicalData::Birth", "bio:Birth");
Chris@1 678 outmapper.addTypeMapping("ClassicalData::Death", "bio:Death");
Chris@4 679 outmapper.addTypeUriPrefixMapping("ClassicalData::Birth", ":event/");
Chris@4 680 outmapper.addTypeUriPrefixMapping("ClassicalData::Death", ":event/");
Chris@1 681 outmapper.addPropertyMapping("ClassicalData::Birth", "year", "bio:date");
Chris@1 682 outmapper.addPropertyMapping("ClassicalData::Death", "year", "bio:date");
Chris@1 683 outmapper.addPropertyMapping("ClassicalData::Birth", "place", "bio:place");
Chris@1 684 outmapper.addPropertyMapping("ClassicalData::Death", "place", "bio:place");
Chris@0 685
Chris@1 686 outmapper.addTypeMapping("ClassicalData::Document", "foaf:Document");
Chris@1 687 outmapper.addPropertyMapping("ClassicalData::Document", "topic", "foaf:primaryTopic");
Chris@0 688
Chris@1 689 outmapper.addTypeMapping("ClassicalData::Work", "mo:MusicalWork");
Chris@1 690
Chris@1 691 outmapper.addPropertyMapping("ClassicalData::Work", "composition", "mo:composed_in");
Chris@1 692 outmapper.addPropertyMapping("ClassicalData::Work", "opus", "mo:opus");
Chris@1 693 outmapper.addPropertyMapping("ClassicalData::Work", "catalogue", "mo:catalogue");
Chris@1 694 outmapper.addPropertyMapping("ClassicalData::Work", "number", "mo:number");
Chris@1 695 outmapper.addPropertyMapping("ClassicalData::Work", "partOf", "dc:isPartOf");
Chris@1 696 outmapper.addPropertyMapping("ClassicalData::Work", "parts", "dc:hasPart");
Chris@1 697 outmapper.addPropertyMapping("ClassicalData::Work", "pages", "foaf:page");
Chris@1 698 outmapper.addPropertyMapping("ClassicalData::Work", "forms", "property:form");
Chris@1 699 outmapper.addPropertyMapping("ClassicalData::Work", "key", "mo:key");
Chris@1 700 outmapper.addPropertyMapping("ClassicalData::Work", "aliases", "dbv:alias");
Chris@1 701 outmapper.addPropertyMapping("ClassicalData::Work", "name", "dc:title");
Chris@1 702
Chris@1 703 outmapper.addTypeMapping("ClassicalData::Composition", "mo:Composition");
Chris@4 704 outmapper.addTypeUriPrefixMapping("ClassicalData::Composition", ":event/");
Chris@1 705 outmapper.addPropertyMapping("ClassicalData::Composition", "composer", "mo:composer");
Chris@1 706 outmapper.addPropertyMapping("ClassicalData::Composition", "works", "mo:produced_work");
Chris@1 707
Chris@1 708 outstore->add(Triple("classical:Composer", "a", outstore->expand("owl:Class")));
Chris@1 709 outstore->add(Triple("classical:Composer", "rdfs:subClassOf", outstore->expand("mo:MusicArtist")));
Chris@1 710
Chris@1 711 outstore->add(Triple("property:birth", "a", outstore->expand("owl:ObjectProperty")));
Chris@1 712 outstore->add(Triple("property:birth", "rdfs:subPropertyOf", outstore->expand("bio:event")));
Chris@1 713
Chris@1 714 outstore->add(Triple("property:death", "a", outstore->expand("owl:ObjectProperty")));
Chris@1 715 outstore->add(Triple("property:death", "rdfs:subPropertyOf", outstore->expand("bio:event")));
Chris@0 716
Chris@0 717 QList<Importer *> importers = parentObject->findChildren<Importer *>();
Chris@0 718 std::cerr << "have " << importers.size() << " importers" << std::endl;
Chris@0 719
Chris@0 720 ComposerMap composers;
Chris@0 721
Chris@0 722 QList<Composer *> dated;
Chris@0 723 QList<Composer *> undated;
Chris@0 724
Chris@0 725 QList<Work *> works;
Chris@0 726 QList<Composition *> compositions;
Chris@0 727 QList<QObject *> other;
Chris@0 728
Chris@0 729 foreach (Importer *importer, importers) {
Chris@0 730 QObjectList objects = importer->getImportedObjects();
Chris@0 731 foreach (QObject *o, objects) {
Chris@0 732 Composer *c;
Chris@0 733 if ((c = qobject_cast<Composer *>(o))) {
Chris@0 734 addMiscExpansions(c);
Chris@0 735 asciify(c);
Chris@0 736 if (c->birth() || c->death()) dated.push_back(c);
Chris@0 737 else undated.push_back(c);
Chris@0 738 continue;
Chris@0 739 }
Chris@0 740 Work *w;
Chris@0 741 if ((w = qobject_cast<Work *>(o))) {
Chris@0 742 asciify(w);
Chris@0 743 works.push_back(w);
Chris@0 744 continue;
Chris@0 745 }
Chris@0 746 Composition *cn;
Chris@0 747 if ((cn = qobject_cast<Composition *>(o))) {
Chris@0 748 compositions.push_back(cn);
Chris@0 749 continue;
Chris@0 750 }
Chris@0 751 }
Chris@0 752 }
Chris@0 753
Chris@0 754 // get all the dated composers merged before attempting to match
Chris@0 755 // the undated ones
Chris@0 756 foreach (Composer *c, dated) {
Chris@0 757 mergeComposer(c, composers);
Chris@0 758 }
Chris@0 759 foreach (Composer *c, undated) {
Chris@0 760 mergeComposer(c, composers);
Chris@0 761 }
Chris@0 762
Chris@0 763 QObjectList toStore;
Chris@0 764
Chris@0 765 QSet<Composer *> cset;
Chris@0 766 for (ComposerMap::iterator i = composers.begin(); i != composers.end(); ++i) {
Chris@0 767 foreach (Composer *c, i.value()) {
Chris@0 768 if (!cset.contains(c)) {
Chris@0 769 assignUri(outstore, c);
Chris@0 770 toStore.push_back(c);
Chris@0 771 cset.insert(c);
Chris@0 772 }
Chris@0 773 foreach (Document *d, c->pages()) {
Chris@0 774 QString s = d->uri().toString();
Chris@0 775 addDbpediaResource(outstore, c, s);
Chris@0 776 }
Chris@0 777 }
Chris@0 778 }
Chris@0 779
Chris@0 780 QSet<QString> storedUris;
Chris@0 781
Chris@0 782 foreach (Work *w, works) {
Chris@0 783 Composition *cn = w->composition();
Chris@0 784 if (!cn) continue;
Chris@0 785 if (!cn->composer()) {
Chris@0 786 QString cname = cn->composerName();
Chris@5 787 QString key = makeNameKey(cname);
Chris@0 788 if (cname != "") {
Chris@5 789 if (!composers.contains(key)) {
Chris@0 790 DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
Chris@0 791 } else {
Chris@5 792 QSet<Composer *> cs = composers[key];
Chris@0 793 if (cs.empty()) {
Chris@0 794 DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
Chris@0 795 } else if (cs.size() > 1) {
Chris@0 796 DEBUG << "Failed to assign Composition to composer: "
Chris@0 797 << cs.size() << " composers match name " << cname << endl;
Chris@0 798 } else {
Chris@0 799 cn->setComposer(*cs.begin());
Chris@0 800 }
Chris@0 801 }
Chris@0 802 } else {
Chris@0 803 DEBUG << "Failed to assign Composition to composer: composer name is empty" << endl;
Chris@0 804 }
Chris@0 805 }
Chris@0 806
Chris@0 807 if (cn->composer()) {
Chris@0 808 assignUri(outstore, w, cn->composer());
Chris@0 809 }
Chris@0 810
Chris@0 811 foreach (Document *d, w->pages()) {
Chris@0 812 QString s = d->uri().toString();
Chris@0 813 addDbpediaResource(outstore, w, s);
Chris@1 814 if (!storedUris.contains(s)) {
Chris@1 815 toStore.push_back(d);
Chris@1 816 storedUris.insert(s);
Chris@1 817 }
Chris@0 818 }
Chris@0 819
Chris@0 820 QString u = w->property("uri").toUrl().toString();
Chris@0 821 if (u == "" || !storedUris.contains(u)) {
Chris@0 822 toStore.push_back(w);
Chris@0 823 if (u != "") storedUris.insert(u);
Chris@0 824 }
Chris@0 825 }
Chris@0 826
Chris@0 827 try {
Chris@0 828 outmapper.storeAllObjects(toStore);
Chris@0 829
Chris@0 830 } catch (RDFException e) {
Chris@0 831 std::cerr << "Caught RDF exception: " << e.what() << std::endl;
Chris@0 832 }
Chris@0 833
Chris@0 834 DEBUG << "Stored, now saving" << endl;
Chris@0 835
Chris@2 836 outstore->save("imported.ttl");
Chris@0 837
Chris@0 838 DEBUG << "Saved" << endl;
Chris@0 839
Chris@0 840
Chris@0 841 QMultiMap<QString, Composer *> cmap;
Chris@0 842 foreach (Composer *c, cset) {
Chris@0 843 QString n = c->getSortName(true);
Chris@0 844 cmap.insert(n, c);
Chris@0 845 }
Chris@0 846
Chris@0 847 std::cout << "Composers: " << cmap.size() << std::endl;
Chris@0 848
Chris@0 849 for (QMultiMap<QString, Composer *>::iterator i = cmap.begin();
Chris@0 850 i != cmap.end(); ++i) {
Chris@0 851
Chris@0 852 QString n = i.key();
Chris@0 853 Composer *c = i.value();
Chris@0 854
Chris@0 855 std::cout << n.toStdString();
Chris@0 856
Chris@0 857 QString d = c->getDisplayDates();
Chris@0 858 if (d != "") std::cout << " (" << d.toStdString() << ")";
Chris@0 859 std::cout << std::endl;
Chris@0 860 }
Chris@0 861
Chris@0 862 std::cout << std::endl;
Chris@0 863
Chris@0 864 std::cout << "Works by composer:" << std::endl;
Chris@0 865
Chris@0 866 for (QMultiMap<QString, Composer *>::iterator i = cmap.begin();
Chris@0 867 i != cmap.end(); ++i) {
Chris@0 868
Chris@0 869 QString n = i.key();
Chris@0 870 Composer *c = i.value();
Chris@0 871
Chris@0 872 std::set<Work *, Work::Ordering> wmap;
Chris@0 873 foreach (Work *w, works) {
Chris@0 874 Composition *cn = w->composition();
Chris@0 875 if (!cn) continue;
Chris@0 876 if (cn->composer() != c) continue;
Chris@0 877 if (w->partOf()) continue;
Chris@0 878 wmap.insert(w);
Chris@0 879 }
Chris@0 880
Chris@0 881 if (wmap.empty()) continue;
Chris@0 882
Chris@0 883 std::cout << n.toStdString() << std::endl;
Chris@0 884
Chris@0 885 foreach (Work *w, wmap) {
Chris@0 886 std::cout << " * ";
Chris@0 887 std::cout << w->name().toStdString();
Chris@0 888 if (w->catalogue() != "") {
Chris@0 889 std::cout << " [" << w->catalogue().toStdString() << "]";
Chris@0 890 }
Chris@0 891 if (w->opus() != "") {
Chris@0 892 std::cout << " [op. " << w->opus().toStdString() << "]";
Chris@0 893 }
Chris@0 894 std::cout << std::endl;
Chris@0 895 std::set<Work *, Work::Ordering> orderedParts;
Chris@0 896 foreach (Work *ww, w->parts()) {
Chris@0 897 orderedParts.insert(ww);
Chris@0 898 }
Chris@0 899 foreach (Work *ww, orderedParts) {
Chris@0 900 std::cout << " ";
Chris@0 901 if (ww->number() != "") {
Chris@0 902 std::cout << ww->number().toStdString() << ". ";
Chris@0 903 }
Chris@0 904 std::cout << ww->name().toStdString();
Chris@0 905 if (ww->catalogue() != "" && ww->catalogue() != w->catalogue()) {
Chris@0 906 std::cout << " [" << ww->catalogue().toStdString() << "]";
Chris@0 907 }
Chris@0 908 if (ww->opus() != "" && ww->opus() != w->opus()) {
Chris@0 909 std::cout << " [op. " << ww->opus().toStdString() << "]";
Chris@0 910 }
Chris@0 911 std::cout << std::endl;
Chris@0 912 }
Chris@0 913 }
Chris@0 914
Chris@0 915 std::cout << std::endl;
Chris@0 916 }
Chris@0 917
Chris@0 918 delete outstore;
Chris@0 919
Chris@0 920 DEBUG << "Done" << endl;
Chris@0 921
Chris@0 922
Chris@0 923 }
Chris@0 924
Chris@0 925