Mercurial > hg > classical
view import/Import.cpp @ 53:bcea875d8d2f tip
More build fixes
author | Chris Cannam |
---|---|
date | Thu, 16 Oct 2014 19:03:51 +0100 |
parents | 7d8a6167febb |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Objects.h" #include <dataquay/BasicStore.h> #include <dataquay/RDFException.h> #include <dataquay/objectmapper/ObjectStorer.h> #include <dataquay/objectmapper/ObjectLoader.h> #include <dataquay/objectmapper/ObjectBuilder.h> #include <dataquay/objectmapper/TypeMapping.h> #include <dataquay/objectmapper/ContainerBuilder.h> #include "ImportClassicalComposersOrg.h" #include "ImportClassicalDotNet.h" #include "ImportClassicalArchives.h" #include "ImportWikipediaComposers.h" #include "ImportWikipediaWorks.h" #include "ImportWikipediaWorksK.h" #include "ImportWikipediaWorksList.h" #include "ImportHoboken.h" #include "TypeRegistrar.h" #include <dataquay/Debug.h> using namespace ClassicalData; using namespace Dataquay; #include <iostream> #include <set> typedef QMap<QString, QSet<Composer *> > ComposerMap; // name -> composers void addMiscExpansions(Composer *c) { QString n = c->name(); DEBUG << "addMiscExpansions: n = " << n << endl; // lovely hard-coded special cases go here! some of these are // needed for works->composer assignments if (n == "Balakirev, Milii") { c->addAlias("Mily Balakirev"); } if (n.startsWith("Cui, C")) { c->addAlias(QString::fromUtf8("C\303\251sar Cui")); } if (n == "Handel, George Frideric") { c->addAlias("Handel, Georg Friedrich"); c->addAlias("Handel"); } if (n == "Prokofiev, Sergey") { c->addAlias("Prokofieff, Sergei"); c->addAlias("Sergei Prokofieff"); } if (n == "Rossini, Gioacchino") { c->addAlias("Rossini, Gioachino"); c->addAlias("Gioachino Rossini"); } if (n == "Edwards, Richard") { c->addAlias("Edwardes, Richard"); c->addAlias("Richard Edwardes"); c->addAlias("Richard Edwards"); } if (n == "Rimsky-Korsakov, Nikolay Andreyevich") { c->addAlias("Nikolai Rimsky-Korsakov"); } if (n.startsWith("Piccinni, Nico")) { c->addAlias(n); c->setName(QString::fromUtf8("Piccinni, Niccol\303\262")); } if (n == "Tchaikovsky, Pyotr Ilyich") { c->addAlias("Tchaikovsky, Piotr Ilyitch"); } if (n == "Wilhelm Stenhammar") { c->addAlias("Stenhammar, Vilhelm Eugene"); c->setName("Stenhammar, Wilhelm"); c->addAlias(n); } if (n == "Mercadante, Saverio Rafaele") { c->addAlias("Mercadante, Giuseppe"); } if (n == "Johann Wenzel Anton Stamitz") { c->addAlias(n); c->setName("Stamitz, Johann Wenzel Anton"); c->addAlias("Stamitz, Jan Vaclav"); } if (n == "Mario Castelnuovo-Tedesco") { c->addAlias("Castelnuovo Tedesco, Mario"); } if (n == "Mayr, Simon") { c->addAlias("Mayr"); } n.replace(", Sr.", " Sr."); n.replace(", Jr.", " Jr."); int comma = n.indexOf(", "); if (comma > 0 && comma + 2 < n.length()) { QString left = n.left(comma); QString right = n.right(n.length() - comma - 2); QRegExp jrsr("( (Sr\\.|Jr\\.|I|II))$"); if (jrsr.indexIn(right) >= 0) { left = left + jrsr.cap(1); right = right.left(right.length()-jrsr.matchedLength()); } n = right + " " + left; } if (n != c->name()) c->addAlias(n); if (n.contains("Sergey")) { QString nn(n); nn.replace("Sergey", "Sergei"); c->addAlias(nn); } else if (n.contains("Sergei")) { QString nn(n); nn.replace("Sergei", "Sergey"); c->addAlias(nn); } QRegExp sr("((, )?Sr\\.|Senior|\\(?the elder\\)?)", Qt::CaseInsensitive); if (sr.indexIn(n) >= 0) { QString nr = n; nr.replace(sr.pos(0), sr.matchedLength(), " I"); nr.replace(" ", " "); DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl; c->addAlias(nr); } QRegExp jr("((, )?Jr\\.|Junior|\\(?the younger\\)?)", Qt::CaseInsensitive); if (jr.indexIn(n) >= 0) { QString nr = n; nr.replace(jr.pos(0), jr.matchedLength(), " II"); nr.replace(" ", " "); DEBUG << "addMiscExpansions: trying " << nr << " for " << n << endl; c->addAlias(nr); } QString nr = n; nr.replace("(I)", "I"); nr.replace("(II)", "II"); nr.replace("(III)", "III"); c->addAlias(nr); } bool hasBetterName(Composer *c, Composer *other) { if (c->name() == other->name()) return false; // Try to guess which of c and other is more likely to have a good // "canonical form" of the composer's name if (c->name().startsWith("van ")) { return false; // wrong choice of sort for e.g. LvB; should be // Beethoven, Ludwig van, not van Beethoven, Ludwig } if (other->name().startsWith("van ")) { return true; } if (c->aliases().size() != other->aliases().size()) { // a rather weak heuristic return c->aliases().size() > other->aliases().size(); } if (c->name().contains(',') && !other->name().contains(',')) { // another rather weak heuristic return true; } return false; } void mergeComposer(Composer *c, ComposerMap &composers) { QString name = c->name(); QSet<QString> allNames = c->aliases(); allNames.insert(name); QString dates; if (c->birth()) { if (c->death()) { dates = QString("%1-%2").arg(c->birth()->year()).arg(c->death()->year()); } else { dates = QString("%1-").arg(c->birth()->year()); } } if (dates != "") { allNames.insert(dates); } QSet<Composer *> matches; foreach (QString candidateName, allNames) { QString key = Composer::reduceName(candidateName); if (composers.contains(key)) { foreach (Composer *candidate, composers[key]) { if (candidateName == dates) { if (c->name() == candidate->name()) { DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl; } else if (!candidate->matchCatalogueName(c->name()) && !c->matchCatalogueName(candidate->name())) { DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl; continue; } else { DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl; } } else { if (!c->matchDates(candidate)) { DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl; continue; } } matches.insert(candidate); } } } if (matches.empty()) { DEBUG << "mergeComposer: No existing composer with alias matching any alias of " << c->name() << ", adding" << endl; if (!c->birth() && !c->death()) { DEBUG << "Composer has no dates, laboriously searching for all names" << endl; // laboriously look for fuzzy match across _all_ composers for (ComposerMap::iterator i = composers.begin(); i != composers.end(); ++i) { foreach (Composer *candidate, *i) { if (candidate->matchCatalogueName(c->name())) { DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl; matches.insert(candidate); break; } } if (!matches.empty()) break; } } if (matches.empty()) { foreach (QString candidateName, allNames) { QString key = Composer::reduceName(candidateName); composers[key].insert(c); DEBUG << "added for alias or date " << candidateName << endl; } return; } } if (matches.size() > 1) { DEBUG << "mergeComposer: More than one composer matches name and date(s) for " << c->name() << " -- something fishy here" << endl; } Composer *other = *matches.begin(); DEBUG << "mergeComposer: Merging " << c->name() << " with " << other->name() << endl; if (hasBetterName(c, other)) { other->addAlias(other->name()); other->setName(c->name()); } else { other->addAlias(c->name()); } composers[Composer::reduceName(c->name())].insert(other); DEBUG << "linking from alias " << c->name() << endl; foreach (QString alias, c->aliases()) { if (alias != other->name() && !other->aliases().contains(alias)) { other->addAlias(alias); composers[Composer::reduceName(alias)].insert(other); DEBUG << "linking from alias " << alias << endl; } } foreach (Document *d, c->pages()) { bool found = false; foreach (Document *dd, other->pages()) { if (d->uri() == dd->uri()) { found = true; break; } } if (!found) { d->setTopic(other); other->addPage(d); } } //!!! actually the "approximate" bits of the following are bogus; // a source reporting birth or death date as approx is probably // more accurate than one reporting an exact date if (c->birth()) { if (!other->birth() || other->birth()->approximate()) { other->setBirth(c->birth()); } } if (c->death()) { if (!other->death() || other->death()->approximate()) { other->setDeath(c->death()); } } if (c->gender() != "") other->setGender(c->gender()); foreach (QString s, c->nationality()) { other->addNationality(s); } foreach (Uri s, c->geonameURIs()) { other->addGeonameURI(s); } if (c->remarks() != "") other->setRemarks(c->remarks()); if (c->period() != "") other->setPeriod(c->period()); } QString asciify(QString field) { // accented characters etc -- add "ascii version" for dumb search purposes QString ascii; for (int i = 0; i < field.length(); ++i) { QString dc = field[i].decomposition(); if (dc != "") ascii += dc[0]; else if (field[i] == QChar(0x00DF)) { ascii += "ss"; } else { ascii += field[i]; } } ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe ascii.replace(QString::fromUtf8("\342\200\222"), "-"); ascii.replace(QString::fromUtf8("\342\200\223"), "-"); ascii.replace(QString::fromUtf8("\342\200\224"), "-"); ascii.replace(QString::fromUtf8("\342\200\225"), "-"); return ascii; } void asciify(Composer *c) { QString n = c->name(); QString asc = asciify(n); if (asc != n && !c->aliases().contains(asc)) c->addAlias(asc); foreach (QString alias, c->aliases()) { asc = asciify(alias); if (asc != alias && !c->aliases().contains(asc)) c->addAlias(asc); } } void asciify(Work *w) { QString n = w->name(); QString asc = asciify(n); if (asc != n && !w->aliases().contains(asc)) w->addAlias(asc); foreach (QString alias, w->aliases()) { asc = asciify(alias); if (asc != alias && !w->aliases().contains(asc)) w->addAlias(asc); } } void assignUri(Store *s, Composer *c) { static QSet<QString> convSet; QString conv = c->name(); if (!conv.contains(",")) { QStringList sl = conv.split(" "); if (!sl.empty()) { sl.push_front(sl[sl.size()-1]); sl.removeLast(); conv = sl.join(" "); DEBUG << "assignUri: " << c->name() << " -> " << conv << endl; } } conv = asciify(conv); conv.replace(" ", "_"); conv.replace("-", "_"); conv.replace(QRegExp("[^a-zA-Z0-9_-]"), ""); conv = conv.toLower(); QString initial = conv; int i = 2; while (convSet.contains(conv)) { conv = QString("%1__%2").arg(initial).arg(i); i++; } convSet.insert(conv); c->setProperty("uri", QVariant::fromValue(s->expand(":composer/" + conv))); } void assignUri(Store *s, Work *w, Composer *c) { QString pfx = c->property("uri").value<Uri>().toString(); DEBUG << "pfx = " << pfx << endl; if (!pfx.contains("composer/")) pfx = ":work/"; else { pfx.replace("composer/", "work/"); pfx += "/"; } static QSet<QString> convSet; QString conv = w->catalogue(); if (conv == "") conv = w->opus(); conv = conv.replace(".", ""); bool hasOpus = (conv != ""); if (conv == "") conv = w->name().toLower(); if (w->number() != "") conv = conv + "_no" + w->number(); conv = asciify(conv); conv.replace(" ", "_"); conv.replace("-", "_"); conv.replace(":", "_"); conv.replace(QRegExp("[^a-zA-Z0-9_-]"), ""); if (pfx != "") conv = pfx + conv; // I think actually for works we want to merge duplicates rather than // assign them separate URIs, _unless_ they lack a viable opus number if (!hasOpus) { QString initial = conv; int i = 2; while (convSet.contains(conv)) { conv = QString("%1__%2").arg(initial).arg(i); i++; } } convSet.insert(conv); w->setProperty("uri", conv); } void addDbpediaResource(Store *store, QObject *o, QString s) { Uri u = o->property("uri").value<Uri>(); if (u == Uri()) return; if (s.startsWith("http://en.wikipedia.org/wiki/")) { store->add(Triple(u, "mo:wikipedia", Uri(s))); s.replace("http://en.wikipedia.org/wiki/", "http://dbpedia.org/resource/"); store->add(Triple(u, "owl:sameAs", Uri(s))); } } int main(int argc, char **argv) { qRegisterMetaType<ClassicalComposersOrgImporter *> ("ClassicalData::ClassicalComposersOrgImporter*"); qRegisterMetaType<ClassicalDotNetImporter *> ("ClassicalData::ClassicalDotNetImporter*"); qRegisterMetaType<ClassicalArchivesImporter *> ("ClassicalData::ClassicalArchivesImporter*"); qRegisterMetaType<WikipediaComposersImporter *> ("ClassicalData::WikipediaComposersImporter*"); qRegisterMetaType<WikipediaWorksImporter *> ("ClassicalData::WikipediaWorksImporter*"); qRegisterMetaType<WikipediaWorksKImporter *> ("ClassicalData::WikipediaWorksKImporter*"); qRegisterMetaType<WikipediaWorksListImporter *> ("ClassicalData::WikipediaWorksListImporter*"); qRegisterMetaType<HobokenImporter *> ("ClassicalData::HobokenImporter*"); ObjectBuilder::getInstance()->registerClass <ClassicalComposersOrgImporter>("ClassicalData::ClassicalComposersOrgImporter*"); ObjectBuilder::getInstance()->registerClass <ClassicalDotNetImporter>("ClassicalData::ClassicalDotNetImporter*"); ObjectBuilder::getInstance()->registerClass <ClassicalArchivesImporter>("ClassicalData::ClassicalArchivesImporter*"); ObjectBuilder::getInstance()->registerClass <WikipediaComposersImporter>("ClassicalData::WikipediaComposersImporter*"); ObjectBuilder::getInstance()->registerClass <WikipediaWorksImporter>("ClassicalData::WikipediaWorksImporter*"); ObjectBuilder::getInstance()->registerClass <WikipediaWorksKImporter>("ClassicalData::WikipediaWorksKImporter*"); ObjectBuilder::getInstance()->registerClass <WikipediaWorksListImporter>("ClassicalData::WikipediaWorksListImporter*"); ObjectBuilder::getInstance()->registerClass <HobokenImporter>("ClassicalData::HobokenImporter*"); BasicStore *store = BasicStore::load(QUrl("file:importers.ttl")); ObjectLoader loader(store); QObject *parentObject = loader.loadAllObjects(new QObject()); BasicStore *outstore = new BasicStore(); outstore->setBaseUri(Uri("http://dbtune.org/classical/resource/")); ObjectStorer storer(outstore); TypeMapping tm; TypeRegistrar::registerTypes(); TypeRegistrar::addMappings(outstore, &tm); storer.setTypeMapping(tm); storer.setPropertyStorePolicy(ObjectStorer::StoreIfChanged); storer.setObjectStorePolicy(ObjectStorer::StoreAllObjects); storer.setBlankNodePolicy(ObjectStorer::NoBlankNodes); QList<Importer *> importers = parentObject->findChildren<Importer *>(); std::cerr << "have " << importers.size() << " importers" << std::endl; ComposerMap composers; QList<Composer *> dated; QList<Composer *> undated; QList<Work *> works; QList<Composition *> compositions; QList<QObject *> other; foreach (Importer *importer, importers) { QObjectList objects = importer->getImportedObjects(); foreach (QObject *o, objects) { Composer *c; if ((c = qobject_cast<Composer *>(o))) { addMiscExpansions(c); asciify(c); if (c->birth() || c->death()) dated.push_back(c); else undated.push_back(c); continue; } Work *w; if ((w = qobject_cast<Work *>(o))) { asciify(w); works.push_back(w); continue; } Composition *cn; if ((cn = qobject_cast<Composition *>(o))) { compositions.push_back(cn); continue; } } } // get all the dated composers merged before attempting to match // the undated ones foreach (Composer *c, dated) { mergeComposer(c, composers); } foreach (Composer *c, undated) { mergeComposer(c, composers); } QObjectList toStore; QSet<Composer *> cset; for (ComposerMap::iterator i = composers.begin(); i != composers.end(); ++i) { foreach (Composer *c, i.value()) { if (!cset.contains(c)) { assignUri(outstore, c); toStore.push_back(c); cset.insert(c); } foreach (Document *d, c->pages()) { QString s = d->uri().toString(); addDbpediaResource(outstore, c, s); } } } QSet<QString> storedUris; foreach (Work *w, works) { Composition *cn = w->composition(); if (!cn) continue; if (!cn->composer()) { QString cname = cn->composerName(); QString key = Composer::reduceName(cname); if (cname != "") { if (!composers.contains(key)) { DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl; } else { QSet<Composer *> cs = composers[key]; if (cs.empty()) { DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl; } else if (cs.size() > 1) { DEBUG << "Failed to assign Composition to composer: " << cs.size() << " composers match name " << cname << endl; } else { cn->setComposer(*cs.begin()); } } } else { DEBUG << "Failed to assign Composition to composer: composer name is empty" << endl; } } if (cn->composer()) { assignUri(outstore, w, cn->composer()); } foreach (Document *d, w->pages()) { QString s = d->uri().toString(); addDbpediaResource(outstore, w, s); if (!storedUris.contains(s)) { toStore.push_back(d); storedUris.insert(s); } } QString u = w->property("uri").value<Uri>().toString(); if (u == "" || !storedUris.contains(u)) { toStore.push_back(w); if (u != "") storedUris.insert(u); } } try { storer.storeAllObjects(toStore); } catch (RDFException e) { std::cerr << "Caught RDF exception: " << e.what() << std::endl; } DEBUG << "Stored, now saving" << endl; outstore->save("imported.ttl"); DEBUG << "Saved" << endl; QMultiMap<QString, Composer *> cmap; foreach (Composer *c, cset) { QString n = c->getSortName(true); cmap.insert(n, c); } std::cout << "Composers: " << cmap.size() << std::endl; for (QMultiMap<QString, Composer *>::iterator i = cmap.begin(); i != cmap.end(); ++i) { QString n = i.key(); Composer *c = i.value(); std::cout << n.toStdString(); QString d = c->getDisplayDates(); if (d != "") std::cout << " (" << d.toStdString() << ")"; std::cout << std::endl; } std::cout << std::endl; std::cout << "Works by composer:" << std::endl; for (QMultiMap<QString, Composer *>::iterator i = cmap.begin(); i != cmap.end(); ++i) { QString n = i.key(); Composer *c = i.value(); std::set<Work *, Work::Ordering> wmap; foreach (Work *w, works) { Composition *cn = w->composition(); if (!cn) continue; if (cn->composer() != c) continue; if (w->partOf()) continue; wmap.insert(w); } if (wmap.empty()) continue; std::cout << n.toStdString() << std::endl; foreach (Work *w, wmap) { std::cout << " * "; std::cout << w->name().toStdString(); if (w->catalogue() != "") { std::cout << " [" << w->catalogue().toStdString() << "]"; } if (w->opus() != "") { std::cout << " [op. " << w->opus().toStdString() << "]"; } std::cout << std::endl; std::set<Work *, Work::Ordering> orderedParts; foreach (Work *ww, w->parts()) { orderedParts.insert(ww); } foreach (Work *ww, orderedParts) { std::cout << " "; if (ww->number() != "") { std::cout << ww->number().toStdString() << ". "; } std::cout << ww->name().toStdString(); if (ww->catalogue() != "" && ww->catalogue() != w->catalogue()) { std::cout << " [" << ww->catalogue().toStdString() << "]"; } if (ww->opus() != "" && ww->opus() != w->opus()) { std::cout << " [op. " << ww->opus().toStdString() << "]"; } std::cout << std::endl; } } std::cout << std::endl; } delete outstore; DEBUG << "Done" << endl; }