changeset 24:2b574b88778e classical-rdf

* Add sameAs record to composer &c * Add merge facility to composer * Sort ntriples database file
author Chris Cannam
date Fri, 26 Feb 2010 15:26:55 +0000
parents 437442790e51
children e856df83c57f
files common/Objects.cpp common/Objects.h common/TypeRegistrar.cpp import/build-database.sh utilities/composer/composer.cpp
diffstat 5 files changed, 172 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/common/Objects.cpp	Fri Feb 26 11:26:16 2010 +0000
+++ b/common/Objects.cpp	Fri Feb 26 15:26:55 2010 +0000
@@ -539,6 +539,45 @@
     return score;
 }
 
+void
+Composer::mergeFrom(Composer *c)
+{
+    QString name = c->name();
+    QSet<QString> allNames = c->aliases();
+    allNames.insert(name);
+        
+    foreach (QString n, allNames) {
+        if (n != m_name && !m_aliases.contains(n)) {
+            m_aliases.insert(n);
+            m_namesCached = false;
+        }
+    }
+
+    if (!m_birth) {
+        if (c->birth()) m_birth = new Birth(*c->birth());
+    }
+
+    if (!m_death) {
+        if (c->death()) m_death = new Death(*c->death());
+    }
+        
+    if (c->gender() != "") {
+        if (m_gender == "") {
+            m_gender = c->gender();
+        } else if (c->gender() != m_gender) {
+            std::cerr << "WARNING: Composer::mergeFrom: Gender mismatch! Composer " << c->name().toStdString() << " has gender " << c->gender().toStdString() << ", but target composer " << m_name.toStdString() << " has gender " << m_gender.toStdString() << std::endl;
+        }
+    }
+
+    m_nationality.unite(c->nationality());
+    m_geonameURIs.unite(c->geonameURIs());
+    m_otherURIs.unite(c->otherURIs());
+    m_pages.unite(c->pages());
+    
+    if (m_period == "") m_period = c->period();
+    if (m_remarks == "") m_remarks = c->remarks();
+}
+
 static int
 compare(QString a, QString b)
 {
--- a/common/Objects.h	Fri Feb 26 11:26:16 2010 +0000
+++ b/common/Objects.h	Fri Feb 26 15:26:55 2010 +0000
@@ -16,6 +16,8 @@
 #include <QMutexLocker>
 #include <QMap>
 
+#include <iostream>
+
 namespace ClassicalData {
 
 class Year
@@ -29,10 +31,12 @@
 
     struct Encoder : public Dataquay::Node::VariantEncoder {
         QString fromVariant(const QVariant &v) {
-            return QString("%1").arg(v.value<Year>().toInt());
+            QString s = QString("%1").arg(v.value<Year>().toInt());
+            return s;
         }
         QVariant toVariant(const QString &s) {
-            return QVariant::fromValue<Year>(s.toInt());
+            QVariant v = QVariant::fromValue<Year>(s.toInt());
+            return v;
         }
     };
 
@@ -52,6 +56,7 @@
     HistoricalEvent() : m_year(0), m_place(), m_approximate(false) { }
     HistoricalEvent(Year y) : m_year(y), m_approximate(false) { }
     HistoricalEvent(Year y, QString p) : m_year(y), m_place(p), m_approximate(false) { }
+    HistoricalEvent(const HistoricalEvent &h) : QObject(), m_year(h.m_year), m_place(h.m_place), m_approximate(h.m_approximate) { }
 
     Year year() const { return m_year; }
     void setYear(Year y) { m_year = y; }
@@ -154,6 +159,7 @@
     Q_PROPERTY(QSet<QString> aliases READ aliases WRITE setAliases STORED true)
     Q_PROPERTY(QString remarks READ remarks WRITE setRemarks STORED true)
     Q_PROPERTY(QSet<ClassicalData::Document*> pages READ pages WRITE setPages STORED true)
+    Q_PROPERTY(QSet<Dataquay::Uri> otherURIs READ otherURIs WRITE setOtherURIs STORED true)
 
 public:
     NamedEntity(QObject *parent = 0) : QObject(parent) { }
@@ -172,11 +178,16 @@
     void addPage(Document *p) { m_pages.insert(p); }
     void setPages(QSet<Document *> p) { m_pages = p; } //!!! destroy old ones? do we own?
 
+    QSet<Dataquay::Uri> otherURIs() const { return m_otherURIs; }
+    void addOtherURI(Dataquay::Uri u) { m_otherURIs.insert(u); }
+    void setOtherURIs(QSet<Dataquay::Uri> u) { m_otherURIs = u; }
+
 protected:
     QString m_name;
     QString m_remarks;
     QSet<QString> m_aliases;
     QSet<Document *> m_pages;
+    QSet<Dataquay::Uri> m_otherURIs;
 };
 
 class Movement;
@@ -402,6 +413,15 @@
     float matchTyping(QString text) const;
 
     /**
+     * Merge data from the given composer into this composer record.
+     * That is, add the composer's name and aliases as aliases of this
+     * composer, copy its dates where we lack them, etc.  In all
+     * cases, values that exist in this composer already are preferred
+     * over values from the "other" composer.
+     */
+    void mergeFrom(Composer *c);
+
+    /**
      * Return the supplied name reduced into a "simplified" form,
      * eliminating many of the differences often found particularly in
      * European language names that have been anglicised.  Used in
--- a/common/TypeRegistrar.cpp	Fri Feb 26 11:26:16 2010 +0000
+++ b/common/TypeRegistrar.cpp	Fri Feb 26 15:26:55 2010 +0000
@@ -138,6 +138,7 @@
     mapper->addPropertyMapping("ClassicalData::Composer", "birth", "property:birth");
     mapper->addPropertyMapping("ClassicalData::Composer", "death", "property:death");
     mapper->addPropertyMapping("ClassicalData::Composer", "geonameURIs", "foaf:based_near");
+    mapper->addPropertyMapping("ClassicalData::Composer", "otherURIs", "owl:sameAs");
 
     mapper->addTypeMapping("ClassicalData::Birth", "bio:Birth");
     mapper->addTypeMapping("ClassicalData::Death", "bio:Death");
--- a/import/build-database.sh	Fri Feb 26 11:26:16 2010 +0000
+++ b/import/build-database.sh	Fri Feb 26 15:26:55 2010 +0000
@@ -17,7 +17,7 @@
 
 echo "Running importer, log is written to importer.log"
 
-#./importer 2>importer.log || exit 1
+./importer 2>importer.log || exit 1
 
 echo "Assembling additional sources"
 
@@ -34,8 +34,10 @@
     cat extra/prefixes.ttl "$ttl" | rapper -i turtle -o ntriples - http://dbtune.org/classical/resource/ >> ready.ntriples
 done
 
+sort ready.ntriples > ready.2.ntriples && mv ready.2.ntriples ready.ntriples
+
 grep composer ready.ntriples | fgrep -v .html | sed 's/^.*composer\///' | \
-    sed 's/>.*//' | sort | uniq > check/new-composer-uris
+    sed 's/>.*//' | grep -v http | sort | uniq > check/new-composer-uris
 
 diff -u check/composer-uris check/new-composer-uris | grep -v '^---' | grep -v '^+++' > /tmp/$$
 
--- a/utilities/composer/composer.cpp	Fri Feb 26 11:26:16 2010 +0000
+++ b/utilities/composer/composer.cpp	Fri Feb 26 15:26:55 2010 +0000
@@ -35,11 +35,16 @@
 
     cerr << "Importing from URL " << url << " ...";
     try {
-	store->import(url, BasicStore::ImportPermitDuplicates, "ntriples");
+	store->import(url, BasicStore::ImportPermitDuplicates);
     } catch (RDFException e) {
-	cerr << "failed" << endl;
-	cerr << "Import failed: " << e.what() << endl;
-	return false;
+        cerr << " retrying with explicit ntriples type...";
+        try {
+            store->import(url, BasicStore::ImportPermitDuplicates, "ntriples");
+        } catch (RDFException e) {
+            cerr << "failed" << endl;
+            cerr << "Import failed: " << e.what() << endl;
+            return false;
+        }
     }
 
     cerr << " done" << endl;
@@ -52,11 +57,13 @@
     int s = 0;
     for (int i = 0; name[i]; ++i) if (name[i] == '/') s = i + 1;
     name = name + s;
-    cerr << "Usage: " << name << " <input-rdf-file> list" << endl;
-    cerr << "Usage: " << name << " <input-rdf-file> list-uris" << endl;
-    cerr << "Usage: " << name << " <input-rdf-file> show <uri> [<uri> ...]" << endl;
-    cerr << "Usage: " << name << " <input-rdf-file> search <text>" << endl;
-    cerr << "Usage: " << name << " <input-rdf-file> match <text>" << endl;
+    cerr << "Usage:" << endl;
+    cerr << "  " << name << " <input-rdf-file> list" << endl;
+    cerr << "  " << name << " <input-rdf-file> list-uris" << endl;
+    cerr << "  " << name << " <input-rdf-file> show <uri> [<uri> ...]" << endl;
+    cerr << "  " << name << " <input-rdf-file> search <text>" << endl;
+    cerr << "  " << name << " <input-rdf-file> match <text>" << endl;
+    cerr << "  " << name << " <input-rdf-file> merge <target-uri> <dup> [<dup> ...]" << endl;
     exit(-1);
 }
 
@@ -98,7 +105,10 @@
         cout << " " << c->remarks() << endl;
     }
     foreach (Document *d, c->pages()) {
-        cout << " " << d->siteName() << " -> " << d->uri() << endl;
+        cout << d->siteName() << " -> " << d->uri() << endl;
+    }
+    foreach (Uri u, c->otherURIs()) {
+        cout << "Same as " << u << endl;
     }
 }
 
@@ -205,17 +215,60 @@
     showSearchResults(matches, 5);
 }
 
+QList<Composer *>
+matchWildcard(QString text)
+{
+    if (!text.contains('/') && !text.contains('*')) {
+        text = "*" + text + "*";
+    }
+    QRegExp re(text, Qt::CaseInsensitive, QRegExp::Wildcard);
+    QList<Composer *> results;
+    foreach (Composer *c, allComposers) {
+        if (re.exactMatch(c->property("uri").value<Uri>().toString())) {
+            results.push_back(c);
+        }
+    }
+    return results;
+}    
+
+Composer *
+matchSingle(QString text)
+{
+    QList<Composer *> matches = matchWildcard(text);
+    if (matches.empty()) {
+        cerr << "matchSingle: No matches for " << text << endl;
+        return 0;
+    } else if (matches.size() > 1) {
+        cerr << "matchSingle: Multiple matches for " << text << endl;
+        return 0;
+    }
+    return matches[0];
+}
+
 void
 showWildcard(QString text)
 {
     cout << "Showing URI or wildcard: " << text << endl;
-    QRegExp re(text, Qt::CaseInsensitive, QRegExp::Wildcard);
-    foreach (Composer *c, allComposers) {
-        if (re.exactMatch(c->property("uri").value<Uri>().toString())) {
-            cout << endl;
-            show(c);
-        }
+    cout << endl;
+    foreach (Composer *c, matchWildcard(text)) {
+        show(c);
+        cout << endl;
     }
+}
+
+void
+merge(Composer *target, QList<Composer *> sources)
+{
+    cout << "Merging into this composer record:" << endl << endl;
+    show(target);
+    cout << endl << "... the following composer record(s):" << endl;
+    foreach (Composer *c, sources) {
+        cout << endl;
+        show(c);
+        target->mergeFrom(c);
+    }
+    cout << endl << "Result after merging:" << endl << endl;;
+    show(target);
     cout << endl;
 }
 
@@ -260,6 +313,8 @@
         }
     }
 
+    bool write = false;
+
     if (command == "list") {
         if (!args.empty()) usage(argv[0]);
         listBrief(allComposers);
@@ -270,9 +325,6 @@
         if (args.empty()) usage(argv[0]);
         if (command == "show") {
             foreach (QString s, args) {
-                if (!s.contains('/') && !s.contains('*')) {
-                    s = "*" + s + "*";
-                }
                 showWildcard(s);
             }
         } else if (command == "search") {
@@ -283,8 +335,43 @@
             foreach (QString s, args) {
                 match(s);
             }
+        } else if (command == "merge") {
+            if (args.size() < 2) usage(argv[0]);
+            Composer *target = matchSingle(args[0]);
+            if (!target) return 1;
+            QList<Composer *> sources;
+            for (int i = 1; i < args.size(); ++i) {
+                Composer *c = matchSingle(args[i]);
+                if (!c) return 1;
+                sources.push_back(c);
+            }
+            merge(target, sources);
+            write = true;
         }
     }
         
+    if (write) {
+        BasicStore *outstore = new BasicStore();
+        outstore->setBaseUri(Uri("http://dbtune.org/classical/resource/"));
+        ObjectMapper *outmapper = new ObjectMapper(outstore);
+
+        TypeRegistrar::addMappings(outstore, outmapper);
+
+        outmapper->setPropertyStorePolicy(ObjectMapper::StoreIfChanged);
+        outmapper->setObjectStorePolicy(ObjectMapper::StoreAllObjects);
+        outmapper->setBlankNodePolicy(ObjectMapper::NoBlankNodes);
+
+        cerr << "Mapping results back to store...";
+        outmapper->storeAllObjects(root->children());
+        cerr << " done" << endl;
+
+        cerr << "Saving to file out.ttl...";
+        outstore->save("out.ttl");
+        cerr << " done" << endl;
+        
+        delete outmapper;
+        delete outstore;
+    }
+
 }