changeset 1852:a454c7477b4f

Be more cautious about firing up an RDF file parser to identify a document - don't do it at all if the document is not apparently text
author Chris Cannam
date Thu, 30 Apr 2020 14:46:07 +0100
parents 91056142abd0
children f36fef97ac81
files rdf/RDFImporter.cpp rdf/RDFImporter.h
diffstat 2 files changed, 53 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/rdf/RDFImporter.cpp	Thu Apr 30 14:45:24 2020 +0100
+++ b/rdf/RDFImporter.cpp	Thu Apr 30 14:46:07 2020 +0100
@@ -23,6 +23,7 @@
 
 #include "base/ProgressReporter.h"
 #include "base/RealTime.h"
+#include "base/StringBits.h"
 
 #include "data/model/SparseOneDimensionalModel.h"
 #include "data/model/SparseTimeValueModel.h"
@@ -39,6 +40,9 @@
 #include <dataquay/BasicStore.h>
 #include <dataquay/PropertyObject.h>
 
+#include <QFile>
+#include <QXmlInputSource>
+
 using Dataquay::Uri;
 using Dataquay::Node;
 using Dataquay::Nodes;
@@ -790,19 +794,22 @@
 }
 
 RDFImporter::RDFDocumentType
-RDFImporter::identifyDocumentType(QString url)
+RDFImporter::identifyDocumentType(QUrl url)
 {
     bool haveAudio = false;
     bool haveAnnotations = false;
     bool haveRDF = false;
 
+    if (!isPlausibleDocumentOfAnyKind(url)) {
+        return NotRDF;
+    }
+
     BasicStore *store = nullptr;
-
+    
     // This is not expected to return anything useful, but if it does
     // anything at all then we know we have RDF
     try {
-        //!!! non-local document?
-        store = BasicStore::load(QUrl(url));
+        store = BasicStore::load(url);
         Triple t = store->matchOnce(Triple());
         if (t != Triple()) haveRDF = true;
     } catch (std::exception &) {
@@ -880,3 +887,42 @@
     return OtherRDFDocument;
 }
 
+bool
+RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url)
+{
+    // Return true if the document can be opened and contains some
+    // sort of text, either UTF-8 (so it could be Turtle) or another
+    // encoding that is recognised as XML
+    
+    FileSource source(url);
+
+    if (!source.isAvailable()) {
+        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl;
+        return false;
+    }
+
+    QFile file(source.getLocalFilename());
+    if (!file.open(QFile::ReadOnly)) {
+        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl;
+        return false;
+    }
+
+    QByteArray bytes = file.read(200);
+
+    if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
+        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl;
+        return true; // good enough to be worth trying to parse
+    }
+
+    QXmlInputSource xmlSource;
+    xmlSource.setData(bytes); // guesses text encoding
+
+    if (xmlSource.data().startsWith("<?xml")) {
+        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl;
+        return true;
+    }
+
+    SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl;
+    return false;
+}
+
--- a/rdf/RDFImporter.h	Thu Apr 30 14:45:24 2020 +0100
+++ b/rdf/RDFImporter.h	Thu Apr 30 14:46:07 2020 +0100
@@ -62,8 +62,10 @@
         NotRDF
     };
 
-    static RDFDocumentType identifyDocumentType(QString url);
+    static RDFDocumentType identifyDocumentType(QUrl url);
 
+    static bool isPlausibleDocumentOfAnyKind(QUrl url);
+    
 protected:
     RDFImporterImpl *m_d;
 };