Mercurial > hg > svcore
changeset 1852:a454c7477b4f
Be more cautious about firing up an RDF file parser to identify a document - don't do it at all if the document is not apparently text
author | Chris Cannam |
---|---|
date | Thu, 30 Apr 2020 14:46:07 +0100 |
parents | 91056142abd0 |
children | f36fef97ac81 |
files | rdf/RDFImporter.cpp rdf/RDFImporter.h |
diffstat | 2 files changed, 53 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/rdf/RDFImporter.cpp Thu Apr 30 14:45:24 2020 +0100 +++ b/rdf/RDFImporter.cpp Thu Apr 30 14:46:07 2020 +0100 @@ -23,6 +23,7 @@ #include "base/ProgressReporter.h" #include "base/RealTime.h" +#include "base/StringBits.h" #include "data/model/SparseOneDimensionalModel.h" #include "data/model/SparseTimeValueModel.h" @@ -39,6 +40,9 @@ #include <dataquay/BasicStore.h> #include <dataquay/PropertyObject.h> +#include <QFile> +#include <QXmlInputSource> + using Dataquay::Uri; using Dataquay::Node; using Dataquay::Nodes; @@ -790,19 +794,22 @@ } RDFImporter::RDFDocumentType -RDFImporter::identifyDocumentType(QString url) +RDFImporter::identifyDocumentType(QUrl url) { bool haveAudio = false; bool haveAnnotations = false; bool haveRDF = false; + if (!isPlausibleDocumentOfAnyKind(url)) { + return NotRDF; + } + BasicStore *store = nullptr; - + // This is not expected to return anything useful, but if it does // anything at all then we know we have RDF try { - //!!! non-local document? - store = BasicStore::load(QUrl(url)); + store = BasicStore::load(url); Triple t = store->matchOnce(Triple()); if (t != Triple()) haveRDF = true; } catch (std::exception &) { @@ -880,3 +887,42 @@ return OtherRDFDocument; } +bool +RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url) +{ + // Return true if the document can be opened and contains some + // sort of text, either UTF-8 (so it could be Turtle) or another + // encoding that is recognised as XML + + FileSource source(url); + + if (!source.isAvailable()) { + SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl; + return false; + } + + QFile file(source.getLocalFilename()); + if (!file.open(QFile::ReadOnly)) { + SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl; + return false; + } + + QByteArray bytes = file.read(200); + + if (StringBits::isValidUtf8(bytes.toStdString(), true)) { + SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl; + return true; // good enough to be worth trying to parse + } + + QXmlInputSource xmlSource; + xmlSource.setData(bytes); // guesses text encoding + + if (xmlSource.data().startsWith("<?xml")) { + SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl; + return true; + } + + SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl; + return false; +} +
--- a/rdf/RDFImporter.h Thu Apr 30 14:45:24 2020 +0100 +++ b/rdf/RDFImporter.h Thu Apr 30 14:46:07 2020 +0100 @@ -62,8 +62,10 @@ NotRDF }; - static RDFDocumentType identifyDocumentType(QString url); + static RDFDocumentType identifyDocumentType(QUrl url); + static bool isPlausibleDocumentOfAnyKind(QUrl url); + protected: RDFImporterImpl *m_d; };