diff rdf/RDFImporter.cpp @ 1855:db489a1ece9b

Pull out text-document check; it's useful elsewhere
author Chris Cannam
date Mon, 11 May 2020 17:27:18 +0100
parents a454c7477b4f
children
line wrap: on
line diff
--- a/rdf/RDFImporter.cpp	Mon May 11 14:43:58 2020 +0100
+++ b/rdf/RDFImporter.cpp	Mon May 11 17:27:18 2020 +0100
@@ -36,13 +36,11 @@
 #include "data/fileio/FileSource.h"
 #include "data/fileio/CachedFile.h"
 #include "data/fileio/FileFinder.h"
+#include "data/fileio/TextTest.h"
 
 #include <dataquay/BasicStore.h>
 #include <dataquay/PropertyObject.h>
 
-#include <QFile>
-#include <QXmlInputSource>
-
 using Dataquay::Uri;
 using Dataquay::Node;
 using Dataquay::Nodes;
@@ -890,39 +888,6 @@
 bool
 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url)
 {
-    // Return true if the document can be opened and contains some
-    // sort of text, either UTF-8 (so it could be Turtle) or another
-    // encoding that is recognised as XML
-    
-    FileSource source(url);
-
-    if (!source.isAvailable()) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl;
-        return false;
-    }
-
-    QFile file(source.getLocalFilename());
-    if (!file.open(QFile::ReadOnly)) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl;
-        return false;
-    }
-
-    QByteArray bytes = file.read(200);
-
-    if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl;
-        return true; // good enough to be worth trying to parse
-    }
-
-    QXmlInputSource xmlSource;
-    xmlSource.setData(bytes); // guesses text encoding
-
-    if (xmlSource.data().startsWith("<?xml")) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl;
-        return true;
-    }
-
-    SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl;
-    return false;
+    return TextTest::isApparentTextDocument(FileSource(url));
 }