comparison rdf/RDFImporter.cpp @ 1855:db489a1ece9b

Pull out text-document check; it's useful elsewhere
author Chris Cannam
date Mon, 11 May 2020 17:27:18 +0100
parents a454c7477b4f
children
comparison
equal deleted inserted replaced
1854:bde22957545e 1855:db489a1ece9b
34 #include "data/model/ReadOnlyWaveFileModel.h" 34 #include "data/model/ReadOnlyWaveFileModel.h"
35 35
36 #include "data/fileio/FileSource.h" 36 #include "data/fileio/FileSource.h"
37 #include "data/fileio/CachedFile.h" 37 #include "data/fileio/CachedFile.h"
38 #include "data/fileio/FileFinder.h" 38 #include "data/fileio/FileFinder.h"
39 #include "data/fileio/TextTest.h"
39 40
40 #include <dataquay/BasicStore.h> 41 #include <dataquay/BasicStore.h>
41 #include <dataquay/PropertyObject.h> 42 #include <dataquay/PropertyObject.h>
42
43 #include <QFile>
44 #include <QXmlInputSource>
45 43
46 using Dataquay::Uri; 44 using Dataquay::Uri;
47 using Dataquay::Node; 45 using Dataquay::Node;
48 using Dataquay::Nodes; 46 using Dataquay::Nodes;
49 using Dataquay::Triple; 47 using Dataquay::Triple;
888 } 886 }
889 887
890 bool 888 bool
891 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url) 889 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url)
892 { 890 {
893 // Return true if the document can be opened and contains some 891 return TextTest::isApparentTextDocument(FileSource(url));
894 // sort of text, either UTF-8 (so it could be Turtle) or another 892 }
895 // encoding that is recognised as XML 893
896
897 FileSource source(url);
898
899 if (!source.isAvailable()) {
900 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl;
901 return false;
902 }
903
904 QFile file(source.getLocalFilename());
905 if (!file.open(QFile::ReadOnly)) {
906 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl;
907 return false;
908 }
909
910 QByteArray bytes = file.read(200);
911
912 if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
913 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl;
914 return true; // good enough to be worth trying to parse
915 }
916
917 QXmlInputSource xmlSource;
918 xmlSource.setData(bytes); // guesses text encoding
919
920 if (xmlSource.data().startsWith("<?xml")) {
921 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl;
922 return true;
923 }
924
925 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl;
926 return false;
927 }
928