comparison rdf/RDFImporter.cpp @ 1852:a454c7477b4f

Be more cautious about firing up an RDF file parser to identify a document - don't do it at all if the document is not apparently text
author Chris Cannam
date Thu, 30 Apr 2020 14:46:07 +0100
parents d484490cdf69
children db489a1ece9b
comparison
equal deleted inserted replaced
1851:91056142abd0 1852:a454c7477b4f
21 #include <iostream> 21 #include <iostream>
22 #include <cmath> 22 #include <cmath>
23 23
24 #include "base/ProgressReporter.h" 24 #include "base/ProgressReporter.h"
25 #include "base/RealTime.h" 25 #include "base/RealTime.h"
26 #include "base/StringBits.h"
26 27
27 #include "data/model/SparseOneDimensionalModel.h" 28 #include "data/model/SparseOneDimensionalModel.h"
28 #include "data/model/SparseTimeValueModel.h" 29 #include "data/model/SparseTimeValueModel.h"
29 #include "data/model/EditableDenseThreeDimensionalModel.h" 30 #include "data/model/EditableDenseThreeDimensionalModel.h"
30 #include "data/model/NoteModel.h" 31 #include "data/model/NoteModel.h"
36 #include "data/fileio/CachedFile.h" 37 #include "data/fileio/CachedFile.h"
37 #include "data/fileio/FileFinder.h" 38 #include "data/fileio/FileFinder.h"
38 39
39 #include <dataquay/BasicStore.h> 40 #include <dataquay/BasicStore.h>
40 #include <dataquay/PropertyObject.h> 41 #include <dataquay/PropertyObject.h>
42
43 #include <QFile>
44 #include <QXmlInputSource>
41 45
42 using Dataquay::Uri; 46 using Dataquay::Uri;
43 using Dataquay::Node; 47 using Dataquay::Node;
44 using Dataquay::Nodes; 48 using Dataquay::Nodes;
45 using Dataquay::Triple; 49 using Dataquay::Triple;
788 cerr << "WARNING: RDFImporterImpl::fillModel: Unknown or unexpected model type" << endl; 792 cerr << "WARNING: RDFImporterImpl::fillModel: Unknown or unexpected model type" << endl;
789 return; 793 return;
790 } 794 }
791 795
792 RDFImporter::RDFDocumentType 796 RDFImporter::RDFDocumentType
793 RDFImporter::identifyDocumentType(QString url) 797 RDFImporter::identifyDocumentType(QUrl url)
794 { 798 {
795 bool haveAudio = false; 799 bool haveAudio = false;
796 bool haveAnnotations = false; 800 bool haveAnnotations = false;
797 bool haveRDF = false; 801 bool haveRDF = false;
798 802
803 if (!isPlausibleDocumentOfAnyKind(url)) {
804 return NotRDF;
805 }
806
799 BasicStore *store = nullptr; 807 BasicStore *store = nullptr;
800 808
801 // This is not expected to return anything useful, but if it does 809 // This is not expected to return anything useful, but if it does
802 // anything at all then we know we have RDF 810 // anything at all then we know we have RDF
803 try { 811 try {
804 //!!! non-local document? 812 store = BasicStore::load(url);
805 store = BasicStore::load(QUrl(url));
806 Triple t = store->matchOnce(Triple()); 813 Triple t = store->matchOnce(Triple());
807 if (t != Triple()) haveRDF = true; 814 if (t != Triple()) haveRDF = true;
808 } catch (std::exception &) { 815 } catch (std::exception &) {
809 // nothing; haveRDF will be false so the next bit catches it 816 // nothing; haveRDF will be false so the next bit catches it
810 } 817 }
878 } 885 }
879 886
880 return OtherRDFDocument; 887 return OtherRDFDocument;
881 } 888 }
882 889
890 bool
891 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url)
892 {
893 // Return true if the document can be opened and contains some
894 // sort of text, either UTF-8 (so it could be Turtle) or another
895 // encoding that is recognised as XML
896
897 FileSource source(url);
898
899 if (!source.isAvailable()) {
900 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl;
901 return false;
902 }
903
904 QFile file(source.getLocalFilename());
905 if (!file.open(QFile::ReadOnly)) {
906 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl;
907 return false;
908 }
909
910 QByteArray bytes = file.read(200);
911
912 if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
913 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl;
914 return true; // good enough to be worth trying to parse
915 }
916
917 QXmlInputSource xmlSource;
918 xmlSource.setData(bytes); // guesses text encoding
919
920 if (xmlSource.data().startsWith("<?xml")) {
921 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl;
922 return true;
923 }
924
925 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl;
926 return false;
927 }
928