Mercurial > hg > svcore
comparison rdf/RDFImporter.cpp @ 1852:a454c7477b4f
Be more cautious about firing up an RDF file parser to identify a document - don't do it at all if the document is not apparently text
author | Chris Cannam |
---|---|
date | Thu, 30 Apr 2020 14:46:07 +0100 |
parents | d484490cdf69 |
children | db489a1ece9b |
comparison
equal
deleted
inserted
replaced
1851:91056142abd0 | 1852:a454c7477b4f |
---|---|
21 #include <iostream> | 21 #include <iostream> |
22 #include <cmath> | 22 #include <cmath> |
23 | 23 |
24 #include "base/ProgressReporter.h" | 24 #include "base/ProgressReporter.h" |
25 #include "base/RealTime.h" | 25 #include "base/RealTime.h" |
26 #include "base/StringBits.h" | |
26 | 27 |
27 #include "data/model/SparseOneDimensionalModel.h" | 28 #include "data/model/SparseOneDimensionalModel.h" |
28 #include "data/model/SparseTimeValueModel.h" | 29 #include "data/model/SparseTimeValueModel.h" |
29 #include "data/model/EditableDenseThreeDimensionalModel.h" | 30 #include "data/model/EditableDenseThreeDimensionalModel.h" |
30 #include "data/model/NoteModel.h" | 31 #include "data/model/NoteModel.h" |
36 #include "data/fileio/CachedFile.h" | 37 #include "data/fileio/CachedFile.h" |
37 #include "data/fileio/FileFinder.h" | 38 #include "data/fileio/FileFinder.h" |
38 | 39 |
39 #include <dataquay/BasicStore.h> | 40 #include <dataquay/BasicStore.h> |
40 #include <dataquay/PropertyObject.h> | 41 #include <dataquay/PropertyObject.h> |
42 | |
43 #include <QFile> | |
44 #include <QXmlInputSource> | |
41 | 45 |
42 using Dataquay::Uri; | 46 using Dataquay::Uri; |
43 using Dataquay::Node; | 47 using Dataquay::Node; |
44 using Dataquay::Nodes; | 48 using Dataquay::Nodes; |
45 using Dataquay::Triple; | 49 using Dataquay::Triple; |
788 cerr << "WARNING: RDFImporterImpl::fillModel: Unknown or unexpected model type" << endl; | 792 cerr << "WARNING: RDFImporterImpl::fillModel: Unknown or unexpected model type" << endl; |
789 return; | 793 return; |
790 } | 794 } |
791 | 795 |
792 RDFImporter::RDFDocumentType | 796 RDFImporter::RDFDocumentType |
793 RDFImporter::identifyDocumentType(QString url) | 797 RDFImporter::identifyDocumentType(QUrl url) |
794 { | 798 { |
795 bool haveAudio = false; | 799 bool haveAudio = false; |
796 bool haveAnnotations = false; | 800 bool haveAnnotations = false; |
797 bool haveRDF = false; | 801 bool haveRDF = false; |
798 | 802 |
803 if (!isPlausibleDocumentOfAnyKind(url)) { | |
804 return NotRDF; | |
805 } | |
806 | |
799 BasicStore *store = nullptr; | 807 BasicStore *store = nullptr; |
800 | 808 |
801 // This is not expected to return anything useful, but if it does | 809 // This is not expected to return anything useful, but if it does |
802 // anything at all then we know we have RDF | 810 // anything at all then we know we have RDF |
803 try { | 811 try { |
804 //!!! non-local document? | 812 store = BasicStore::load(url); |
805 store = BasicStore::load(QUrl(url)); | |
806 Triple t = store->matchOnce(Triple()); | 813 Triple t = store->matchOnce(Triple()); |
807 if (t != Triple()) haveRDF = true; | 814 if (t != Triple()) haveRDF = true; |
808 } catch (std::exception &) { | 815 } catch (std::exception &) { |
809 // nothing; haveRDF will be false so the next bit catches it | 816 // nothing; haveRDF will be false so the next bit catches it |
810 } | 817 } |
878 } | 885 } |
879 | 886 |
880 return OtherRDFDocument; | 887 return OtherRDFDocument; |
881 } | 888 } |
882 | 889 |
890 bool | |
891 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url) | |
892 { | |
893 // Return true if the document can be opened and contains some | |
894 // sort of text, either UTF-8 (so it could be Turtle) or another | |
895 // encoding that is recognised as XML | |
896 | |
897 FileSource source(url); | |
898 | |
899 if (!source.isAvailable()) { | |
900 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl; | |
901 return false; | |
902 } | |
903 | |
904 QFile file(source.getLocalFilename()); | |
905 if (!file.open(QFile::ReadOnly)) { | |
906 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl; | |
907 return false; | |
908 } | |
909 | |
910 QByteArray bytes = file.read(200); | |
911 | |
912 if (StringBits::isValidUtf8(bytes.toStdString(), true)) { | |
913 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl; | |
914 return true; // good enough to be worth trying to parse | |
915 } | |
916 | |
917 QXmlInputSource xmlSource; | |
918 xmlSource.setData(bytes); // guesses text encoding | |
919 | |
920 if (xmlSource.data().startsWith("<?xml")) { | |
921 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl; | |
922 return true; | |
923 } | |
924 | |
925 SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl; | |
926 return false; | |
927 } | |
928 |