Mercurial > hg > svcore
changeset 1855:db489a1ece9b
Pull out text-document check; it's useful elsewhere
author | Chris Cannam |
---|---|
date | Mon, 11 May 2020 17:27:18 +0100 |
parents | bde22957545e |
children | ecd3152750a5 |
files | data/fileio/TextTest.cpp data/fileio/TextTest.h files.pri rdf/RDFImporter.cpp |
diffstat | 4 files changed, 105 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data/fileio/TextTest.cpp Mon May 11 17:27:18 2020 +0100 @@ -0,0 +1,58 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Sonic Visualiser + An audio file viewer and annotation editor. + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#include "TextTest.h" + +#include "base/Debug.h" +#include "base/StringBits.h" + +#include <QFile> +#include <QXmlInputSource> + +bool +TextTest::isApparentTextDocument(FileSource source) +{ + // Return true if the document can be opened and contains some + // sort of text, either UTF-8 (so it could be Turtle) or another + // encoding that is recognised as XML + + if (!source.isAvailable()) { + SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to retrieve document from " << source.getLocation() << endl; + return false; + } + + QFile file(source.getLocalFilename()); + if (!file.open(QFile::ReadOnly)) { + SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to open local file from " << source.getLocalFilename() << endl; + return false; + } + + QByteArray bytes = file.read(200); + + if (StringBits::isValidUtf8(bytes.toStdString(), true)) { + SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be UTF-8" << endl; + return true; // good enough to be worth trying to parse + } + + QXmlInputSource xmlSource; + xmlSource.setData(bytes); // guesses text encoding + + if (xmlSource.data().startsWith("<?xml")) { + SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be XML" << endl; + return true; + } + + SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document is not UTF-8 and is not XML, rejecting" << endl; + return false; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data/fileio/TextTest.h Mon May 11 17:27:18 2020 +0100 @@ -0,0 +1,41 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Sonic Visualiser + An audio file viewer and annotation editor. + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +#ifndef SV_TEXT_TEST_H +#define SV_TEXT_TEST_H + +#include "data/fileio/FileSource.h" + +class TextTest +{ +public: + /** + * Return true if the source appears to point to a text format of + * some kind (could be CSV, XML, RDF/Turtle etc). + * + * We apply two tests and report success if either succeeds: + * + * 1. The first few hundred bytes (where present) of the document + * are valid UTF-8 + * + * 2. The document starts with the text "<?xml" when opened using + * QXmlInputSource (which guesses its text encoding) + * + * So we only accept non-UTF-8 encodings where they also happen to + * be XML documents. + */ + static bool isApparentTextDocument(FileSource); +}; + +#endif
--- a/files.pri Mon May 11 14:43:58 2020 +0100 +++ b/files.pri Mon May 11 17:27:18 2020 +0100 @@ -63,6 +63,7 @@ data/fileio/CSVStreamWriter.h \ data/fileio/DataFileReader.h \ data/fileio/DataFileReaderFactory.h \ + data/fileio/DecodingWavFileReader.h \ data/fileio/FileFinder.h \ data/fileio/FileReadThread.h \ data/fileio/FileSource.h \ @@ -70,7 +71,7 @@ data/fileio/MIDIFileWriter.h \ data/fileio/MP3FileReader.h \ data/fileio/PlaylistFileReader.h \ - data/fileio/DecodingWavFileReader.h \ + data/fileio/TextTest.h \ data/fileio/WavFileReader.h \ data/fileio/WavFileWriter.h \ data/midi/MIDIEvent.h \ @@ -199,13 +200,14 @@ data/fileio/CSVFileWriter.cpp \ data/fileio/CSVFormat.cpp \ data/fileio/DataFileReaderFactory.cpp \ + data/fileio/DecodingWavFileReader.cpp \ data/fileio/FileReadThread.cpp \ data/fileio/FileSource.cpp \ data/fileio/MIDIFileReader.cpp \ data/fileio/MIDIFileWriter.cpp \ data/fileio/MP3FileReader.cpp \ data/fileio/PlaylistFileReader.cpp \ - data/fileio/DecodingWavFileReader.cpp \ + data/fileio/TextTest.cpp \ data/fileio/WavFileReader.cpp \ data/fileio/WavFileWriter.cpp \ data/midi/MIDIInput.cpp \
--- a/rdf/RDFImporter.cpp Mon May 11 14:43:58 2020 +0100 +++ b/rdf/RDFImporter.cpp Mon May 11 17:27:18 2020 +0100 @@ -36,13 +36,11 @@ #include "data/fileio/FileSource.h" #include "data/fileio/CachedFile.h" #include "data/fileio/FileFinder.h" +#include "data/fileio/TextTest.h" #include <dataquay/BasicStore.h> #include <dataquay/PropertyObject.h> -#include <QFile> -#include <QXmlInputSource> - using Dataquay::Uri; using Dataquay::Node; using Dataquay::Nodes; @@ -890,39 +888,6 @@ bool RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url) { - // Return true if the document can be opened and contains some - // sort of text, either UTF-8 (so it could be Turtle) or another - // encoding that is recognised as XML - - FileSource source(url); - - if (!source.isAvailable()) { - SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl; - return false; - } - - QFile file(source.getLocalFilename()); - if (!file.open(QFile::ReadOnly)) { - SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl; - return false; - } - - QByteArray bytes = file.read(200); - - if (StringBits::isValidUtf8(bytes.toStdString(), true)) { - SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl; - return true; // good enough to be worth trying to parse - } - - QXmlInputSource xmlSource; - xmlSource.setData(bytes); // guesses text encoding - - if (xmlSource.data().startsWith("<?xml")) { - SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl; - return true; - } - - SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl; - return false; + return TextTest::isApparentTextDocument(FileSource(url)); }