changeset 1855:db489a1ece9b

Pull out text-document check; it's useful elsewhere
author Chris Cannam
date Mon, 11 May 2020 17:27:18 +0100
parents bde22957545e
children ecd3152750a5
files data/fileio/TextTest.cpp data/fileio/TextTest.h files.pri rdf/RDFImporter.cpp
diffstat 4 files changed, 105 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/TextTest.cpp	Mon May 11 17:27:18 2020 +0100
@@ -0,0 +1,58 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Sonic Visualiser
+    An audio file viewer and annotation editor.
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#include "TextTest.h"
+
+#include "base/Debug.h"
+#include "base/StringBits.h"
+
+#include <QFile>
+#include <QXmlInputSource>
+
+bool
+TextTest::isApparentTextDocument(FileSource source)
+{
+    // Return true if the document can be opened and contains some
+    // sort of text, either UTF-8 (so it could be Turtle) or another
+    // encoding that is recognised as XML
+
+    if (!source.isAvailable()) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to retrieve document from " << source.getLocation() << endl;
+        return false;
+    }
+
+    QFile file(source.getLocalFilename());
+    if (!file.open(QFile::ReadOnly)) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to open local file from " << source.getLocalFilename() << endl;
+        return false;
+    }
+
+    QByteArray bytes = file.read(200);
+
+    if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be UTF-8" << endl;
+        return true; // good enough to be worth trying to parse
+    }
+
+    QXmlInputSource xmlSource;
+    xmlSource.setData(bytes); // guesses text encoding
+
+    if (xmlSource.data().startsWith("<?xml")) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be XML" << endl;
+        return true;
+    }
+
+    SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document is not UTF-8 and is not XML, rejecting" << endl;
+    return false;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/TextTest.h	Mon May 11 17:27:18 2020 +0100
@@ -0,0 +1,41 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Sonic Visualiser
+    An audio file viewer and annotation editor.
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#ifndef SV_TEXT_TEST_H
+#define SV_TEXT_TEST_H
+
+#include "data/fileio/FileSource.h"
+
+class TextTest
+{
+public:
+    /**
+     * Return true if the source appears to point to a text format of
+     * some kind (could be CSV, XML, RDF/Turtle etc).
+     *
+     * We apply two tests and report success if either succeeds:
+     *
+     * 1. The first few hundred bytes (where present) of the document
+     *    are valid UTF-8
+     *
+     * 2. The document starts with the text "<?xml" when opened using
+     *    QXmlInputSource (which guesses its text encoding)
+     *
+     * So we only accept non-UTF-8 encodings where they also happen to
+     * be XML documents.
+     */
+    static bool isApparentTextDocument(FileSource);
+};
+
+#endif
--- a/files.pri	Mon May 11 14:43:58 2020 +0100
+++ b/files.pri	Mon May 11 17:27:18 2020 +0100
@@ -63,6 +63,7 @@
            data/fileio/CSVStreamWriter.h \
            data/fileio/DataFileReader.h \
            data/fileio/DataFileReaderFactory.h \
+           data/fileio/DecodingWavFileReader.h \
            data/fileio/FileFinder.h \
            data/fileio/FileReadThread.h \
            data/fileio/FileSource.h \
@@ -70,7 +71,7 @@
            data/fileio/MIDIFileWriter.h \
            data/fileio/MP3FileReader.h \
            data/fileio/PlaylistFileReader.h \
-           data/fileio/DecodingWavFileReader.h \
+           data/fileio/TextTest.h \
            data/fileio/WavFileReader.h \
            data/fileio/WavFileWriter.h \
            data/midi/MIDIEvent.h \
@@ -199,13 +200,14 @@
            data/fileio/CSVFileWriter.cpp \
            data/fileio/CSVFormat.cpp \
            data/fileio/DataFileReaderFactory.cpp \
+           data/fileio/DecodingWavFileReader.cpp \
            data/fileio/FileReadThread.cpp \
            data/fileio/FileSource.cpp \
            data/fileio/MIDIFileReader.cpp \
            data/fileio/MIDIFileWriter.cpp \
            data/fileio/MP3FileReader.cpp \
            data/fileio/PlaylistFileReader.cpp \
-           data/fileio/DecodingWavFileReader.cpp \
+           data/fileio/TextTest.cpp \
            data/fileio/WavFileReader.cpp \
            data/fileio/WavFileWriter.cpp \
            data/midi/MIDIInput.cpp \
--- a/rdf/RDFImporter.cpp	Mon May 11 14:43:58 2020 +0100
+++ b/rdf/RDFImporter.cpp	Mon May 11 17:27:18 2020 +0100
@@ -36,13 +36,11 @@
 #include "data/fileio/FileSource.h"
 #include "data/fileio/CachedFile.h"
 #include "data/fileio/FileFinder.h"
+#include "data/fileio/TextTest.h"
 
 #include <dataquay/BasicStore.h>
 #include <dataquay/PropertyObject.h>
 
-#include <QFile>
-#include <QXmlInputSource>
-
 using Dataquay::Uri;
 using Dataquay::Node;
 using Dataquay::Nodes;
@@ -890,39 +888,6 @@
 bool
 RDFImporter::isPlausibleDocumentOfAnyKind(QUrl url)
 {
-    // Return true if the document can be opened and contains some
-    // sort of text, either UTF-8 (so it could be Turtle) or another
-    // encoding that is recognised as XML
-    
-    FileSource source(url);
-
-    if (!source.isAvailable()) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to retrieve document from " << url << endl;
-        return false;
-    }
-
-    QFile file(source.getLocalFilename());
-    if (!file.open(QFile::ReadOnly)) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Failed to open local file from " << source.getLocalFilename() << endl;
-        return false;
-    }
-
-    QByteArray bytes = file.read(200);
-
-    if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be UTF-8" << endl;
-        return true; // good enough to be worth trying to parse
-    }
-
-    QXmlInputSource xmlSource;
-    xmlSource.setData(bytes); // guesses text encoding
-
-    if (xmlSource.data().startsWith("<?xml")) {
-        SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document appears to be XML" << endl;
-        return true;
-    }
-
-    SVDEBUG << "NOTE: RDFImporter::isPlausibleDocumentOfAnyKind: Document is not UTF-8 and is not XML, rejecting" << endl;
-    return false;
+    return TextTest::isApparentTextDocument(FileSource(url));
 }