diff data/fileio/TextTest.cpp @ 1855:db489a1ece9b

Pull out text-document check; it's useful elsewhere
author Chris Cannam
date Mon, 11 May 2020 17:27:18 +0100
parents
children 4c5736a517e0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/TextTest.cpp	Mon May 11 17:27:18 2020 +0100
@@ -0,0 +1,58 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Sonic Visualiser
+    An audio file viewer and annotation editor.
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#include "TextTest.h"
+
+#include "base/Debug.h"
+#include "base/StringBits.h"
+
+#include <QFile>
+#include <QXmlInputSource>
+
+bool
+TextTest::isApparentTextDocument(FileSource source)
+{
+    // Return true if the document can be opened and contains some
+    // sort of text, either UTF-8 (so it could be Turtle) or another
+    // encoding that is recognised as XML
+
+    if (!source.isAvailable()) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to retrieve document from " << source.getLocation() << endl;
+        return false;
+    }
+
+    QFile file(source.getLocalFilename());
+    if (!file.open(QFile::ReadOnly)) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Failed to open local file from " << source.getLocalFilename() << endl;
+        return false;
+    }
+
+    QByteArray bytes = file.read(200);
+
+    if (StringBits::isValidUtf8(bytes.toStdString(), true)) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be UTF-8" << endl;
+        return true; // good enough to be worth trying to parse
+    }
+
+    QXmlInputSource xmlSource;
+    xmlSource.setData(bytes); // guesses text encoding
+
+    if (xmlSource.data().startsWith("<?xml")) {
+        SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document appears to be XML" << endl;
+        return true;
+    }
+
+    SVDEBUG << "NOTE: TextTest::isApparentTextDocument: Document is not UTF-8 and is not XML, rejecting" << endl;
+    return false;
+}