changeset 1524:64ef24ebb19c

Some CSV format tests and minor fixes
author Chris Cannam
date Fri, 14 Sep 2018 09:25:17 +0100
parents c1b2eab6ac51
children a92e94215863
files base/StringBits.h base/test/TestStringBits.h data/fileio/CSVFormat.cpp data/fileio/CSVFormat.h data/fileio/test/CSVFormatTest.h data/fileio/test/csv/column-qualities.csv data/fileio/test/csv/comment.csv data/fileio/test/csv/separator-colon.csv data/fileio/test/csv/separator-comma.csv data/fileio/test/csv/separator-pipe.csv data/fileio/test/csv/separator-space.csv data/fileio/test/csv/separator-tab.csv data/fileio/test/files.pri data/fileio/test/svcore-data-fileio-test.cpp
diffstat 14 files changed, 270 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/base/StringBits.h	Wed Sep 12 18:49:32 2018 +0100
+++ b/base/StringBits.h	Fri Sep 14 09:25:17 2018 +0100
@@ -18,8 +18,8 @@
    This file copyright 2000-2010 Chris Cannam.
 */
 
-#ifndef _STRING_BITS_H_
-#define _STRING_BITS_H_
+#ifndef SV_STRING_BITS_H
+#define SV_STRING_BITS_H
 
 #include <QString>
 #include <QStringList>
--- a/base/test/TestStringBits.h	Wed Sep 12 18:49:32 2018 +0100
+++ b/base/test/TestStringBits.h	Fri Sep 14 09:25:17 2018 +0100
@@ -128,6 +128,34 @@
         testSplitQuoted(in, out);
     }
 
+    void snested3() {
+        QString in = "'aa bb cc\"' dd";
+        QStringList out;            
+        out << "aa bb cc\"" << "dd";
+        testSplitQuoted(in, out);
+    }
+
+    void snested3a() {
+        QString in = "\"aa bb cc'\" dd";
+        QStringList out;            
+        out << "aa bb cc'" << "dd";
+        testSplitQuoted(in, out);
+    }
+
+    void snested4() {
+        QString in = "'aa \"bb cc\" dd'";
+        QStringList out;            
+        out << "aa \"bb cc\" dd";
+        testSplitQuoted(in, out);
+    }
+
+    void snested4a() {
+        QString in = "\"aa 'bb cc' dd\"";
+        QStringList out;            
+        out << "aa 'bb cc' dd";
+        testSplitQuoted(in, out);
+    }
+
     void qquoted() {
         QString in = "a'a 'bb' \\\"cc\" dd\\\"";
         QStringList out;                 
@@ -135,6 +163,19 @@
         testSplitQuoted(in, out);
     }
 
+    void qspace() {
+        QString in = "\"a a\":\"b:b\":\"c d\"";
+        QStringList out1;
+        // Can't start a quote in the middle of a bare field - they
+        // are handled only if the first character in the field is a
+        // quote. Otherwise we'd have trouble with apostrophes etc
+        out1 << "a a:\"b:b\":\"c" << "d\"";
+        QCOMPARE(StringBits::splitQuoted(in, ' '), out1);
+        QStringList out2;
+        out2 << "a a" << "b:b" << "c d";
+        QCOMPARE(StringBits::splitQuoted(in, ':'), out2);
+    }
+    
     void multispace() {
         QString in = "  a'a \\'         'bb'    '      \\\"cc\" ' dd\\\" '";
         QStringList out;                                            
--- a/data/fileio/CSVFormat.cpp	Wed Sep 12 18:49:32 2018 +0100
+++ b/data/fileio/CSVFormat.cpp	Fri Sep 14 09:25:17 2018 +0100
@@ -33,12 +33,14 @@
     m_windowSize(1024),
     m_allowQuoting(true)
 {
-    guessFormatFor(path);
+    (void)guessFormatFor(path);
 }
 
-void
+bool
 CSVFormat::guessFormatFor(QString path)
 {
+    m_separator = ""; // to prompt guessing for it
+
     m_modelType = TwoDimensionalModel;
     m_timingType = ExplicitTiming;
     m_timeUnits = TimeSeconds;
@@ -53,8 +55,17 @@
     m_prevValues.clear();
 
     QFile file(path);
-    if (!file.exists()) return;
-    if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
+    if (!file.exists()) {
+        SVCERR << "CSVFormat::guessFormatFor(" << path
+               << "): File does not exist" << endl;
+        return false;
+    }
+    if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
+        SVCERR << "CSVFormat::guessFormatFor(" << path
+               << "): File could not be opened for reading" << endl;
+        return false;
+    }
+    SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl;
 
     QTextStream in(&file);
     in.seek(0);
@@ -85,14 +96,22 @@
 
     guessPurposes();
     guessAudioSampleRange();
+
+    return true;
 }
 
 void
 CSVFormat::guessSeparator(QString line)
 {
-    char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
-    for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
-        if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
+    QString candidates = "\t|,/: ";
+
+    for (int i = 0; i < candidates.length(); ++i) {
+        auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
+        if (bits.size() >= 2) {
+            SVDEBUG << "Successfully split the line into:" << endl;
+            for (auto b: bits) {
+                SVDEBUG << b << endl;
+            }
             m_separator = candidates[i];
             SVDEBUG << "Estimated column separator: '" << m_separator
                     << "'" << endl;
@@ -104,7 +123,9 @@
 void
 CSVFormat::guessQualities(QString line, int lineno)
 {
-    if (m_separator == "") guessSeparator(line);
+    if (m_separator == "") {
+        guessSeparator(line);
+    }
 
     QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
 
@@ -167,6 +188,14 @@
                 }
             } else {
                 numeric = false;
+
+                // If the column is not numeric, it can't be any of
+                // these things either
+                integral = false;
+                increasing = false;
+                small = false;
+                large = false;
+                signd = false;
             }
         }
 
@@ -186,7 +215,7 @@
 
             m_prevValues[i] = value;
         }
-
+        
         m_columnQualities[i] =
             (numeric    ? ColumnNumeric : 0) |
             (integral   ? ColumnIntegral : 0) |
--- a/data/fileio/CSVFormat.h	Wed Sep 12 18:49:32 2018 +0100
+++ b/data/fileio/CSVFormat.h	Fri Sep 14 09:25:17 2018 +0100
@@ -95,8 +95,21 @@
      * string, the separator character will also be guessed; otherwise
      * the current separator will be used.  The other properties of
      * this object will be set according to guesses from the file.
+     *
+     * The properties that are guessed from the file contents are:
+     * separator, column count, variable-column-count flag, audio
+     * sample range, timing type, time units, column qualities, column
+     * purposes, and model type. The sample rate and window size
+     * cannot be guessed and will not be changed by this function.
+     * Note also that this function will never guess WaveFileModel for
+     * the model type.
+     *
+     * Return false if there is some fundamental error, e.g. the file
+     * could not be opened at all. Return true otherwise. Note that
+     * this function returns true even if the file doesn't appear to
+     * make much sense as a data format.
      */
-    void guessFormatFor(QString path);
+    bool guessFormatFor(QString path);
  
     ModelType    getModelType()     const { return m_modelType;     }
     TimingType   getTimingType()    const { return m_timingType;    }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/CSVFormatTest.h	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,130 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    Sonic Visualiser
+    An audio file viewer and annotation editor.
+    Centre for Digital Music, Queen Mary, University of London.
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.  See the file
+    COPYING included with this distribution for more information.
+*/
+
+#ifndef TEST_CSV_FORMAT_H
+#define TEST_CSV_FORMAT_H
+
+// Tests for the code that guesses the most likely format for parsing a CSV file
+
+#include "../CSVFormat.h"
+
+#include "base/Debug.h"
+
+#include <cmath>
+
+#include <QObject>
+#include <QtTest>
+#include <QDir>
+
+#include <iostream>
+
+using namespace std;
+
+class CSVFormatTest : public QObject
+{
+    Q_OBJECT
+
+private:
+    QDir csvDir;
+
+public:
+    CSVFormatTest(QString base) {
+        if (base == "") {
+            base = "svcore/data/fileio/test";
+        }
+        csvDir = QDir(base + "/csv");
+    }
+
+private slots:
+    void init() {
+        if (!csvDir.exists()) {
+            SVCERR << "ERROR: CSV test file directory \"" << csvDir.absolutePath() << "\" does not exist" << endl;
+            QVERIFY2(csvDir.exists(), "CSV test file directory not found");
+        }
+    }
+
+    void separatorComma() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-comma.csv")));
+        QCOMPARE(f.getSeparator(), QChar(','));
+        QCOMPARE(f.getColumnCount(), 3);
+    }
+    
+    void separatorTab() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-tab.csv")));
+        QCOMPARE(f.getSeparator(), QChar('\t'));
+        QCOMPARE(f.getColumnCount(), 3);
+    }
+    
+    void separatorPipe() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-pipe.csv")));
+        QCOMPARE(f.getSeparator(), QChar('|'));
+        // differs from the others
+        QCOMPARE(f.getColumnCount(), 4);
+    }
+    
+    void separatorSpace() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-space.csv")));
+        QCOMPARE(f.getSeparator(), QChar(' '));
+        // NB fields are separated by 1 or more spaces, not necessarily exactly 1
+        QCOMPARE(f.getColumnCount(), 3);
+    }
+    
+    void separatorColon() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-colon.csv")));
+        QCOMPARE(f.getSeparator(), QChar(':'));
+        QCOMPARE(f.getColumnCount(), 3);
+    }
+    
+    void comment() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("comment.csv")));
+        QCOMPARE(f.getSeparator(), QChar(','));
+        QCOMPARE(f.getColumnCount(), 4);
+    }
+
+    void qualities() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("column-qualities.csv")));
+        QCOMPARE(f.getSeparator(), QChar(','));
+        QCOMPARE(f.getColumnCount(), 7);
+        QList<CSVFormat::ColumnQualities> q = f.getColumnQualities();
+        QList<CSVFormat::ColumnQualities> expected;
+        expected << 0;
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric |
+                                               CSVFormat::ColumnIntegral |
+                                               CSVFormat::ColumnIncreasing);
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric |
+                                               CSVFormat::ColumnIntegral |
+                                               CSVFormat::ColumnIncreasing |
+                                               CSVFormat::ColumnLarge);
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric);
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric |
+                                               CSVFormat::ColumnIncreasing);
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric |
+                                               CSVFormat::ColumnSmall |
+                                               CSVFormat::ColumnSigned);
+        expected << CSVFormat::ColumnQualities(CSVFormat::ColumnNumeric |
+                                               CSVFormat::ColumnIntegral |
+                                               CSVFormat::ColumnIncreasing |
+                                               CSVFormat::ColumnNearEmpty);
+        QCOMPARE(q, expected);
+    }
+};
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/column-qualities.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,9 @@
+Text only,4,1024,45.6,45.7,-0.001,987
+Blah,5,2048,45.1,45.9,0.0123,
+
+# Include the odd blank line, space, and comment
+
+
+Parp, 6, 3072 , 44.7 ,52.1, 0.26,
+
+Toot,7,4096,42.2,57.9,0.0,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/comment.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,5 @@
+# This is a comment
+# This is a comment with various | possible | but not real	separators in it
+This is,the first,of the,real data lines
+# This,is,one,that,would,cause,more,columns,to,be,counted,if,it,were,real
+This is the,second
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-colon.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,4 @@
+"This thing":"That thing":"The other thing"
+1:12,4:16,3
+2:14,2
+3:16,1:1901
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-comma.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,4 @@
+This thing,That thing,The other thing
+1,12.4,16.3
+2,14.2
+3,16.1,"This, that\", and the other"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-pipe.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,4 @@
+This thing|That thing|The other thing
+1|12,4|16,3
+2|14,2|And another|column
+3|16,1|1901|"Not another|column - we have four columns, not five"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-space.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,4 @@
+"This thing" "That thing" "The other thing"
+1            12,4         16,3
+2            14,2
+3            16,1         1901
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-tab.csv	Fri Sep 14 09:25:17 2018 +0100
@@ -0,0 +1,4 @@
+This thing	That thing	The other thing
+1	12,4	16,3
+2	14,2
+3	16,1	1901
--- a/data/fileio/test/files.pri	Wed Sep 12 18:49:32 2018 +0100
+++ b/data/fileio/test/files.pri	Fri Sep 14 09:25:17 2018 +0100
@@ -6,6 +6,7 @@
 	AudioTestData.h \
 	EncodingTest.h \
 	MIDIFileReaderTest.h \
+	CSVFormatTest.h \
 	CSVStreamWriterTest.h
      
 TEST_SOURCES += \
--- a/data/fileio/test/svcore-data-fileio-test.cpp	Wed Sep 12 18:49:32 2018 +0100
+++ b/data/fileio/test/svcore-data-fileio-test.cpp	Fri Sep 14 09:25:17 2018 +0100
@@ -16,6 +16,7 @@
 #include "AudioFileWriterTest.h"
 #include "EncodingTest.h"
 #include "MIDIFileReaderTest.h"
+#include "CSVFormatTest.h"
 #include "CSVStreamWriterTest.h"
 
 #include <QtTest>
@@ -40,7 +41,7 @@
 
     QCoreApplication app(argc, argv);
     app.setOrganizationName("sonic-visualiser");
-    app.setApplicationName("test-fileio");
+    app.setApplicationName("test-svcore-data-fileio");
 
     if (testDir != "") {
         SVCERR << "Setting test directory base path to \"" << testDir << "\"" << endl;
@@ -71,16 +72,22 @@
     }
 
     {
+        CSVFormatTest t(testDir);
+        if (QTest::qExec(&t, argc, argv) == 0) ++good;
+        else ++bad;
+    }
+
+    {
         CSVStreamWriterTest t;
         if (QTest::qExec(&t, argc, argv) == 0) ++good;
         else ++bad;
     }
 
     if (bad > 0) {
-    SVCERR << "\n********* " << bad << " test suite(s) failed!\n" << endl;
+        SVCERR << "\n********* " << bad << " test suite(s) failed!\n" << endl;
         return 1;
     } else {
-    SVCERR << "All tests passed" << endl;
+        SVCERR << "All tests passed" << endl;
         return 0;
     }
 }