changeset 1585:9570ef94eaa3

Add mechanism to retrieve the set of plausible separators found in CSV-like file when guessing its format
author Chris Cannam
date Wed, 09 Jan 2019 14:39:50 +0000
parents 07f23b90701a
children 841b2a3e606d
files data/fileio/CSVFormat.cpp data/fileio/CSVFormat.h data/fileio/test/CSVFormatTest.h data/fileio/test/csv/separator-many.csv
diffstat 4 files changed, 32 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/data/fileio/CSVFormat.cpp	Wed Nov 14 15:46:35 2018 +0000
+++ b/data/fileio/CSVFormat.cpp	Wed Jan 09 14:39:50 2019 +0000
@@ -39,8 +39,6 @@
 bool
 CSVFormat::guessFormatFor(QString path)
 {
-    m_separator = ""; // to prompt guessing for it
-
     m_modelType = TwoDimensionalModel;
     m_timingType = ExplicitTiming;
     m_timeUnits = TimeSeconds;
@@ -108,14 +106,12 @@
     for (int i = 0; i < candidates.length(); ++i) {
         auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
         if (bits.size() >= 2) {
-            SVDEBUG << "Successfully split the line into:" << endl;
-            for (auto b: bits) {
-                SVDEBUG << b << endl;
+            m_plausibleSeparators.insert(candidates[i]);
+            if (m_separator == "") {
+                m_separator = candidates[i];
+                SVDEBUG << "Estimated column separator: '" << m_separator
+                        << "'" << endl;
             }
-            m_separator = candidates[i];
-            SVDEBUG << "Estimated column separator: '" << m_separator
-                    << "'" << endl;
-            return;
         }
     }
 }
@@ -123,9 +119,7 @@
 void
 CSVFormat::guessQualities(QString line, int lineno)
 {
-    if (m_separator == "") {
-        guessSeparator(line);
-    }
+    guessSeparator(line);
 
     QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
 
--- a/data/fileio/CSVFormat.h	Wed Nov 14 15:46:35 2018 +0000
+++ b/data/fileio/CSVFormat.h	Wed Jan 09 14:39:50 2019 +0000
@@ -19,6 +19,8 @@
 #include <QString>
 #include <QStringList>
 
+#include <set>
+
 #include "base/BaseTypes.h"
 
 class CSVFormat
@@ -77,7 +79,7 @@
         m_modelType(TwoDimensionalModel),
         m_timingType(ExplicitTiming),
         m_timeUnits(TimeSeconds),
-        m_separator(","),
+        m_separator(""),
         m_sampleRate(44100),
         m_windowSize(1024),
         m_columnCount(0),
@@ -120,9 +122,13 @@
     AudioSampleRange getAudioSampleRange() const { return m_audioSampleRange; }
     bool         getAllowQuoting()  const { return m_allowQuoting;  }
     QChar        getSeparator()     const { 
-        if (m_separator == "") return ' ';
+        if (m_separator == "") return ',';
         else return m_separator[0];
     }
+    // set rather than QSet to ensure a fixed order
+    std::set<QChar> getPlausibleSeparators() const {
+        return m_plausibleSeparators;
+    }
 
     void setModelType(ModelType t)        { m_modelType    = t; }
     void setTimingType(TimingType t)      { m_timingType   = t; }
@@ -157,7 +163,8 @@
     ModelType    m_modelType;
     TimingType   m_timingType;
     TimeUnits    m_timeUnits;
-    QString      m_separator;
+    QString      m_separator; // "" or a single char - basically QChar option
+    std::set<QChar> m_plausibleSeparators;
     sv_samplerate_t m_sampleRate;
     int          m_windowSize;
 
--- a/data/fileio/test/CSVFormatTest.h	Wed Nov 14 15:46:35 2018 +0000
+++ b/data/fileio/test/CSVFormatTest.h	Wed Jan 09 14:39:50 2019 +0000
@@ -91,6 +91,18 @@
         QCOMPARE(f.getColumnCount(), 3);
     }
     
+    void plausibleSeparators() {
+        CSVFormat f;
+        QVERIFY(f.guessFormatFor(csvDir.filePath("separator-many.csv")));
+        std::set<QChar> p;
+        p.insert(QChar('|'));
+        p.insert(QChar(','));
+        p.insert(QChar(':'));
+        p.insert(QChar(' '));
+        std::set<QChar> actual = f.getPlausibleSeparators();
+        QCOMPARE(actual, p);
+    }
+    
     void comment() {
         CSVFormat f;
         QVERIFY(f.guessFormatFor(csvDir.filePath("comment.csv")));
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/fileio/test/csv/separator-many.csv	Wed Jan 09 14:39:50 2019 +0000
@@ -0,0 +1,4 @@
+This thing|That thing|The other thing
+1|12,4|16,3
+2|14,2|And:another|column
+3|16,1|1901|