changeset 1854:bde22957545e

Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now)
author Chris Cannam
date Mon, 11 May 2020 14:43:58 +0100
parents f36fef97ac81
children db489a1ece9b
files base/StringBits.cpp base/StringBits.h base/test/TestStringBits.h data/fileio/CSVFormat.cpp data/fileio/test/csv/separator-comma.csv data/fileio/test/csv/separator-space.csv
diffstat 6 files changed, 139 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- a/base/StringBits.cpp	Wed May 06 09:08:06 2020 +0100
+++ b/base/StringBits.cpp	Mon May 11 14:43:58 2020 +0100
@@ -72,17 +72,23 @@
 }
     
 QStringList
-StringBits::splitQuoted(QString s, QChar separator)
+StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode)
 {
     QStringList tokens;
     QString tok;
 
-    // sep -> just seen a field separator (or start of line)
+    // beg -> at beginning of line
+    // sep -> just seen a field separator
     // unq -> in an unquoted field
     // q1  -> in a single-quoted field
     // q2  -> in a double-quoted field
 
-    enum { sep, unq, q1, q2 } mode = sep;
+    enum { beg, sep, unq, q1, q2 } mode = beg;
+
+    bool use_doubling = (escapeMode == EscapeDoubling ||
+                         escapeMode == EscapeAny);
+    bool use_backslash = (escapeMode == EscapeBackslash ||
+                          escapeMode == EscapeAny);
 
     for (int i = 0; i < s.length(); ++i) {
         
@@ -90,43 +96,54 @@
 
         if (c == '\'') {
             switch (mode) {
-            case sep: mode = q1; break;
+            case beg: case sep: mode = q1; break;
             case unq: case q2: tok += c; break;
-            case q1: mode = unq; break;
+            case q1:
+                if (use_doubling && i+1 < s.length() && s[i+1] == c) {
+                    tok += c; ++i; break;
+                } else {
+                    mode = unq; break;
+                }
             }
 
         } else if (c == '"') {
             switch (mode) {
-            case sep: mode = q2; break;
+            case beg: case sep: mode = q2; break;
             case unq: case q1: tok += c; break;
-            case q2: mode = unq; break;
+            case q2: 
+                if (use_doubling && i+1 < s.length() && s[i+1] == c) {
+                    tok += c; ++i; break;
+                } else {
+                    mode = unq; break;
+                }
             }
 
         } else if (c == separator || (separator == ' ' && c.isSpace())) {
             switch (mode) {
+            case beg: mode = sep; tokens << ""; break;
             case sep: if (separator != ' ') tokens << ""; break;
             case unq: mode = sep; tokens << tok; tok = ""; break;
             case q1: case q2: tok += c; break;
             }
 
-        } else if (c == '\\') {
+        } else if (c == '\\' && use_backslash) {
             if (++i < s.length()) {
                 c = s[i];
                 switch (mode) {
-                case sep: mode = unq; tok += c; break;
+                case beg: case sep: mode = unq; tok += c; break;
                 case unq: case q1: case q2: tok += c; break;
                 }
             }
 
         } else {
             switch (mode) {
-            case sep: mode = unq; tok += c; break;
+            case beg: case sep: mode = unq; tok += c; break;
             case unq: case q1: case q2: tok += c; break;
             }
         }
     }
 
-    if (tok != "" || mode != sep) {
+    if (tok != "" || mode != beg) {
         if (mode == q1) {
             tokens << ("'" + tok);  // turns out it wasn't quoted after all
         } else if (mode == q2) {
--- a/base/StringBits.h	Wed May 06 09:08:06 2020 +0100
+++ b/base/StringBits.h	Mon May 11 14:43:58 2020 +0100
@@ -38,6 +38,13 @@
      */
     static double stringToDoubleLocaleFree(QString s, bool *ok = 0);
 
+    enum EscapeMode {
+        EscapeAny,             // support both backslash and doubling escapes
+        EscapeBackslash,       // support backslash escapes only
+        EscapeDoubling,        // support doubling escapes ("" for " etc) only
+        EscapeNone             // support no escapes
+    };
+    
     /**
      * Split a string at the given separator character, allowing
      * quoted sections that contain the separator.  If the separator
@@ -45,16 +52,20 @@
      * single separator.  If the separator is another whitespace
      * character such as '\t', it will be used literally.
      */
-    static QStringList splitQuoted(QString s, QChar separator);
+    static QStringList splitQuoted(QString s,
+                                   QChar separator,
+                                   EscapeMode escapeMode = EscapeAny);
 
     /**
      * Split a string at the given separator character.  If quoted is
-     * true, do so by calling splitQuoted (above).  If quoted is
-     * false, use QString::split; if separator is ' ', use
-     * SkipEmptyParts behaviour, otherwise use KeepEmptyParts (this is
-     * analogous to the behaviour of splitQuoted).
+     * true, do so by calling splitQuoted (above) in EscapeAny escape
+     * mode.  If quoted is false, use QString::split; if separator is
+     * ' ', use SkipEmptyParts behaviour, otherwise use KeepEmptyParts
+     * (this is analogous to the behaviour of splitQuoted).
      */
-    static QStringList split(QString s, QChar separator, bool quoted);
+    static QStringList split(QString s,
+                             QChar separator,
+                             bool quoted);
 
     /**
      * Join a vector of strings into a single string, with the
--- a/base/test/TestStringBits.h	Wed May 06 09:08:06 2020 +0100
+++ b/base/test/TestStringBits.h	Mon May 11 14:43:58 2020 +0100
@@ -44,6 +44,35 @@
     }
 
 private slots:
+    void empty() {
+        QString in = "";
+        QStringList out;     
+        testSplitQuoted(in, out);
+    }
+
+    void empties() {
+
+        // Behaviour here differs based on what the separator is
+        // (spaces are coalesced)
+
+        QString in;
+        QStringList out;
+        out << "" << "";
+
+        in = " ";
+        QCOMPARE(StringBits::splitQuoted(in, ' '), out);
+
+        in = ",";
+        QCOMPARE(StringBits::splitQuoted(in, ','), out);
+
+        in = "  ";
+        QCOMPARE(StringBits::splitQuoted(in, ' '), out);
+
+        in = ",,";
+        out << "";
+        QCOMPARE(StringBits::splitQuoted(in, ','), out);
+    }
+
     void simple() {
         QString in = "a b c d";
         QStringList out;     
@@ -107,6 +136,20 @@
         testSplitQuoted(in, out);
     }
 
+    void ddescaped() {
+        QString in = "a \"b c\"\" d\"";
+        QStringList out;         
+        out << "a" << "b c\" d"; 
+        testSplitQuoted(in, out);
+    }
+
+    void sdescaped() {
+        QString in = "a 'b c'' d'";
+        QStringList out;         
+        out << "a" << "b c' d"; 
+        testSplitQuoted(in, out);
+    }
+
     void dnested() {
         QString in = "a \"b c' d\"";
         QStringList out;        
@@ -179,7 +222,7 @@
     void multispace() {
         QString in = "  a'a \\'         'bb'    '      \\\"cc\" ' dd\\\" '";
         QStringList out;                                            
-        out << "a'a" << "'" << "bb" << "      \"cc\" " << "dd\"" << "'";
+        out << "" << "a'a" << "'" << "bb" << "      \"cc\" " << "dd\"" << "'";
         QCOMPARE(StringBits::splitQuoted(in, ' '), out);
 
         QString in2 = ",,a'a,\\',,,,,,,,,'bb',,,,',,,,,,\\\"cc\",',dd\\\",'";
--- a/data/fileio/CSVFormat.cpp	Wed May 06 09:08:06 2020 +0100
+++ b/data/fileio/CSVFormat.cpp	Mon May 11 14:43:58 2020 +0100
@@ -135,7 +135,9 @@
         ColumnIncreasing | ColumnNearEmpty;
     
     for (int i = 0; i < cols; ++i) {
-            
+
+        SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
+        
         while (m_columnQualities.size() <= i) {
             m_columnQualities.push_back(defaultQualities);
             m_prevValues.push_back(0.f);
@@ -157,57 +159,60 @@
         bool signd      = (qualities & ColumnSigned); // also defaults to off
         bool emptyish   = (qualities & ColumnNearEmpty);
 
-        if (lineno > 1 && s.trimmed() != "") {
-            emptyish = false;
-        }
+        if (s.trimmed() != "") {
         
-        float value = 0.f;
+            if (lineno > 1) {
+                emptyish = false;
+            }
+        
+            float value = 0.f;
 
-        //!!! how to take into account headers?
+            //!!! how to take into account headers?
 
-        if (numeric) {
-            value = s.toFloat(&ok);
-            if (!ok) {
-                value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
-            }
-            if (ok) {
-                if (lineno < 2 && value > 1000.f) {
-                    large = true;
+            if (numeric) {
+                value = s.toFloat(&ok);
+                if (!ok) {
+                    value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
                 }
-                if (value < 0.f) {
-                    signd = true;
-                }
-                if (value < -1.f || value > 1.f) {
+                if (ok) {
+                    if (lineno < 2 && value > 1000.f) {
+                        large = true;
+                    }
+                    if (value < 0.f) {
+                        signd = true;
+                    }
+                    if (value < -1.f || value > 1.f) {
+                        small = false;
+                    }
+                } else {
+                    numeric = false;
+
+                    // If the column is not numeric, it can't be any of
+                    // these things either
+                    integral = false;
+                    increasing = false;
                     small = false;
-                }
-            } else {
-                numeric = false;
-
-                // If the column is not numeric, it can't be any of
-                // these things either
-                integral = false;
-                increasing = false;
-                small = false;
-                large = false;
-                signd = false;
-            }
-        }
-
-        if (numeric) {
-
-            if (integral) {
-                if (s.contains('.') || s.contains(',')) {
-                    integral = false;
+                    large = false;
+                    signd = false;
                 }
             }
 
-            if (increasing) {
-                if (lineno > 0 && value <= m_prevValues[i]) {
-                    increasing = false;
+            if (numeric) {
+
+                if (integral) {
+                    if (s.contains('.') || s.contains(',')) {
+                        integral = false;
+                    }
                 }
+
+                if (increasing) {
+                    if (lineno > 0 && value <= m_prevValues[i]) {
+                        increasing = false;
+                    }
+                }
+
+                m_prevValues[i] = value;
             }
-
-            m_prevValues[i] = value;
         }
         
         m_columnQualities[i] =
--- a/data/fileio/test/csv/separator-comma.csv	Wed May 06 09:08:06 2020 +0100
+++ b/data/fileio/test/csv/separator-comma.csv	Mon May 11 14:43:58 2020 +0100
@@ -1,4 +1,6 @@
 This thing,That thing,The other thing
 1,12.4,16.3
 2,14.2
+3,"This, that", "and the other"
 3,16.1,"This, that\", and the other"
+3,16.1,"This, that"", and the other"
--- a/data/fileio/test/csv/separator-space.csv	Wed May 06 09:08:06 2020 +0100
+++ b/data/fileio/test/csv/separator-space.csv	Mon May 11 14:43:58 2020 +0100
@@ -2,3 +2,4 @@
 1            12,4         16,3
 2            14,2
 3            16,1         1901
+"This thing" "The ""second"" thing" "The other thing"