diff base/StringBits.cpp @ 1854:bde22957545e

Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now)
author Chris Cannam
date Mon, 11 May 2020 14:43:58 +0100
parents 91056142abd0
children
line wrap: on
line diff
--- a/base/StringBits.cpp	Wed May 06 09:08:06 2020 +0100
+++ b/base/StringBits.cpp	Mon May 11 14:43:58 2020 +0100
@@ -72,17 +72,23 @@
 }
     
 QStringList
-StringBits::splitQuoted(QString s, QChar separator)
+StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode)
 {
     QStringList tokens;
     QString tok;
 
-    // sep -> just seen a field separator (or start of line)
+    // beg -> at beginning of line
+    // sep -> just seen a field separator
     // unq -> in an unquoted field
     // q1  -> in a single-quoted field
     // q2  -> in a double-quoted field
 
-    enum { sep, unq, q1, q2 } mode = sep;
+    enum { beg, sep, unq, q1, q2 } mode = beg;
+
+    bool use_doubling = (escapeMode == EscapeDoubling ||
+                         escapeMode == EscapeAny);
+    bool use_backslash = (escapeMode == EscapeBackslash ||
+                          escapeMode == EscapeAny);
 
     for (int i = 0; i < s.length(); ++i) {
         
@@ -90,43 +96,54 @@
 
         if (c == '\'') {
             switch (mode) {
-            case sep: mode = q1; break;
+            case beg: case sep: mode = q1; break;
             case unq: case q2: tok += c; break;
-            case q1: mode = unq; break;
+            case q1:
+                if (use_doubling && i+1 < s.length() && s[i+1] == c) {
+                    tok += c; ++i; break;
+                } else {
+                    mode = unq; break;
+                }
             }
 
         } else if (c == '"') {
             switch (mode) {
-            case sep: mode = q2; break;
+            case beg: case sep: mode = q2; break;
             case unq: case q1: tok += c; break;
-            case q2: mode = unq; break;
+            case q2: 
+                if (use_doubling && i+1 < s.length() && s[i+1] == c) {
+                    tok += c; ++i; break;
+                } else {
+                    mode = unq; break;
+                }
             }
 
         } else if (c == separator || (separator == ' ' && c.isSpace())) {
             switch (mode) {
+            case beg: mode = sep; tokens << ""; break;
             case sep: if (separator != ' ') tokens << ""; break;
             case unq: mode = sep; tokens << tok; tok = ""; break;
             case q1: case q2: tok += c; break;
             }
 
-        } else if (c == '\\') {
+        } else if (c == '\\' && use_backslash) {
             if (++i < s.length()) {
                 c = s[i];
                 switch (mode) {
-                case sep: mode = unq; tok += c; break;
+                case beg: case sep: mode = unq; tok += c; break;
                 case unq: case q1: case q2: tok += c; break;
                 }
             }
 
         } else {
             switch (mode) {
-            case sep: mode = unq; tok += c; break;
+            case beg: case sep: mode = unq; tok += c; break;
             case unq: case q1: case q2: tok += c; break;
             }
         }
     }
 
-    if (tok != "" || mode != sep) {
+    if (tok != "" || mode != beg) {
         if (mode == q1) {
             tokens << ("'" + tok);  // turns out it wasn't quoted after all
         } else if (mode == q2) {