# HG changeset patch # User Chris Cannam # Date 1589204638 -3600 # Node ID bde22957545e05eceac4b0462c7b23feac108e0b # Parent f36fef97ac8183ef06566e882434e6bde52e32dc Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now) diff -r f36fef97ac81 -r bde22957545e base/StringBits.cpp --- a/base/StringBits.cpp Wed May 06 09:08:06 2020 +0100 +++ b/base/StringBits.cpp Mon May 11 14:43:58 2020 +0100 @@ -72,17 +72,23 @@ } QStringList -StringBits::splitQuoted(QString s, QChar separator) +StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode) { QStringList tokens; QString tok; - // sep -> just seen a field separator (or start of line) + // beg -> at beginning of line + // sep -> just seen a field separator // unq -> in an unquoted field // q1 -> in a single-quoted field // q2 -> in a double-quoted field - enum { sep, unq, q1, q2 } mode = sep; + enum { beg, sep, unq, q1, q2 } mode = beg; + + bool use_doubling = (escapeMode == EscapeDoubling || + escapeMode == EscapeAny); + bool use_backslash = (escapeMode == EscapeBackslash || + escapeMode == EscapeAny); for (int i = 0; i < s.length(); ++i) { @@ -90,43 +96,54 @@ if (c == '\'') { switch (mode) { - case sep: mode = q1; break; + case beg: case sep: mode = q1; break; case unq: case q2: tok += c; break; - case q1: mode = unq; break; + case q1: + if (use_doubling && i+1 < s.length() && s[i+1] == c) { + tok += c; ++i; break; + } else { + mode = unq; break; + } } } else if (c == '"') { switch (mode) { - case sep: mode = q2; break; + case beg: case sep: mode = q2; break; case unq: case q1: tok += c; break; - case q2: mode = unq; break; + case q2: + if (use_doubling && i+1 < s.length() && s[i+1] == c) { + tok += c; ++i; break; + } else { + mode = unq; break; + } } } else if (c == separator || (separator == ' ' && c.isSpace())) { switch (mode) { + case beg: mode = sep; tokens << ""; break; case sep: if (separator != ' ') tokens << ""; break; case unq: mode = sep; tokens << tok; tok = ""; break; case q1: case q2: tok += c; break; } - } else if (c == '\\') { + } else if (c == '\\' && use_backslash) { if (++i < s.length()) { c = s[i]; switch (mode) { - case sep: mode = unq; tok += c; break; + case beg: case sep: mode = unq; tok += c; break; case unq: case q1: case q2: tok += c; break; } } } else { switch (mode) { - case sep: mode = unq; tok += c; break; + case beg: case sep: mode = unq; tok += c; break; case unq: case q1: case q2: tok += c; break; } } } - if (tok != "" || mode != sep) { + if (tok != "" || mode != beg) { if (mode == q1) { tokens << ("'" + tok); // turns out it wasn't quoted after all } else if (mode == q2) { diff -r f36fef97ac81 -r bde22957545e base/StringBits.h --- a/base/StringBits.h Wed May 06 09:08:06 2020 +0100 +++ b/base/StringBits.h Mon May 11 14:43:58 2020 +0100 @@ -38,6 +38,13 @@ */ static double stringToDoubleLocaleFree(QString s, bool *ok = 0); + enum EscapeMode { + EscapeAny, // support both backslash and doubling escapes + EscapeBackslash, // support backslash escapes only + EscapeDoubling, // support doubling escapes ("" for " etc) only + EscapeNone // support no escapes + }; + /** * Split a string at the given separator character, allowing * quoted sections that contain the separator. If the separator @@ -45,16 +52,20 @@ * single separator. If the separator is another whitespace * character such as '\t', it will be used literally. */ - static QStringList splitQuoted(QString s, QChar separator); + static QStringList splitQuoted(QString s, + QChar separator, + EscapeMode escapeMode = EscapeAny); /** * Split a string at the given separator character. If quoted is - * true, do so by calling splitQuoted (above). If quoted is - * false, use QString::split; if separator is ' ', use - * SkipEmptyParts behaviour, otherwise use KeepEmptyParts (this is - * analogous to the behaviour of splitQuoted). + * true, do so by calling splitQuoted (above) in EscapeAny escape + * mode. If quoted is false, use QString::split; if separator is + * ' ', use SkipEmptyParts behaviour, otherwise use KeepEmptyParts + * (this is analogous to the behaviour of splitQuoted). */ - static QStringList split(QString s, QChar separator, bool quoted); + static QStringList split(QString s, + QChar separator, + bool quoted); /** * Join a vector of strings into a single string, with the diff -r f36fef97ac81 -r bde22957545e base/test/TestStringBits.h --- a/base/test/TestStringBits.h Wed May 06 09:08:06 2020 +0100 +++ b/base/test/TestStringBits.h Mon May 11 14:43:58 2020 +0100 @@ -44,6 +44,35 @@ } private slots: + void empty() { + QString in = ""; + QStringList out; + testSplitQuoted(in, out); + } + + void empties() { + + // Behaviour here differs based on what the separator is + // (spaces are coalesced) + + QString in; + QStringList out; + out << "" << ""; + + in = " "; + QCOMPARE(StringBits::splitQuoted(in, ' '), out); + + in = ","; + QCOMPARE(StringBits::splitQuoted(in, ','), out); + + in = " "; + QCOMPARE(StringBits::splitQuoted(in, ' '), out); + + in = ",,"; + out << ""; + QCOMPARE(StringBits::splitQuoted(in, ','), out); + } + void simple() { QString in = "a b c d"; QStringList out; @@ -107,6 +136,20 @@ testSplitQuoted(in, out); } + void ddescaped() { + QString in = "a \"b c\"\" d\""; + QStringList out; + out << "a" << "b c\" d"; + testSplitQuoted(in, out); + } + + void sdescaped() { + QString in = "a 'b c'' d'"; + QStringList out; + out << "a" << "b c' d"; + testSplitQuoted(in, out); + } + void dnested() { QString in = "a \"b c' d\""; QStringList out; @@ -179,7 +222,7 @@ void multispace() { QString in = " a'a \\' 'bb' ' \\\"cc\" ' dd\\\" '"; QStringList out; - out << "a'a" << "'" << "bb" << " \"cc\" " << "dd\"" << "'"; + out << "" << "a'a" << "'" << "bb" << " \"cc\" " << "dd\"" << "'"; QCOMPARE(StringBits::splitQuoted(in, ' '), out); QString in2 = ",,a'a,\\',,,,,,,,,'bb',,,,',,,,,,\\\"cc\",',dd\\\",'"; diff -r f36fef97ac81 -r bde22957545e data/fileio/CSVFormat.cpp --- a/data/fileio/CSVFormat.cpp Wed May 06 09:08:06 2020 +0100 +++ b/data/fileio/CSVFormat.cpp Mon May 11 14:43:58 2020 +0100 @@ -135,7 +135,9 @@ ColumnIncreasing | ColumnNearEmpty; for (int i = 0; i < cols; ++i) { - + + SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; + while (m_columnQualities.size() <= i) { m_columnQualities.push_back(defaultQualities); m_prevValues.push_back(0.f); @@ -157,57 +159,60 @@ bool signd = (qualities & ColumnSigned); // also defaults to off bool emptyish = (qualities & ColumnNearEmpty); - if (lineno > 1 && s.trimmed() != "") { - emptyish = false; - } + if (s.trimmed() != "") { - float value = 0.f; + if (lineno > 1) { + emptyish = false; + } + + float value = 0.f; - //!!! how to take into account headers? + //!!! how to take into account headers? - if (numeric) { - value = s.toFloat(&ok); - if (!ok) { - value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); - } - if (ok) { - if (lineno < 2 && value > 1000.f) { - large = true; + if (numeric) { + value = s.toFloat(&ok); + if (!ok) { + value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); } - if (value < 0.f) { - signd = true; - } - if (value < -1.f || value > 1.f) { + if (ok) { + if (lineno < 2 && value > 1000.f) { + large = true; + } + if (value < 0.f) { + signd = true; + } + if (value < -1.f || value > 1.f) { + small = false; + } + } else { + numeric = false; + + // If the column is not numeric, it can't be any of + // these things either + integral = false; + increasing = false; small = false; - } - } else { - numeric = false; - - // If the column is not numeric, it can't be any of - // these things either - integral = false; - increasing = false; - small = false; - large = false; - signd = false; - } - } - - if (numeric) { - - if (integral) { - if (s.contains('.') || s.contains(',')) { - integral = false; + large = false; + signd = false; } } - if (increasing) { - if (lineno > 0 && value <= m_prevValues[i]) { - increasing = false; + if (numeric) { + + if (integral) { + if (s.contains('.') || s.contains(',')) { + integral = false; + } } + + if (increasing) { + if (lineno > 0 && value <= m_prevValues[i]) { + increasing = false; + } + } + + m_prevValues[i] = value; } - - m_prevValues[i] = value; } m_columnQualities[i] = diff -r f36fef97ac81 -r bde22957545e data/fileio/test/csv/separator-comma.csv --- a/data/fileio/test/csv/separator-comma.csv Wed May 06 09:08:06 2020 +0100 +++ b/data/fileio/test/csv/separator-comma.csv Mon May 11 14:43:58 2020 +0100 @@ -1,4 +1,6 @@ This thing,That thing,The other thing 1,12.4,16.3 2,14.2 +3,"This, that", "and the other" 3,16.1,"This, that\", and the other" +3,16.1,"This, that"", and the other" diff -r f36fef97ac81 -r bde22957545e data/fileio/test/csv/separator-space.csv --- a/data/fileio/test/csv/separator-space.csv Wed May 06 09:08:06 2020 +0100 +++ b/data/fileio/test/csv/separator-space.csv Mon May 11 14:43:58 2020 +0100 @@ -2,3 +2,4 @@ 1 12,4 16,3 2 14,2 3 16,1 1901 +"This thing" "The ""second"" thing" "The other thing"