Mercurial > hg > svcore
changeset 1854:bde22957545e
Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now)
author | Chris Cannam |
---|---|
date | Mon, 11 May 2020 14:43:58 +0100 |
parents | f36fef97ac81 |
children | db489a1ece9b |
files | base/StringBits.cpp base/StringBits.h base/test/TestStringBits.h data/fileio/CSVFormat.cpp data/fileio/test/csv/separator-comma.csv data/fileio/test/csv/separator-space.csv |
diffstat | 6 files changed, 139 insertions(+), 60 deletions(-) [+] |
line wrap: on
line diff
--- a/base/StringBits.cpp Wed May 06 09:08:06 2020 +0100 +++ b/base/StringBits.cpp Mon May 11 14:43:58 2020 +0100 @@ -72,17 +72,23 @@ } QStringList -StringBits::splitQuoted(QString s, QChar separator) +StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode) { QStringList tokens; QString tok; - // sep -> just seen a field separator (or start of line) + // beg -> at beginning of line + // sep -> just seen a field separator // unq -> in an unquoted field // q1 -> in a single-quoted field // q2 -> in a double-quoted field - enum { sep, unq, q1, q2 } mode = sep; + enum { beg, sep, unq, q1, q2 } mode = beg; + + bool use_doubling = (escapeMode == EscapeDoubling || + escapeMode == EscapeAny); + bool use_backslash = (escapeMode == EscapeBackslash || + escapeMode == EscapeAny); for (int i = 0; i < s.length(); ++i) { @@ -90,43 +96,54 @@ if (c == '\'') { switch (mode) { - case sep: mode = q1; break; + case beg: case sep: mode = q1; break; case unq: case q2: tok += c; break; - case q1: mode = unq; break; + case q1: + if (use_doubling && i+1 < s.length() && s[i+1] == c) { + tok += c; ++i; break; + } else { + mode = unq; break; + } } } else if (c == '"') { switch (mode) { - case sep: mode = q2; break; + case beg: case sep: mode = q2; break; case unq: case q1: tok += c; break; - case q2: mode = unq; break; + case q2: + if (use_doubling && i+1 < s.length() && s[i+1] == c) { + tok += c; ++i; break; + } else { + mode = unq; break; + } } } else if (c == separator || (separator == ' ' && c.isSpace())) { switch (mode) { + case beg: mode = sep; tokens << ""; break; case sep: if (separator != ' ') tokens << ""; break; case unq: mode = sep; tokens << tok; tok = ""; break; case q1: case q2: tok += c; break; } - } else if (c == '\\') { + } else if (c == '\\' && use_backslash) { if (++i < s.length()) { c = s[i]; switch (mode) { - case sep: mode = unq; tok += c; break; + case beg: case sep: mode = unq; tok += c; break; case unq: case q1: case q2: tok += c; break; } } } else { switch (mode) { - case sep: mode = unq; tok += c; break; + case beg: case sep: mode = unq; tok += c; break; case unq: case q1: case q2: tok += c; break; } } } - if (tok != "" || mode != sep) { + if (tok != "" || mode != beg) { if (mode == q1) { tokens << ("'" + tok); // turns out it wasn't quoted after all } else if (mode == q2) {
--- a/base/StringBits.h Wed May 06 09:08:06 2020 +0100 +++ b/base/StringBits.h Mon May 11 14:43:58 2020 +0100 @@ -38,6 +38,13 @@ */ static double stringToDoubleLocaleFree(QString s, bool *ok = 0); + enum EscapeMode { + EscapeAny, // support both backslash and doubling escapes + EscapeBackslash, // support backslash escapes only + EscapeDoubling, // support doubling escapes ("" for " etc) only + EscapeNone // support no escapes + }; + /** * Split a string at the given separator character, allowing * quoted sections that contain the separator. If the separator @@ -45,16 +52,20 @@ * single separator. If the separator is another whitespace * character such as '\t', it will be used literally. */ - static QStringList splitQuoted(QString s, QChar separator); + static QStringList splitQuoted(QString s, + QChar separator, + EscapeMode escapeMode = EscapeAny); /** * Split a string at the given separator character. If quoted is - * true, do so by calling splitQuoted (above). If quoted is - * false, use QString::split; if separator is ' ', use - * SkipEmptyParts behaviour, otherwise use KeepEmptyParts (this is - * analogous to the behaviour of splitQuoted). + * true, do so by calling splitQuoted (above) in EscapeAny escape + * mode. If quoted is false, use QString::split; if separator is + * ' ', use SkipEmptyParts behaviour, otherwise use KeepEmptyParts + * (this is analogous to the behaviour of splitQuoted). */ - static QStringList split(QString s, QChar separator, bool quoted); + static QStringList split(QString s, + QChar separator, + bool quoted); /** * Join a vector of strings into a single string, with the
--- a/base/test/TestStringBits.h Wed May 06 09:08:06 2020 +0100 +++ b/base/test/TestStringBits.h Mon May 11 14:43:58 2020 +0100 @@ -44,6 +44,35 @@ } private slots: + void empty() { + QString in = ""; + QStringList out; + testSplitQuoted(in, out); + } + + void empties() { + + // Behaviour here differs based on what the separator is + // (spaces are coalesced) + + QString in; + QStringList out; + out << "" << ""; + + in = " "; + QCOMPARE(StringBits::splitQuoted(in, ' '), out); + + in = ","; + QCOMPARE(StringBits::splitQuoted(in, ','), out); + + in = " "; + QCOMPARE(StringBits::splitQuoted(in, ' '), out); + + in = ",,"; + out << ""; + QCOMPARE(StringBits::splitQuoted(in, ','), out); + } + void simple() { QString in = "a b c d"; QStringList out; @@ -107,6 +136,20 @@ testSplitQuoted(in, out); } + void ddescaped() { + QString in = "a \"b c\"\" d\""; + QStringList out; + out << "a" << "b c\" d"; + testSplitQuoted(in, out); + } + + void sdescaped() { + QString in = "a 'b c'' d'"; + QStringList out; + out << "a" << "b c' d"; + testSplitQuoted(in, out); + } + void dnested() { QString in = "a \"b c' d\""; QStringList out; @@ -179,7 +222,7 @@ void multispace() { QString in = " a'a \\' 'bb' ' \\\"cc\" ' dd\\\" '"; QStringList out; - out << "a'a" << "'" << "bb" << " \"cc\" " << "dd\"" << "'"; + out << "" << "a'a" << "'" << "bb" << " \"cc\" " << "dd\"" << "'"; QCOMPARE(StringBits::splitQuoted(in, ' '), out); QString in2 = ",,a'a,\\',,,,,,,,,'bb',,,,',,,,,,\\\"cc\",',dd\\\",'";
--- a/data/fileio/CSVFormat.cpp Wed May 06 09:08:06 2020 +0100 +++ b/data/fileio/CSVFormat.cpp Mon May 11 14:43:58 2020 +0100 @@ -135,7 +135,9 @@ ColumnIncreasing | ColumnNearEmpty; for (int i = 0; i < cols; ++i) { - + + SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; + while (m_columnQualities.size() <= i) { m_columnQualities.push_back(defaultQualities); m_prevValues.push_back(0.f); @@ -157,57 +159,60 @@ bool signd = (qualities & ColumnSigned); // also defaults to off bool emptyish = (qualities & ColumnNearEmpty); - if (lineno > 1 && s.trimmed() != "") { - emptyish = false; - } + if (s.trimmed() != "") { - float value = 0.f; + if (lineno > 1) { + emptyish = false; + } + + float value = 0.f; - //!!! how to take into account headers? + //!!! how to take into account headers? - if (numeric) { - value = s.toFloat(&ok); - if (!ok) { - value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); - } - if (ok) { - if (lineno < 2 && value > 1000.f) { - large = true; + if (numeric) { + value = s.toFloat(&ok); + if (!ok) { + value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); } - if (value < 0.f) { - signd = true; - } - if (value < -1.f || value > 1.f) { + if (ok) { + if (lineno < 2 && value > 1000.f) { + large = true; + } + if (value < 0.f) { + signd = true; + } + if (value < -1.f || value > 1.f) { + small = false; + } + } else { + numeric = false; + + // If the column is not numeric, it can't be any of + // these things either + integral = false; + increasing = false; small = false; - } - } else { - numeric = false; - - // If the column is not numeric, it can't be any of - // these things either - integral = false; - increasing = false; - small = false; - large = false; - signd = false; - } - } - - if (numeric) { - - if (integral) { - if (s.contains('.') || s.contains(',')) { - integral = false; + large = false; + signd = false; } } - if (increasing) { - if (lineno > 0 && value <= m_prevValues[i]) { - increasing = false; + if (numeric) { + + if (integral) { + if (s.contains('.') || s.contains(',')) { + integral = false; + } } + + if (increasing) { + if (lineno > 0 && value <= m_prevValues[i]) { + increasing = false; + } + } + + m_prevValues[i] = value; } - - m_prevValues[i] = value; } m_columnQualities[i] =
--- a/data/fileio/test/csv/separator-comma.csv Wed May 06 09:08:06 2020 +0100 +++ b/data/fileio/test/csv/separator-comma.csv Mon May 11 14:43:58 2020 +0100 @@ -1,4 +1,6 @@ This thing,That thing,The other thing 1,12.4,16.3 2,14.2 +3,"This, that", "and the other" 3,16.1,"This, that\", and the other" +3,16.1,"This, that"", and the other"