Mercurial > hg > svcore
comparison base/StringBits.cpp @ 1854:bde22957545e
Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now)
author | Chris Cannam |
---|---|
date | Mon, 11 May 2020 14:43:58 +0100 |
parents | 91056142abd0 |
children |
comparison
equal
deleted
inserted
replaced
1853:f36fef97ac81 | 1854:bde22957545e |
---|---|
70 | 70 |
71 return result * sign; | 71 return result * sign; |
72 } | 72 } |
73 | 73 |
74 QStringList | 74 QStringList |
75 StringBits::splitQuoted(QString s, QChar separator) | 75 StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode) |
76 { | 76 { |
77 QStringList tokens; | 77 QStringList tokens; |
78 QString tok; | 78 QString tok; |
79 | 79 |
80 // sep -> just seen a field separator (or start of line) | 80 // beg -> at beginning of line |
81 // sep -> just seen a field separator | |
81 // unq -> in an unquoted field | 82 // unq -> in an unquoted field |
82 // q1 -> in a single-quoted field | 83 // q1 -> in a single-quoted field |
83 // q2 -> in a double-quoted field | 84 // q2 -> in a double-quoted field |
84 | 85 |
85 enum { sep, unq, q1, q2 } mode = sep; | 86 enum { beg, sep, unq, q1, q2 } mode = beg; |
87 | |
88 bool use_doubling = (escapeMode == EscapeDoubling || | |
89 escapeMode == EscapeAny); | |
90 bool use_backslash = (escapeMode == EscapeBackslash || | |
91 escapeMode == EscapeAny); | |
86 | 92 |
87 for (int i = 0; i < s.length(); ++i) { | 93 for (int i = 0; i < s.length(); ++i) { |
88 | 94 |
89 QChar c = s[i]; | 95 QChar c = s[i]; |
90 | 96 |
91 if (c == '\'') { | 97 if (c == '\'') { |
92 switch (mode) { | 98 switch (mode) { |
93 case sep: mode = q1; break; | 99 case beg: case sep: mode = q1; break; |
94 case unq: case q2: tok += c; break; | 100 case unq: case q2: tok += c; break; |
95 case q1: mode = unq; break; | 101 case q1: |
102 if (use_doubling && i+1 < s.length() && s[i+1] == c) { | |
103 tok += c; ++i; break; | |
104 } else { | |
105 mode = unq; break; | |
106 } | |
96 } | 107 } |
97 | 108 |
98 } else if (c == '"') { | 109 } else if (c == '"') { |
99 switch (mode) { | 110 switch (mode) { |
100 case sep: mode = q2; break; | 111 case beg: case sep: mode = q2; break; |
101 case unq: case q1: tok += c; break; | 112 case unq: case q1: tok += c; break; |
102 case q2: mode = unq; break; | 113 case q2: |
114 if (use_doubling && i+1 < s.length() && s[i+1] == c) { | |
115 tok += c; ++i; break; | |
116 } else { | |
117 mode = unq; break; | |
118 } | |
103 } | 119 } |
104 | 120 |
105 } else if (c == separator || (separator == ' ' && c.isSpace())) { | 121 } else if (c == separator || (separator == ' ' && c.isSpace())) { |
106 switch (mode) { | 122 switch (mode) { |
123 case beg: mode = sep; tokens << ""; break; | |
107 case sep: if (separator != ' ') tokens << ""; break; | 124 case sep: if (separator != ' ') tokens << ""; break; |
108 case unq: mode = sep; tokens << tok; tok = ""; break; | 125 case unq: mode = sep; tokens << tok; tok = ""; break; |
109 case q1: case q2: tok += c; break; | 126 case q1: case q2: tok += c; break; |
110 } | 127 } |
111 | 128 |
112 } else if (c == '\\') { | 129 } else if (c == '\\' && use_backslash) { |
113 if (++i < s.length()) { | 130 if (++i < s.length()) { |
114 c = s[i]; | 131 c = s[i]; |
115 switch (mode) { | 132 switch (mode) { |
116 case sep: mode = unq; tok += c; break; | 133 case beg: case sep: mode = unq; tok += c; break; |
117 case unq: case q1: case q2: tok += c; break; | 134 case unq: case q1: case q2: tok += c; break; |
118 } | 135 } |
119 } | 136 } |
120 | 137 |
121 } else { | 138 } else { |
122 switch (mode) { | 139 switch (mode) { |
123 case sep: mode = unq; tok += c; break; | 140 case beg: case sep: mode = unq; tok += c; break; |
124 case unq: case q1: case q2: tok += c; break; | 141 case unq: case q1: case q2: tok += c; break; |
125 } | 142 } |
126 } | 143 } |
127 } | 144 } |
128 | 145 |
129 if (tok != "" || mode != sep) { | 146 if (tok != "" || mode != beg) { |
130 if (mode == q1) { | 147 if (mode == q1) { |
131 tokens << ("'" + tok); // turns out it wasn't quoted after all | 148 tokens << ("'" + tok); // turns out it wasn't quoted after all |
132 } else if (mode == q2) { | 149 } else if (mode == q2) { |
133 tokens << ("\"" + tok); | 150 tokens << ("\"" + tok); |
134 } else { | 151 } else { |