comparison base/StringBits.cpp @ 1854:bde22957545e

Add support for doubling escapes for quotes in quoted texts in CSV-like formats on import (similar to how we, and the relevant RFC, do escaping on export now)
author Chris Cannam
date Mon, 11 May 2020 14:43:58 +0100
parents 91056142abd0
children
comparison
equal deleted inserted replaced
1853:f36fef97ac81 1854:bde22957545e
70 70
71 return result * sign; 71 return result * sign;
72 } 72 }
73 73
74 QStringList 74 QStringList
75 StringBits::splitQuoted(QString s, QChar separator) 75 StringBits::splitQuoted(QString s, QChar separator, EscapeMode escapeMode)
76 { 76 {
77 QStringList tokens; 77 QStringList tokens;
78 QString tok; 78 QString tok;
79 79
80 // sep -> just seen a field separator (or start of line) 80 // beg -> at beginning of line
81 // sep -> just seen a field separator
81 // unq -> in an unquoted field 82 // unq -> in an unquoted field
82 // q1 -> in a single-quoted field 83 // q1 -> in a single-quoted field
83 // q2 -> in a double-quoted field 84 // q2 -> in a double-quoted field
84 85
85 enum { sep, unq, q1, q2 } mode = sep; 86 enum { beg, sep, unq, q1, q2 } mode = beg;
87
88 bool use_doubling = (escapeMode == EscapeDoubling ||
89 escapeMode == EscapeAny);
90 bool use_backslash = (escapeMode == EscapeBackslash ||
91 escapeMode == EscapeAny);
86 92
87 for (int i = 0; i < s.length(); ++i) { 93 for (int i = 0; i < s.length(); ++i) {
88 94
89 QChar c = s[i]; 95 QChar c = s[i];
90 96
91 if (c == '\'') { 97 if (c == '\'') {
92 switch (mode) { 98 switch (mode) {
93 case sep: mode = q1; break; 99 case beg: case sep: mode = q1; break;
94 case unq: case q2: tok += c; break; 100 case unq: case q2: tok += c; break;
95 case q1: mode = unq; break; 101 case q1:
102 if (use_doubling && i+1 < s.length() && s[i+1] == c) {
103 tok += c; ++i; break;
104 } else {
105 mode = unq; break;
106 }
96 } 107 }
97 108
98 } else if (c == '"') { 109 } else if (c == '"') {
99 switch (mode) { 110 switch (mode) {
100 case sep: mode = q2; break; 111 case beg: case sep: mode = q2; break;
101 case unq: case q1: tok += c; break; 112 case unq: case q1: tok += c; break;
102 case q2: mode = unq; break; 113 case q2:
114 if (use_doubling && i+1 < s.length() && s[i+1] == c) {
115 tok += c; ++i; break;
116 } else {
117 mode = unq; break;
118 }
103 } 119 }
104 120
105 } else if (c == separator || (separator == ' ' && c.isSpace())) { 121 } else if (c == separator || (separator == ' ' && c.isSpace())) {
106 switch (mode) { 122 switch (mode) {
123 case beg: mode = sep; tokens << ""; break;
107 case sep: if (separator != ' ') tokens << ""; break; 124 case sep: if (separator != ' ') tokens << ""; break;
108 case unq: mode = sep; tokens << tok; tok = ""; break; 125 case unq: mode = sep; tokens << tok; tok = ""; break;
109 case q1: case q2: tok += c; break; 126 case q1: case q2: tok += c; break;
110 } 127 }
111 128
112 } else if (c == '\\') { 129 } else if (c == '\\' && use_backslash) {
113 if (++i < s.length()) { 130 if (++i < s.length()) {
114 c = s[i]; 131 c = s[i];
115 switch (mode) { 132 switch (mode) {
116 case sep: mode = unq; tok += c; break; 133 case beg: case sep: mode = unq; tok += c; break;
117 case unq: case q1: case q2: tok += c; break; 134 case unq: case q1: case q2: tok += c; break;
118 } 135 }
119 } 136 }
120 137
121 } else { 138 } else {
122 switch (mode) { 139 switch (mode) {
123 case sep: mode = unq; tok += c; break; 140 case beg: case sep: mode = unq; tok += c; break;
124 case unq: case q1: case q2: tok += c; break; 141 case unq: case q1: case q2: tok += c; break;
125 } 142 }
126 } 143 }
127 } 144 }
128 145
129 if (tok != "" || mode != sep) { 146 if (tok != "" || mode != beg) {
130 if (mode == q1) { 147 if (mode == q1) {
131 tokens << ("'" + tok); // turns out it wasn't quoted after all 148 tokens << ("'" + tok); // turns out it wasn't quoted after all
132 } else if (mode == q2) { 149 } else if (mode == q2) {
133 tokens << ("\"" + tok); 150 tokens << ("\"" + tok);
134 } else { 151 } else {