comparison base/StringBits.cpp @ 1851:91056142abd0

Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author Chris Cannam
date Thu, 30 Apr 2020 14:45:24 +0100
parents 21c792334c2e
children bde22957545e
comparison
equal deleted inserted replaced
1850:e830b252267e 1851:91056142abd0
166 s += col; 166 s += col;
167 } 167 }
168 return s; 168 return s;
169 } 169 }
170 170
171 bool
172 StringBits::isValidUtf8(const std::string &bytes, bool isTruncated)
173 {
174 size_t len = bytes.length();
175 size_t mblen = 0;
176 unsigned char first = '\0';
177
178 for (size_t i = 0; i < len; ++i) {
179
180 unsigned char c = bytes[i];
181
182 if (((c & 0xc0) == 0xc0) || !(c & 0x80)) {
183
184 // 11xxxxxx or 0xxxxxxx: first byte of a character sequence
185
186 if (mblen > 0) {
187
188 // have we seen a valid sequence?
189 size_t length =
190 (!(first & 0x20)) ? 2 :
191 (!(first & 0x10)) ? 3 :
192 (!(first & 0x08)) ? 4 :
193 (!(first & 0x04)) ? 5 : 0;
194
195 if (length != 0 && mblen != length) {
196 // previous multibyte sequence had invalid length
197 return false;
198 }
199 }
200
201 mblen = 0;
202 first = c;
203
204 } else {
205
206 // second or subsequent byte
207
208 if (mblen == 0) {
209 // ... without a first byte!
210 return false;
211 }
212 }
213 }
214
215 // at the end
216
217 if (isTruncated) {
218 // can't trust any errors arising now
219 return true;
220 }
221
222 if (mblen > 0) {
223
224 // have we seen a valid sequence?
225 size_t length =
226 (!(first & 0x20)) ? 2 :
227 (!(first & 0x10)) ? 3 :
228 (!(first & 0x08)) ? 4 :
229 (!(first & 0x04)) ? 5 : 0;
230
231 if (length != 0 && mblen != length) {
232 // final multibyte sequence had invalid length
233 return false;
234 }
235 }
236
237 return true;
238 }
239
240
241
242