Mercurial > hg > svcore
comparison base/StringBits.cpp @ 1851:91056142abd0
Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author | Chris Cannam |
---|---|
date | Thu, 30 Apr 2020 14:45:24 +0100 |
parents | 21c792334c2e |
children | bde22957545e |
comparison
equal
deleted
inserted
replaced
1850:e830b252267e | 1851:91056142abd0 |
---|---|
166 s += col; | 166 s += col; |
167 } | 167 } |
168 return s; | 168 return s; |
169 } | 169 } |
170 | 170 |
171 bool | |
172 StringBits::isValidUtf8(const std::string &bytes, bool isTruncated) | |
173 { | |
174 size_t len = bytes.length(); | |
175 size_t mblen = 0; | |
176 unsigned char first = '\0'; | |
177 | |
178 for (size_t i = 0; i < len; ++i) { | |
179 | |
180 unsigned char c = bytes[i]; | |
181 | |
182 if (((c & 0xc0) == 0xc0) || !(c & 0x80)) { | |
183 | |
184 // 11xxxxxx or 0xxxxxxx: first byte of a character sequence | |
185 | |
186 if (mblen > 0) { | |
187 | |
188 // have we seen a valid sequence? | |
189 size_t length = | |
190 (!(first & 0x20)) ? 2 : | |
191 (!(first & 0x10)) ? 3 : | |
192 (!(first & 0x08)) ? 4 : | |
193 (!(first & 0x04)) ? 5 : 0; | |
194 | |
195 if (length != 0 && mblen != length) { | |
196 // previous multibyte sequence had invalid length | |
197 return false; | |
198 } | |
199 } | |
200 | |
201 mblen = 0; | |
202 first = c; | |
203 | |
204 } else { | |
205 | |
206 // second or subsequent byte | |
207 | |
208 if (mblen == 0) { | |
209 // ... without a first byte! | |
210 return false; | |
211 } | |
212 } | |
213 } | |
214 | |
215 // at the end | |
216 | |
217 if (isTruncated) { | |
218 // can't trust any errors arising now | |
219 return true; | |
220 } | |
221 | |
222 if (mblen > 0) { | |
223 | |
224 // have we seen a valid sequence? | |
225 size_t length = | |
226 (!(first & 0x20)) ? 2 : | |
227 (!(first & 0x10)) ? 3 : | |
228 (!(first & 0x08)) ? 4 : | |
229 (!(first & 0x04)) ? 5 : 0; | |
230 | |
231 if (length != 0 && mblen != length) { | |
232 // final multibyte sequence had invalid length | |
233 return false; | |
234 } | |
235 } | |
236 | |
237 return true; | |
238 } | |
239 | |
240 | |
241 | |
242 |