Mercurial > hg > svcore
changeset 1851:91056142abd0
Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author | Chris Cannam |
---|---|
date | Thu, 30 Apr 2020 14:45:24 +0100 (2020-04-30) |
parents | e830b252267e |
children | a454c7477b4f |
files | base/StringBits.cpp base/StringBits.h |
diffstat | 2 files changed, 81 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/base/StringBits.cpp Wed Apr 29 13:25:19 2020 +0100 +++ b/base/StringBits.cpp Thu Apr 30 14:45:24 2020 +0100 @@ -168,3 +168,75 @@ return s; } +bool +StringBits::isValidUtf8(const std::string &bytes, bool isTruncated) +{ + size_t len = bytes.length(); + size_t mblen = 0; + unsigned char first = '\0'; + + for (size_t i = 0; i < len; ++i) { + + unsigned char c = bytes[i]; + + if (((c & 0xc0) == 0xc0) || !(c & 0x80)) { + + // 11xxxxxx or 0xxxxxxx: first byte of a character sequence + + if (mblen > 0) { + + // have we seen a valid sequence? + size_t length = + (!(first & 0x20)) ? 2 : + (!(first & 0x10)) ? 3 : + (!(first & 0x08)) ? 4 : + (!(first & 0x04)) ? 5 : 0; + + if (length != 0 && mblen != length) { + // previous multibyte sequence had invalid length + return false; + } + } + + mblen = 0; + first = c; + + } else { + + // second or subsequent byte + + if (mblen == 0) { + // ... without a first byte! + return false; + } + } + } + + // at the end + + if (isTruncated) { + // can't trust any errors arising now + return true; + } + + if (mblen > 0) { + + // have we seen a valid sequence? + size_t length = + (!(first & 0x20)) ? 2 : + (!(first & 0x10)) ? 3 : + (!(first & 0x08)) ? 4 : + (!(first & 0x04)) ? 5 : 0; + + if (length != 0 && mblen != length) { + // final multibyte sequence had invalid length + return false; + } + } + + return true; +} + + + +
--- a/base/StringBits.h Wed Apr 29 13:25:19 2020 +0100 +++ b/base/StringBits.h Thu Apr 30 14:45:24 2020 +0100 @@ -65,6 +65,15 @@ * Comma-Separated Values (CSV) Files. */ static QString joinDelimited(QVector<QString> row, QString delimiter); + + /** + * Return true if the given byte array contains a valid UTF-8 + * sequence, false if not. If isTruncated is true, the byte array + * will be treated as the prefix of a longer byte sequence, and + * any errors resulting from a multibyte code ending prematurely + * at the end of the array will be ignored. + */ + static bool isValidUtf8(const std::string &bytes, bool isTruncated); }; #endif