# HG changeset patch # User Chris Cannam # Date 1588254324 -3600 # Node ID 91056142abd019533686924e752f3467c79aa9ff # Parent e830b252267ed6fda2a22b49bb79160bde842cb5 Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8 diff -r e830b252267e -r 91056142abd0 base/StringBits.cpp --- a/base/StringBits.cpp Wed Apr 29 13:25:19 2020 +0100 +++ b/base/StringBits.cpp Thu Apr 30 14:45:24 2020 +0100 @@ -168,3 +168,75 @@ return s; } +bool +StringBits::isValidUtf8(const std::string &bytes, bool isTruncated) +{ + size_t len = bytes.length(); + size_t mblen = 0; + unsigned char first = '\0'; + + for (size_t i = 0; i < len; ++i) { + + unsigned char c = bytes[i]; + + if (((c & 0xc0) == 0xc0) || !(c & 0x80)) { + + // 11xxxxxx or 0xxxxxxx: first byte of a character sequence + + if (mblen > 0) { + + // have we seen a valid sequence? + size_t length = + (!(first & 0x20)) ? 2 : + (!(first & 0x10)) ? 3 : + (!(first & 0x08)) ? 4 : + (!(first & 0x04)) ? 5 : 0; + + if (length != 0 && mblen != length) { + // previous multibyte sequence had invalid length + return false; + } + } + + mblen = 0; + first = c; + + } else { + + // second or subsequent byte + + if (mblen == 0) { + // ... without a first byte! + return false; + } + } + } + + // at the end + + if (isTruncated) { + // can't trust any errors arising now + return true; + } + + if (mblen > 0) { + + // have we seen a valid sequence? + size_t length = + (!(first & 0x20)) ? 2 : + (!(first & 0x10)) ? 3 : + (!(first & 0x08)) ? 4 : + (!(first & 0x04)) ? 5 : 0; + + if (length != 0 && mblen != length) { + // final multibyte sequence had invalid length + return false; + } + } + + return true; +} + + + + diff -r e830b252267e -r 91056142abd0 base/StringBits.h --- a/base/StringBits.h Wed Apr 29 13:25:19 2020 +0100 +++ b/base/StringBits.h Thu Apr 30 14:45:24 2020 +0100 @@ -65,6 +65,15 @@ * Comma-Separated Values (CSV) Files. */ static QString joinDelimited(QVector row, QString delimiter); + + /** + * Return true if the given byte array contains a valid UTF-8 + * sequence, false if not. If isTruncated is true, the byte array + * will be treated as the prefix of a longer byte sequence, and + * any errors resulting from a multibyte code ending prematurely + * at the end of the array will be ignored. + */ + static bool isValidUtf8(const std::string &bytes, bool isTruncated); }; #endif