changeset 1851:91056142abd0

Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author Chris Cannam
date Thu, 30 Apr 2020 14:45:24 +0100 (2020-04-30)
parents e830b252267e
children a454c7477b4f
files base/StringBits.cpp base/StringBits.h
diffstat 2 files changed, 81 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/base/StringBits.cpp	Wed Apr 29 13:25:19 2020 +0100
+++ b/base/StringBits.cpp	Thu Apr 30 14:45:24 2020 +0100
@@ -168,3 +168,75 @@
     return s;    
 }
 
+bool
+StringBits::isValidUtf8(const std::string &bytes, bool isTruncated)
+{
+    size_t len = bytes.length();
+    size_t mblen = 0;
+    unsigned char first = '\0';
+    
+    for (size_t i = 0; i < len; ++i) {
+
+	unsigned char c = bytes[i];
+
+	if (((c & 0xc0) == 0xc0) || !(c & 0x80)) {
+
+	    // 11xxxxxx or 0xxxxxxx: first byte of a character sequence
+
+	    if (mblen > 0) {
+                
+		// have we seen a valid sequence?
+		size_t length = 
+		    (!(first & 0x20)) ? 2 :
+		    (!(first & 0x10)) ? 3 :
+		    (!(first & 0x08)) ? 4 :
+		    (!(first & 0x04)) ? 5 : 0;
+
+                if (length != 0 && mblen != length) {
+                    // previous multibyte sequence had invalid length
+                    return false;
+                }
+            }
+
+            mblen = 0;
+            first = c;
+
+        } else {
+            
+	    // second or subsequent byte
+
+	    if (mblen == 0) {
+                // ... without a first byte!                
+                return false;
+            }
+        }
+    }
+
+    // at the end
+
+    if (isTruncated) {
+        // can't trust any errors arising now
+        return true;
+    }
+
+    if (mblen > 0) {
+                
+        // have we seen a valid sequence?
+        size_t length = 
+            (!(first & 0x20)) ? 2 :
+            (!(first & 0x10)) ? 3 :
+            (!(first & 0x08)) ? 4 :
+            (!(first & 0x04)) ? 5 : 0;
+        
+        if (length != 0 && mblen != length) {
+            // final multibyte sequence had invalid length
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+        
+
--- a/base/StringBits.h	Wed Apr 29 13:25:19 2020 +0100
+++ b/base/StringBits.h	Thu Apr 30 14:45:24 2020 +0100
@@ -65,6 +65,15 @@
      * Comma-Separated Values (CSV) Files.
      */
     static QString joinDelimited(QVector<QString> row, QString delimiter);
+
+    /**
+     * Return true if the given byte array contains a valid UTF-8
+     * sequence, false if not. If isTruncated is true, the byte array
+     * will be treated as the prefix of a longer byte sequence, and
+     * any errors resulting from a multibyte code ending prematurely
+     * at the end of the array will be ignored.
+     */
+    static bool isValidUtf8(const std::string &bytes, bool isTruncated);
 };
 
 #endif