annotate base/StringBits.cpp @ 1851:91056142abd0

Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author Chris Cannam
date Thu, 30 Apr 2020 14:45:24 +0100
parents 21c792334c2e
children bde22957545e
rev   line source
Chris@629 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@629 2
Chris@629 3 /*
Chris@629 4 Sonic Visualiser
Chris@629 5 An audio file viewer and annotation editor.
Chris@629 6 Centre for Digital Music, Queen Mary, University of London.
Chris@629 7
Chris@629 8 This program is free software; you can redistribute it and/or
Chris@629 9 modify it under the terms of the GNU General Public License as
Chris@629 10 published by the Free Software Foundation; either version 2 of the
Chris@629 11 License, or (at your option) any later version. See the file
Chris@629 12 COPYING included with this distribution for more information.
Chris@629 13 */
Chris@629 14
Chris@629 15 /*
Chris@629 16 This is a modified version of a source file from the
Chris@629 17 Rosegarden MIDI and audio sequencer and notation editor.
Chris@629 18 This file copyright 2000-2010 Chris Cannam.
Chris@629 19 */
Chris@629 20
Chris@629 21 #include "StringBits.h"
Chris@629 22
Chris@1022 23 #include "Debug.h"
Chris@1022 24
Chris@1022 25 using namespace std;
Chris@1022 26
Chris@629 27 double
Chris@629 28 StringBits::stringToDoubleLocaleFree(QString s, bool *ok)
Chris@629 29 {
Chris@629 30 int dp = 0;
Chris@629 31 int sign = 1;
Chris@629 32 int i = 0;
Chris@629 33 double result = 0.0;
Chris@629 34 int len = s.length();
Chris@629 35
Chris@629 36 result = 0.0;
Chris@629 37
Chris@629 38 if (ok) *ok = true;
Chris@629 39
Chris@629 40 while (i < len && s[i].isSpace()) ++i;
Chris@629 41 if (i < len && s[i] == '-') sign = -1;
Chris@629 42
Chris@629 43 while (i < len) {
Chris@629 44
Chris@1429 45 QChar c = s[i];
Chris@629 46
Chris@629 47 if (c.isDigit()) {
Chris@629 48
Chris@629 49 double d = c.digitValue();
Chris@629 50
Chris@629 51 if (dp > 0) {
Chris@629 52 for (int p = dp; p > 0; --p) d /= 10.0;
Chris@629 53 ++dp;
Chris@629 54 } else {
Chris@629 55 result *= 10.0;
Chris@629 56 }
Chris@629 57
Chris@629 58 result += d;
Chris@629 59
Chris@629 60 } else if (c == '.') {
Chris@629 61
Chris@629 62 dp = 1;
Chris@629 63
Chris@629 64 } else if (ok) {
Chris@629 65 *ok = false;
Chris@629 66 }
Chris@629 67
Chris@629 68 ++i;
Chris@629 69 }
Chris@629 70
Chris@629 71 return result * sign;
Chris@629 72 }
Chris@629 73
Chris@629 74 QStringList
Chris@629 75 StringBits::splitQuoted(QString s, QChar separator)
Chris@629 76 {
Chris@629 77 QStringList tokens;
Chris@629 78 QString tok;
Chris@629 79
Chris@1022 80 // sep -> just seen a field separator (or start of line)
Chris@1022 81 // unq -> in an unquoted field
Chris@1022 82 // q1 -> in a single-quoted field
Chris@1022 83 // q2 -> in a double-quoted field
Chris@1022 84
Chris@629 85 enum { sep, unq, q1, q2 } mode = sep;
Chris@629 86
Chris@629 87 for (int i = 0; i < s.length(); ++i) {
Chris@1429 88
Chris@1429 89 QChar c = s[i];
Chris@629 90
Chris@1429 91 if (c == '\'') {
Chris@1429 92 switch (mode) {
Chris@1429 93 case sep: mode = q1; break;
Chris@1429 94 case unq: case q2: tok += c; break;
Chris@1429 95 case q1: mode = unq; break;
Chris@1429 96 }
Chris@629 97
Chris@1429 98 } else if (c == '"') {
Chris@1429 99 switch (mode) {
Chris@1429 100 case sep: mode = q2; break;
Chris@1429 101 case unq: case q1: tok += c; break;
Chris@1429 102 case q2: mode = unq; break;
Chris@1429 103 }
Chris@629 104
Chris@1429 105 } else if (c == separator || (separator == ' ' && c.isSpace())) {
Chris@1429 106 switch (mode) {
Chris@1429 107 case sep: if (separator != ' ') tokens << ""; break;
Chris@1429 108 case unq: mode = sep; tokens << tok; tok = ""; break;
Chris@1429 109 case q1: case q2: tok += c; break;
Chris@1429 110 }
Chris@629 111
Chris@1429 112 } else if (c == '\\') {
Chris@1429 113 if (++i < s.length()) {
Chris@1429 114 c = s[i];
Chris@1429 115 switch (mode) {
Chris@1429 116 case sep: mode = unq; tok += c; break;
Chris@928 117 case unq: case q1: case q2: tok += c; break;
Chris@1429 118 }
Chris@1429 119 }
Chris@629 120
Chris@1429 121 } else {
Chris@1429 122 switch (mode) {
Chris@1429 123 case sep: mode = unq; tok += c; break;
Chris@928 124 case unq: case q1: case q2: tok += c; break;
Chris@1429 125 }
Chris@1429 126 }
Chris@629 127 }
Chris@629 128
Chris@1022 129 if (tok != "" || mode != sep) {
Chris@1022 130 if (mode == q1) {
Chris@1022 131 tokens << ("'" + tok); // turns out it wasn't quoted after all
Chris@1022 132 } else if (mode == q2) {
Chris@1022 133 tokens << ("\"" + tok);
Chris@1022 134 } else {
Chris@1022 135 tokens << tok;
Chris@1022 136 }
Chris@1022 137 }
Chris@1022 138
Chris@629 139 return tokens;
Chris@629 140 }
Chris@629 141
Chris@629 142 QStringList
Chris@629 143 StringBits::split(QString line, QChar separator, bool quoted)
Chris@629 144 {
Chris@629 145 if (quoted) {
Chris@629 146 return splitQuoted(line, separator);
Chris@629 147 } else {
Chris@629 148 return line.split(separator,
Chris@629 149 separator == ' ' ? QString::SkipEmptyParts :
Chris@629 150 QString::KeepEmptyParts);
Chris@629 151 }
Chris@629 152 }
Chris@629 153
Chris@1833 154 QString
Chris@1833 155 StringBits::joinDelimited(QVector<QString> row, QString delimiter)
Chris@1833 156 {
Chris@1833 157 QString s;
Chris@1833 158 for (auto col: row) {
Chris@1833 159 if (s != "") {
Chris@1833 160 s += delimiter;
Chris@1833 161 }
Chris@1833 162 if (col.contains(delimiter)) {
Chris@1833 163 col.replace("\"", "\"\"");
Chris@1833 164 col = "\"" + col + "\"";
Chris@1833 165 }
Chris@1833 166 s += col;
Chris@1833 167 }
Chris@1833 168 return s;
Chris@1833 169 }
Chris@1833 170
Chris@1851 171 bool
Chris@1851 172 StringBits::isValidUtf8(const std::string &bytes, bool isTruncated)
Chris@1851 173 {
Chris@1851 174 size_t len = bytes.length();
Chris@1851 175 size_t mblen = 0;
Chris@1851 176 unsigned char first = '\0';
Chris@1851 177
Chris@1851 178 for (size_t i = 0; i < len; ++i) {
Chris@1851 179
Chris@1851 180 unsigned char c = bytes[i];
Chris@1851 181
Chris@1851 182 if (((c & 0xc0) == 0xc0) || !(c & 0x80)) {
Chris@1851 183
Chris@1851 184 // 11xxxxxx or 0xxxxxxx: first byte of a character sequence
Chris@1851 185
Chris@1851 186 if (mblen > 0) {
Chris@1851 187
Chris@1851 188 // have we seen a valid sequence?
Chris@1851 189 size_t length =
Chris@1851 190 (!(first & 0x20)) ? 2 :
Chris@1851 191 (!(first & 0x10)) ? 3 :
Chris@1851 192 (!(first & 0x08)) ? 4 :
Chris@1851 193 (!(first & 0x04)) ? 5 : 0;
Chris@1851 194
Chris@1851 195 if (length != 0 && mblen != length) {
Chris@1851 196 // previous multibyte sequence had invalid length
Chris@1851 197 return false;
Chris@1851 198 }
Chris@1851 199 }
Chris@1851 200
Chris@1851 201 mblen = 0;
Chris@1851 202 first = c;
Chris@1851 203
Chris@1851 204 } else {
Chris@1851 205
Chris@1851 206 // second or subsequent byte
Chris@1851 207
Chris@1851 208 if (mblen == 0) {
Chris@1851 209 // ... without a first byte!
Chris@1851 210 return false;
Chris@1851 211 }
Chris@1851 212 }
Chris@1851 213 }
Chris@1851 214
Chris@1851 215 // at the end
Chris@1851 216
Chris@1851 217 if (isTruncated) {
Chris@1851 218 // can't trust any errors arising now
Chris@1851 219 return true;
Chris@1851 220 }
Chris@1851 221
Chris@1851 222 if (mblen > 0) {
Chris@1851 223
Chris@1851 224 // have we seen a valid sequence?
Chris@1851 225 size_t length =
Chris@1851 226 (!(first & 0x20)) ? 2 :
Chris@1851 227 (!(first & 0x10)) ? 3 :
Chris@1851 228 (!(first & 0x08)) ? 4 :
Chris@1851 229 (!(first & 0x04)) ? 5 : 0;
Chris@1851 230
Chris@1851 231 if (length != 0 && mblen != length) {
Chris@1851 232 // final multibyte sequence had invalid length
Chris@1851 233 return false;
Chris@1851 234 }
Chris@1851 235 }
Chris@1851 236
Chris@1851 237 return true;
Chris@1851 238 }
Chris@1851 239
Chris@1851 240
Chris@1851 241
Chris@1851 242