view base/StringBits.cpp @ 1851:91056142abd0

Add function (cribbed from Rosegarden source) to check whether a string is valid UTF-8
author Chris Cannam
date Thu, 30 Apr 2020 14:45:24 +0100
parents 21c792334c2e
children bde22957545e
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    Sonic Visualiser
    An audio file viewer and annotation editor.
    Centre for Digital Music, Queen Mary, University of London.
    
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.
*/

/*
   This is a modified version of a source file from the 
   Rosegarden MIDI and audio sequencer and notation editor.
   This file copyright 2000-2010 Chris Cannam.
*/

#include "StringBits.h"

#include "Debug.h"

using namespace std;

double
StringBits::stringToDoubleLocaleFree(QString s, bool *ok)
{
    int dp = 0;
    int sign = 1;
    int i = 0;
    double result = 0.0;
    int len = s.length();

    result = 0.0;

    if (ok) *ok = true;

    while (i < len && s[i].isSpace()) ++i;
    if (i < len && s[i] == '-') sign = -1;

    while (i < len) {

        QChar c = s[i];

        if (c.isDigit()) {

            double d = c.digitValue();

            if (dp > 0) {
                for (int p = dp; p > 0; --p) d /= 10.0;
                ++dp;
            } else {
                result *= 10.0;
            }

            result += d;

        } else if (c == '.') {

            dp = 1;

        } else if (ok) {
            *ok = false;
        }

        ++i;
    }

    return result * sign;
}
    
QStringList
StringBits::splitQuoted(QString s, QChar separator)
{
    QStringList tokens;
    QString tok;

    // sep -> just seen a field separator (or start of line)
    // unq -> in an unquoted field
    // q1  -> in a single-quoted field
    // q2  -> in a double-quoted field

    enum { sep, unq, q1, q2 } mode = sep;

    for (int i = 0; i < s.length(); ++i) {
        
        QChar c = s[i];

        if (c == '\'') {
            switch (mode) {
            case sep: mode = q1; break;
            case unq: case q2: tok += c; break;
            case q1: mode = unq; break;
            }

        } else if (c == '"') {
            switch (mode) {
            case sep: mode = q2; break;
            case unq: case q1: tok += c; break;
            case q2: mode = unq; break;
            }

        } else if (c == separator || (separator == ' ' && c.isSpace())) {
            switch (mode) {
            case sep: if (separator != ' ') tokens << ""; break;
            case unq: mode = sep; tokens << tok; tok = ""; break;
            case q1: case q2: tok += c; break;
            }

        } else if (c == '\\') {
            if (++i < s.length()) {
                c = s[i];
                switch (mode) {
                case sep: mode = unq; tok += c; break;
                case unq: case q1: case q2: tok += c; break;
                }
            }

        } else {
            switch (mode) {
            case sep: mode = unq; tok += c; break;
            case unq: case q1: case q2: tok += c; break;
            }
        }
    }

    if (tok != "" || mode != sep) {
        if (mode == q1) {
            tokens << ("'" + tok);  // turns out it wasn't quoted after all
        } else if (mode == q2) {
            tokens << ("\"" + tok);
        } else {
            tokens << tok;
        }
    }

    return tokens;
}

QStringList
StringBits::split(QString line, QChar separator, bool quoted)
{
    if (quoted) {
        return splitQuoted(line, separator);
    } else {
        return line.split(separator,
                          separator == ' ' ? QString::SkipEmptyParts :
                          QString::KeepEmptyParts);
    }
}

QString
StringBits::joinDelimited(QVector<QString> row, QString delimiter)
{
    QString s;
    for (auto col: row) {
        if (s != "") {
            s += delimiter;
        }
        if (col.contains(delimiter)) {
            col.replace("\"", "\"\"");
            col = "\"" + col + "\"";
        }
        s += col;
    }
    return s;    
}

bool
StringBits::isValidUtf8(const std::string &bytes, bool isTruncated)
{
    size_t len = bytes.length();
    size_t mblen = 0;
    unsigned char first = '\0';
    
    for (size_t i = 0; i < len; ++i) {

	unsigned char c = bytes[i];

	if (((c & 0xc0) == 0xc0) || !(c & 0x80)) {

	    // 11xxxxxx or 0xxxxxxx: first byte of a character sequence

	    if (mblen > 0) {
                
		// have we seen a valid sequence?
		size_t length = 
		    (!(first & 0x20)) ? 2 :
		    (!(first & 0x10)) ? 3 :
		    (!(first & 0x08)) ? 4 :
		    (!(first & 0x04)) ? 5 : 0;

                if (length != 0 && mblen != length) {
                    // previous multibyte sequence had invalid length
                    return false;
                }
            }

            mblen = 0;
            first = c;

        } else {
            
	    // second or subsequent byte

	    if (mblen == 0) {
                // ... without a first byte!                
                return false;
            }
        }
    }

    // at the end

    if (isTruncated) {
        // can't trust any errors arising now
        return true;
    }

    if (mblen > 0) {
                
        // have we seen a valid sequence?
        size_t length = 
            (!(first & 0x20)) ? 2 :
            (!(first & 0x10)) ? 3 :
            (!(first & 0x08)) ? 4 :
            (!(first & 0x04)) ? 5 : 0;
        
        if (length != 0 && mblen != length) {
            // final multibyte sequence had invalid length
            return false;
        }
    }

    return true;
}