Mercurial > hg > svcore
changeset 629:35499d48a5d1
* Start overhauling CSV parser to associate purposes with columns en route to its guesses; add some string manipulation code
author | Chris Cannam |
---|---|
date | Thu, 15 Jul 2010 15:27:21 +0000 |
parents | 001db550bd48 |
children | 11a664058dd8 |
files | base/StringBits.cpp base/StringBits.h base/base.pro data/fileio/CSVFormat.cpp data/fileio/CSVFormat.h |
diffstat | 5 files changed, 529 insertions(+), 150 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/base/StringBits.cpp Thu Jul 15 15:27:21 2010 +0000 @@ -0,0 +1,211 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Sonic Visualiser + An audio file viewer and annotation editor. + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +/* + This is a modified version of a source file from the + Rosegarden MIDI and audio sequencer and notation editor. + This file copyright 2000-2010 Chris Cannam. +*/ + +#include "StringBits.h" + +double +StringBits::stringToDoubleLocaleFree(QString s, bool *ok) +{ + int dp = 0; + int sign = 1; + int i = 0; + double result = 0.0; + int len = s.length(); + + result = 0.0; + + if (ok) *ok = true; + + while (i < len && s[i].isSpace()) ++i; + if (i < len && s[i] == '-') sign = -1; + + while (i < len) { + + QChar c = s[i]; + + if (c.isDigit()) { + + double d = c.digitValue(); + + if (dp > 0) { + for (int p = dp; p > 0; --p) d /= 10.0; + ++dp; + } else { + result *= 10.0; + } + + result += d; + + } else if (c == '.') { + + dp = 1; + + } else if (ok) { + *ok = false; + } + + ++i; + } + + return result * sign; +} + +QStringList +StringBits::splitQuoted(QString s, QChar separator) +{ + QStringList tokens; + QString tok; + + enum { sep, unq, q1, q2 } mode = sep; + + for (int i = 0; i < s.length(); ++i) { + + QChar c = s[i]; + + if (c == '\'') { + switch (mode) { + case sep: mode = q1; break; + case unq: case q2: tok += c; break; + case q1: mode = sep; tokens << tok; tok = ""; break; + } + + } else if (c == '"') { + switch (mode) { + case sep: mode = q2; break; + case unq: case q1: tok += c; break; + case q2: mode = sep; tokens << tok; tok = ""; break; + } + + } else if (c == separator || (separator == ' ' && c.isSpace())) { + switch (mode) { + case sep: if (separator != ' ') tokens << ""; break; + case unq: mode = sep; tokens << tok; tok = ""; break; + case q1: case q2: tok += c; break; + } + + } else if (c == '\\') { + if (++i < s.length()) { + c = s[i]; + switch (mode) { + case sep: mode = unq; tok += c; break; + default: tok += c; break; + } + } + + } else { + switch (mode) { + case sep: mode = unq; tok += c; break; + default: tok += c; break; + } + } + } + + if (tok != "" || mode != sep) tokens << tok; + return tokens; +} + +/* + +void testSplit() +{ + QStringList tests; + tests << "a b c d"; + tests << "a \"b c\" d"; + tests << "a 'b c' d"; + tests << "a \"b c\\\" d\""; + tests << "a 'b c\\' d'"; + tests << "a \"b c' d\""; + tests << "a 'b c\" d'"; + tests << "aa 'bb cc\" dd'"; + tests << "a'a 'bb' \\\"cc\" dd\\\""; + tests << " a'a \\\' 'bb' \' \\\"cc\" ' dd\\\" '"; + + for (int j = 0; j < tests.size(); ++j) { + cout << endl; + cout << tests[j].toStdString() << endl; + cout << "->" << endl << "("; + QStringList l = splitQuoted(tests[j], ' '); + for (int i = 0; i < l.size(); ++i) { + if (i > 0) cout << ";"; + cout << l[i].toStdString(); + } + cout << ")" << endl; + } +} + +*/ + +/* + Results: + +a b c d +-> +(a;b;c;d) + +a "b c" d +-> +(a;b c;d) + +a 'b c' d +-> +(a;b c;d) + +a "b c\" d" +-> +(a;b c" d) + +a 'b c\' d' +-> +(a;b c' d) + +a "b c' d" +-> +(a;b c' d) + +a 'b c" d' +-> +(a;b c" d) + +aa 'bb cc" dd' +-> +(aa;bb cc" dd) + +a'a 'bb' \"cc" dd\" +-> +(a'a;bb;"cc";dd") + + a'a \' 'bb' ' \"cc" ' dd\" ' +-> +(a'a;';bb; "cc" ;dd";) + +*/ + +QStringList +StringBits::split(QString line, QChar separator, bool quoted) +{ + if (quoted) { + return splitQuoted(line, separator); + } else { + return line.split(separator, + separator == ' ' ? QString::SkipEmptyParts : + QString::KeepEmptyParts); + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/base/StringBits.h Thu Jul 15 15:27:21 2010 +0000 @@ -0,0 +1,60 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + Sonic Visualiser + An audio file viewer and annotation editor. + Centre for Digital Music, Queen Mary, University of London. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. See the file + COPYING included with this distribution for more information. +*/ + +/* + This is a modified version of a source file from the + Rosegarden MIDI and audio sequencer and notation editor. + This file copyright 2000-2010 Chris Cannam. +*/ + +#ifndef _STRING_BITS_H_ +#define _STRING_BITS_H_ + +#include <QString> +#include <QStringList> +#include <QChar> + +class StringBits +{ +public: + /** + * Convert a string to a double using basic "C"-locale syntax, + * i.e. always using '.' as a decimal point. We use this as a + * fallback when parsing files from an unknown source, if + * locale-specific conversion fails. Does not support e notation. + * If ok is non-NULL, *ok will be set to true if conversion + * succeeds or false otherwise. + */ + static double stringToDoubleLocaleFree(QString s, bool *ok = 0); + + /** + * Split a string at the given separator character, allowing + * quoted sections that contain the separator. If the separator + * is ' ', any (amount of) whitespace will be considered as a + * single separator. If the separator is another whitespace + * character such as '\t', it will be used literally. + */ + static QStringList splitQuoted(QString s, QChar separator); + + /** + * Split a string at the given separator character. If quoted is + * true, do so by calling splitQuoted (above). If quoted is + * false, use QString::split; if separator is ' ', use + * SkipEmptyParts behaviour, otherwise use KeepEmptyParts (this is + * analogous to the behaviour of splitQuoted). + */ + static QStringList split(QString s, QChar separator, bool quoted); +}; + +#endif
--- a/base/base.pro Thu Jul 08 14:22:28 2010 +0000 +++ b/base/base.pro Thu Jul 15 15:27:21 2010 +0000 @@ -39,6 +39,7 @@ Selection.h \ Serialiser.h \ StorageAdviser.h \ + StringBits.h \ TempDirectory.h \ TextMatcher.h \ Thread.h \ @@ -67,6 +68,7 @@ Selection.cpp \ Serialiser.cpp \ StorageAdviser.cpp \ + StringBits.cpp \ TempDirectory.cpp \ TextMatcher.cpp \ Thread.cpp \
--- a/data/fileio/CSVFormat.cpp Thu Jul 08 14:22:28 2010 +0000 +++ b/data/fileio/CSVFormat.cpp Thu Jul 15 15:27:21 2010 +0000 @@ -15,6 +15,8 @@ #include "CSVFormat.h" +#include "base/StringBits.h" + #include <QFile> #include <QString> #include <QRegExp> @@ -23,39 +25,41 @@ #include <iostream> -CSVFormat::CSVFormat(QString filename) : - m_modelType(TwoDimensionalModel), - m_timingType(ExplicitTiming), - m_durationType(Durations), - m_timeUnits(TimeSeconds), - m_separator(","), +CSVFormat::CSVFormat(QString path) : + m_separator(""), m_sampleRate(44100), m_windowSize(1024), - m_behaviour(QString::KeepEmptyParts), - m_maxExampleCols(0) + m_allowQuoting(true) { - QFile file(filename); + guessFormatFor(path); +} + +void +CSVFormat::guessFormatFor(QString path) +{ + m_modelType = TwoDimensionalModel; + m_timingType = ExplicitTiming; + m_durationType = Durations; + m_timeUnits = TimeSeconds; + m_behaviour = QString::KeepEmptyParts; + + m_maxExampleCols = 0; + m_columnCount = 0; + m_variableColumnCount = false; + + m_example.clear(); + m_columnQualities.clear(); + m_columnPurposes.clear(); + m_prevValues.clear(); + + QFile file(path); if (!file.exists()) return; if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; QTextStream in(&file); in.seek(0); - unsigned int lineno = 0; - - bool nonIncreasingPrimaries = false; - bool nonIncreasingSecondaries = false; - bool nonNumericPrimaries = false; - bool floatPrimaries = false; - bool variableItemCount = false; - int itemCount = 1; - int earliestNonNumericItem = -1; - - float prevPrimary = 0.0; - float prevSecondary = 0.0; - - m_maxExampleCols = 0; - m_separator = ""; + int lineno = 0; while (!in.atEnd()) { @@ -67,148 +71,198 @@ for (size_t li = 0; li < lines.size(); ++li) { QString line = lines[li]; + if (line.startsWith("#") || line == "") continue; - if (line.startsWith("#")) continue; + guessQualities(line, lineno); - m_behaviour = QString::KeepEmptyParts; + if (++lineno == 50) break; + } + } - if (m_separator == "") { - //!!! to do: ask the user - if (line.split(",").size() >= 2) m_separator = ","; - else if (line.split("\t").size() >= 2) m_separator = "\t"; - else if (line.split("|").size() >= 2) m_separator = "|"; - else if (line.split("/").size() >= 2) m_separator = "/"; - else if (line.split(":").size() >= 2) m_separator = ":"; - else { - m_separator = " "; - m_behaviour = QString::SkipEmptyParts; + guessPurposes(); +} + +void +CSVFormat::guessSeparator(QString line) +{ + char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; + for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) { + if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { + m_separator = candidates[i]; + return; + } + } + m_separator = " "; +} + +void +CSVFormat::guessQualities(QString line, int lineno) +{ + if (m_separator == "") guessSeparator(line); + + QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting); + + int cols = list.size(); + if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols; + if (cols != m_columnCount) m_variableColumnCount = true; + + // All columns are regarded as having these qualities until we see + // something that indicates otherwise: + + ColumnQualities defaultQualities = + ColumnNumeric | ColumnIntegral | ColumnIncreasing; + + for (int i = 0; i < cols; ++i) { + + while (m_columnQualities.size() <= i) { + m_columnQualities.push_back(defaultQualities); + m_prevValues.push_back(0.f); + } + + QString s(list[i]); + bool ok = false; + + ColumnQualities qualities = m_columnQualities[i]; + + bool numeric = (qualities & ColumnNumeric); + bool integral = (qualities & ColumnIntegral); + bool increasing = (qualities & ColumnIncreasing); + bool large = (qualities & ColumnLarge); // this one defaults to off + + float value = 0.f; + + //!!! how to take into account headers? + + if (numeric) { + value = s.toFloat(&ok); + if (!ok) { + value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); + } + if (ok) { + if (lineno < 2 && value > 1000.f) large = true; + } else { + numeric = false; + } + } + + if (numeric) { + + if (integral) { + if (s.contains('.') || s.contains(',')) { + integral = false; } } -// std::cerr << "separator = \"" << m_separator.toStdString() << "\"" << std::endl; - - QStringList list = line.split(m_separator, m_behaviour); - QStringList tidyList; - - for (int i = 0; i < list.size(); ++i) { - - QString s(list[i]); - bool numeric = false; - - if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { - s = s.mid(1, s.length() - 2); - } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { - s = s.mid(1, s.length() - 2); - } else { - float f = s.toFloat(&numeric); -// std::cerr << "converted \"" << s.toStdString() << "\" to float, got " << f << " and success = " << numeric << std::endl; - } - - tidyList.push_back(s); - - if (lineno == 0 || (list.size() < itemCount)) { - itemCount = list.size(); - } else { - if (itemCount != list.size()) { - variableItemCount = true; - } - } - - if (i == 0) { // primary - - if (numeric) { - - float primary = s.toFloat(); - - if (lineno > 0 && primary <= prevPrimary) { - nonIncreasingPrimaries = true; - } - - if (s.contains(".") || s.contains(",")) { - floatPrimaries = true; - } - - prevPrimary = primary; - - } else { - nonNumericPrimaries = true; - } - } else { // secondary - - if (!numeric) { - if (earliestNonNumericItem < 0 || - i < earliestNonNumericItem) { - earliestNonNumericItem = i; - } - } else if (i == 1) { - float secondary = s.toFloat(); - if (lineno > 0 && secondary <= prevSecondary) { - nonIncreasingSecondaries = true; - } - prevSecondary = secondary; - } + if (increasing) { + if (lineno > 0 && value <= m_prevValues[i]) { + increasing = false; } } - if (lineno < 10) { - m_example.push_back(tidyList); - if (lineno == 0 || tidyList.size() > m_maxExampleCols) { - m_maxExampleCols = tidyList.size(); - } - } + m_prevValues[i] = value; + } - ++lineno; + m_columnQualities[i] = + (numeric ? ColumnNumeric : 0) | + (integral ? ColumnIntegral : 0) | + (increasing ? ColumnIncreasing : 0) | + (large ? ColumnLarge : 0); + } - if (lineno == 50) break; + if (lineno < 10) { + m_example.push_back(list); + if (lineno == 0 || cols > m_maxExampleCols) { + m_maxExampleCols = cols; } } - if (nonNumericPrimaries || nonIncreasingPrimaries) { + std::cerr << "Estimated column qualities: "; + for (int i = 0; i < m_columnCount; ++i) { + std::cerr << int(m_columnQualities[i]) << " "; + } + std::cerr << std::endl; +} + +void +CSVFormat::guessPurposes() +{ + while (m_columnPurposes.size() <= m_columnCount) { + m_columnPurposes.push_back(ColumnUnknown); + } + + m_timingType = CSVFormat::ImplicitTiming; + m_timeUnits = CSVFormat::TimeWindows; - // Primaries are probably not a series of times + int timingColumnCount = 0; + + for (int i = 0; i < m_columnCount; ++i) { + + ColumnPurpose purpose = ColumnUnknown; + bool primary = (i == 0); - m_timingType = CSVFormat::ImplicitTiming; - m_timeUnits = CSVFormat::TimeWindows; - - if (nonNumericPrimaries) { - m_modelType = CSVFormat::OneDimensionalModel; - } else if (itemCount == 1 || variableItemCount || - (earliestNonNumericItem != -1)) { - m_modelType = CSVFormat::TwoDimensionalModel; - } else { - m_modelType = CSVFormat::ThreeDimensionalModel; - } + ColumnQualities qualities = m_columnQualities[i]; + bool numeric = (qualities & ColumnNumeric); + bool integral = (qualities & ColumnIntegral); + bool increasing = (qualities & ColumnIncreasing); + bool large = (qualities & ColumnLarge); + + bool timingColumn = (numeric && increasing); + + if (timingColumn) { + + ++timingColumnCount; + + if (primary) { + + purpose = ColumnStartTime; + + m_timingType = ExplicitTiming; + + if (integral && large) { + m_timeUnits = TimeAudioFrames; + } else { + m_timeUnits = TimeSeconds; + } + + } else { + + if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { + purpose = ColumnEndTime; + m_durationType = EndTimes; + } + } + } + + if (purpose == ColumnUnknown) { + if (numeric) { + purpose = ColumnValue; + } else { + purpose = ColumnLabel; + } + } + + m_columnPurposes[i] = purpose; + } + + int valueCount = 0; + for (int i = 0; i < m_columnCount; ++i) { + if (m_columnPurposes[i] == ColumnValue) ++valueCount; + } + + if (valueCount == 0) { + m_modelType = OneDimensionalModel; + } else if (valueCount == 1) { + m_modelType = TwoDimensionalModel; } else { + m_modelType = ThreeDimensionalModel; + } - // Increasing numeric primaries -- likely to be time - - m_timingType = CSVFormat::ExplicitTiming; - - if (floatPrimaries) { - m_timeUnits = CSVFormat::TimeSeconds; - } else { - m_timeUnits = CSVFormat::TimeAudioFrames; - } - - if (itemCount == 1) { - m_modelType = CSVFormat::OneDimensionalModel; - } else if (variableItemCount || (earliestNonNumericItem != -1)) { - if (earliestNonNumericItem != -1 && earliestNonNumericItem < 2) { - m_modelType = CSVFormat::OneDimensionalModel; - } else { - m_modelType = CSVFormat::TwoDimensionalModel; - } - } else { - m_modelType = CSVFormat::ThreeDimensionalModel; - } - - if (nonIncreasingSecondaries) { - m_durationType = Durations; - } else { - m_durationType = EndTimes; - } + std::cerr << "Estimated column purposes: "; + for (int i = 0; i < m_columnCount; ++i) { + std::cerr << int(m_columnPurposes[i]) << " "; } + std::cerr << std::endl; std::cerr << "Estimated model type: " << m_modelType << std::endl; std::cerr << "Estimated timing type: " << m_timingType << std::endl; @@ -216,3 +270,4 @@ std::cerr << "Estimated units: " << m_timeUnits << std::endl; } +
--- a/data/fileio/CSVFormat.h Thu Jul 08 14:22:28 2010 +0000 +++ b/data/fileio/CSVFormat.h Thu Jul 15 15:27:21 2010 +0000 @@ -45,7 +45,22 @@ TimeWindows }; - CSVFormat(QString path); // guess format + enum ColumnPurpose { + ColumnUnknown, + ColumnStartTime, + ColumnEndTime, + ColumnDuration, + ColumnValue, + ColumnLabel + }; + + enum ColumnQuality { + ColumnNumeric = 0x1, + ColumnIntegral = 0x2, + ColumnIncreasing = 0x4, + ColumnLarge = 0x8 + }; + typedef unsigned int ColumnQualities; CSVFormat() : // arbitrary defaults m_modelType(TwoDimensionalModel), @@ -55,8 +70,23 @@ m_separator(","), m_sampleRate(44100), m_windowSize(1024), - m_behaviour(QString::KeepEmptyParts) + m_columnCount(0), + m_variableColumnCount(false), + m_behaviour(QString::KeepEmptyParts), + m_allowQuoting(true), + m_maxExampleCols(0) { } + + CSVFormat(QString path); // guess format + + /** + * Guess the format of the given CSV file, setting the fields in + * this object accordingly. If the current separator is the empty + * string, the separator character will also be guessed; otherwise + * the current separator will be used. The other properties of + * this object will be set according to guesses from the file. + */ + void guessFormatFor(QString path); ModelType getModelType() const { return m_modelType; } TimingType getTimingType() const { return m_timingType; } @@ -65,8 +95,9 @@ QString getSeparator() const { return m_separator; } size_t getSampleRate() const { return m_sampleRate; } size_t getWindowSize() const { return m_windowSize; } - + QString::SplitBehavior getSplitBehaviour() const { return m_behaviour; } + QList<ColumnPurpose> getColumnPurposes() const { return m_columnPurposes; } void setModelType(ModelType t) { m_modelType = t; } void setTimingType(TimingType t) { m_timingType = t; } @@ -77,8 +108,12 @@ void setWindowSize(size_t s) { m_windowSize = s; } void setSplitBehaviour(QString::SplitBehavior b) { m_behaviour = b; } + void setColumnPurposes(QList<ColumnPurpose> cl) { m_columnPurposes = cl; } - // only valid if constructor that guesses format was used: + // read-only; only valid if format has been guessed: + QList<ColumnQualities> getColumnQualities() const { return m_columnQualities; } + + // read-only; only valid if format has been guessed: QList<QStringList> getExample() const { return m_example; } int getMaxExampleCols() const { return m_maxExampleCols; } @@ -91,10 +126,26 @@ size_t m_sampleRate; size_t m_windowSize; + int m_columnCount; + bool m_variableColumnCount; + + QList<ColumnQualities> m_columnQualities; + QList<ColumnPurpose> m_columnPurposes; + + QList<float> m_prevValues; + QString::SplitBehavior m_behaviour; + bool m_allowQuoting; QList<QStringList> m_example; int m_maxExampleCols; + + void guessSeparator(QString line); + void guessQualities(QString line, int lineno); + void guessPurposes(); + + void guessFormatFor_Old(QString path); + }; #endif