Mercurial > hg > svcore
view data/fileio/CSVFormat.cpp @ 1881:b504df98c3be
Ensure completion on output model is started at zero, so if it's checked before the input model has become ready and the transform has begun, it is not accidentally reported as complete (affected re-aligning models in Sonic Lineup when replacing the session)
author | Chris Cannam |
---|---|
date | Fri, 26 Jun 2020 11:45:39 +0100 |
parents | bed42ce4d3ab |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* Sonic Visualiser An audio file viewer and annotation editor. Centre for Digital Music, Queen Mary, University of London. This file copyright 2006 Chris Cannam. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "CSVFormat.h" #include "base/StringBits.h" #include <QFile> #include <QString> #include <QRegExp> #include <QStringList> #include <QTextStream> #include <iostream> #include "base/Debug.h" CSVFormat::CSVFormat(QString path) : m_separator(""), m_sampleRate(44100), m_windowSize(1024), m_headerStatus(HeaderUnknown), m_allowQuoting(true), m_maxExampleCols(0) { (void)guessFormatFor(path); } bool CSVFormat::guessFormatFor(QString path) { m_modelType = TwoDimensionalModel; m_timingType = ExplicitTiming; m_timeUnits = TimeSeconds; m_maxExampleCols = 0; m_columnCount = 0; m_variableColumnCount = false; m_example.clear(); m_columnQualities.clear(); m_columnPurposes.clear(); m_prevValues.clear(); QFile file(path); if (!file.exists()) { SVCERR << "CSVFormat::guessFormatFor(" << path << "): File does not exist" << endl; return false; } if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { SVCERR << "CSVFormat::guessFormatFor(" << path << "): File could not be opened for reading" << endl; return false; } SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl; QTextStream in(&file); in.seek(0); int lineno = 0; while (!in.atEnd()) { // See comment about line endings in CSVFileReader::load() QString chunk = in.readLine(); QStringList lines = chunk.split('\r', QString::SkipEmptyParts); for (int li = 0; li < lines.size(); ++li) { QString line = lines[li]; if (line.startsWith("#") || line == "") { continue; } guessQualities(line, lineno); ++lineno; } if (lineno >= 150) break; } guessPurposes(); guessAudioSampleRange(); return true; } void CSVFormat::guessSeparator(QString line) { QString candidates = "\t|,/: "; for (int i = 0; i < candidates.length(); ++i) { auto bits = StringBits::split(line, candidates[i], m_allowQuoting); if (bits.size() >= 2) { m_plausibleSeparators.insert(candidates[i]); if (m_separator == "") { m_separator = candidates[i]; SVDEBUG << "Estimated column separator: '" << m_separator << "'" << endl; } } } } void CSVFormat::guessQualities(QString line, int lineno) { guessSeparator(line); QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); int cols = list.size(); int firstLine = 0; if (m_headerStatus == HeaderPresent) { firstLine = 1; } if (lineno == firstLine || (cols > m_columnCount)) { m_columnCount = cols; } if (cols != m_columnCount) { m_variableColumnCount = true; } // All columns are regarded as having these qualities until we see // something that indicates otherwise: ColumnQualities defaultQualities = ColumnNumeric | ColumnIntegral | ColumnSmall | ColumnIncreasing | ColumnNearEmpty; for (int i = 0; i < cols; ++i) { SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; if (m_columnQualities.find(i) == m_columnQualities.end()) { m_columnQualities[i] = defaultQualities; m_prevValues[i] = 0.f; } QString s(list[i]); bool ok = false; ColumnQualities qualities = m_columnQualities[i]; // Looks like this is defined on Windows #undef small bool numeric = (qualities & ColumnNumeric); bool integral = (qualities & ColumnIntegral); bool increasing = (qualities & ColumnIncreasing); bool small = (qualities & ColumnSmall); bool large = (qualities & ColumnLarge); // this one defaults to off bool signd = (qualities & ColumnSigned); // also defaults to off bool emptyish = (qualities & ColumnNearEmpty); if (s.trimmed() != "") { if (lineno > firstLine) { emptyish = false; } float value = 0.f; if (numeric) { value = s.toFloat(&ok); if (!ok) { value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); } if (ok) { if (lineno < firstLine + 2 && value > 1000.f) { large = true; } if (value < 0.f) { signd = true; } if (value < -1.f || value > 1.f) { small = false; } } else { numeric = false; // If the column is not numeric, it can't be any of // these things either integral = false; increasing = false; small = false; large = false; signd = false; } } if (numeric) { if (integral) { if (s.contains('.') || s.contains(',')) { integral = false; } } if (increasing) { if (lineno > firstLine && value <= m_prevValues[i]) { increasing = false; } } m_prevValues[i] = value; } } m_columnQualities[i] = (numeric ? ColumnNumeric : 0) | (integral ? ColumnIntegral : 0) | (increasing ? ColumnIncreasing : 0) | (small ? ColumnSmall : 0) | (large ? ColumnLarge : 0) | (signd ? ColumnSigned : 0) | (emptyish ? ColumnNearEmpty : 0); } if (lineno == 0 && m_headerStatus == HeaderUnknown) { // If we have at least one column, and every column has // quality == ColumnNearEmpty, i.e. not empty and not numeric, // then we probably have a header row bool couldBeHeader = (cols > 0); std::map<int, QString> headings; for (int i = 0; i < cols; ++i) { if (m_columnQualities[i] != ColumnNearEmpty) { couldBeHeader = false; } else { headings[i] = list[i].trimmed().toLower(); } } if (couldBeHeader) { m_headerStatus = HeaderPresent; m_columnHeadings = headings; } else { m_headerStatus = HeaderAbsent; } } if (lineno == 0 && m_headerStatus == HeaderPresent) { // Start again with the qualities: m_columnQualities.clear(); m_prevValues.clear(); } if (lineno < firstLine + 10) { m_example.push_back(list); if (lineno == 0 || cols > m_maxExampleCols) { m_maxExampleCols = cols; } } if (lineno < firstLine + 10) { SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; if (lineno == 0 && m_headerStatus == HeaderPresent && m_columnCount > 0 && m_columnQualities.empty()) { SVDEBUG << "[whole line classified as a header row]"; } else { for (int i = 0; i < cols; ++i) { if (m_columnQualities.find(i) == m_columnQualities.end()) { SVDEBUG << "(not set) "; } else { SVDEBUG << int(m_columnQualities[i]) << " "; } } } SVDEBUG << endl; SVDEBUG << "Estimated header status: " << m_headerStatus << endl; } } void CSVFormat::guessPurposes() { m_timingType = CSVFormat::ImplicitTiming; m_timeUnits = CSVFormat::TimeWindows; int timingColumnCount = 0; bool haveDurationOrEndTime = false; SVDEBUG << "Estimated column qualities overall: "; for (int i = 0; i < m_columnCount; ++i) { if (m_columnQualities.find(i) == m_columnQualities.end()) { SVDEBUG << "(not set) "; } else { SVDEBUG << int(m_columnQualities[i]) << " "; } } SVDEBUG << endl; // if our first column has zero or one entries in it and the rest // have more, then we'll default to ignoring the first column and // counting the next one as primary. (e.g. Sonic Annotator output // with filename at start of first column.) int primaryColumnNo = 0; if (m_columnCount >= 2) { if ( (m_columnQualities[0] & ColumnNearEmpty) && !(m_columnQualities[1] & ColumnNearEmpty)) { primaryColumnNo = 1; } } for (int i = 0; i < m_columnCount; ++i) { ColumnPurpose purpose = ColumnUnknown; if (i < primaryColumnNo) { setColumnPurpose(i, purpose); continue; } bool primary = (i == primaryColumnNo); ColumnQualities qualities = m_columnQualities[i]; bool numeric = (qualities & ColumnNumeric); bool integral = (qualities & ColumnIntegral); bool increasing = (qualities & ColumnIncreasing); bool large = (qualities & ColumnLarge); bool timingColumn = (numeric && increasing); QString heading; if (m_columnHeadings.find(i) != m_columnHeadings.end()) { heading = m_columnHeadings[i]; } if (heading == "time" || heading == "frame" || heading == "duration" || heading == "endtime") { timingColumn = true; } if (heading == "value" || heading == "height" || heading == "label") { timingColumn = false; } if (timingColumn) { ++timingColumnCount; if (heading == "endtime") { purpose = ColumnEndTime; haveDurationOrEndTime = true; } else if (heading == "duration") { purpose = ColumnDuration; haveDurationOrEndTime = true; } else if (primary || heading == "time" || heading == "frame") { purpose = ColumnStartTime; m_timingType = ExplicitTiming; if ((integral && large) || heading == "frame") { m_timeUnits = TimeAudioFrames; } else { m_timeUnits = TimeSeconds; } } else if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { purpose = ColumnEndTime; haveDurationOrEndTime = true; } } if (purpose == ColumnUnknown) { if (heading == "label") { purpose = ColumnLabel; } else if (numeric || heading == "value" || heading == "height") { purpose = ColumnValue; } else { purpose = ColumnLabel; } } setColumnPurpose(i, purpose); } int valueCount = 0; for (int i = 0; i < m_columnCount; ++i) { if (m_columnPurposes[i] == ColumnValue) { ++valueCount; } } if (valueCount == 2 && timingColumnCount == 1) { // If we have exactly two apparent value columns and only one // timing column, but one value column is integral and the // other is not, guess that whichever one matches the integral // status of the time column is either duration or end time if (m_timingType == ExplicitTiming) { int a = -1, b = -1; for (int i = 0; i < m_columnCount; ++i) { if (m_columnPurposes[i] == ColumnValue) { if (a == -1) a = i; else b = i; } } if ((m_columnQualities[a] & ColumnIntegral) != (m_columnQualities[b] & ColumnIntegral)) { int timecol = a; if ((m_columnQualities[a] & ColumnIntegral) != (m_columnQualities[0] & ColumnIntegral)) { timecol = b; } if (m_columnQualities[timecol] & ColumnIncreasing) { // This shouldn't happen; should have been settled above m_columnPurposes[timecol] = ColumnEndTime; haveDurationOrEndTime = true; } else { m_columnPurposes[timecol] = ColumnDuration; haveDurationOrEndTime = true; } --valueCount; } } } if (timingColumnCount > 1 || haveDurationOrEndTime) { m_modelType = TwoDimensionalModelWithDuration; } else { if (valueCount == 0) { m_modelType = OneDimensionalModel; } else if (valueCount == 1) { m_modelType = TwoDimensionalModel; } else { m_modelType = ThreeDimensionalModel; } } SVDEBUG << "Estimated column purposes: "; for (int i = 0; i < m_columnCount; ++i) { SVDEBUG << int(m_columnPurposes[i]) << " "; } SVDEBUG << endl; SVDEBUG << "Estimated model type: " << m_modelType << endl; SVDEBUG << "Estimated timing type: " << m_timingType << endl; SVDEBUG << "Estimated units: " << m_timeUnits << endl; } void CSVFormat::guessAudioSampleRange() { AudioSampleRange range = SampleRangeSigned1; range = SampleRangeSigned1; bool knownSigned = false; bool knownNonIntegral = false; SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of " << range << endl; for (int i = 0; i < m_columnCount; ++i) { if (m_columnPurposes[i] != ColumnValue) { SVDEBUG << "... column " << i << " is not apparently a value, ignoring" << endl; continue; } if (!(m_columnQualities[i] & ColumnIntegral)) { knownNonIntegral = true; if (range == SampleRangeUnsigned255 || range == SampleRangeSigned32767) { range = SampleRangeOther; } SVDEBUG << "... column " << i << " is non-integral, updating range to " << range << endl; } if (m_columnQualities[i] & ColumnLarge) { if (range == SampleRangeSigned1 || range == SampleRangeUnsigned255) { if (knownNonIntegral) { range = SampleRangeOther; } else { range = SampleRangeSigned32767; } } SVDEBUG << "... column " << i << " is large, updating range to " << range << endl; } if (m_columnQualities[i] & ColumnSigned) { knownSigned = true; if (range == SampleRangeUnsigned255) { range = SampleRangeSigned32767; } SVDEBUG << "... column " << i << " is signed, updating range to " << range << endl; } if (!(m_columnQualities[i] & ColumnSmall)) { if (range == SampleRangeSigned1) { if (knownNonIntegral) { range = SampleRangeOther; } else if (knownSigned) { range = SampleRangeSigned32767; } else { range = SampleRangeUnsigned255; } } SVDEBUG << "... column " << i << " is not small, updating range to " << range << endl; } } SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range " << range << endl; m_audioSampleRange = range; } QList<CSVFormat::ColumnPurpose> CSVFormat::getColumnPurposes() const { QList<ColumnPurpose> purposes; for (int i = 0; i < m_columnCount; ++i) { purposes.push_back(getColumnPurpose(i)); } return purposes; } void CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl) { m_columnPurposes.clear(); for (int i = 0; in_range_for(cl, i); ++i) { m_columnPurposes[i] = cl[i]; } } CSVFormat::ColumnPurpose CSVFormat::getColumnPurpose(int i) const { if (m_columnPurposes.find(i) == m_columnPurposes.end()) { return ColumnUnknown; } else { return m_columnPurposes.at(i); } } void CSVFormat::setColumnPurpose(int i, ColumnPurpose p) { m_columnPurposes[i] = p; } QList<CSVFormat::ColumnQualities> CSVFormat::getColumnQualities() const { QList<ColumnQualities> qualities; for (int i = 0; i < m_columnCount; ++i) { if (m_columnQualities.find(i) == m_columnQualities.end()) { qualities.push_back(0); } else { qualities.push_back(m_columnQualities.at(i)); } } return qualities; }