Chris@392: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@392: Chris@392: /* Chris@392: Sonic Visualiser Chris@392: An audio file viewer and annotation editor. Chris@392: Centre for Digital Music, Queen Mary, University of London. Chris@392: This file copyright 2006 Chris Cannam. Chris@392: Chris@392: This program is free software; you can redistribute it and/or Chris@392: modify it under the terms of the GNU General Public License as Chris@392: published by the Free Software Foundation; either version 2 of the Chris@392: License, or (at your option) any later version. See the file Chris@392: COPYING included with this distribution for more information. Chris@392: */ Chris@392: Chris@392: #include "CSVFormat.h" Chris@392: Chris@629: #include "base/StringBits.h" Chris@629: Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: Chris@392: #include Chris@392: Chris@1362: #include "base/Debug.h" Chris@1362: Chris@629: CSVFormat::CSVFormat(QString path) : Chris@629: m_separator(""), Chris@392: m_sampleRate(44100), Chris@392: m_windowSize(1024), Chris@629: m_allowQuoting(true) Chris@392: { Chris@1524: (void)guessFormatFor(path); Chris@629: } Chris@629: Chris@1524: bool Chris@629: CSVFormat::guessFormatFor(QString path) Chris@629: { Chris@629: m_modelType = TwoDimensionalModel; Chris@629: m_timingType = ExplicitTiming; Chris@629: m_timeUnits = TimeSeconds; Chris@629: Chris@629: m_maxExampleCols = 0; Chris@629: m_columnCount = 0; Chris@629: m_variableColumnCount = false; Chris@629: Chris@629: m_example.clear(); Chris@629: m_columnQualities.clear(); Chris@629: m_columnPurposes.clear(); Chris@629: m_prevValues.clear(); Chris@629: Chris@629: QFile file(path); Chris@1524: if (!file.exists()) { Chris@1524: SVCERR << "CSVFormat::guessFormatFor(" << path Chris@1524: << "): File does not exist" << endl; Chris@1524: return false; Chris@1524: } Chris@1524: if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { Chris@1524: SVCERR << "CSVFormat::guessFormatFor(" << path Chris@1524: << "): File could not be opened for reading" << endl; Chris@1524: return false; Chris@1524: } Chris@1524: SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl; Chris@392: Chris@392: QTextStream in(&file); Chris@392: in.seek(0); Chris@392: Chris@629: int lineno = 0; Chris@392: Chris@392: while (!in.atEnd()) { Chris@392: Chris@392: // See comment about line endings in CSVFileReader::load() Chris@392: Chris@392: QString chunk = in.readLine(); Chris@392: QStringList lines = chunk.split('\r', QString::SkipEmptyParts); Chris@392: Chris@897: for (int li = 0; li < lines.size(); ++li) { Chris@392: Chris@392: QString line = lines[li]; Chris@1512: if (line.startsWith("#") || line == "") { Chris@1512: continue; Chris@1512: } Chris@392: Chris@629: guessQualities(line, lineno); Chris@392: Chris@840: ++lineno; Chris@629: } Chris@840: Chris@1512: if (lineno >= 150) break; Chris@629: } Chris@392: Chris@629: guessPurposes(); Chris@1515: guessAudioSampleRange(); Chris@1524: Chris@1524: return true; Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessSeparator(QString line) Chris@629: { Chris@1524: QString candidates = "\t|,/: "; Chris@1524: Chris@1524: for (int i = 0; i < candidates.length(); ++i) { Chris@1524: auto bits = StringBits::split(line, candidates[i], m_allowQuoting); Chris@1524: if (bits.size() >= 2) { Chris@1585: m_plausibleSeparators.insert(candidates[i]); Chris@1585: if (m_separator == "") { Chris@1585: m_separator = candidates[i]; Chris@1585: SVDEBUG << "Estimated column separator: '" << m_separator Chris@1585: << "'" << endl; Chris@1524: } Chris@629: } Chris@629: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessQualities(QString line, int lineno) Chris@629: { Chris@1585: guessSeparator(line); Chris@629: Chris@1362: QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); Chris@629: Chris@629: int cols = list.size(); Chris@991: if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; Chris@629: if (cols != m_columnCount) m_variableColumnCount = true; Chris@629: Chris@629: // All columns are regarded as having these qualities until we see Chris@629: // something that indicates otherwise: Chris@629: Chris@629: ColumnQualities defaultQualities = Chris@1512: ColumnNumeric | ColumnIntegral | ColumnSmall | Chris@1512: ColumnIncreasing | ColumnNearEmpty; Chris@629: Chris@629: for (int i = 0; i < cols; ++i) { Chris@1429: Chris@629: while (m_columnQualities.size() <= i) { Chris@629: m_columnQualities.push_back(defaultQualities); Chris@629: m_prevValues.push_back(0.f); Chris@629: } Chris@629: Chris@629: QString s(list[i]); Chris@629: bool ok = false; Chris@629: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@629: Chris@1523: // Looks like this is defined on Windows Chris@1523: #undef small Chris@1523: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@1512: bool small = (qualities & ColumnSmall); Chris@629: bool large = (qualities & ColumnLarge); // this one defaults to off Chris@1512: bool signd = (qualities & ColumnSigned); // also defaults to off Chris@1021: bool emptyish = (qualities & ColumnNearEmpty); Chris@629: Chris@1021: if (lineno > 1 && s.trimmed() != "") { Chris@1021: emptyish = false; Chris@1021: } Chris@1021: Chris@629: float value = 0.f; Chris@629: Chris@629: //!!! how to take into account headers? Chris@629: Chris@629: if (numeric) { Chris@629: value = s.toFloat(&ok); Chris@629: if (!ok) { Chris@629: value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); Chris@629: } Chris@629: if (ok) { Chris@1512: if (lineno < 2 && value > 1000.f) { Chris@1512: large = true; Chris@1512: } Chris@1512: if (value < 0.f) { Chris@1512: signd = true; Chris@1512: } Chris@1512: if (value < -1.f || value > 1.f) { Chris@1512: small = false; Chris@1512: } Chris@629: } else { Chris@629: numeric = false; Chris@1524: Chris@1524: // If the column is not numeric, it can't be any of Chris@1524: // these things either Chris@1524: integral = false; Chris@1524: increasing = false; Chris@1524: small = false; Chris@1524: large = false; Chris@1524: signd = false; Chris@629: } Chris@629: } Chris@629: Chris@629: if (numeric) { Chris@629: Chris@629: if (integral) { Chris@629: if (s.contains('.') || s.contains(',')) { Chris@629: integral = false; Chris@392: } Chris@392: } Chris@392: Chris@629: if (increasing) { Chris@629: if (lineno > 0 && value <= m_prevValues[i]) { Chris@629: increasing = false; Chris@392: } Chris@392: } Chris@392: Chris@629: m_prevValues[i] = value; Chris@629: } Chris@1524: Chris@629: m_columnQualities[i] = Chris@629: (numeric ? ColumnNumeric : 0) | Chris@629: (integral ? ColumnIntegral : 0) | Chris@629: (increasing ? ColumnIncreasing : 0) | Chris@1512: (small ? ColumnSmall : 0) | Chris@1021: (large ? ColumnLarge : 0) | Chris@1512: (signd ? ColumnSigned : 0) | Chris@1021: (emptyish ? ColumnNearEmpty : 0); Chris@629: } Chris@392: Chris@629: if (lineno < 10) { Chris@629: m_example.push_back(list); Chris@629: if (lineno == 0 || cols > m_maxExampleCols) { Chris@629: m_maxExampleCols = cols; Chris@392: } Chris@392: } Chris@392: Chris@1362: if (lineno < 10) { Chris@1362: SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; Chris@1362: for (int i = 0; i < m_columnCount; ++i) { Chris@1362: SVDEBUG << int(m_columnQualities[i]) << " "; Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@1362: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessPurposes() Chris@629: { Chris@629: m_timingType = CSVFormat::ImplicitTiming; Chris@629: m_timeUnits = CSVFormat::TimeWindows; Chris@1429: Chris@629: int timingColumnCount = 0; Chris@1525: bool haveDurationOrEndTime = false; Chris@1021: Chris@1510: SVDEBUG << "Estimated column qualities overall: "; Chris@1510: for (int i = 0; i < m_columnCount; ++i) { Chris@1510: SVDEBUG << int(m_columnQualities[i]) << " "; Chris@1510: } Chris@1510: SVDEBUG << endl; Chris@1510: Chris@1021: // if our first column has zero or one entries in it and the rest Chris@1021: // have more, then we'll default to ignoring the first column and Chris@1021: // counting the next one as primary. (e.g. Sonic Annotator output Chris@1021: // with filename at start of first column.) Chris@1021: Chris@1021: int primaryColumnNo = 0; Chris@1021: Chris@1021: if (m_columnCount >= 2) { Chris@1021: if ( (m_columnQualities[0] & ColumnNearEmpty) && Chris@1021: !(m_columnQualities[1] & ColumnNearEmpty)) { Chris@1021: primaryColumnNo = 1; Chris@1021: } Chris@1021: } Chris@629: Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@629: Chris@629: ColumnPurpose purpose = ColumnUnknown; Chris@1021: Chris@1021: if (i < primaryColumnNo) { Chris@1021: setColumnPurpose(i, purpose); Chris@1021: continue; Chris@1021: } Chris@1021: Chris@1021: bool primary = (i == primaryColumnNo); Chris@392: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@392: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@629: bool large = (qualities & ColumnLarge); Chris@629: Chris@629: bool timingColumn = (numeric && increasing); Chris@629: Chris@629: if (timingColumn) { Chris@629: Chris@629: ++timingColumnCount; Chris@629: Chris@629: if (primary) { Chris@629: Chris@629: purpose = ColumnStartTime; Chris@629: Chris@629: m_timingType = ExplicitTiming; Chris@629: Chris@629: if (integral && large) { Chris@629: m_timeUnits = TimeAudioFrames; Chris@629: } else { Chris@629: m_timeUnits = TimeSeconds; Chris@629: } Chris@629: Chris@629: } else { Chris@629: Chris@629: if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { Chris@629: purpose = ColumnEndTime; Chris@1525: haveDurationOrEndTime = true; Chris@629: } Chris@629: } Chris@629: } Chris@629: Chris@629: if (purpose == ColumnUnknown) { Chris@629: if (numeric) { Chris@629: purpose = ColumnValue; Chris@629: } else { Chris@629: purpose = ColumnLabel; Chris@629: } Chris@629: } Chris@629: Chris@631: setColumnPurpose(i, purpose); Chris@629: } Chris@629: Chris@629: int valueCount = 0; Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@629: if (m_columnPurposes[i] == ColumnValue) ++valueCount; Chris@629: } Chris@629: Chris@630: if (valueCount == 2 && timingColumnCount == 1) { Chris@630: // If we have exactly two apparent value columns and only one Chris@630: // timing column, but one value column is integral and the Chris@630: // other is not, guess that whichever one matches the integral Chris@630: // status of the time column is either duration or end time Chris@630: if (m_timingType == ExplicitTiming) { Chris@630: int a = -1, b = -1; Chris@630: for (int i = 0; i < m_columnCount; ++i) { Chris@630: if (m_columnPurposes[i] == ColumnValue) { Chris@630: if (a == -1) a = i; Chris@630: else b = i; Chris@630: } Chris@630: } Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[b] & ColumnIntegral)) { Chris@630: int timecol = a; Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[0] & ColumnIntegral)) { Chris@630: timecol = b; Chris@630: } Chris@630: if (m_columnQualities[timecol] & ColumnIncreasing) { Chris@630: // This shouldn't happen; should have been settled above Chris@630: m_columnPurposes[timecol] = ColumnEndTime; Chris@1525: haveDurationOrEndTime = true; Chris@630: } else { Chris@630: m_columnPurposes[timecol] = ColumnDuration; Chris@1525: haveDurationOrEndTime = true; Chris@630: } Chris@630: --valueCount; Chris@630: } Chris@630: } Chris@630: } Chris@630: Chris@1525: if (timingColumnCount > 1 || haveDurationOrEndTime) { Chris@631: m_modelType = TwoDimensionalModelWithDuration; Chris@392: } else { Chris@631: if (valueCount == 0) { Chris@631: m_modelType = OneDimensionalModel; Chris@631: } else if (valueCount == 1) { Chris@631: m_modelType = TwoDimensionalModel; Chris@631: } else { Chris@631: m_modelType = ThreeDimensionalModel; Chris@631: } Chris@629: } Chris@392: Chris@1362: SVDEBUG << "Estimated column purposes: "; Chris@1362: for (int i = 0; i < m_columnCount; ++i) { Chris@1362: SVDEBUG << int(m_columnPurposes[i]) << " "; Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@392: Chris@1362: SVDEBUG << "Estimated model type: " << m_modelType << endl; Chris@1362: SVDEBUG << "Estimated timing type: " << m_timingType << endl; Chris@1362: SVDEBUG << "Estimated units: " << m_timeUnits << endl; Chris@392: } Chris@392: Chris@1515: void Chris@1515: CSVFormat::guessAudioSampleRange() Chris@1515: { Chris@1515: AudioSampleRange range = SampleRangeSigned1; Chris@1515: Chris@1515: range = SampleRangeSigned1; Chris@1515: bool knownSigned = false; Chris@1515: bool knownNonIntegral = false; Chris@1521: Chris@1521: SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of " Chris@1521: << range << endl; Chris@1515: Chris@1515: for (int i = 0; i < m_columnCount; ++i) { Chris@1521: if (m_columnPurposes[i] != ColumnValue) { Chris@1521: SVDEBUG << "... column " << i Chris@1521: << " is not apparently a value, ignoring" << endl; Chris@1521: continue; Chris@1521: } Chris@1515: if (!(m_columnQualities[i] & ColumnIntegral)) { Chris@1515: knownNonIntegral = true; Chris@1515: if (range == SampleRangeUnsigned255 || Chris@1515: range == SampleRangeSigned32767) { Chris@1515: range = SampleRangeOther; Chris@1515: } Chris@1521: SVDEBUG << "... column " << i Chris@1521: << " is non-integral, updating range to " << range << endl; Chris@1515: } Chris@1515: if (m_columnQualities[i] & ColumnLarge) { Chris@1515: if (range == SampleRangeSigned1 || Chris@1515: range == SampleRangeUnsigned255) { Chris@1515: if (knownNonIntegral) { Chris@1515: range = SampleRangeOther; Chris@1515: } else { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is large, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: if (m_columnQualities[i] & ColumnSigned) { Chris@1515: knownSigned = true; Chris@1515: if (range == SampleRangeUnsigned255) { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is signed, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: if (!(m_columnQualities[i] & ColumnSmall)) { Chris@1515: if (range == SampleRangeSigned1) { Chris@1515: if (knownNonIntegral) { Chris@1515: range = SampleRangeOther; Chris@1515: } else if (knownSigned) { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } else { Chris@1515: range = SampleRangeUnsigned255; Chris@1515: } Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is not small, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: } Chris@1515: Chris@1521: SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range " Chris@1521: << range << endl; Chris@1521: Chris@1515: m_audioSampleRange = range; Chris@1515: } Chris@1515: Chris@631: CSVFormat::ColumnPurpose Chris@631: CSVFormat::getColumnPurpose(int i) Chris@631: { Chris@631: while (m_columnPurposes.size() <= i) { Chris@631: m_columnPurposes.push_back(ColumnUnknown); Chris@631: } Chris@631: return m_columnPurposes[i]; Chris@631: } Chris@629: Chris@631: CSVFormat::ColumnPurpose Chris@631: CSVFormat::getColumnPurpose(int i) const Chris@631: { Chris@668: if (m_columnPurposes.size() <= i) { Chris@668: return ColumnUnknown; Chris@668: } Chris@631: return m_columnPurposes[i]; Chris@631: } Chris@631: Chris@631: void Chris@631: CSVFormat::setColumnPurpose(int i, ColumnPurpose p) Chris@631: { Chris@631: while (m_columnPurposes.size() <= i) { Chris@631: m_columnPurposes.push_back(ColumnUnknown); Chris@631: } Chris@631: m_columnPurposes[i] = p; Chris@631: } Chris@631: Chris@631: Chris@631: Chris@631: