Chris@392: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@392: Chris@392: /* Chris@392: Sonic Visualiser Chris@392: An audio file viewer and annotation editor. Chris@392: Centre for Digital Music, Queen Mary, University of London. Chris@392: This file copyright 2006 Chris Cannam. Chris@392: Chris@392: This program is free software; you can redistribute it and/or Chris@392: modify it under the terms of the GNU General Public License as Chris@392: published by the Free Software Foundation; either version 2 of the Chris@392: License, or (at your option) any later version. See the file Chris@392: COPYING included with this distribution for more information. Chris@392: */ Chris@392: Chris@392: #include "CSVFormat.h" Chris@392: Chris@629: #include "base/StringBits.h" Chris@629: Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: Chris@392: #include Chris@392: Chris@1362: #include "base/Debug.h" Chris@1362: Chris@629: CSVFormat::CSVFormat(QString path) : Chris@629: m_separator(""), Chris@392: m_sampleRate(44100), Chris@392: m_windowSize(1024), Chris@629: m_allowQuoting(true) Chris@392: { Chris@629: guessFormatFor(path); Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessFormatFor(QString path) Chris@629: { Chris@629: m_modelType = TwoDimensionalModel; Chris@629: m_timingType = ExplicitTiming; Chris@629: m_timeUnits = TimeSeconds; Chris@629: Chris@629: m_maxExampleCols = 0; Chris@629: m_columnCount = 0; Chris@629: m_variableColumnCount = false; Chris@629: Chris@629: m_example.clear(); Chris@629: m_columnQualities.clear(); Chris@629: m_columnPurposes.clear(); Chris@629: m_prevValues.clear(); Chris@629: Chris@629: QFile file(path); Chris@392: if (!file.exists()) return; Chris@392: if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; Chris@392: Chris@392: QTextStream in(&file); Chris@392: in.seek(0); Chris@392: Chris@629: int lineno = 0; Chris@392: Chris@392: while (!in.atEnd()) { Chris@392: Chris@392: // See comment about line endings in CSVFileReader::load() Chris@392: Chris@392: QString chunk = in.readLine(); Chris@392: QStringList lines = chunk.split('\r', QString::SkipEmptyParts); Chris@392: Chris@897: for (int li = 0; li < lines.size(); ++li) { Chris@392: Chris@392: QString line = lines[li]; Chris@629: if (line.startsWith("#") || line == "") continue; Chris@392: Chris@629: guessQualities(line, lineno); Chris@392: Chris@840: ++lineno; Chris@629: } Chris@840: Chris@840: if (lineno >= 50) break; Chris@629: } Chris@392: Chris@629: guessPurposes(); Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessSeparator(QString line) Chris@629: { Chris@629: char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; Chris@897: for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) { Chris@629: if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { Chris@629: m_separator = candidates[i]; Chris@629: return; Chris@629: } Chris@629: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessQualities(QString line, int lineno) Chris@629: { Chris@629: if (m_separator == "") guessSeparator(line); Chris@629: Chris@1362: QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); Chris@629: Chris@629: int cols = list.size(); Chris@991: if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; Chris@629: if (cols != m_columnCount) m_variableColumnCount = true; Chris@629: Chris@629: // All columns are regarded as having these qualities until we see Chris@629: // something that indicates otherwise: Chris@629: Chris@629: ColumnQualities defaultQualities = Chris@1021: ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty; Chris@629: Chris@629: for (int i = 0; i < cols; ++i) { Chris@1429: Chris@629: while (m_columnQualities.size() <= i) { Chris@629: m_columnQualities.push_back(defaultQualities); Chris@629: m_prevValues.push_back(0.f); Chris@629: } Chris@629: Chris@629: QString s(list[i]); Chris@629: bool ok = false; Chris@629: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@629: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@629: bool large = (qualities & ColumnLarge); // this one defaults to off Chris@1021: bool emptyish = (qualities & ColumnNearEmpty); Chris@629: Chris@1021: if (lineno > 1 && s.trimmed() != "") { Chris@1021: emptyish = false; Chris@1021: } Chris@1021: Chris@629: float value = 0.f; Chris@629: Chris@629: //!!! how to take into account headers? Chris@629: Chris@629: if (numeric) { Chris@629: value = s.toFloat(&ok); Chris@629: if (!ok) { Chris@629: value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); Chris@629: } Chris@629: if (ok) { Chris@629: if (lineno < 2 && value > 1000.f) large = true; Chris@629: } else { Chris@629: numeric = false; Chris@629: } Chris@629: } Chris@629: Chris@629: if (numeric) { Chris@629: Chris@629: if (integral) { Chris@629: if (s.contains('.') || s.contains(',')) { Chris@629: integral = false; Chris@392: } Chris@392: } Chris@392: Chris@629: if (increasing) { Chris@629: if (lineno > 0 && value <= m_prevValues[i]) { Chris@629: increasing = false; Chris@392: } Chris@392: } Chris@392: Chris@629: m_prevValues[i] = value; Chris@629: } Chris@392: Chris@629: m_columnQualities[i] = Chris@629: (numeric ? ColumnNumeric : 0) | Chris@629: (integral ? ColumnIntegral : 0) | Chris@629: (increasing ? ColumnIncreasing : 0) | Chris@1021: (large ? ColumnLarge : 0) | Chris@1021: (emptyish ? ColumnNearEmpty : 0); Chris@629: } Chris@392: Chris@629: if (lineno < 10) { Chris@629: m_example.push_back(list); Chris@629: if (lineno == 0 || cols > m_maxExampleCols) { Chris@629: m_maxExampleCols = cols; Chris@392: } Chris@392: } Chris@392: Chris@1362: if (lineno < 10) { Chris@1362: SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; Chris@1362: for (int i = 0; i < m_columnCount; ++i) { Chris@1362: SVDEBUG << int(m_columnQualities[i]) << " "; Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@1362: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessPurposes() Chris@629: { Chris@629: m_timingType = CSVFormat::ImplicitTiming; Chris@629: m_timeUnits = CSVFormat::TimeWindows; Chris@1429: Chris@629: int timingColumnCount = 0; Chris@1021: Chris@1021: // if our first column has zero or one entries in it and the rest Chris@1021: // have more, then we'll default to ignoring the first column and Chris@1021: // counting the next one as primary. (e.g. Sonic Annotator output Chris@1021: // with filename at start of first column.) Chris@1021: Chris@1021: int primaryColumnNo = 0; Chris@1021: Chris@1021: if (m_columnCount >= 2) { Chris@1021: if ( (m_columnQualities[0] & ColumnNearEmpty) && Chris@1021: !(m_columnQualities[1] & ColumnNearEmpty)) { Chris@1021: primaryColumnNo = 1; Chris@1021: } Chris@1021: } Chris@629: Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@629: Chris@629: ColumnPurpose purpose = ColumnUnknown; Chris@1021: Chris@1021: if (i < primaryColumnNo) { Chris@1021: setColumnPurpose(i, purpose); Chris@1021: continue; Chris@1021: } Chris@1021: Chris@1021: bool primary = (i == primaryColumnNo); Chris@392: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@392: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@629: bool large = (qualities & ColumnLarge); Chris@629: Chris@629: bool timingColumn = (numeric && increasing); Chris@629: Chris@629: if (timingColumn) { Chris@629: Chris@629: ++timingColumnCount; Chris@629: Chris@629: if (primary) { Chris@629: Chris@629: purpose = ColumnStartTime; Chris@629: Chris@629: m_timingType = ExplicitTiming; Chris@629: Chris@629: if (integral && large) { Chris@629: m_timeUnits = TimeAudioFrames; Chris@629: } else { Chris@629: m_timeUnits = TimeSeconds; Chris@629: } Chris@629: Chris@629: } else { Chris@629: Chris@629: if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { Chris@629: purpose = ColumnEndTime; Chris@629: } Chris@629: } Chris@629: } Chris@629: Chris@629: if (purpose == ColumnUnknown) { Chris@629: if (numeric) { Chris@629: purpose = ColumnValue; Chris@629: } else { Chris@629: purpose = ColumnLabel; Chris@629: } Chris@629: } Chris@629: Chris@631: setColumnPurpose(i, purpose); Chris@629: } Chris@629: Chris@629: int valueCount = 0; Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@629: if (m_columnPurposes[i] == ColumnValue) ++valueCount; Chris@629: } Chris@629: Chris@630: if (valueCount == 2 && timingColumnCount == 1) { Chris@630: // If we have exactly two apparent value columns and only one Chris@630: // timing column, but one value column is integral and the Chris@630: // other is not, guess that whichever one matches the integral Chris@630: // status of the time column is either duration or end time Chris@630: if (m_timingType == ExplicitTiming) { Chris@630: int a = -1, b = -1; Chris@630: for (int i = 0; i < m_columnCount; ++i) { Chris@630: if (m_columnPurposes[i] == ColumnValue) { Chris@630: if (a == -1) a = i; Chris@630: else b = i; Chris@630: } Chris@630: } Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[b] & ColumnIntegral)) { Chris@630: int timecol = a; Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[0] & ColumnIntegral)) { Chris@630: timecol = b; Chris@630: } Chris@630: if (m_columnQualities[timecol] & ColumnIncreasing) { Chris@630: // This shouldn't happen; should have been settled above Chris@630: m_columnPurposes[timecol] = ColumnEndTime; Chris@630: } else { Chris@630: m_columnPurposes[timecol] = ColumnDuration; Chris@630: } Chris@630: --valueCount; Chris@630: } Chris@630: } Chris@630: } Chris@630: Chris@631: if (timingColumnCount > 1) { Chris@631: m_modelType = TwoDimensionalModelWithDuration; Chris@392: } else { Chris@631: if (valueCount == 0) { Chris@631: m_modelType = OneDimensionalModel; Chris@631: } else if (valueCount == 1) { Chris@631: m_modelType = TwoDimensionalModel; Chris@631: } else { Chris@631: m_modelType = ThreeDimensionalModel; Chris@631: } Chris@629: } Chris@392: Chris@1362: SVDEBUG << "Estimated column purposes: "; Chris@1362: for (int i = 0; i < m_columnCount; ++i) { Chris@1362: SVDEBUG << int(m_columnPurposes[i]) << " "; Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@392: Chris@1362: SVDEBUG << "Estimated model type: " << m_modelType << endl; Chris@1362: SVDEBUG << "Estimated timing type: " << m_timingType << endl; Chris@1362: SVDEBUG << "Estimated units: " << m_timeUnits << endl; Chris@392: } Chris@392: Chris@631: CSVFormat::ColumnPurpose Chris@631: CSVFormat::getColumnPurpose(int i) Chris@631: { Chris@631: while (m_columnPurposes.size() <= i) { Chris@631: m_columnPurposes.push_back(ColumnUnknown); Chris@631: } Chris@631: return m_columnPurposes[i]; Chris@631: } Chris@629: Chris@631: CSVFormat::ColumnPurpose Chris@631: CSVFormat::getColumnPurpose(int i) const Chris@631: { Chris@668: if (m_columnPurposes.size() <= i) { Chris@668: return ColumnUnknown; Chris@668: } Chris@631: return m_columnPurposes[i]; Chris@631: } Chris@631: Chris@631: void Chris@631: CSVFormat::setColumnPurpose(int i, ColumnPurpose p) Chris@631: { Chris@631: while (m_columnPurposes.size() <= i) { Chris@631: m_columnPurposes.push_back(ColumnUnknown); Chris@631: } Chris@631: m_columnPurposes[i] = p; Chris@631: } Chris@631: Chris@631: Chris@631: Chris@631: