Chris@392: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@392: Chris@392: /* Chris@392: Sonic Visualiser Chris@392: An audio file viewer and annotation editor. Chris@392: Centre for Digital Music, Queen Mary, University of London. Chris@392: This file copyright 2006 Chris Cannam. Chris@392: Chris@392: This program is free software; you can redistribute it and/or Chris@392: modify it under the terms of the GNU General Public License as Chris@392: published by the Free Software Foundation; either version 2 of the Chris@392: License, or (at your option) any later version. See the file Chris@392: COPYING included with this distribution for more information. Chris@392: */ Chris@392: Chris@392: #include "CSVFormat.h" Chris@392: Chris@629: #include "base/StringBits.h" Chris@629: Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: #include Chris@392: Chris@392: #include Chris@392: Chris@1362: #include "base/Debug.h" Chris@1362: Chris@629: CSVFormat::CSVFormat(QString path) : Chris@629: m_separator(""), Chris@392: m_sampleRate(44100), Chris@392: m_windowSize(1024), Chris@1870: m_headerStatus(HeaderUnknown), Chris@1870: m_allowQuoting(true), Chris@1870: m_maxExampleCols(0) Chris@392: { Chris@1524: (void)guessFormatFor(path); Chris@629: } Chris@629: Chris@1524: bool Chris@629: CSVFormat::guessFormatFor(QString path) Chris@629: { Chris@629: m_modelType = TwoDimensionalModel; Chris@629: m_timingType = ExplicitTiming; Chris@629: m_timeUnits = TimeSeconds; Chris@629: Chris@629: m_maxExampleCols = 0; Chris@629: m_columnCount = 0; Chris@629: m_variableColumnCount = false; Chris@629: Chris@629: m_example.clear(); Chris@629: m_columnQualities.clear(); Chris@629: m_columnPurposes.clear(); Chris@629: m_prevValues.clear(); Chris@629: Chris@629: QFile file(path); Chris@1524: if (!file.exists()) { Chris@1524: SVCERR << "CSVFormat::guessFormatFor(" << path Chris@1524: << "): File does not exist" << endl; Chris@1524: return false; Chris@1524: } Chris@1524: if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { Chris@1524: SVCERR << "CSVFormat::guessFormatFor(" << path Chris@1524: << "): File could not be opened for reading" << endl; Chris@1524: return false; Chris@1524: } Chris@1524: SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl; Chris@392: Chris@392: QTextStream in(&file); Chris@392: in.seek(0); Chris@392: Chris@629: int lineno = 0; Chris@392: Chris@392: while (!in.atEnd()) { Chris@392: Chris@392: // See comment about line endings in CSVFileReader::load() Chris@392: Chris@392: QString chunk = in.readLine(); Chris@392: QStringList lines = chunk.split('\r', QString::SkipEmptyParts); Chris@392: Chris@897: for (int li = 0; li < lines.size(); ++li) { Chris@392: Chris@392: QString line = lines[li]; Chris@1512: if (line.startsWith("#") || line == "") { Chris@1512: continue; Chris@1512: } Chris@392: Chris@629: guessQualities(line, lineno); Chris@392: Chris@840: ++lineno; Chris@629: } Chris@840: Chris@1512: if (lineno >= 150) break; Chris@629: } Chris@392: Chris@629: guessPurposes(); Chris@1515: guessAudioSampleRange(); Chris@1524: Chris@1524: return true; Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessSeparator(QString line) Chris@629: { Chris@1524: QString candidates = "\t|,/: "; Chris@1524: Chris@1524: for (int i = 0; i < candidates.length(); ++i) { Chris@1524: auto bits = StringBits::split(line, candidates[i], m_allowQuoting); Chris@1524: if (bits.size() >= 2) { Chris@1585: m_plausibleSeparators.insert(candidates[i]); Chris@1585: if (m_separator == "") { Chris@1585: m_separator = candidates[i]; Chris@1585: SVDEBUG << "Estimated column separator: '" << m_separator Chris@1585: << "'" << endl; Chris@1524: } Chris@629: } Chris@629: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessQualities(QString line, int lineno) Chris@629: { Chris@1585: guessSeparator(line); Chris@629: Chris@1362: QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); Chris@629: Chris@629: int cols = list.size(); Chris@1870: Chris@1870: int firstLine = 0; Chris@1870: if (m_headerStatus == HeaderPresent) { Chris@1870: firstLine = 1; Chris@1870: } Chris@1870: Chris@1870: if (lineno == firstLine || (cols > m_columnCount)) { Chris@1870: m_columnCount = cols; Chris@1870: } Chris@1870: if (cols != m_columnCount) { Chris@1870: m_variableColumnCount = true; Chris@1870: } Chris@629: Chris@629: // All columns are regarded as having these qualities until we see Chris@629: // something that indicates otherwise: Chris@629: Chris@629: ColumnQualities defaultQualities = Chris@1512: ColumnNumeric | ColumnIntegral | ColumnSmall | Chris@1512: ColumnIncreasing | ColumnNearEmpty; Chris@629: Chris@629: for (int i = 0; i < cols; ++i) { Chris@1854: Chris@1854: SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; Chris@1870: Chris@1870: if (m_columnQualities.find(i) == m_columnQualities.end()) { Chris@1870: m_columnQualities[i] = defaultQualities; Chris@1870: m_prevValues[i] = 0.f; Chris@629: } Chris@629: Chris@629: QString s(list[i]); Chris@629: bool ok = false; Chris@629: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@629: Chris@1523: // Looks like this is defined on Windows Chris@1523: #undef small Chris@1523: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@1512: bool small = (qualities & ColumnSmall); Chris@629: bool large = (qualities & ColumnLarge); // this one defaults to off Chris@1512: bool signd = (qualities & ColumnSigned); // also defaults to off Chris@1021: bool emptyish = (qualities & ColumnNearEmpty); Chris@629: Chris@1854: if (s.trimmed() != "") { Chris@1021: Chris@1870: if (lineno > firstLine) { Chris@1854: emptyish = false; Chris@1854: } Chris@1854: Chris@1854: float value = 0.f; Chris@629: Chris@1854: if (numeric) { Chris@1854: value = s.toFloat(&ok); Chris@1854: if (!ok) { Chris@1854: value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); Chris@1512: } Chris@1854: if (ok) { Chris@1870: if (lineno < firstLine + 2 && value > 1000.f) { Chris@1854: large = true; Chris@1854: } Chris@1854: if (value < 0.f) { Chris@1854: signd = true; Chris@1854: } Chris@1854: if (value < -1.f || value > 1.f) { Chris@1854: small = false; Chris@1854: } Chris@1854: } else { Chris@1854: numeric = false; Chris@1854: Chris@1854: // If the column is not numeric, it can't be any of Chris@1854: // these things either Chris@1854: integral = false; Chris@1854: increasing = false; Chris@1512: small = false; Chris@1854: large = false; Chris@1854: signd = false; Chris@392: } Chris@392: } Chris@392: Chris@1854: if (numeric) { Chris@1854: Chris@1854: if (integral) { Chris@1854: if (s.contains('.') || s.contains(',')) { Chris@1854: integral = false; Chris@1854: } Chris@392: } Chris@1854: Chris@1854: if (increasing) { Chris@1870: if (lineno > firstLine && value <= m_prevValues[i]) { Chris@1854: increasing = false; Chris@1854: } Chris@1854: } Chris@1854: Chris@1854: m_prevValues[i] = value; Chris@392: } Chris@629: } Chris@1524: Chris@629: m_columnQualities[i] = Chris@629: (numeric ? ColumnNumeric : 0) | Chris@629: (integral ? ColumnIntegral : 0) | Chris@629: (increasing ? ColumnIncreasing : 0) | Chris@1512: (small ? ColumnSmall : 0) | Chris@1021: (large ? ColumnLarge : 0) | Chris@1512: (signd ? ColumnSigned : 0) | Chris@1021: (emptyish ? ColumnNearEmpty : 0); Chris@629: } Chris@392: Chris@1870: if (lineno == 0 && m_headerStatus == HeaderUnknown) { Chris@1870: // If we have at least one column, and every column has Chris@1870: // quality == ColumnNearEmpty, i.e. not empty and not numeric, Chris@1870: // then we probably have a header row Chris@1870: bool couldBeHeader = (cols > 0); Chris@1870: std::map headings; Chris@1870: for (int i = 0; i < cols; ++i) { Chris@1870: if (m_columnQualities[i] != ColumnNearEmpty) { Chris@1870: couldBeHeader = false; Chris@1870: } else { Chris@1870: headings[i] = list[i].trimmed().toLower(); Chris@1870: } Chris@1870: } Chris@1870: if (couldBeHeader) { Chris@1870: m_headerStatus = HeaderPresent; Chris@1870: m_columnHeadings = headings; Chris@1870: } else { Chris@1870: m_headerStatus = HeaderAbsent; Chris@1870: } Chris@1870: } Chris@1870: Chris@1870: if (lineno == 0 && m_headerStatus == HeaderPresent) { Chris@1870: // Start again with the qualities: Chris@1870: m_columnQualities.clear(); Chris@1870: m_prevValues.clear(); Chris@1870: } else if (lineno < firstLine + 10) { Chris@1870: // Not a header row, so add it to the example column output Chris@629: m_example.push_back(list); Chris@1870: if (lineno == firstLine || cols > m_maxExampleCols) { Chris@629: m_maxExampleCols = cols; Chris@392: } Chris@392: } Chris@392: Chris@1870: if (lineno < firstLine + 10) { Chris@1362: SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; Chris@1870: if (lineno == 0 && m_headerStatus == HeaderPresent && Chris@1870: m_columnCount > 0 && m_columnQualities.empty()) { Chris@1870: SVDEBUG << "[whole line classified as a header row]"; Chris@1870: } else { Chris@1870: for (int i = 0; i < cols; ++i) { Chris@1870: if (m_columnQualities.find(i) == m_columnQualities.end()) { Chris@1870: SVDEBUG << "(not set) "; Chris@1870: } else { Chris@1870: SVDEBUG << int(m_columnQualities[i]) << " "; Chris@1870: } Chris@1870: } Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@1870: SVDEBUG << "Estimated header status: " << m_headerStatus << endl; Chris@1362: } Chris@629: } Chris@629: Chris@629: void Chris@629: CSVFormat::guessPurposes() Chris@629: { Chris@629: m_timingType = CSVFormat::ImplicitTiming; Chris@629: m_timeUnits = CSVFormat::TimeWindows; Chris@1429: Chris@629: int timingColumnCount = 0; Chris@1525: bool haveDurationOrEndTime = false; Chris@1021: Chris@1510: SVDEBUG << "Estimated column qualities overall: "; Chris@1510: for (int i = 0; i < m_columnCount; ++i) { Chris@1870: if (m_columnQualities.find(i) == m_columnQualities.end()) { Chris@1870: SVDEBUG << "(not set) "; Chris@1870: } else { Chris@1870: SVDEBUG << int(m_columnQualities[i]) << " "; Chris@1870: } Chris@1510: } Chris@1510: SVDEBUG << endl; Chris@1510: Chris@1021: // if our first column has zero or one entries in it and the rest Chris@1021: // have more, then we'll default to ignoring the first column and Chris@1021: // counting the next one as primary. (e.g. Sonic Annotator output Chris@1021: // with filename at start of first column.) Chris@1021: Chris@1021: int primaryColumnNo = 0; Chris@1021: Chris@1021: if (m_columnCount >= 2) { Chris@1021: if ( (m_columnQualities[0] & ColumnNearEmpty) && Chris@1021: !(m_columnQualities[1] & ColumnNearEmpty)) { Chris@1021: primaryColumnNo = 1; Chris@1021: } Chris@1021: } Chris@629: Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@629: Chris@629: ColumnPurpose purpose = ColumnUnknown; Chris@1021: Chris@1021: if (i < primaryColumnNo) { Chris@1021: setColumnPurpose(i, purpose); Chris@1021: continue; Chris@1021: } Chris@1021: Chris@1021: bool primary = (i == primaryColumnNo); Chris@392: Chris@629: ColumnQualities qualities = m_columnQualities[i]; Chris@392: Chris@629: bool numeric = (qualities & ColumnNumeric); Chris@629: bool integral = (qualities & ColumnIntegral); Chris@629: bool increasing = (qualities & ColumnIncreasing); Chris@629: bool large = (qualities & ColumnLarge); Chris@629: Chris@629: bool timingColumn = (numeric && increasing); Chris@629: Chris@1870: QString heading; Chris@1870: if (m_columnHeadings.find(i) != m_columnHeadings.end()) { Chris@1870: heading = m_columnHeadings[i]; Chris@1870: } Chris@1870: Chris@1870: if (heading == "time" || heading == "frame" || Chris@1870: heading == "duration" || heading == "endtime") { Chris@1870: timingColumn = true; Chris@1870: } Chris@1870: Chris@1870: if (heading == "value" || heading == "height" || heading == "label") { Chris@1870: timingColumn = false; Chris@1870: } Chris@1870: Chris@629: if (timingColumn) { Chris@629: Chris@629: ++timingColumnCount; Chris@1870: Chris@1870: if (heading == "endtime") { Chris@1870: Chris@1870: purpose = ColumnEndTime; Chris@1870: haveDurationOrEndTime = true; Chris@1870: Chris@1870: } else if (heading == "duration") { Chris@1870: Chris@1870: purpose = ColumnDuration; Chris@1870: haveDurationOrEndTime = true; Chris@629: Chris@1870: } else if (primary || heading == "time" || heading == "frame") { Chris@629: Chris@629: purpose = ColumnStartTime; Chris@629: m_timingType = ExplicitTiming; Chris@629: Chris@1870: if ((integral && large) || heading == "frame") { Chris@629: m_timeUnits = TimeAudioFrames; Chris@629: } else { Chris@629: m_timeUnits = TimeSeconds; Chris@629: } Chris@629: Chris@1870: } else if (timingColumnCount == 2 && Chris@1870: m_timingType == ExplicitTiming) { Chris@1870: purpose = ColumnEndTime; Chris@1870: haveDurationOrEndTime = true; Chris@629: } Chris@629: } Chris@629: Chris@629: if (purpose == ColumnUnknown) { Chris@1870: if (heading == "label") { Chris@1870: purpose = ColumnLabel; Chris@1870: } else if (numeric || heading == "value" || heading == "height") { Chris@629: purpose = ColumnValue; Chris@629: } else { Chris@629: purpose = ColumnLabel; Chris@629: } Chris@629: } Chris@629: Chris@631: setColumnPurpose(i, purpose); Chris@629: } Chris@629: Chris@629: int valueCount = 0; Chris@629: for (int i = 0; i < m_columnCount; ++i) { Chris@1870: if (m_columnPurposes[i] == ColumnValue) { Chris@1870: ++valueCount; Chris@1870: } Chris@629: } Chris@629: Chris@630: if (valueCount == 2 && timingColumnCount == 1) { Chris@630: // If we have exactly two apparent value columns and only one Chris@630: // timing column, but one value column is integral and the Chris@630: // other is not, guess that whichever one matches the integral Chris@630: // status of the time column is either duration or end time Chris@630: if (m_timingType == ExplicitTiming) { Chris@630: int a = -1, b = -1; Chris@630: for (int i = 0; i < m_columnCount; ++i) { Chris@630: if (m_columnPurposes[i] == ColumnValue) { Chris@630: if (a == -1) a = i; Chris@630: else b = i; Chris@630: } Chris@630: } Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[b] & ColumnIntegral)) { Chris@630: int timecol = a; Chris@630: if ((m_columnQualities[a] & ColumnIntegral) != Chris@630: (m_columnQualities[0] & ColumnIntegral)) { Chris@630: timecol = b; Chris@630: } Chris@630: if (m_columnQualities[timecol] & ColumnIncreasing) { Chris@630: // This shouldn't happen; should have been settled above Chris@630: m_columnPurposes[timecol] = ColumnEndTime; Chris@1525: haveDurationOrEndTime = true; Chris@630: } else { Chris@630: m_columnPurposes[timecol] = ColumnDuration; Chris@1525: haveDurationOrEndTime = true; Chris@630: } Chris@630: --valueCount; Chris@630: } Chris@630: } Chris@630: } Chris@630: Chris@1525: if (timingColumnCount > 1 || haveDurationOrEndTime) { Chris@631: m_modelType = TwoDimensionalModelWithDuration; Chris@392: } else { Chris@631: if (valueCount == 0) { Chris@631: m_modelType = OneDimensionalModel; Chris@631: } else if (valueCount == 1) { Chris@631: m_modelType = TwoDimensionalModel; Chris@631: } else { Chris@631: m_modelType = ThreeDimensionalModel; Chris@631: } Chris@629: } Chris@392: Chris@1362: SVDEBUG << "Estimated column purposes: "; Chris@1362: for (int i = 0; i < m_columnCount; ++i) { Chris@1362: SVDEBUG << int(m_columnPurposes[i]) << " "; Chris@1362: } Chris@1362: SVDEBUG << endl; Chris@392: Chris@1362: SVDEBUG << "Estimated model type: " << m_modelType << endl; Chris@1362: SVDEBUG << "Estimated timing type: " << m_timingType << endl; Chris@1362: SVDEBUG << "Estimated units: " << m_timeUnits << endl; Chris@392: } Chris@392: Chris@1515: void Chris@1515: CSVFormat::guessAudioSampleRange() Chris@1515: { Chris@1515: AudioSampleRange range = SampleRangeSigned1; Chris@1515: Chris@1515: range = SampleRangeSigned1; Chris@1515: bool knownSigned = false; Chris@1515: bool knownNonIntegral = false; Chris@1521: Chris@1521: SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of " Chris@1521: << range << endl; Chris@1515: Chris@1515: for (int i = 0; i < m_columnCount; ++i) { Chris@1521: if (m_columnPurposes[i] != ColumnValue) { Chris@1521: SVDEBUG << "... column " << i Chris@1521: << " is not apparently a value, ignoring" << endl; Chris@1521: continue; Chris@1521: } Chris@1515: if (!(m_columnQualities[i] & ColumnIntegral)) { Chris@1515: knownNonIntegral = true; Chris@1515: if (range == SampleRangeUnsigned255 || Chris@1515: range == SampleRangeSigned32767) { Chris@1515: range = SampleRangeOther; Chris@1515: } Chris@1521: SVDEBUG << "... column " << i Chris@1521: << " is non-integral, updating range to " << range << endl; Chris@1515: } Chris@1515: if (m_columnQualities[i] & ColumnLarge) { Chris@1515: if (range == SampleRangeSigned1 || Chris@1515: range == SampleRangeUnsigned255) { Chris@1515: if (knownNonIntegral) { Chris@1515: range = SampleRangeOther; Chris@1515: } else { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is large, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: if (m_columnQualities[i] & ColumnSigned) { Chris@1515: knownSigned = true; Chris@1515: if (range == SampleRangeUnsigned255) { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is signed, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: if (!(m_columnQualities[i] & ColumnSmall)) { Chris@1515: if (range == SampleRangeSigned1) { Chris@1515: if (knownNonIntegral) { Chris@1515: range = SampleRangeOther; Chris@1515: } else if (knownSigned) { Chris@1515: range = SampleRangeSigned32767; Chris@1515: } else { Chris@1515: range = SampleRangeUnsigned255; Chris@1515: } Chris@1515: } Chris@1521: SVDEBUG << "... column " << i << " is not small, updating range to " Chris@1521: << range << endl; Chris@1515: } Chris@1515: } Chris@1515: Chris@1521: SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range " Chris@1521: << range << endl; Chris@1521: Chris@1515: m_audioSampleRange = range; Chris@1515: } Chris@1515: Chris@1870: QList Chris@1870: CSVFormat::getColumnPurposes() const Chris@631: { Chris@1870: QList purposes; Chris@1870: for (int i = 0; i < m_columnCount; ++i) { Chris@1870: purposes.push_back(getColumnPurpose(i)); Chris@631: } Chris@1870: return purposes; Chris@1870: } Chris@1870: Chris@1870: void Chris@1870: CSVFormat::setColumnPurposes(QList cl) Chris@1870: { Chris@1870: m_columnPurposes.clear(); Chris@1870: for (int i = 0; in_range_for(cl, i); ++i) { Chris@1870: m_columnPurposes[i] = cl[i]; Chris@1870: } Chris@631: } Chris@629: Chris@631: CSVFormat::ColumnPurpose Chris@631: CSVFormat::getColumnPurpose(int i) const Chris@631: { Chris@1870: if (m_columnPurposes.find(i) == m_columnPurposes.end()) { Chris@668: return ColumnUnknown; Chris@1870: } else { Chris@1870: return m_columnPurposes.at(i); Chris@668: } Chris@631: } Chris@631: Chris@631: void Chris@631: CSVFormat::setColumnPurpose(int i, ColumnPurpose p) Chris@631: { Chris@631: m_columnPurposes[i] = p; Chris@631: } Chris@631: Chris@1870: QList Chris@1870: CSVFormat::getColumnQualities() const Chris@1870: { Chris@1870: QList qualities; Chris@1870: for (int i = 0; i < m_columnCount; ++i) { Chris@1870: if (m_columnQualities.find(i) == m_columnQualities.end()) { Chris@1870: qualities.push_back(0); Chris@1870: } else { Chris@1870: qualities.push_back(m_columnQualities.at(i)); Chris@1870: } Chris@1870: } Chris@1870: return qualities; Chris@1870: }