| Chris@392 | 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */ | 
| Chris@392 | 2 | 
| Chris@392 | 3 /* | 
| Chris@392 | 4     Sonic Visualiser | 
| Chris@392 | 5     An audio file viewer and annotation editor. | 
| Chris@392 | 6     Centre for Digital Music, Queen Mary, University of London. | 
| Chris@392 | 7     This file copyright 2006 Chris Cannam. | 
| Chris@392 | 8 | 
| Chris@392 | 9     This program is free software; you can redistribute it and/or | 
| Chris@392 | 10     modify it under the terms of the GNU General Public License as | 
| Chris@392 | 11     published by the Free Software Foundation; either version 2 of the | 
| Chris@392 | 12     License, or (at your option) any later version.  See the file | 
| Chris@392 | 13     COPYING included with this distribution for more information. | 
| Chris@392 | 14 */ | 
| Chris@392 | 15 | 
| Chris@392 | 16 #include "CSVFormat.h" | 
| Chris@392 | 17 | 
| Chris@629 | 18 #include "base/StringBits.h" | 
| Chris@629 | 19 | 
| Chris@392 | 20 #include <QFile> | 
| Chris@392 | 21 #include <QString> | 
| Chris@392 | 22 #include <QRegExp> | 
| Chris@392 | 23 #include <QStringList> | 
| Chris@392 | 24 #include <QTextStream> | 
| Chris@392 | 25 | 
| Chris@392 | 26 #include <iostream> | 
| Chris@392 | 27 | 
| Chris@1362 | 28 #include "base/Debug.h" | 
| Chris@1362 | 29 | 
| Chris@629 | 30 CSVFormat::CSVFormat(QString path) : | 
| Chris@629 | 31     m_separator(""), | 
| Chris@392 | 32     m_sampleRate(44100), | 
| Chris@392 | 33     m_windowSize(1024), | 
| Chris@629 | 34     m_allowQuoting(true) | 
| Chris@392 | 35 { | 
| Chris@629 | 36     guessFormatFor(path); | 
| Chris@629 | 37 } | 
| Chris@629 | 38 | 
| Chris@629 | 39 void | 
| Chris@629 | 40 CSVFormat::guessFormatFor(QString path) | 
| Chris@629 | 41 { | 
| Chris@629 | 42     m_modelType = TwoDimensionalModel; | 
| Chris@629 | 43     m_timingType = ExplicitTiming; | 
| Chris@629 | 44     m_timeUnits = TimeSeconds; | 
| Chris@629 | 45 | 
| Chris@629 | 46     m_maxExampleCols = 0; | 
| Chris@629 | 47     m_columnCount = 0; | 
| Chris@629 | 48     m_variableColumnCount = false; | 
| Chris@629 | 49 | 
| Chris@629 | 50     m_example.clear(); | 
| Chris@629 | 51     m_columnQualities.clear(); | 
| Chris@629 | 52     m_columnPurposes.clear(); | 
| Chris@629 | 53     m_prevValues.clear(); | 
| Chris@629 | 54 | 
| Chris@629 | 55     QFile file(path); | 
| Chris@392 | 56     if (!file.exists()) return; | 
| Chris@392 | 57     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; | 
| Chris@392 | 58 | 
| Chris@392 | 59     QTextStream in(&file); | 
| Chris@392 | 60     in.seek(0); | 
| Chris@392 | 61 | 
| Chris@629 | 62     int lineno = 0; | 
| Chris@392 | 63 | 
| Chris@392 | 64     while (!in.atEnd()) { | 
| Chris@392 | 65 | 
| Chris@392 | 66         // See comment about line endings in CSVFileReader::load() | 
| Chris@392 | 67 | 
| Chris@392 | 68         QString chunk = in.readLine(); | 
| Chris@392 | 69         QStringList lines = chunk.split('\r', QString::SkipEmptyParts); | 
| Chris@392 | 70 | 
| Chris@897 | 71         for (int li = 0; li < lines.size(); ++li) { | 
| Chris@392 | 72 | 
| Chris@392 | 73             QString line = lines[li]; | 
| Chris@629 | 74             if (line.startsWith("#") || line == "") continue; | 
| Chris@392 | 75 | 
| Chris@629 | 76             guessQualities(line, lineno); | 
| Chris@392 | 77 | 
| Chris@840 | 78             ++lineno; | 
| Chris@629 | 79         } | 
| Chris@840 | 80 | 
| Chris@840 | 81         if (lineno >= 50) break; | 
| Chris@629 | 82     } | 
| Chris@392 | 83 | 
| Chris@629 | 84     guessPurposes(); | 
| Chris@629 | 85 } | 
| Chris@629 | 86 | 
| Chris@629 | 87 void | 
| Chris@629 | 88 CSVFormat::guessSeparator(QString line) | 
| Chris@629 | 89 { | 
| Chris@629 | 90     char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; | 
| Chris@897 | 91     for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) { | 
| Chris@629 | 92         if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { | 
| Chris@629 | 93             m_separator = candidates[i]; | 
| Chris@1510 | 94             SVDEBUG << "Estimated column separator: '" << m_separator | 
| Chris@1510 | 95                     << "'" << endl; | 
| Chris@629 | 96             return; | 
| Chris@629 | 97         } | 
| Chris@629 | 98     } | 
| Chris@629 | 99 } | 
| Chris@629 | 100 | 
| Chris@629 | 101 void | 
| Chris@629 | 102 CSVFormat::guessQualities(QString line, int lineno) | 
| Chris@629 | 103 { | 
| Chris@629 | 104     if (m_separator == "") guessSeparator(line); | 
| Chris@629 | 105 | 
| Chris@1362 | 106     QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); | 
| Chris@629 | 107 | 
| Chris@629 | 108     int cols = list.size(); | 
| Chris@991 | 109     if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; | 
| Chris@629 | 110     if (cols != m_columnCount) m_variableColumnCount = true; | 
| Chris@629 | 111 | 
| Chris@629 | 112     // All columns are regarded as having these qualities until we see | 
| Chris@629 | 113     // something that indicates otherwise: | 
| Chris@629 | 114 | 
| Chris@629 | 115     ColumnQualities defaultQualities = | 
| Chris@1021 | 116         ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty; | 
| Chris@629 | 117 | 
| Chris@629 | 118     for (int i = 0; i < cols; ++i) { | 
| Chris@1429 | 119 | 
| Chris@629 | 120         while (m_columnQualities.size() <= i) { | 
| Chris@629 | 121             m_columnQualities.push_back(defaultQualities); | 
| Chris@629 | 122             m_prevValues.push_back(0.f); | 
| Chris@629 | 123         } | 
| Chris@629 | 124 | 
| Chris@629 | 125         QString s(list[i]); | 
| Chris@629 | 126         bool ok = false; | 
| Chris@629 | 127 | 
| Chris@629 | 128         ColumnQualities qualities = m_columnQualities[i]; | 
| Chris@629 | 129 | 
| Chris@629 | 130         bool numeric    = (qualities & ColumnNumeric); | 
| Chris@629 | 131         bool integral   = (qualities & ColumnIntegral); | 
| Chris@629 | 132         bool increasing = (qualities & ColumnIncreasing); | 
| Chris@629 | 133         bool large      = (qualities & ColumnLarge); // this one defaults to off | 
| Chris@1021 | 134         bool emptyish   = (qualities & ColumnNearEmpty); | 
| Chris@629 | 135 | 
| Chris@1021 | 136         if (lineno > 1 && s.trimmed() != "") { | 
| Chris@1021 | 137             emptyish = false; | 
| Chris@1021 | 138         } | 
| Chris@1021 | 139 | 
| Chris@629 | 140         float value = 0.f; | 
| Chris@629 | 141 | 
| Chris@629 | 142         //!!! how to take into account headers? | 
| Chris@629 | 143 | 
| Chris@629 | 144         if (numeric) { | 
| Chris@629 | 145             value = s.toFloat(&ok); | 
| Chris@629 | 146             if (!ok) { | 
| Chris@629 | 147                 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); | 
| Chris@629 | 148             } | 
| Chris@629 | 149             if (ok) { | 
| Chris@629 | 150                 if (lineno < 2 && value > 1000.f) large = true; | 
| Chris@629 | 151             } else { | 
| Chris@629 | 152                 numeric = false; | 
| Chris@629 | 153             } | 
| Chris@629 | 154         } | 
| Chris@629 | 155 | 
| Chris@629 | 156         if (numeric) { | 
| Chris@629 | 157 | 
| Chris@629 | 158             if (integral) { | 
| Chris@629 | 159                 if (s.contains('.') || s.contains(',')) { | 
| Chris@629 | 160                     integral = false; | 
| Chris@392 | 161                 } | 
| Chris@392 | 162             } | 
| Chris@392 | 163 | 
| Chris@629 | 164             if (increasing) { | 
| Chris@629 | 165                 if (lineno > 0 && value <= m_prevValues[i]) { | 
| Chris@629 | 166                     increasing = false; | 
| Chris@392 | 167                 } | 
| Chris@392 | 168             } | 
| Chris@392 | 169 | 
| Chris@629 | 170             m_prevValues[i] = value; | 
| Chris@629 | 171         } | 
| Chris@392 | 172 | 
| Chris@629 | 173         m_columnQualities[i] = | 
| Chris@629 | 174             (numeric    ? ColumnNumeric : 0) | | 
| Chris@629 | 175             (integral   ? ColumnIntegral : 0) | | 
| Chris@629 | 176             (increasing ? ColumnIncreasing : 0) | | 
| Chris@1021 | 177             (large      ? ColumnLarge : 0) | | 
| Chris@1021 | 178             (emptyish   ? ColumnNearEmpty : 0); | 
| Chris@629 | 179     } | 
| Chris@392 | 180 | 
| Chris@629 | 181     if (lineno < 10) { | 
| Chris@629 | 182         m_example.push_back(list); | 
| Chris@629 | 183         if (lineno == 0 || cols > m_maxExampleCols) { | 
| Chris@629 | 184             m_maxExampleCols = cols; | 
| Chris@392 | 185         } | 
| Chris@392 | 186     } | 
| Chris@392 | 187 | 
| Chris@1362 | 188     if (lineno < 10) { | 
| Chris@1362 | 189         SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; | 
| Chris@1362 | 190         for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@1362 | 191             SVDEBUG << int(m_columnQualities[i]) << " "; | 
| Chris@1362 | 192         } | 
| Chris@1362 | 193         SVDEBUG << endl; | 
| Chris@1362 | 194     } | 
| Chris@629 | 195 } | 
| Chris@629 | 196 | 
| Chris@629 | 197 void | 
| Chris@629 | 198 CSVFormat::guessPurposes() | 
| Chris@629 | 199 { | 
| Chris@629 | 200     m_timingType = CSVFormat::ImplicitTiming; | 
| Chris@629 | 201     m_timeUnits = CSVFormat::TimeWindows; | 
| Chris@1429 | 202 | 
| Chris@629 | 203     int timingColumnCount = 0; | 
| Chris@1021 | 204 | 
| Chris@1510 | 205     SVDEBUG << "Estimated column qualities overall: "; | 
| Chris@1510 | 206     for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@1510 | 207         SVDEBUG << int(m_columnQualities[i]) << " "; | 
| Chris@1510 | 208     } | 
| Chris@1510 | 209     SVDEBUG << endl; | 
| Chris@1510 | 210 | 
| Chris@1021 | 211     // if our first column has zero or one entries in it and the rest | 
| Chris@1021 | 212     // have more, then we'll default to ignoring the first column and | 
| Chris@1021 | 213     // counting the next one as primary. (e.g. Sonic Annotator output | 
| Chris@1021 | 214     // with filename at start of first column.) | 
| Chris@1021 | 215 | 
| Chris@1021 | 216     int primaryColumnNo = 0; | 
| Chris@1021 | 217 | 
| Chris@1021 | 218     if (m_columnCount >= 2) { | 
| Chris@1021 | 219         if ( (m_columnQualities[0] & ColumnNearEmpty) && | 
| Chris@1021 | 220             !(m_columnQualities[1] & ColumnNearEmpty)) { | 
| Chris@1021 | 221             primaryColumnNo = 1; | 
| Chris@1021 | 222         } | 
| Chris@1021 | 223     } | 
| Chris@629 | 224 | 
| Chris@629 | 225     for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@629 | 226 | 
| Chris@629 | 227         ColumnPurpose purpose = ColumnUnknown; | 
| Chris@1021 | 228 | 
| Chris@1021 | 229         if (i < primaryColumnNo) { | 
| Chris@1021 | 230             setColumnPurpose(i, purpose); | 
| Chris@1021 | 231             continue; | 
| Chris@1021 | 232         } | 
| Chris@1021 | 233 | 
| Chris@1021 | 234         bool primary = (i == primaryColumnNo); | 
| Chris@392 | 235 | 
| Chris@629 | 236         ColumnQualities qualities = m_columnQualities[i]; | 
| Chris@392 | 237 | 
| Chris@629 | 238         bool numeric    = (qualities & ColumnNumeric); | 
| Chris@629 | 239         bool integral   = (qualities & ColumnIntegral); | 
| Chris@629 | 240         bool increasing = (qualities & ColumnIncreasing); | 
| Chris@629 | 241         bool large      = (qualities & ColumnLarge); | 
| Chris@629 | 242 | 
| Chris@629 | 243         bool timingColumn = (numeric && increasing); | 
| Chris@629 | 244 | 
| Chris@629 | 245         if (timingColumn) { | 
| Chris@629 | 246 | 
| Chris@629 | 247             ++timingColumnCount; | 
| Chris@629 | 248 | 
| Chris@629 | 249             if (primary) { | 
| Chris@629 | 250 | 
| Chris@629 | 251                 purpose = ColumnStartTime; | 
| Chris@629 | 252 | 
| Chris@629 | 253                 m_timingType = ExplicitTiming; | 
| Chris@629 | 254 | 
| Chris@629 | 255                 if (integral && large) { | 
| Chris@629 | 256                     m_timeUnits = TimeAudioFrames; | 
| Chris@629 | 257                 } else { | 
| Chris@629 | 258                     m_timeUnits = TimeSeconds; | 
| Chris@629 | 259                 } | 
| Chris@629 | 260 | 
| Chris@629 | 261             } else { | 
| Chris@629 | 262 | 
| Chris@629 | 263                 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { | 
| Chris@629 | 264                     purpose = ColumnEndTime; | 
| Chris@629 | 265                 } | 
| Chris@629 | 266             } | 
| Chris@629 | 267         } | 
| Chris@629 | 268 | 
| Chris@629 | 269         if (purpose == ColumnUnknown) { | 
| Chris@629 | 270             if (numeric) { | 
| Chris@629 | 271                 purpose = ColumnValue; | 
| Chris@629 | 272             } else { | 
| Chris@629 | 273                 purpose = ColumnLabel; | 
| Chris@629 | 274             } | 
| Chris@629 | 275         } | 
| Chris@629 | 276 | 
| Chris@631 | 277         setColumnPurpose(i, purpose); | 
| Chris@629 | 278     } | 
| Chris@629 | 279 | 
| Chris@629 | 280     int valueCount = 0; | 
| Chris@629 | 281     for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@629 | 282         if (m_columnPurposes[i] == ColumnValue) ++valueCount; | 
| Chris@629 | 283     } | 
| Chris@629 | 284 | 
| Chris@630 | 285     if (valueCount == 2 && timingColumnCount == 1) { | 
| Chris@630 | 286         // If we have exactly two apparent value columns and only one | 
| Chris@630 | 287         // timing column, but one value column is integral and the | 
| Chris@630 | 288         // other is not, guess that whichever one matches the integral | 
| Chris@630 | 289         // status of the time column is either duration or end time | 
| Chris@630 | 290         if (m_timingType == ExplicitTiming) { | 
| Chris@630 | 291             int a = -1, b = -1; | 
| Chris@630 | 292             for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@630 | 293                 if (m_columnPurposes[i] == ColumnValue) { | 
| Chris@630 | 294                     if (a == -1) a = i; | 
| Chris@630 | 295                     else b = i; | 
| Chris@630 | 296                 } | 
| Chris@630 | 297             } | 
| Chris@630 | 298             if ((m_columnQualities[a] & ColumnIntegral) != | 
| Chris@630 | 299                 (m_columnQualities[b] & ColumnIntegral)) { | 
| Chris@630 | 300                 int timecol = a; | 
| Chris@630 | 301                 if ((m_columnQualities[a] & ColumnIntegral) != | 
| Chris@630 | 302                     (m_columnQualities[0] & ColumnIntegral)) { | 
| Chris@630 | 303                     timecol = b; | 
| Chris@630 | 304                 } | 
| Chris@630 | 305                 if (m_columnQualities[timecol] & ColumnIncreasing) { | 
| Chris@630 | 306                     // This shouldn't happen; should have been settled above | 
| Chris@630 | 307                     m_columnPurposes[timecol] = ColumnEndTime; | 
| Chris@630 | 308                 } else { | 
| Chris@630 | 309                     m_columnPurposes[timecol] = ColumnDuration; | 
| Chris@630 | 310                 } | 
| Chris@630 | 311                 --valueCount; | 
| Chris@630 | 312             } | 
| Chris@630 | 313         } | 
| Chris@630 | 314     } | 
| Chris@630 | 315 | 
| Chris@631 | 316     if (timingColumnCount > 1) { | 
| Chris@631 | 317         m_modelType = TwoDimensionalModelWithDuration; | 
| Chris@392 | 318     } else { | 
| Chris@631 | 319         if (valueCount == 0) { | 
| Chris@631 | 320             m_modelType = OneDimensionalModel; | 
| Chris@631 | 321         } else if (valueCount == 1) { | 
| Chris@631 | 322             m_modelType = TwoDimensionalModel; | 
| Chris@631 | 323         } else { | 
| Chris@631 | 324             m_modelType = ThreeDimensionalModel; | 
| Chris@631 | 325         } | 
| Chris@629 | 326     } | 
| Chris@392 | 327 | 
| Chris@1362 | 328     SVDEBUG << "Estimated column purposes: "; | 
| Chris@1362 | 329     for (int i = 0; i < m_columnCount; ++i) { | 
| Chris@1362 | 330         SVDEBUG << int(m_columnPurposes[i]) << " "; | 
| Chris@1362 | 331     } | 
| Chris@1362 | 332     SVDEBUG << endl; | 
| Chris@392 | 333 | 
| Chris@1362 | 334     SVDEBUG << "Estimated model type: " << m_modelType << endl; | 
| Chris@1362 | 335     SVDEBUG << "Estimated timing type: " << m_timingType << endl; | 
| Chris@1362 | 336     SVDEBUG << "Estimated units: " << m_timeUnits << endl; | 
| Chris@392 | 337 } | 
| Chris@392 | 338 | 
| Chris@631 | 339 CSVFormat::ColumnPurpose | 
| Chris@631 | 340 CSVFormat::getColumnPurpose(int i) | 
| Chris@631 | 341 { | 
| Chris@631 | 342     while (m_columnPurposes.size() <= i) { | 
| Chris@631 | 343         m_columnPurposes.push_back(ColumnUnknown); | 
| Chris@631 | 344     } | 
| Chris@631 | 345     return m_columnPurposes[i]; | 
| Chris@631 | 346 } | 
| Chris@629 | 347 | 
| Chris@631 | 348 CSVFormat::ColumnPurpose | 
| Chris@631 | 349 CSVFormat::getColumnPurpose(int i) const | 
| Chris@631 | 350 { | 
| Chris@668 | 351     if (m_columnPurposes.size() <= i) { | 
| Chris@668 | 352         return ColumnUnknown; | 
| Chris@668 | 353     } | 
| Chris@631 | 354     return m_columnPurposes[i]; | 
| Chris@631 | 355 } | 
| Chris@631 | 356 | 
| Chris@631 | 357 void | 
| Chris@631 | 358 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) | 
| Chris@631 | 359 { | 
| Chris@631 | 360     while (m_columnPurposes.size() <= i) { | 
| Chris@631 | 361         m_columnPurposes.push_back(ColumnUnknown); | 
| Chris@631 | 362     } | 
| Chris@631 | 363     m_columnPurposes[i] = p; | 
| Chris@631 | 364 } | 
| Chris@631 | 365 | 
| Chris@631 | 366 | 
| Chris@631 | 367 | 
| Chris@631 | 368 |