annotate data/fileio/CSVFormat.cpp @ 1078:ce82bcdc95d0

Fail upfront if the file is going to be too large. We expect the caller to split up large data sets into several MatrixFiles
author Chris Cannam
date Wed, 10 Jun 2015 13:10:26 +0100
parents 1888ca033a84
children 1bf38a4b91c4
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@629 28 CSVFormat::CSVFormat(QString path) :
Chris@629 29 m_separator(""),
Chris@392 30 m_sampleRate(44100),
Chris@392 31 m_windowSize(1024),
Chris@629 32 m_allowQuoting(true)
Chris@392 33 {
Chris@629 34 guessFormatFor(path);
Chris@629 35 }
Chris@629 36
Chris@629 37 void
Chris@629 38 CSVFormat::guessFormatFor(QString path)
Chris@629 39 {
Chris@629 40 m_modelType = TwoDimensionalModel;
Chris@629 41 m_timingType = ExplicitTiming;
Chris@629 42 m_timeUnits = TimeSeconds;
Chris@629 43
Chris@629 44 m_maxExampleCols = 0;
Chris@629 45 m_columnCount = 0;
Chris@629 46 m_variableColumnCount = false;
Chris@629 47
Chris@629 48 m_example.clear();
Chris@629 49 m_columnQualities.clear();
Chris@629 50 m_columnPurposes.clear();
Chris@629 51 m_prevValues.clear();
Chris@629 52
Chris@629 53 QFile file(path);
Chris@392 54 if (!file.exists()) return;
Chris@392 55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 56
Chris@392 57 QTextStream in(&file);
Chris@392 58 in.seek(0);
Chris@392 59
Chris@629 60 int lineno = 0;
Chris@392 61
Chris@392 62 while (!in.atEnd()) {
Chris@392 63
Chris@392 64 // See comment about line endings in CSVFileReader::load()
Chris@392 65
Chris@392 66 QString chunk = in.readLine();
Chris@392 67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 68
Chris@897 69 for (int li = 0; li < lines.size(); ++li) {
Chris@392 70
Chris@392 71 QString line = lines[li];
Chris@629 72 if (line.startsWith("#") || line == "") continue;
Chris@392 73
Chris@629 74 guessQualities(line, lineno);
Chris@392 75
Chris@840 76 ++lineno;
Chris@629 77 }
Chris@840 78
Chris@840 79 if (lineno >= 50) break;
Chris@629 80 }
Chris@392 81
Chris@629 82 guessPurposes();
Chris@629 83 }
Chris@629 84
Chris@629 85 void
Chris@629 86 CSVFormat::guessSeparator(QString line)
Chris@629 87 {
Chris@629 88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@897 89 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
Chris@629 90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 91 m_separator = candidates[i];
Chris@629 92 return;
Chris@629 93 }
Chris@629 94 }
Chris@629 95 m_separator = " ";
Chris@629 96 }
Chris@629 97
Chris@629 98 void
Chris@629 99 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 100 {
Chris@629 101 if (m_separator == "") guessSeparator(line);
Chris@629 102
Chris@629 103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
Chris@629 104
Chris@629 105 int cols = list.size();
Chris@991 106 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 107 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 108
Chris@629 109 // All columns are regarded as having these qualities until we see
Chris@629 110 // something that indicates otherwise:
Chris@629 111
Chris@629 112 ColumnQualities defaultQualities =
Chris@1021 113 ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty;
Chris@629 114
Chris@629 115 for (int i = 0; i < cols; ++i) {
Chris@629 116
Chris@629 117 while (m_columnQualities.size() <= i) {
Chris@629 118 m_columnQualities.push_back(defaultQualities);
Chris@629 119 m_prevValues.push_back(0.f);
Chris@629 120 }
Chris@629 121
Chris@629 122 QString s(list[i]);
Chris@629 123 bool ok = false;
Chris@629 124
Chris@629 125 ColumnQualities qualities = m_columnQualities[i];
Chris@629 126
Chris@629 127 bool numeric = (qualities & ColumnNumeric);
Chris@629 128 bool integral = (qualities & ColumnIntegral);
Chris@629 129 bool increasing = (qualities & ColumnIncreasing);
Chris@629 130 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@1021 131 bool emptyish = (qualities & ColumnNearEmpty);
Chris@629 132
Chris@1021 133 if (lineno > 1 && s.trimmed() != "") {
Chris@1021 134 emptyish = false;
Chris@1021 135 }
Chris@1021 136
Chris@629 137 float value = 0.f;
Chris@629 138
Chris@629 139 //!!! how to take into account headers?
Chris@629 140
Chris@629 141 if (numeric) {
Chris@629 142 value = s.toFloat(&ok);
Chris@629 143 if (!ok) {
Chris@629 144 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 145 }
Chris@629 146 if (ok) {
Chris@629 147 if (lineno < 2 && value > 1000.f) large = true;
Chris@629 148 } else {
Chris@629 149 numeric = false;
Chris@629 150 }
Chris@629 151 }
Chris@629 152
Chris@629 153 if (numeric) {
Chris@629 154
Chris@629 155 if (integral) {
Chris@629 156 if (s.contains('.') || s.contains(',')) {
Chris@629 157 integral = false;
Chris@392 158 }
Chris@392 159 }
Chris@392 160
Chris@629 161 if (increasing) {
Chris@629 162 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 163 increasing = false;
Chris@392 164 }
Chris@392 165 }
Chris@392 166
Chris@629 167 m_prevValues[i] = value;
Chris@629 168 }
Chris@392 169
Chris@629 170 m_columnQualities[i] =
Chris@629 171 (numeric ? ColumnNumeric : 0) |
Chris@629 172 (integral ? ColumnIntegral : 0) |
Chris@629 173 (increasing ? ColumnIncreasing : 0) |
Chris@1021 174 (large ? ColumnLarge : 0) |
Chris@1021 175 (emptyish ? ColumnNearEmpty : 0);
Chris@629 176 }
Chris@392 177
Chris@629 178 if (lineno < 10) {
Chris@629 179 m_example.push_back(list);
Chris@629 180 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 181 m_maxExampleCols = cols;
Chris@392 182 }
Chris@392 183 }
Chris@392 184
Chris@843 185 // cerr << "Estimated column qualities: ";
Chris@676 186 // for (int i = 0; i < m_columnCount; ++i) {
Chris@843 187 // cerr << int(m_columnQualities[i]) << " ";
Chris@676 188 // }
Chris@843 189 // cerr << endl;
Chris@629 190 }
Chris@629 191
Chris@629 192 void
Chris@629 193 CSVFormat::guessPurposes()
Chris@629 194 {
Chris@629 195 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 196 m_timeUnits = CSVFormat::TimeWindows;
Chris@392 197
Chris@629 198 int timingColumnCount = 0;
Chris@1021 199
Chris@1021 200 // if our first column has zero or one entries in it and the rest
Chris@1021 201 // have more, then we'll default to ignoring the first column and
Chris@1021 202 // counting the next one as primary. (e.g. Sonic Annotator output
Chris@1021 203 // with filename at start of first column.)
Chris@1021 204
Chris@1021 205 int primaryColumnNo = 0;
Chris@1021 206
Chris@1021 207 if (m_columnCount >= 2) {
Chris@1021 208 if ( (m_columnQualities[0] & ColumnNearEmpty) &&
Chris@1021 209 !(m_columnQualities[1] & ColumnNearEmpty)) {
Chris@1021 210 primaryColumnNo = 1;
Chris@1021 211 }
Chris@1021 212 }
Chris@629 213
Chris@629 214 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 215
Chris@629 216 ColumnPurpose purpose = ColumnUnknown;
Chris@1021 217
Chris@1021 218 if (i < primaryColumnNo) {
Chris@1021 219 setColumnPurpose(i, purpose);
Chris@1021 220 continue;
Chris@1021 221 }
Chris@1021 222
Chris@1021 223 bool primary = (i == primaryColumnNo);
Chris@392 224
Chris@629 225 ColumnQualities qualities = m_columnQualities[i];
Chris@392 226
Chris@629 227 bool numeric = (qualities & ColumnNumeric);
Chris@629 228 bool integral = (qualities & ColumnIntegral);
Chris@629 229 bool increasing = (qualities & ColumnIncreasing);
Chris@629 230 bool large = (qualities & ColumnLarge);
Chris@629 231
Chris@629 232 bool timingColumn = (numeric && increasing);
Chris@629 233
Chris@629 234 if (timingColumn) {
Chris@629 235
Chris@629 236 ++timingColumnCount;
Chris@629 237
Chris@629 238 if (primary) {
Chris@629 239
Chris@629 240 purpose = ColumnStartTime;
Chris@629 241
Chris@629 242 m_timingType = ExplicitTiming;
Chris@629 243
Chris@629 244 if (integral && large) {
Chris@629 245 m_timeUnits = TimeAudioFrames;
Chris@629 246 } else {
Chris@629 247 m_timeUnits = TimeSeconds;
Chris@629 248 }
Chris@629 249
Chris@629 250 } else {
Chris@629 251
Chris@629 252 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 253 purpose = ColumnEndTime;
Chris@629 254 }
Chris@629 255 }
Chris@629 256 }
Chris@629 257
Chris@629 258 if (purpose == ColumnUnknown) {
Chris@629 259 if (numeric) {
Chris@629 260 purpose = ColumnValue;
Chris@629 261 } else {
Chris@629 262 purpose = ColumnLabel;
Chris@629 263 }
Chris@629 264 }
Chris@629 265
Chris@631 266 setColumnPurpose(i, purpose);
Chris@629 267 }
Chris@629 268
Chris@629 269 int valueCount = 0;
Chris@629 270 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 271 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 272 }
Chris@629 273
Chris@630 274 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 275 // If we have exactly two apparent value columns and only one
Chris@630 276 // timing column, but one value column is integral and the
Chris@630 277 // other is not, guess that whichever one matches the integral
Chris@630 278 // status of the time column is either duration or end time
Chris@630 279 if (m_timingType == ExplicitTiming) {
Chris@630 280 int a = -1, b = -1;
Chris@630 281 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 282 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 283 if (a == -1) a = i;
Chris@630 284 else b = i;
Chris@630 285 }
Chris@630 286 }
Chris@630 287 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 288 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 289 int timecol = a;
Chris@630 290 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 291 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 292 timecol = b;
Chris@630 293 }
Chris@630 294 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 295 // This shouldn't happen; should have been settled above
Chris@630 296 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 297 } else {
Chris@630 298 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 299 }
Chris@630 300 --valueCount;
Chris@630 301 }
Chris@630 302 }
Chris@630 303 }
Chris@630 304
Chris@631 305 if (timingColumnCount > 1) {
Chris@631 306 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 307 } else {
Chris@631 308 if (valueCount == 0) {
Chris@631 309 m_modelType = OneDimensionalModel;
Chris@631 310 } else if (valueCount == 1) {
Chris@631 311 m_modelType = TwoDimensionalModel;
Chris@631 312 } else {
Chris@631 313 m_modelType = ThreeDimensionalModel;
Chris@631 314 }
Chris@629 315 }
Chris@392 316
Chris@843 317 // cerr << "Estimated column purposes: ";
Chris@676 318 // for (int i = 0; i < m_columnCount; ++i) {
Chris@843 319 // cerr << int(m_columnPurposes[i]) << " ";
Chris@676 320 // }
Chris@843 321 // cerr << endl;
Chris@392 322
Chris@843 323 // cerr << "Estimated model type: " << m_modelType << endl;
Chris@843 324 // cerr << "Estimated timing type: " << m_timingType << endl;
Chris@843 325 // cerr << "Estimated units: " << m_timeUnits << endl;
Chris@392 326 }
Chris@392 327
Chris@631 328 CSVFormat::ColumnPurpose
Chris@631 329 CSVFormat::getColumnPurpose(int i)
Chris@631 330 {
Chris@631 331 while (m_columnPurposes.size() <= i) {
Chris@631 332 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 333 }
Chris@631 334 return m_columnPurposes[i];
Chris@631 335 }
Chris@629 336
Chris@631 337 CSVFormat::ColumnPurpose
Chris@631 338 CSVFormat::getColumnPurpose(int i) const
Chris@631 339 {
Chris@668 340 if (m_columnPurposes.size() <= i) {
Chris@668 341 return ColumnUnknown;
Chris@668 342 }
Chris@631 343 return m_columnPurposes[i];
Chris@631 344 }
Chris@631 345
Chris@631 346 void
Chris@631 347 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 348 {
Chris@631 349 while (m_columnPurposes.size() <= i) {
Chris@631 350 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 351 }
Chris@631 352 m_columnPurposes[i] = p;
Chris@631 353 }
Chris@631 354
Chris@631 355
Chris@631 356
Chris@631 357