annotate data/fileio/CSVFormat.cpp @ 1773:fadd9f8aaa27

This output is too annoying, in the perfectly innocuous case of reading from an aggregate model whose components are different lengths
author Chris Cannam
date Wed, 14 Aug 2019 13:54:23 +0100
parents 9570ef94eaa3
children bde22957545e
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@1362 28 #include "base/Debug.h"
Chris@1362 29
Chris@629 30 CSVFormat::CSVFormat(QString path) :
Chris@629 31 m_separator(""),
Chris@392 32 m_sampleRate(44100),
Chris@392 33 m_windowSize(1024),
Chris@629 34 m_allowQuoting(true)
Chris@392 35 {
Chris@1524 36 (void)guessFormatFor(path);
Chris@629 37 }
Chris@629 38
Chris@1524 39 bool
Chris@629 40 CSVFormat::guessFormatFor(QString path)
Chris@629 41 {
Chris@629 42 m_modelType = TwoDimensionalModel;
Chris@629 43 m_timingType = ExplicitTiming;
Chris@629 44 m_timeUnits = TimeSeconds;
Chris@629 45
Chris@629 46 m_maxExampleCols = 0;
Chris@629 47 m_columnCount = 0;
Chris@629 48 m_variableColumnCount = false;
Chris@629 49
Chris@629 50 m_example.clear();
Chris@629 51 m_columnQualities.clear();
Chris@629 52 m_columnPurposes.clear();
Chris@629 53 m_prevValues.clear();
Chris@629 54
Chris@629 55 QFile file(path);
Chris@1524 56 if (!file.exists()) {
Chris@1524 57 SVCERR << "CSVFormat::guessFormatFor(" << path
Chris@1524 58 << "): File does not exist" << endl;
Chris@1524 59 return false;
Chris@1524 60 }
Chris@1524 61 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
Chris@1524 62 SVCERR << "CSVFormat::guessFormatFor(" << path
Chris@1524 63 << "): File could not be opened for reading" << endl;
Chris@1524 64 return false;
Chris@1524 65 }
Chris@1524 66 SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl;
Chris@392 67
Chris@392 68 QTextStream in(&file);
Chris@392 69 in.seek(0);
Chris@392 70
Chris@629 71 int lineno = 0;
Chris@392 72
Chris@392 73 while (!in.atEnd()) {
Chris@392 74
Chris@392 75 // See comment about line endings in CSVFileReader::load()
Chris@392 76
Chris@392 77 QString chunk = in.readLine();
Chris@392 78 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 79
Chris@897 80 for (int li = 0; li < lines.size(); ++li) {
Chris@392 81
Chris@392 82 QString line = lines[li];
Chris@1512 83 if (line.startsWith("#") || line == "") {
Chris@1512 84 continue;
Chris@1512 85 }
Chris@392 86
Chris@629 87 guessQualities(line, lineno);
Chris@392 88
Chris@840 89 ++lineno;
Chris@629 90 }
Chris@840 91
Chris@1512 92 if (lineno >= 150) break;
Chris@629 93 }
Chris@392 94
Chris@629 95 guessPurposes();
Chris@1515 96 guessAudioSampleRange();
Chris@1524 97
Chris@1524 98 return true;
Chris@629 99 }
Chris@629 100
Chris@629 101 void
Chris@629 102 CSVFormat::guessSeparator(QString line)
Chris@629 103 {
Chris@1524 104 QString candidates = "\t|,/: ";
Chris@1524 105
Chris@1524 106 for (int i = 0; i < candidates.length(); ++i) {
Chris@1524 107 auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
Chris@1524 108 if (bits.size() >= 2) {
Chris@1585 109 m_plausibleSeparators.insert(candidates[i]);
Chris@1585 110 if (m_separator == "") {
Chris@1585 111 m_separator = candidates[i];
Chris@1585 112 SVDEBUG << "Estimated column separator: '" << m_separator
Chris@1585 113 << "'" << endl;
Chris@1524 114 }
Chris@629 115 }
Chris@629 116 }
Chris@629 117 }
Chris@629 118
Chris@629 119 void
Chris@629 120 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 121 {
Chris@1585 122 guessSeparator(line);
Chris@629 123
Chris@1362 124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
Chris@629 125
Chris@629 126 int cols = list.size();
Chris@991 127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 128 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 129
Chris@629 130 // All columns are regarded as having these qualities until we see
Chris@629 131 // something that indicates otherwise:
Chris@629 132
Chris@629 133 ColumnQualities defaultQualities =
Chris@1512 134 ColumnNumeric | ColumnIntegral | ColumnSmall |
Chris@1512 135 ColumnIncreasing | ColumnNearEmpty;
Chris@629 136
Chris@629 137 for (int i = 0; i < cols; ++i) {
Chris@1429 138
Chris@629 139 while (m_columnQualities.size() <= i) {
Chris@629 140 m_columnQualities.push_back(defaultQualities);
Chris@629 141 m_prevValues.push_back(0.f);
Chris@629 142 }
Chris@629 143
Chris@629 144 QString s(list[i]);
Chris@629 145 bool ok = false;
Chris@629 146
Chris@629 147 ColumnQualities qualities = m_columnQualities[i];
Chris@629 148
Chris@1523 149 // Looks like this is defined on Windows
Chris@1523 150 #undef small
Chris@1523 151
Chris@629 152 bool numeric = (qualities & ColumnNumeric);
Chris@629 153 bool integral = (qualities & ColumnIntegral);
Chris@629 154 bool increasing = (qualities & ColumnIncreasing);
Chris@1512 155 bool small = (qualities & ColumnSmall);
Chris@629 156 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@1512 157 bool signd = (qualities & ColumnSigned); // also defaults to off
Chris@1021 158 bool emptyish = (qualities & ColumnNearEmpty);
Chris@629 159
Chris@1021 160 if (lineno > 1 && s.trimmed() != "") {
Chris@1021 161 emptyish = false;
Chris@1021 162 }
Chris@1021 163
Chris@629 164 float value = 0.f;
Chris@629 165
Chris@629 166 //!!! how to take into account headers?
Chris@629 167
Chris@629 168 if (numeric) {
Chris@629 169 value = s.toFloat(&ok);
Chris@629 170 if (!ok) {
Chris@629 171 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 172 }
Chris@629 173 if (ok) {
Chris@1512 174 if (lineno < 2 && value > 1000.f) {
Chris@1512 175 large = true;
Chris@1512 176 }
Chris@1512 177 if (value < 0.f) {
Chris@1512 178 signd = true;
Chris@1512 179 }
Chris@1512 180 if (value < -1.f || value > 1.f) {
Chris@1512 181 small = false;
Chris@1512 182 }
Chris@629 183 } else {
Chris@629 184 numeric = false;
Chris@1524 185
Chris@1524 186 // If the column is not numeric, it can't be any of
Chris@1524 187 // these things either
Chris@1524 188 integral = false;
Chris@1524 189 increasing = false;
Chris@1524 190 small = false;
Chris@1524 191 large = false;
Chris@1524 192 signd = false;
Chris@629 193 }
Chris@629 194 }
Chris@629 195
Chris@629 196 if (numeric) {
Chris@629 197
Chris@629 198 if (integral) {
Chris@629 199 if (s.contains('.') || s.contains(',')) {
Chris@629 200 integral = false;
Chris@392 201 }
Chris@392 202 }
Chris@392 203
Chris@629 204 if (increasing) {
Chris@629 205 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 206 increasing = false;
Chris@392 207 }
Chris@392 208 }
Chris@392 209
Chris@629 210 m_prevValues[i] = value;
Chris@629 211 }
Chris@1524 212
Chris@629 213 m_columnQualities[i] =
Chris@629 214 (numeric ? ColumnNumeric : 0) |
Chris@629 215 (integral ? ColumnIntegral : 0) |
Chris@629 216 (increasing ? ColumnIncreasing : 0) |
Chris@1512 217 (small ? ColumnSmall : 0) |
Chris@1021 218 (large ? ColumnLarge : 0) |
Chris@1512 219 (signd ? ColumnSigned : 0) |
Chris@1021 220 (emptyish ? ColumnNearEmpty : 0);
Chris@629 221 }
Chris@392 222
Chris@629 223 if (lineno < 10) {
Chris@629 224 m_example.push_back(list);
Chris@629 225 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 226 m_maxExampleCols = cols;
Chris@392 227 }
Chris@392 228 }
Chris@392 229
Chris@1362 230 if (lineno < 10) {
Chris@1362 231 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
Chris@1362 232 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 233 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1362 234 }
Chris@1362 235 SVDEBUG << endl;
Chris@1362 236 }
Chris@629 237 }
Chris@629 238
Chris@629 239 void
Chris@629 240 CSVFormat::guessPurposes()
Chris@629 241 {
Chris@629 242 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 243 m_timeUnits = CSVFormat::TimeWindows;
Chris@1429 244
Chris@629 245 int timingColumnCount = 0;
Chris@1525 246 bool haveDurationOrEndTime = false;
Chris@1021 247
Chris@1510 248 SVDEBUG << "Estimated column qualities overall: ";
Chris@1510 249 for (int i = 0; i < m_columnCount; ++i) {
Chris@1510 250 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1510 251 }
Chris@1510 252 SVDEBUG << endl;
Chris@1510 253
Chris@1021 254 // if our first column has zero or one entries in it and the rest
Chris@1021 255 // have more, then we'll default to ignoring the first column and
Chris@1021 256 // counting the next one as primary. (e.g. Sonic Annotator output
Chris@1021 257 // with filename at start of first column.)
Chris@1021 258
Chris@1021 259 int primaryColumnNo = 0;
Chris@1021 260
Chris@1021 261 if (m_columnCount >= 2) {
Chris@1021 262 if ( (m_columnQualities[0] & ColumnNearEmpty) &&
Chris@1021 263 !(m_columnQualities[1] & ColumnNearEmpty)) {
Chris@1021 264 primaryColumnNo = 1;
Chris@1021 265 }
Chris@1021 266 }
Chris@629 267
Chris@629 268 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 269
Chris@629 270 ColumnPurpose purpose = ColumnUnknown;
Chris@1021 271
Chris@1021 272 if (i < primaryColumnNo) {
Chris@1021 273 setColumnPurpose(i, purpose);
Chris@1021 274 continue;
Chris@1021 275 }
Chris@1021 276
Chris@1021 277 bool primary = (i == primaryColumnNo);
Chris@392 278
Chris@629 279 ColumnQualities qualities = m_columnQualities[i];
Chris@392 280
Chris@629 281 bool numeric = (qualities & ColumnNumeric);
Chris@629 282 bool integral = (qualities & ColumnIntegral);
Chris@629 283 bool increasing = (qualities & ColumnIncreasing);
Chris@629 284 bool large = (qualities & ColumnLarge);
Chris@629 285
Chris@629 286 bool timingColumn = (numeric && increasing);
Chris@629 287
Chris@629 288 if (timingColumn) {
Chris@629 289
Chris@629 290 ++timingColumnCount;
Chris@629 291
Chris@629 292 if (primary) {
Chris@629 293
Chris@629 294 purpose = ColumnStartTime;
Chris@629 295
Chris@629 296 m_timingType = ExplicitTiming;
Chris@629 297
Chris@629 298 if (integral && large) {
Chris@629 299 m_timeUnits = TimeAudioFrames;
Chris@629 300 } else {
Chris@629 301 m_timeUnits = TimeSeconds;
Chris@629 302 }
Chris@629 303
Chris@629 304 } else {
Chris@629 305
Chris@629 306 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 307 purpose = ColumnEndTime;
Chris@1525 308 haveDurationOrEndTime = true;
Chris@629 309 }
Chris@629 310 }
Chris@629 311 }
Chris@629 312
Chris@629 313 if (purpose == ColumnUnknown) {
Chris@629 314 if (numeric) {
Chris@629 315 purpose = ColumnValue;
Chris@629 316 } else {
Chris@629 317 purpose = ColumnLabel;
Chris@629 318 }
Chris@629 319 }
Chris@629 320
Chris@631 321 setColumnPurpose(i, purpose);
Chris@629 322 }
Chris@629 323
Chris@629 324 int valueCount = 0;
Chris@629 325 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 326 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 327 }
Chris@629 328
Chris@630 329 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 330 // If we have exactly two apparent value columns and only one
Chris@630 331 // timing column, but one value column is integral and the
Chris@630 332 // other is not, guess that whichever one matches the integral
Chris@630 333 // status of the time column is either duration or end time
Chris@630 334 if (m_timingType == ExplicitTiming) {
Chris@630 335 int a = -1, b = -1;
Chris@630 336 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 337 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 338 if (a == -1) a = i;
Chris@630 339 else b = i;
Chris@630 340 }
Chris@630 341 }
Chris@630 342 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 343 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 344 int timecol = a;
Chris@630 345 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 346 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 347 timecol = b;
Chris@630 348 }
Chris@630 349 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 350 // This shouldn't happen; should have been settled above
Chris@630 351 m_columnPurposes[timecol] = ColumnEndTime;
Chris@1525 352 haveDurationOrEndTime = true;
Chris@630 353 } else {
Chris@630 354 m_columnPurposes[timecol] = ColumnDuration;
Chris@1525 355 haveDurationOrEndTime = true;
Chris@630 356 }
Chris@630 357 --valueCount;
Chris@630 358 }
Chris@630 359 }
Chris@630 360 }
Chris@630 361
Chris@1525 362 if (timingColumnCount > 1 || haveDurationOrEndTime) {
Chris@631 363 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 364 } else {
Chris@631 365 if (valueCount == 0) {
Chris@631 366 m_modelType = OneDimensionalModel;
Chris@631 367 } else if (valueCount == 1) {
Chris@631 368 m_modelType = TwoDimensionalModel;
Chris@631 369 } else {
Chris@631 370 m_modelType = ThreeDimensionalModel;
Chris@631 371 }
Chris@629 372 }
Chris@392 373
Chris@1362 374 SVDEBUG << "Estimated column purposes: ";
Chris@1362 375 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 376 SVDEBUG << int(m_columnPurposes[i]) << " ";
Chris@1362 377 }
Chris@1362 378 SVDEBUG << endl;
Chris@392 379
Chris@1362 380 SVDEBUG << "Estimated model type: " << m_modelType << endl;
Chris@1362 381 SVDEBUG << "Estimated timing type: " << m_timingType << endl;
Chris@1362 382 SVDEBUG << "Estimated units: " << m_timeUnits << endl;
Chris@392 383 }
Chris@392 384
Chris@1515 385 void
Chris@1515 386 CSVFormat::guessAudioSampleRange()
Chris@1515 387 {
Chris@1515 388 AudioSampleRange range = SampleRangeSigned1;
Chris@1515 389
Chris@1515 390 range = SampleRangeSigned1;
Chris@1515 391 bool knownSigned = false;
Chris@1515 392 bool knownNonIntegral = false;
Chris@1521 393
Chris@1521 394 SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of "
Chris@1521 395 << range << endl;
Chris@1515 396
Chris@1515 397 for (int i = 0; i < m_columnCount; ++i) {
Chris@1521 398 if (m_columnPurposes[i] != ColumnValue) {
Chris@1521 399 SVDEBUG << "... column " << i
Chris@1521 400 << " is not apparently a value, ignoring" << endl;
Chris@1521 401 continue;
Chris@1521 402 }
Chris@1515 403 if (!(m_columnQualities[i] & ColumnIntegral)) {
Chris@1515 404 knownNonIntegral = true;
Chris@1515 405 if (range == SampleRangeUnsigned255 ||
Chris@1515 406 range == SampleRangeSigned32767) {
Chris@1515 407 range = SampleRangeOther;
Chris@1515 408 }
Chris@1521 409 SVDEBUG << "... column " << i
Chris@1521 410 << " is non-integral, updating range to " << range << endl;
Chris@1515 411 }
Chris@1515 412 if (m_columnQualities[i] & ColumnLarge) {
Chris@1515 413 if (range == SampleRangeSigned1 ||
Chris@1515 414 range == SampleRangeUnsigned255) {
Chris@1515 415 if (knownNonIntegral) {
Chris@1515 416 range = SampleRangeOther;
Chris@1515 417 } else {
Chris@1515 418 range = SampleRangeSigned32767;
Chris@1515 419 }
Chris@1515 420 }
Chris@1521 421 SVDEBUG << "... column " << i << " is large, updating range to "
Chris@1521 422 << range << endl;
Chris@1515 423 }
Chris@1515 424 if (m_columnQualities[i] & ColumnSigned) {
Chris@1515 425 knownSigned = true;
Chris@1515 426 if (range == SampleRangeUnsigned255) {
Chris@1515 427 range = SampleRangeSigned32767;
Chris@1515 428 }
Chris@1521 429 SVDEBUG << "... column " << i << " is signed, updating range to "
Chris@1521 430 << range << endl;
Chris@1515 431 }
Chris@1515 432 if (!(m_columnQualities[i] & ColumnSmall)) {
Chris@1515 433 if (range == SampleRangeSigned1) {
Chris@1515 434 if (knownNonIntegral) {
Chris@1515 435 range = SampleRangeOther;
Chris@1515 436 } else if (knownSigned) {
Chris@1515 437 range = SampleRangeSigned32767;
Chris@1515 438 } else {
Chris@1515 439 range = SampleRangeUnsigned255;
Chris@1515 440 }
Chris@1515 441 }
Chris@1521 442 SVDEBUG << "... column " << i << " is not small, updating range to "
Chris@1521 443 << range << endl;
Chris@1515 444 }
Chris@1515 445 }
Chris@1515 446
Chris@1521 447 SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range "
Chris@1521 448 << range << endl;
Chris@1521 449
Chris@1515 450 m_audioSampleRange = range;
Chris@1515 451 }
Chris@1515 452
Chris@631 453 CSVFormat::ColumnPurpose
Chris@631 454 CSVFormat::getColumnPurpose(int i)
Chris@631 455 {
Chris@631 456 while (m_columnPurposes.size() <= i) {
Chris@631 457 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 458 }
Chris@631 459 return m_columnPurposes[i];
Chris@631 460 }
Chris@629 461
Chris@631 462 CSVFormat::ColumnPurpose
Chris@631 463 CSVFormat::getColumnPurpose(int i) const
Chris@631 464 {
Chris@668 465 if (m_columnPurposes.size() <= i) {
Chris@668 466 return ColumnUnknown;
Chris@668 467 }
Chris@631 468 return m_columnPurposes[i];
Chris@631 469 }
Chris@631 470
Chris@631 471 void
Chris@631 472 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 473 {
Chris@631 474 while (m_columnPurposes.size() <= i) {
Chris@631 475 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 476 }
Chris@631 477 m_columnPurposes[i] = p;
Chris@631 478 }
Chris@631 479
Chris@631 480
Chris@631 481
Chris@631 482