annotate data/fileio/CSVFormat.cpp @ 1855:db489a1ece9b

Pull out text-document check; it's useful elsewhere
author Chris Cannam
date Mon, 11 May 2020 17:27:18 +0100
parents bde22957545e
children 1b8c4ee06f6d
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@1362 28 #include "base/Debug.h"
Chris@1362 29
Chris@629 30 CSVFormat::CSVFormat(QString path) :
Chris@629 31 m_separator(""),
Chris@392 32 m_sampleRate(44100),
Chris@392 33 m_windowSize(1024),
Chris@629 34 m_allowQuoting(true)
Chris@392 35 {
Chris@1524 36 (void)guessFormatFor(path);
Chris@629 37 }
Chris@629 38
Chris@1524 39 bool
Chris@629 40 CSVFormat::guessFormatFor(QString path)
Chris@629 41 {
Chris@629 42 m_modelType = TwoDimensionalModel;
Chris@629 43 m_timingType = ExplicitTiming;
Chris@629 44 m_timeUnits = TimeSeconds;
Chris@629 45
Chris@629 46 m_maxExampleCols = 0;
Chris@629 47 m_columnCount = 0;
Chris@629 48 m_variableColumnCount = false;
Chris@629 49
Chris@629 50 m_example.clear();
Chris@629 51 m_columnQualities.clear();
Chris@629 52 m_columnPurposes.clear();
Chris@629 53 m_prevValues.clear();
Chris@629 54
Chris@629 55 QFile file(path);
Chris@1524 56 if (!file.exists()) {
Chris@1524 57 SVCERR << "CSVFormat::guessFormatFor(" << path
Chris@1524 58 << "): File does not exist" << endl;
Chris@1524 59 return false;
Chris@1524 60 }
Chris@1524 61 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
Chris@1524 62 SVCERR << "CSVFormat::guessFormatFor(" << path
Chris@1524 63 << "): File could not be opened for reading" << endl;
Chris@1524 64 return false;
Chris@1524 65 }
Chris@1524 66 SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl;
Chris@392 67
Chris@392 68 QTextStream in(&file);
Chris@392 69 in.seek(0);
Chris@392 70
Chris@629 71 int lineno = 0;
Chris@392 72
Chris@392 73 while (!in.atEnd()) {
Chris@392 74
Chris@392 75 // See comment about line endings in CSVFileReader::load()
Chris@392 76
Chris@392 77 QString chunk = in.readLine();
Chris@392 78 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 79
Chris@897 80 for (int li = 0; li < lines.size(); ++li) {
Chris@392 81
Chris@392 82 QString line = lines[li];
Chris@1512 83 if (line.startsWith("#") || line == "") {
Chris@1512 84 continue;
Chris@1512 85 }
Chris@392 86
Chris@629 87 guessQualities(line, lineno);
Chris@392 88
Chris@840 89 ++lineno;
Chris@629 90 }
Chris@840 91
Chris@1512 92 if (lineno >= 150) break;
Chris@629 93 }
Chris@392 94
Chris@629 95 guessPurposes();
Chris@1515 96 guessAudioSampleRange();
Chris@1524 97
Chris@1524 98 return true;
Chris@629 99 }
Chris@629 100
Chris@629 101 void
Chris@629 102 CSVFormat::guessSeparator(QString line)
Chris@629 103 {
Chris@1524 104 QString candidates = "\t|,/: ";
Chris@1524 105
Chris@1524 106 for (int i = 0; i < candidates.length(); ++i) {
Chris@1524 107 auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
Chris@1524 108 if (bits.size() >= 2) {
Chris@1585 109 m_plausibleSeparators.insert(candidates[i]);
Chris@1585 110 if (m_separator == "") {
Chris@1585 111 m_separator = candidates[i];
Chris@1585 112 SVDEBUG << "Estimated column separator: '" << m_separator
Chris@1585 113 << "'" << endl;
Chris@1524 114 }
Chris@629 115 }
Chris@629 116 }
Chris@629 117 }
Chris@629 118
Chris@629 119 void
Chris@629 120 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 121 {
Chris@1585 122 guessSeparator(line);
Chris@629 123
Chris@1362 124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
Chris@629 125
Chris@629 126 int cols = list.size();
Chris@991 127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 128 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 129
Chris@629 130 // All columns are regarded as having these qualities until we see
Chris@629 131 // something that indicates otherwise:
Chris@629 132
Chris@629 133 ColumnQualities defaultQualities =
Chris@1512 134 ColumnNumeric | ColumnIntegral | ColumnSmall |
Chris@1512 135 ColumnIncreasing | ColumnNearEmpty;
Chris@629 136
Chris@629 137 for (int i = 0; i < cols; ++i) {
Chris@1854 138
Chris@1854 139 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
Chris@1854 140
Chris@629 141 while (m_columnQualities.size() <= i) {
Chris@629 142 m_columnQualities.push_back(defaultQualities);
Chris@629 143 m_prevValues.push_back(0.f);
Chris@629 144 }
Chris@629 145
Chris@629 146 QString s(list[i]);
Chris@629 147 bool ok = false;
Chris@629 148
Chris@629 149 ColumnQualities qualities = m_columnQualities[i];
Chris@629 150
Chris@1523 151 // Looks like this is defined on Windows
Chris@1523 152 #undef small
Chris@1523 153
Chris@629 154 bool numeric = (qualities & ColumnNumeric);
Chris@629 155 bool integral = (qualities & ColumnIntegral);
Chris@629 156 bool increasing = (qualities & ColumnIncreasing);
Chris@1512 157 bool small = (qualities & ColumnSmall);
Chris@629 158 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@1512 159 bool signd = (qualities & ColumnSigned); // also defaults to off
Chris@1021 160 bool emptyish = (qualities & ColumnNearEmpty);
Chris@629 161
Chris@1854 162 if (s.trimmed() != "") {
Chris@1021 163
Chris@1854 164 if (lineno > 1) {
Chris@1854 165 emptyish = false;
Chris@1854 166 }
Chris@1854 167
Chris@1854 168 float value = 0.f;
Chris@629 169
Chris@1854 170 //!!! how to take into account headers?
Chris@629 171
Chris@1854 172 if (numeric) {
Chris@1854 173 value = s.toFloat(&ok);
Chris@1854 174 if (!ok) {
Chris@1854 175 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@1512 176 }
Chris@1854 177 if (ok) {
Chris@1854 178 if (lineno < 2 && value > 1000.f) {
Chris@1854 179 large = true;
Chris@1854 180 }
Chris@1854 181 if (value < 0.f) {
Chris@1854 182 signd = true;
Chris@1854 183 }
Chris@1854 184 if (value < -1.f || value > 1.f) {
Chris@1854 185 small = false;
Chris@1854 186 }
Chris@1854 187 } else {
Chris@1854 188 numeric = false;
Chris@1854 189
Chris@1854 190 // If the column is not numeric, it can't be any of
Chris@1854 191 // these things either
Chris@1854 192 integral = false;
Chris@1854 193 increasing = false;
Chris@1512 194 small = false;
Chris@1854 195 large = false;
Chris@1854 196 signd = false;
Chris@392 197 }
Chris@392 198 }
Chris@392 199
Chris@1854 200 if (numeric) {
Chris@1854 201
Chris@1854 202 if (integral) {
Chris@1854 203 if (s.contains('.') || s.contains(',')) {
Chris@1854 204 integral = false;
Chris@1854 205 }
Chris@392 206 }
Chris@1854 207
Chris@1854 208 if (increasing) {
Chris@1854 209 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@1854 210 increasing = false;
Chris@1854 211 }
Chris@1854 212 }
Chris@1854 213
Chris@1854 214 m_prevValues[i] = value;
Chris@392 215 }
Chris@629 216 }
Chris@1524 217
Chris@629 218 m_columnQualities[i] =
Chris@629 219 (numeric ? ColumnNumeric : 0) |
Chris@629 220 (integral ? ColumnIntegral : 0) |
Chris@629 221 (increasing ? ColumnIncreasing : 0) |
Chris@1512 222 (small ? ColumnSmall : 0) |
Chris@1021 223 (large ? ColumnLarge : 0) |
Chris@1512 224 (signd ? ColumnSigned : 0) |
Chris@1021 225 (emptyish ? ColumnNearEmpty : 0);
Chris@629 226 }
Chris@392 227
Chris@629 228 if (lineno < 10) {
Chris@629 229 m_example.push_back(list);
Chris@629 230 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 231 m_maxExampleCols = cols;
Chris@392 232 }
Chris@392 233 }
Chris@392 234
Chris@1362 235 if (lineno < 10) {
Chris@1362 236 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
Chris@1362 237 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 238 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1362 239 }
Chris@1362 240 SVDEBUG << endl;
Chris@1362 241 }
Chris@629 242 }
Chris@629 243
Chris@629 244 void
Chris@629 245 CSVFormat::guessPurposes()
Chris@629 246 {
Chris@629 247 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 248 m_timeUnits = CSVFormat::TimeWindows;
Chris@1429 249
Chris@629 250 int timingColumnCount = 0;
Chris@1525 251 bool haveDurationOrEndTime = false;
Chris@1021 252
Chris@1510 253 SVDEBUG << "Estimated column qualities overall: ";
Chris@1510 254 for (int i = 0; i < m_columnCount; ++i) {
Chris@1510 255 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1510 256 }
Chris@1510 257 SVDEBUG << endl;
Chris@1510 258
Chris@1021 259 // if our first column has zero or one entries in it and the rest
Chris@1021 260 // have more, then we'll default to ignoring the first column and
Chris@1021 261 // counting the next one as primary. (e.g. Sonic Annotator output
Chris@1021 262 // with filename at start of first column.)
Chris@1021 263
Chris@1021 264 int primaryColumnNo = 0;
Chris@1021 265
Chris@1021 266 if (m_columnCount >= 2) {
Chris@1021 267 if ( (m_columnQualities[0] & ColumnNearEmpty) &&
Chris@1021 268 !(m_columnQualities[1] & ColumnNearEmpty)) {
Chris@1021 269 primaryColumnNo = 1;
Chris@1021 270 }
Chris@1021 271 }
Chris@629 272
Chris@629 273 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 274
Chris@629 275 ColumnPurpose purpose = ColumnUnknown;
Chris@1021 276
Chris@1021 277 if (i < primaryColumnNo) {
Chris@1021 278 setColumnPurpose(i, purpose);
Chris@1021 279 continue;
Chris@1021 280 }
Chris@1021 281
Chris@1021 282 bool primary = (i == primaryColumnNo);
Chris@392 283
Chris@629 284 ColumnQualities qualities = m_columnQualities[i];
Chris@392 285
Chris@629 286 bool numeric = (qualities & ColumnNumeric);
Chris@629 287 bool integral = (qualities & ColumnIntegral);
Chris@629 288 bool increasing = (qualities & ColumnIncreasing);
Chris@629 289 bool large = (qualities & ColumnLarge);
Chris@629 290
Chris@629 291 bool timingColumn = (numeric && increasing);
Chris@629 292
Chris@629 293 if (timingColumn) {
Chris@629 294
Chris@629 295 ++timingColumnCount;
Chris@629 296
Chris@629 297 if (primary) {
Chris@629 298
Chris@629 299 purpose = ColumnStartTime;
Chris@629 300
Chris@629 301 m_timingType = ExplicitTiming;
Chris@629 302
Chris@629 303 if (integral && large) {
Chris@629 304 m_timeUnits = TimeAudioFrames;
Chris@629 305 } else {
Chris@629 306 m_timeUnits = TimeSeconds;
Chris@629 307 }
Chris@629 308
Chris@629 309 } else {
Chris@629 310
Chris@629 311 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 312 purpose = ColumnEndTime;
Chris@1525 313 haveDurationOrEndTime = true;
Chris@629 314 }
Chris@629 315 }
Chris@629 316 }
Chris@629 317
Chris@629 318 if (purpose == ColumnUnknown) {
Chris@629 319 if (numeric) {
Chris@629 320 purpose = ColumnValue;
Chris@629 321 } else {
Chris@629 322 purpose = ColumnLabel;
Chris@629 323 }
Chris@629 324 }
Chris@629 325
Chris@631 326 setColumnPurpose(i, purpose);
Chris@629 327 }
Chris@629 328
Chris@629 329 int valueCount = 0;
Chris@629 330 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 331 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 332 }
Chris@629 333
Chris@630 334 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 335 // If we have exactly two apparent value columns and only one
Chris@630 336 // timing column, but one value column is integral and the
Chris@630 337 // other is not, guess that whichever one matches the integral
Chris@630 338 // status of the time column is either duration or end time
Chris@630 339 if (m_timingType == ExplicitTiming) {
Chris@630 340 int a = -1, b = -1;
Chris@630 341 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 342 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 343 if (a == -1) a = i;
Chris@630 344 else b = i;
Chris@630 345 }
Chris@630 346 }
Chris@630 347 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 348 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 349 int timecol = a;
Chris@630 350 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 351 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 352 timecol = b;
Chris@630 353 }
Chris@630 354 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 355 // This shouldn't happen; should have been settled above
Chris@630 356 m_columnPurposes[timecol] = ColumnEndTime;
Chris@1525 357 haveDurationOrEndTime = true;
Chris@630 358 } else {
Chris@630 359 m_columnPurposes[timecol] = ColumnDuration;
Chris@1525 360 haveDurationOrEndTime = true;
Chris@630 361 }
Chris@630 362 --valueCount;
Chris@630 363 }
Chris@630 364 }
Chris@630 365 }
Chris@630 366
Chris@1525 367 if (timingColumnCount > 1 || haveDurationOrEndTime) {
Chris@631 368 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 369 } else {
Chris@631 370 if (valueCount == 0) {
Chris@631 371 m_modelType = OneDimensionalModel;
Chris@631 372 } else if (valueCount == 1) {
Chris@631 373 m_modelType = TwoDimensionalModel;
Chris@631 374 } else {
Chris@631 375 m_modelType = ThreeDimensionalModel;
Chris@631 376 }
Chris@629 377 }
Chris@392 378
Chris@1362 379 SVDEBUG << "Estimated column purposes: ";
Chris@1362 380 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 381 SVDEBUG << int(m_columnPurposes[i]) << " ";
Chris@1362 382 }
Chris@1362 383 SVDEBUG << endl;
Chris@392 384
Chris@1362 385 SVDEBUG << "Estimated model type: " << m_modelType << endl;
Chris@1362 386 SVDEBUG << "Estimated timing type: " << m_timingType << endl;
Chris@1362 387 SVDEBUG << "Estimated units: " << m_timeUnits << endl;
Chris@392 388 }
Chris@392 389
Chris@1515 390 void
Chris@1515 391 CSVFormat::guessAudioSampleRange()
Chris@1515 392 {
Chris@1515 393 AudioSampleRange range = SampleRangeSigned1;
Chris@1515 394
Chris@1515 395 range = SampleRangeSigned1;
Chris@1515 396 bool knownSigned = false;
Chris@1515 397 bool knownNonIntegral = false;
Chris@1521 398
Chris@1521 399 SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of "
Chris@1521 400 << range << endl;
Chris@1515 401
Chris@1515 402 for (int i = 0; i < m_columnCount; ++i) {
Chris@1521 403 if (m_columnPurposes[i] != ColumnValue) {
Chris@1521 404 SVDEBUG << "... column " << i
Chris@1521 405 << " is not apparently a value, ignoring" << endl;
Chris@1521 406 continue;
Chris@1521 407 }
Chris@1515 408 if (!(m_columnQualities[i] & ColumnIntegral)) {
Chris@1515 409 knownNonIntegral = true;
Chris@1515 410 if (range == SampleRangeUnsigned255 ||
Chris@1515 411 range == SampleRangeSigned32767) {
Chris@1515 412 range = SampleRangeOther;
Chris@1515 413 }
Chris@1521 414 SVDEBUG << "... column " << i
Chris@1521 415 << " is non-integral, updating range to " << range << endl;
Chris@1515 416 }
Chris@1515 417 if (m_columnQualities[i] & ColumnLarge) {
Chris@1515 418 if (range == SampleRangeSigned1 ||
Chris@1515 419 range == SampleRangeUnsigned255) {
Chris@1515 420 if (knownNonIntegral) {
Chris@1515 421 range = SampleRangeOther;
Chris@1515 422 } else {
Chris@1515 423 range = SampleRangeSigned32767;
Chris@1515 424 }
Chris@1515 425 }
Chris@1521 426 SVDEBUG << "... column " << i << " is large, updating range to "
Chris@1521 427 << range << endl;
Chris@1515 428 }
Chris@1515 429 if (m_columnQualities[i] & ColumnSigned) {
Chris@1515 430 knownSigned = true;
Chris@1515 431 if (range == SampleRangeUnsigned255) {
Chris@1515 432 range = SampleRangeSigned32767;
Chris@1515 433 }
Chris@1521 434 SVDEBUG << "... column " << i << " is signed, updating range to "
Chris@1521 435 << range << endl;
Chris@1515 436 }
Chris@1515 437 if (!(m_columnQualities[i] & ColumnSmall)) {
Chris@1515 438 if (range == SampleRangeSigned1) {
Chris@1515 439 if (knownNonIntegral) {
Chris@1515 440 range = SampleRangeOther;
Chris@1515 441 } else if (knownSigned) {
Chris@1515 442 range = SampleRangeSigned32767;
Chris@1515 443 } else {
Chris@1515 444 range = SampleRangeUnsigned255;
Chris@1515 445 }
Chris@1515 446 }
Chris@1521 447 SVDEBUG << "... column " << i << " is not small, updating range to "
Chris@1521 448 << range << endl;
Chris@1515 449 }
Chris@1515 450 }
Chris@1515 451
Chris@1521 452 SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range "
Chris@1521 453 << range << endl;
Chris@1521 454
Chris@1515 455 m_audioSampleRange = range;
Chris@1515 456 }
Chris@1515 457
Chris@631 458 CSVFormat::ColumnPurpose
Chris@631 459 CSVFormat::getColumnPurpose(int i)
Chris@631 460 {
Chris@631 461 while (m_columnPurposes.size() <= i) {
Chris@631 462 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 463 }
Chris@631 464 return m_columnPurposes[i];
Chris@631 465 }
Chris@629 466
Chris@631 467 CSVFormat::ColumnPurpose
Chris@631 468 CSVFormat::getColumnPurpose(int i) const
Chris@631 469 {
Chris@668 470 if (m_columnPurposes.size() <= i) {
Chris@668 471 return ColumnUnknown;
Chris@668 472 }
Chris@631 473 return m_columnPurposes[i];
Chris@631 474 }
Chris@631 475
Chris@631 476 void
Chris@631 477 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 478 {
Chris@631 479 while (m_columnPurposes.size() <= i) {
Chris@631 480 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 481 }
Chris@631 482 m_columnPurposes[i] = p;
Chris@631 483 }
Chris@631 484
Chris@631 485
Chris@631 486
Chris@631 487