annotate data/fileio/CSVFormat.cpp @ 1523:c1b2eab6ac51

Win32 fix
author Chris Cannam
date Wed, 12 Sep 2018 18:49:32 +0100
parents 2d291eac9f21
children 64ef24ebb19c
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@1362 28 #include "base/Debug.h"
Chris@1362 29
Chris@629 30 CSVFormat::CSVFormat(QString path) :
Chris@629 31 m_separator(""),
Chris@392 32 m_sampleRate(44100),
Chris@392 33 m_windowSize(1024),
Chris@629 34 m_allowQuoting(true)
Chris@392 35 {
Chris@629 36 guessFormatFor(path);
Chris@629 37 }
Chris@629 38
Chris@629 39 void
Chris@629 40 CSVFormat::guessFormatFor(QString path)
Chris@629 41 {
Chris@629 42 m_modelType = TwoDimensionalModel;
Chris@629 43 m_timingType = ExplicitTiming;
Chris@629 44 m_timeUnits = TimeSeconds;
Chris@629 45
Chris@629 46 m_maxExampleCols = 0;
Chris@629 47 m_columnCount = 0;
Chris@629 48 m_variableColumnCount = false;
Chris@629 49
Chris@629 50 m_example.clear();
Chris@629 51 m_columnQualities.clear();
Chris@629 52 m_columnPurposes.clear();
Chris@629 53 m_prevValues.clear();
Chris@629 54
Chris@629 55 QFile file(path);
Chris@392 56 if (!file.exists()) return;
Chris@392 57 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 58
Chris@392 59 QTextStream in(&file);
Chris@392 60 in.seek(0);
Chris@392 61
Chris@629 62 int lineno = 0;
Chris@392 63
Chris@392 64 while (!in.atEnd()) {
Chris@392 65
Chris@392 66 // See comment about line endings in CSVFileReader::load()
Chris@392 67
Chris@392 68 QString chunk = in.readLine();
Chris@392 69 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 70
Chris@897 71 for (int li = 0; li < lines.size(); ++li) {
Chris@392 72
Chris@392 73 QString line = lines[li];
Chris@1512 74 if (line.startsWith("#") || line == "") {
Chris@1512 75 continue;
Chris@1512 76 }
Chris@392 77
Chris@629 78 guessQualities(line, lineno);
Chris@392 79
Chris@840 80 ++lineno;
Chris@629 81 }
Chris@840 82
Chris@1512 83 if (lineno >= 150) break;
Chris@629 84 }
Chris@392 85
Chris@629 86 guessPurposes();
Chris@1515 87 guessAudioSampleRange();
Chris@629 88 }
Chris@629 89
Chris@629 90 void
Chris@629 91 CSVFormat::guessSeparator(QString line)
Chris@629 92 {
Chris@629 93 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@897 94 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
Chris@629 95 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 96 m_separator = candidates[i];
Chris@1510 97 SVDEBUG << "Estimated column separator: '" << m_separator
Chris@1510 98 << "'" << endl;
Chris@629 99 return;
Chris@629 100 }
Chris@629 101 }
Chris@629 102 }
Chris@629 103
Chris@629 104 void
Chris@629 105 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 106 {
Chris@629 107 if (m_separator == "") guessSeparator(line);
Chris@629 108
Chris@1362 109 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
Chris@629 110
Chris@629 111 int cols = list.size();
Chris@991 112 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 113 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 114
Chris@629 115 // All columns are regarded as having these qualities until we see
Chris@629 116 // something that indicates otherwise:
Chris@629 117
Chris@629 118 ColumnQualities defaultQualities =
Chris@1512 119 ColumnNumeric | ColumnIntegral | ColumnSmall |
Chris@1512 120 ColumnIncreasing | ColumnNearEmpty;
Chris@629 121
Chris@629 122 for (int i = 0; i < cols; ++i) {
Chris@1429 123
Chris@629 124 while (m_columnQualities.size() <= i) {
Chris@629 125 m_columnQualities.push_back(defaultQualities);
Chris@629 126 m_prevValues.push_back(0.f);
Chris@629 127 }
Chris@629 128
Chris@629 129 QString s(list[i]);
Chris@629 130 bool ok = false;
Chris@629 131
Chris@629 132 ColumnQualities qualities = m_columnQualities[i];
Chris@629 133
Chris@1523 134 // Looks like this is defined on Windows
Chris@1523 135 #undef small
Chris@1523 136
Chris@629 137 bool numeric = (qualities & ColumnNumeric);
Chris@629 138 bool integral = (qualities & ColumnIntegral);
Chris@629 139 bool increasing = (qualities & ColumnIncreasing);
Chris@1512 140 bool small = (qualities & ColumnSmall);
Chris@629 141 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@1512 142 bool signd = (qualities & ColumnSigned); // also defaults to off
Chris@1021 143 bool emptyish = (qualities & ColumnNearEmpty);
Chris@629 144
Chris@1021 145 if (lineno > 1 && s.trimmed() != "") {
Chris@1021 146 emptyish = false;
Chris@1021 147 }
Chris@1021 148
Chris@629 149 float value = 0.f;
Chris@629 150
Chris@629 151 //!!! how to take into account headers?
Chris@629 152
Chris@629 153 if (numeric) {
Chris@629 154 value = s.toFloat(&ok);
Chris@629 155 if (!ok) {
Chris@629 156 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 157 }
Chris@629 158 if (ok) {
Chris@1512 159 if (lineno < 2 && value > 1000.f) {
Chris@1512 160 large = true;
Chris@1512 161 }
Chris@1512 162 if (value < 0.f) {
Chris@1512 163 signd = true;
Chris@1512 164 }
Chris@1512 165 if (value < -1.f || value > 1.f) {
Chris@1512 166 small = false;
Chris@1512 167 }
Chris@629 168 } else {
Chris@629 169 numeric = false;
Chris@629 170 }
Chris@629 171 }
Chris@629 172
Chris@629 173 if (numeric) {
Chris@629 174
Chris@629 175 if (integral) {
Chris@629 176 if (s.contains('.') || s.contains(',')) {
Chris@629 177 integral = false;
Chris@392 178 }
Chris@392 179 }
Chris@392 180
Chris@629 181 if (increasing) {
Chris@629 182 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 183 increasing = false;
Chris@392 184 }
Chris@392 185 }
Chris@392 186
Chris@629 187 m_prevValues[i] = value;
Chris@629 188 }
Chris@392 189
Chris@629 190 m_columnQualities[i] =
Chris@629 191 (numeric ? ColumnNumeric : 0) |
Chris@629 192 (integral ? ColumnIntegral : 0) |
Chris@629 193 (increasing ? ColumnIncreasing : 0) |
Chris@1512 194 (small ? ColumnSmall : 0) |
Chris@1021 195 (large ? ColumnLarge : 0) |
Chris@1512 196 (signd ? ColumnSigned : 0) |
Chris@1021 197 (emptyish ? ColumnNearEmpty : 0);
Chris@629 198 }
Chris@392 199
Chris@629 200 if (lineno < 10) {
Chris@629 201 m_example.push_back(list);
Chris@629 202 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 203 m_maxExampleCols = cols;
Chris@392 204 }
Chris@392 205 }
Chris@392 206
Chris@1362 207 if (lineno < 10) {
Chris@1362 208 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
Chris@1362 209 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 210 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1362 211 }
Chris@1362 212 SVDEBUG << endl;
Chris@1362 213 }
Chris@629 214 }
Chris@629 215
Chris@629 216 void
Chris@629 217 CSVFormat::guessPurposes()
Chris@629 218 {
Chris@629 219 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 220 m_timeUnits = CSVFormat::TimeWindows;
Chris@1429 221
Chris@629 222 int timingColumnCount = 0;
Chris@1021 223
Chris@1510 224 SVDEBUG << "Estimated column qualities overall: ";
Chris@1510 225 for (int i = 0; i < m_columnCount; ++i) {
Chris@1510 226 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1510 227 }
Chris@1510 228 SVDEBUG << endl;
Chris@1510 229
Chris@1021 230 // if our first column has zero or one entries in it and the rest
Chris@1021 231 // have more, then we'll default to ignoring the first column and
Chris@1021 232 // counting the next one as primary. (e.g. Sonic Annotator output
Chris@1021 233 // with filename at start of first column.)
Chris@1021 234
Chris@1021 235 int primaryColumnNo = 0;
Chris@1021 236
Chris@1021 237 if (m_columnCount >= 2) {
Chris@1021 238 if ( (m_columnQualities[0] & ColumnNearEmpty) &&
Chris@1021 239 !(m_columnQualities[1] & ColumnNearEmpty)) {
Chris@1021 240 primaryColumnNo = 1;
Chris@1021 241 }
Chris@1021 242 }
Chris@629 243
Chris@629 244 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 245
Chris@629 246 ColumnPurpose purpose = ColumnUnknown;
Chris@1021 247
Chris@1021 248 if (i < primaryColumnNo) {
Chris@1021 249 setColumnPurpose(i, purpose);
Chris@1021 250 continue;
Chris@1021 251 }
Chris@1021 252
Chris@1021 253 bool primary = (i == primaryColumnNo);
Chris@392 254
Chris@629 255 ColumnQualities qualities = m_columnQualities[i];
Chris@392 256
Chris@629 257 bool numeric = (qualities & ColumnNumeric);
Chris@629 258 bool integral = (qualities & ColumnIntegral);
Chris@629 259 bool increasing = (qualities & ColumnIncreasing);
Chris@629 260 bool large = (qualities & ColumnLarge);
Chris@629 261
Chris@629 262 bool timingColumn = (numeric && increasing);
Chris@629 263
Chris@629 264 if (timingColumn) {
Chris@629 265
Chris@629 266 ++timingColumnCount;
Chris@629 267
Chris@629 268 if (primary) {
Chris@629 269
Chris@629 270 purpose = ColumnStartTime;
Chris@629 271
Chris@629 272 m_timingType = ExplicitTiming;
Chris@629 273
Chris@629 274 if (integral && large) {
Chris@629 275 m_timeUnits = TimeAudioFrames;
Chris@629 276 } else {
Chris@629 277 m_timeUnits = TimeSeconds;
Chris@629 278 }
Chris@629 279
Chris@629 280 } else {
Chris@629 281
Chris@629 282 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 283 purpose = ColumnEndTime;
Chris@629 284 }
Chris@629 285 }
Chris@629 286 }
Chris@629 287
Chris@629 288 if (purpose == ColumnUnknown) {
Chris@629 289 if (numeric) {
Chris@629 290 purpose = ColumnValue;
Chris@629 291 } else {
Chris@629 292 purpose = ColumnLabel;
Chris@629 293 }
Chris@629 294 }
Chris@629 295
Chris@631 296 setColumnPurpose(i, purpose);
Chris@629 297 }
Chris@629 298
Chris@629 299 int valueCount = 0;
Chris@629 300 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 301 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 302 }
Chris@629 303
Chris@630 304 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 305 // If we have exactly two apparent value columns and only one
Chris@630 306 // timing column, but one value column is integral and the
Chris@630 307 // other is not, guess that whichever one matches the integral
Chris@630 308 // status of the time column is either duration or end time
Chris@630 309 if (m_timingType == ExplicitTiming) {
Chris@630 310 int a = -1, b = -1;
Chris@630 311 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 312 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 313 if (a == -1) a = i;
Chris@630 314 else b = i;
Chris@630 315 }
Chris@630 316 }
Chris@630 317 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 318 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 319 int timecol = a;
Chris@630 320 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 321 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 322 timecol = b;
Chris@630 323 }
Chris@630 324 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 325 // This shouldn't happen; should have been settled above
Chris@630 326 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 327 } else {
Chris@630 328 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 329 }
Chris@630 330 --valueCount;
Chris@630 331 }
Chris@630 332 }
Chris@630 333 }
Chris@630 334
Chris@631 335 if (timingColumnCount > 1) {
Chris@631 336 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 337 } else {
Chris@631 338 if (valueCount == 0) {
Chris@631 339 m_modelType = OneDimensionalModel;
Chris@631 340 } else if (valueCount == 1) {
Chris@631 341 m_modelType = TwoDimensionalModel;
Chris@631 342 } else {
Chris@631 343 m_modelType = ThreeDimensionalModel;
Chris@631 344 }
Chris@629 345 }
Chris@392 346
Chris@1362 347 SVDEBUG << "Estimated column purposes: ";
Chris@1362 348 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 349 SVDEBUG << int(m_columnPurposes[i]) << " ";
Chris@1362 350 }
Chris@1362 351 SVDEBUG << endl;
Chris@392 352
Chris@1362 353 SVDEBUG << "Estimated model type: " << m_modelType << endl;
Chris@1362 354 SVDEBUG << "Estimated timing type: " << m_timingType << endl;
Chris@1362 355 SVDEBUG << "Estimated units: " << m_timeUnits << endl;
Chris@392 356 }
Chris@392 357
Chris@1515 358 void
Chris@1515 359 CSVFormat::guessAudioSampleRange()
Chris@1515 360 {
Chris@1515 361 AudioSampleRange range = SampleRangeSigned1;
Chris@1515 362
Chris@1515 363 range = SampleRangeSigned1;
Chris@1515 364 bool knownSigned = false;
Chris@1515 365 bool knownNonIntegral = false;
Chris@1521 366
Chris@1521 367 SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of "
Chris@1521 368 << range << endl;
Chris@1515 369
Chris@1515 370 for (int i = 0; i < m_columnCount; ++i) {
Chris@1521 371 if (m_columnPurposes[i] != ColumnValue) {
Chris@1521 372 SVDEBUG << "... column " << i
Chris@1521 373 << " is not apparently a value, ignoring" << endl;
Chris@1521 374 continue;
Chris@1521 375 }
Chris@1515 376 if (!(m_columnQualities[i] & ColumnIntegral)) {
Chris@1515 377 knownNonIntegral = true;
Chris@1515 378 if (range == SampleRangeUnsigned255 ||
Chris@1515 379 range == SampleRangeSigned32767) {
Chris@1515 380 range = SampleRangeOther;
Chris@1515 381 }
Chris@1521 382 SVDEBUG << "... column " << i
Chris@1521 383 << " is non-integral, updating range to " << range << endl;
Chris@1515 384 }
Chris@1515 385 if (m_columnQualities[i] & ColumnLarge) {
Chris@1515 386 if (range == SampleRangeSigned1 ||
Chris@1515 387 range == SampleRangeUnsigned255) {
Chris@1515 388 if (knownNonIntegral) {
Chris@1515 389 range = SampleRangeOther;
Chris@1515 390 } else {
Chris@1515 391 range = SampleRangeSigned32767;
Chris@1515 392 }
Chris@1515 393 }
Chris@1521 394 SVDEBUG << "... column " << i << " is large, updating range to "
Chris@1521 395 << range << endl;
Chris@1515 396 }
Chris@1515 397 if (m_columnQualities[i] & ColumnSigned) {
Chris@1515 398 knownSigned = true;
Chris@1515 399 if (range == SampleRangeUnsigned255) {
Chris@1515 400 range = SampleRangeSigned32767;
Chris@1515 401 }
Chris@1521 402 SVDEBUG << "... column " << i << " is signed, updating range to "
Chris@1521 403 << range << endl;
Chris@1515 404 }
Chris@1515 405 if (!(m_columnQualities[i] & ColumnSmall)) {
Chris@1515 406 if (range == SampleRangeSigned1) {
Chris@1515 407 if (knownNonIntegral) {
Chris@1515 408 range = SampleRangeOther;
Chris@1515 409 } else if (knownSigned) {
Chris@1515 410 range = SampleRangeSigned32767;
Chris@1515 411 } else {
Chris@1515 412 range = SampleRangeUnsigned255;
Chris@1515 413 }
Chris@1515 414 }
Chris@1521 415 SVDEBUG << "... column " << i << " is not small, updating range to "
Chris@1521 416 << range << endl;
Chris@1515 417 }
Chris@1515 418 }
Chris@1515 419
Chris@1521 420 SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range "
Chris@1521 421 << range << endl;
Chris@1521 422
Chris@1515 423 m_audioSampleRange = range;
Chris@1515 424 }
Chris@1515 425
Chris@631 426 CSVFormat::ColumnPurpose
Chris@631 427 CSVFormat::getColumnPurpose(int i)
Chris@631 428 {
Chris@631 429 while (m_columnPurposes.size() <= i) {
Chris@631 430 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 431 }
Chris@631 432 return m_columnPurposes[i];
Chris@631 433 }
Chris@629 434
Chris@631 435 CSVFormat::ColumnPurpose
Chris@631 436 CSVFormat::getColumnPurpose(int i) const
Chris@631 437 {
Chris@668 438 if (m_columnPurposes.size() <= i) {
Chris@668 439 return ColumnUnknown;
Chris@668 440 }
Chris@631 441 return m_columnPurposes[i];
Chris@631 442 }
Chris@631 443
Chris@631 444 void
Chris@631 445 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 446 {
Chris@631 447 while (m_columnPurposes.size() <= i) {
Chris@631 448 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 449 }
Chris@631 450 m_columnPurposes[i] = p;
Chris@631 451 }
Chris@631 452
Chris@631 453
Chris@631 454
Chris@631 455