CSVFormat.cpp
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
2 
3 /*
4  Sonic Visualiser
5  An audio file viewer and annotation editor.
6  Centre for Digital Music, Queen Mary, University of London.
7  This file copyright 2006 Chris Cannam.
8 
9  This program is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 2 of the
12  License, or (at your option) any later version. See the file
13  COPYING included with this distribution for more information.
14 */
15 
16 #include "CSVFormat.h"
17 
18 #include "base/StringBits.h"
19 
20 #include <QFile>
21 #include <QString>
22 #include <QRegExp>
23 #include <QStringList>
24 #include <QTextStream>
25 
26 #include <iostream>
27 
28 #include "base/Debug.h"
29 
30 CSVFormat::CSVFormat(QString path) :
31  m_separator(""),
32  m_sampleRate(44100),
33  m_windowSize(1024),
34  m_headerStatus(HeaderUnknown),
35  m_allowQuoting(true),
36  m_maxExampleCols(0)
37 {
38  (void)guessFormatFor(path);
39 }
40 
41 bool
43 {
47 
48  m_maxExampleCols = 0;
49  m_columnCount = 0;
50  m_variableColumnCount = false;
51 
52  m_example.clear();
53  m_columnQualities.clear();
54  m_columnPurposes.clear();
55  m_prevValues.clear();
56 
57  QFile file(path);
58  if (!file.exists()) {
59  SVCERR << "CSVFormat::guessFormatFor(" << path
60  << "): File does not exist" << endl;
61  return false;
62  }
63  if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
64  SVCERR << "CSVFormat::guessFormatFor(" << path
65  << "): File could not be opened for reading" << endl;
66  return false;
67  }
68  SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl;
69 
70  QTextStream in(&file);
71  in.seek(0);
72 
73  int lineno = 0;
74 
75  while (!in.atEnd()) {
76 
77  // See comment about line endings in CSVFileReader::load()
78 
79  QString chunk = in.readLine();
80  QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
81 
82  for (int li = 0; li < lines.size(); ++li) {
83 
84  QString line = lines[li];
85  if (line.startsWith("#") || line == "") {
86  continue;
87  }
88 
89  guessQualities(line, lineno);
90 
91  ++lineno;
92  }
93 
94  if (lineno >= 150) break;
95  }
96 
97  guessPurposes();
99 
100  return true;
101 }
102 
103 void
105 {
106  QString candidates = "\t|,/: ";
107 
108  for (int i = 0; i < candidates.length(); ++i) {
109  auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
110  if (bits.size() >= 2) {
111  m_plausibleSeparators.insert(candidates[i]);
112  if (m_separator == "") {
113  m_separator = candidates[i];
114  SVDEBUG << "Estimated column separator: '" << m_separator
115  << "'" << endl;
116  }
117  }
118  }
119 }
120 
121 void
122 CSVFormat::guessQualities(QString line, int lineno)
123 {
124  guessSeparator(line);
125 
126  QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
127 
128  int cols = list.size();
129 
130  int firstLine = 0;
131  if (m_headerStatus == HeaderPresent) {
132  firstLine = 1;
133  }
134 
135  if (lineno == firstLine || (cols > m_columnCount)) {
136  m_columnCount = cols;
137  }
138  if (cols != m_columnCount) {
139  m_variableColumnCount = true;
140  }
141 
142  // All columns are regarded as having these qualities until we see
143  // something that indicates otherwise:
144 
145  ColumnQualities defaultQualities =
148 
149  for (int i = 0; i < cols; ++i) {
150 
151  SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
152 
153  if (m_columnQualities.find(i) == m_columnQualities.end()) {
154  m_columnQualities[i] = defaultQualities;
155  m_prevValues[i] = 0.f;
156  }
157 
158  QString s(list[i]);
159  bool ok = false;
160 
161  ColumnQualities qualities = m_columnQualities[i];
162 
163 // Looks like this is defined on Windows
164 #undef small
165 
166  bool numeric = (qualities & ColumnNumeric);
167  bool integral = (qualities & ColumnIntegral);
168  bool increasing = (qualities & ColumnIncreasing);
169  bool small = (qualities & ColumnSmall);
170  bool large = (qualities & ColumnLarge); // this one defaults to off
171  bool signd = (qualities & ColumnSigned); // also defaults to off
172  bool emptyish = (qualities & ColumnNearEmpty);
173 
174  if (s.trimmed() != "") {
175 
176  if (lineno > firstLine) {
177  emptyish = false;
178  }
179 
180  float value = 0.f;
181 
182  if (numeric) {
183  value = s.toFloat(&ok);
184  if (!ok) {
185  value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
186  }
187  if (ok) {
188  if (lineno < firstLine + 2 && value > 1000.f) {
189  large = true;
190  }
191  if (value < 0.f) {
192  signd = true;
193  }
194  if (value < -1.f || value > 1.f) {
195  small = false;
196  }
197  } else {
198  numeric = false;
199 
200  // If the column is not numeric, it can't be any of
201  // these things either
202  integral = false;
203  increasing = false;
204  small = false;
205  large = false;
206  signd = false;
207  }
208  }
209 
210  if (numeric) {
211 
212  if (integral) {
213  if (s.contains('.') || s.contains(',')) {
214  integral = false;
215  }
216  }
217 
218  if (increasing) {
219  if (lineno > firstLine && value <= m_prevValues[i]) {
220  increasing = false;
221  }
222  }
223 
224  m_prevValues[i] = value;
225  }
226  }
227 
228  m_columnQualities[i] =
229  (numeric ? ColumnNumeric : 0) |
230  (integral ? ColumnIntegral : 0) |
231  (increasing ? ColumnIncreasing : 0) |
232  (small ? ColumnSmall : 0) |
233  (large ? ColumnLarge : 0) |
234  (signd ? ColumnSigned : 0) |
235  (emptyish ? ColumnNearEmpty : 0);
236  }
237 
238  if (lineno == 0 && m_headerStatus == HeaderUnknown) {
239  // If we have at least one column, and every column has
240  // quality == ColumnNearEmpty, i.e. not empty and not numeric,
241  // then we probably have a header row
242  bool couldBeHeader = (cols > 0);
243  std::map<int, QString> headings;
244  for (int i = 0; i < cols; ++i) {
246  couldBeHeader = false;
247  } else {
248  headings[i] = list[i].trimmed().toLower();
249  }
250  }
251  if (couldBeHeader) {
253  m_columnHeadings = headings;
254  } else {
256  }
257  }
258 
259  if (lineno == 0 && m_headerStatus == HeaderPresent) {
260  // Start again with the qualities:
261  m_columnQualities.clear();
262  m_prevValues.clear();
263  }
264 
265  if (lineno < firstLine + 10) {
266  m_example.push_back(list);
267  if (lineno == 0 || cols > m_maxExampleCols) {
268  m_maxExampleCols = cols;
269  }
270  }
271 
272  if (lineno < firstLine + 10) {
273  SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
274  if (lineno == 0 && m_headerStatus == HeaderPresent &&
275  m_columnCount > 0 && m_columnQualities.empty()) {
276  SVDEBUG << "[whole line classified as a header row]";
277  } else {
278  for (int i = 0; i < cols; ++i) {
279  if (m_columnQualities.find(i) == m_columnQualities.end()) {
280  SVDEBUG << "(not set) ";
281  } else {
282  SVDEBUG << int(m_columnQualities[i]) << " ";
283  }
284  }
285  }
286  SVDEBUG << endl;
287  SVDEBUG << "Estimated header status: " << m_headerStatus << endl;
288  }
289 }
290 
291 void
293 {
296 
297  int timingColumnCount = 0;
298  bool haveDurationOrEndTime = false;
299 
300  SVDEBUG << "Estimated column qualities overall: ";
301  for (int i = 0; i < m_columnCount; ++i) {
302  if (m_columnQualities.find(i) == m_columnQualities.end()) {
303  SVDEBUG << "(not set) ";
304  } else {
305  SVDEBUG << int(m_columnQualities[i]) << " ";
306  }
307  }
308  SVDEBUG << endl;
309 
310  // if our first column has zero or one entries in it and the rest
311  // have more, then we'll default to ignoring the first column and
312  // counting the next one as primary. (e.g. Sonic Annotator output
313  // with filename at start of first column.)
314 
315  int primaryColumnNo = 0;
316 
317  if (m_columnCount >= 2) {
318  if ( (m_columnQualities[0] & ColumnNearEmpty) &&
319  !(m_columnQualities[1] & ColumnNearEmpty)) {
320  primaryColumnNo = 1;
321  }
322  }
323 
324  for (int i = 0; i < m_columnCount; ++i) {
325 
326  ColumnPurpose purpose = ColumnUnknown;
327 
328  if (i < primaryColumnNo) {
329  setColumnPurpose(i, purpose);
330  continue;
331  }
332 
333  bool primary = (i == primaryColumnNo);
334 
335  ColumnQualities qualities = m_columnQualities[i];
336 
337  bool numeric = (qualities & ColumnNumeric);
338  bool integral = (qualities & ColumnIntegral);
339  bool increasing = (qualities & ColumnIncreasing);
340  bool large = (qualities & ColumnLarge);
341 
342  bool timingColumn = (numeric && increasing);
343 
344  QString heading;
345  if (m_columnHeadings.find(i) != m_columnHeadings.end()) {
346  heading = m_columnHeadings[i];
347  }
348 
349  if (heading == "time" || heading == "frame" ||
350  heading == "duration" || heading == "endtime") {
351  timingColumn = true;
352  }
353 
354  if (heading == "value" || heading == "height" || heading == "label") {
355  timingColumn = false;
356  }
357 
358  if (timingColumn) {
359 
360  ++timingColumnCount;
361 
362  if (heading == "endtime") {
363 
364  purpose = ColumnEndTime;
365  haveDurationOrEndTime = true;
366 
367  } else if (heading == "duration") {
368 
369  purpose = ColumnDuration;
370  haveDurationOrEndTime = true;
371 
372  } else if (primary || heading == "time" || heading == "frame") {
373 
374  purpose = ColumnStartTime;
376 
377  if ((integral && large) || heading == "frame") {
379  } else {
381  }
382 
383  } else if (timingColumnCount == 2 &&
385  purpose = ColumnEndTime;
386  haveDurationOrEndTime = true;
387  }
388  }
389 
390  if (purpose == ColumnUnknown) {
391  if (heading == "label") {
392  purpose = ColumnLabel;
393  } else if (numeric || heading == "value" || heading == "height") {
394  purpose = ColumnValue;
395  } else {
396  purpose = ColumnLabel;
397  }
398  }
399 
400  setColumnPurpose(i, purpose);
401  }
402 
403  int valueCount = 0;
404  for (int i = 0; i < m_columnCount; ++i) {
405  if (m_columnPurposes[i] == ColumnValue) {
406  ++valueCount;
407  }
408  }
409 
410  if (valueCount == 2 && timingColumnCount == 1) {
411  // If we have exactly two apparent value columns and only one
412  // timing column, but one value column is integral and the
413  // other is not, guess that whichever one matches the integral
414  // status of the time column is either duration or end time
415  if (m_timingType == ExplicitTiming) {
416  int a = -1, b = -1;
417  for (int i = 0; i < m_columnCount; ++i) {
418  if (m_columnPurposes[i] == ColumnValue) {
419  if (a == -1) a = i;
420  else b = i;
421  }
422  }
423  if ((m_columnQualities[a] & ColumnIntegral) !=
424  (m_columnQualities[b] & ColumnIntegral)) {
425  int timecol = a;
426  if ((m_columnQualities[a] & ColumnIntegral) !=
427  (m_columnQualities[0] & ColumnIntegral)) {
428  timecol = b;
429  }
430  if (m_columnQualities[timecol] & ColumnIncreasing) {
431  // This shouldn't happen; should have been settled above
432  m_columnPurposes[timecol] = ColumnEndTime;
433  haveDurationOrEndTime = true;
434  } else {
435  m_columnPurposes[timecol] = ColumnDuration;
436  haveDurationOrEndTime = true;
437  }
438  --valueCount;
439  }
440  }
441  }
442 
443  if (timingColumnCount > 1 || haveDurationOrEndTime) {
445  } else {
446  if (valueCount == 0) {
448  } else if (valueCount == 1) {
450  } else {
452  }
453  }
454 
455  SVDEBUG << "Estimated column purposes: ";
456  for (int i = 0; i < m_columnCount; ++i) {
457  SVDEBUG << int(m_columnPurposes[i]) << " ";
458  }
459  SVDEBUG << endl;
460 
461  SVDEBUG << "Estimated model type: " << m_modelType << endl;
462  SVDEBUG << "Estimated timing type: " << m_timingType << endl;
463  SVDEBUG << "Estimated units: " << m_timeUnits << endl;
464 }
465 
466 void
468 {
470 
471  range = SampleRangeSigned1;
472  bool knownSigned = false;
473  bool knownNonIntegral = false;
474 
475  SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of "
476  << range << endl;
477 
478  for (int i = 0; i < m_columnCount; ++i) {
479  if (m_columnPurposes[i] != ColumnValue) {
480  SVDEBUG << "... column " << i
481  << " is not apparently a value, ignoring" << endl;
482  continue;
483  }
484  if (!(m_columnQualities[i] & ColumnIntegral)) {
485  knownNonIntegral = true;
486  if (range == SampleRangeUnsigned255 ||
487  range == SampleRangeSigned32767) {
488  range = SampleRangeOther;
489  }
490  SVDEBUG << "... column " << i
491  << " is non-integral, updating range to " << range << endl;
492  }
493  if (m_columnQualities[i] & ColumnLarge) {
494  if (range == SampleRangeSigned1 ||
495  range == SampleRangeUnsigned255) {
496  if (knownNonIntegral) {
497  range = SampleRangeOther;
498  } else {
499  range = SampleRangeSigned32767;
500  }
501  }
502  SVDEBUG << "... column " << i << " is large, updating range to "
503  << range << endl;
504  }
505  if (m_columnQualities[i] & ColumnSigned) {
506  knownSigned = true;
507  if (range == SampleRangeUnsigned255) {
508  range = SampleRangeSigned32767;
509  }
510  SVDEBUG << "... column " << i << " is signed, updating range to "
511  << range << endl;
512  }
513  if (!(m_columnQualities[i] & ColumnSmall)) {
514  if (range == SampleRangeSigned1) {
515  if (knownNonIntegral) {
516  range = SampleRangeOther;
517  } else if (knownSigned) {
518  range = SampleRangeSigned32767;
519  } else {
520  range = SampleRangeUnsigned255;
521  }
522  }
523  SVDEBUG << "... column " << i << " is not small, updating range to "
524  << range << endl;
525  }
526  }
527 
528  SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range "
529  << range << endl;
530 
531  m_audioSampleRange = range;
532 }
533 
534 QList<CSVFormat::ColumnPurpose>
536 {
537  QList<ColumnPurpose> purposes;
538  for (int i = 0; i < m_columnCount; ++i) {
539  purposes.push_back(getColumnPurpose(i));
540  }
541  return purposes;
542 }
543 
544 void
545 CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl)
546 {
547  m_columnPurposes.clear();
548  for (int i = 0; in_range_for(cl, i); ++i) {
549  m_columnPurposes[i] = cl[i];
550  }
551 }
552 
555 {
556  if (m_columnPurposes.find(i) == m_columnPurposes.end()) {
557  return ColumnUnknown;
558  } else {
559  return m_columnPurposes.at(i);
560  }
561 }
562 
563 void
565 {
566  m_columnPurposes[i] = p;
567 }
568 
569 QList<CSVFormat::ColumnQualities>
571 {
572  QList<ColumnQualities> qualities;
573  for (int i = 0; i < m_columnCount; ++i) {
574  if (m_columnQualities.find(i) == m_columnQualities.end()) {
575  qualities.push_back(0);
576  } else {
577  qualities.push_back(m_columnQualities.at(i));
578  }
579  }
580  return qualities;
581 }
CSVFormat()
Definition: CSVFormat.h:86
int m_maxExampleCols
Definition: CSVFormat.h:191
void guessSeparator(QString line)
Definition: CSVFormat.cpp:104
int m_columnCount
Definition: CSVFormat.h:177
AudioSampleRange m_audioSampleRange
Definition: CSVFormat.h:186
static QStringList split(QString s, QChar separator, bool quoted)
Split a string at the given separator character.
Definition: StringBits.cpp:160
std::set< QChar > m_plausibleSeparators
Definition: CSVFormat.h:172
bool guessFormatFor(QString path)
Guess the format of the given CSV file, setting the fields in this object accordingly.
Definition: CSVFormat.cpp:42
QList< ColumnPurpose > getColumnPurposes() const
Definition: CSVFormat.cpp:535
HeaderStatus m_headerStatus
Definition: CSVFormat.h:175
void guessQualities(QString line, int lineno)
Definition: CSVFormat.cpp:122
void guessPurposes()
Definition: CSVFormat.cpp:292
std::map< int, ColumnPurpose > m_columnPurposes
Definition: CSVFormat.h:181
ModelType m_modelType
Definition: CSVFormat.h:168
ColumnPurpose getColumnPurpose(int i) const
Definition: CSVFormat.cpp:554
std::map< int, ColumnQualities > m_columnQualities
Definition: CSVFormat.h:180
std::map< int, float > m_prevValues
Definition: CSVFormat.h:184
TimingType m_timingType
Definition: CSVFormat.h:169
QList< QStringList > m_example
Definition: CSVFormat.h:190
void setColumnPurpose(int i, ColumnPurpose p)
Definition: CSVFormat.cpp:564
bool in_range_for(const C &container, T i)
Check whether an integer index is in range for a container, avoiding overflows and signed/unsigned co...
Definition: BaseTypes.h:37
#define SVDEBUG
Definition: Debug.h:106
AudioSampleRange
Definition: CSVFormat.h:79
void setColumnPurposes(QList< ColumnPurpose > cl)
Definition: CSVFormat.cpp:545
QChar getSeparator() const
Definition: CSVFormat.h:134
#define SVCERR
Definition: Debug.h:109
std::map< int, QString > m_columnHeadings
Definition: CSVFormat.h:182
bool m_allowQuoting
Definition: CSVFormat.h:188
unsigned int ColumnQualities
Definition: CSVFormat.h:77
QString m_separator
Definition: CSVFormat.h:171
bool m_variableColumnCount
Definition: CSVFormat.h:178
void guessAudioSampleRange()
Definition: CSVFormat.cpp:467
TimeUnits m_timeUnits
Definition: CSVFormat.h:170
QList< ColumnQualities > getColumnQualities() const
Definition: CSVFormat.cpp:570
static double stringToDoubleLocaleFree(QString s, bool *ok=0)
Convert a string to a double using basic "C"-locale syntax, i.e.
Definition: StringBits.cpp:28