comparison data/fileio/CSVFormat.cpp @ 1527:710e6250a401 zoom

Merge from default branch
author Chris Cannam
date Mon, 17 Sep 2018 13:51:14 +0100
parents a92e94215863
children 9570ef94eaa3
comparison
equal deleted inserted replaced
1324:d4a28d1479a8 1527:710e6250a401
23 #include <QStringList> 23 #include <QStringList>
24 #include <QTextStream> 24 #include <QTextStream>
25 25
26 #include <iostream> 26 #include <iostream>
27 27
28 #include "base/Debug.h"
29
28 CSVFormat::CSVFormat(QString path) : 30 CSVFormat::CSVFormat(QString path) :
29 m_separator(""), 31 m_separator(""),
30 m_sampleRate(44100), 32 m_sampleRate(44100),
31 m_windowSize(1024), 33 m_windowSize(1024),
32 m_allowQuoting(true) 34 m_allowQuoting(true)
33 { 35 {
34 guessFormatFor(path); 36 (void)guessFormatFor(path);
35 } 37 }
36 38
37 void 39 bool
38 CSVFormat::guessFormatFor(QString path) 40 CSVFormat::guessFormatFor(QString path)
39 { 41 {
42 m_separator = ""; // to prompt guessing for it
43
40 m_modelType = TwoDimensionalModel; 44 m_modelType = TwoDimensionalModel;
41 m_timingType = ExplicitTiming; 45 m_timingType = ExplicitTiming;
42 m_timeUnits = TimeSeconds; 46 m_timeUnits = TimeSeconds;
43 47
44 m_maxExampleCols = 0; 48 m_maxExampleCols = 0;
49 m_columnQualities.clear(); 53 m_columnQualities.clear();
50 m_columnPurposes.clear(); 54 m_columnPurposes.clear();
51 m_prevValues.clear(); 55 m_prevValues.clear();
52 56
53 QFile file(path); 57 QFile file(path);
54 if (!file.exists()) return; 58 if (!file.exists()) {
55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; 59 SVCERR << "CSVFormat::guessFormatFor(" << path
60 << "): File does not exist" << endl;
61 return false;
62 }
63 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
64 SVCERR << "CSVFormat::guessFormatFor(" << path
65 << "): File could not be opened for reading" << endl;
66 return false;
67 }
68 SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl;
56 69
57 QTextStream in(&file); 70 QTextStream in(&file);
58 in.seek(0); 71 in.seek(0);
59 72
60 int lineno = 0; 73 int lineno = 0;
67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); 80 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
68 81
69 for (int li = 0; li < lines.size(); ++li) { 82 for (int li = 0; li < lines.size(); ++li) {
70 83
71 QString line = lines[li]; 84 QString line = lines[li];
72 if (line.startsWith("#") || line == "") continue; 85 if (line.startsWith("#") || line == "") {
86 continue;
87 }
73 88
74 guessQualities(line, lineno); 89 guessQualities(line, lineno);
75 90
76 ++lineno; 91 ++lineno;
77 } 92 }
78 93
79 if (lineno >= 50) break; 94 if (lineno >= 150) break;
80 } 95 }
81 96
82 guessPurposes(); 97 guessPurposes();
98 guessAudioSampleRange();
99
100 return true;
83 } 101 }
84 102
85 void 103 void
86 CSVFormat::guessSeparator(QString line) 104 CSVFormat::guessSeparator(QString line)
87 { 105 {
88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; 106 QString candidates = "\t|,/: ";
89 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) { 107
90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { 108 for (int i = 0; i < candidates.length(); ++i) {
109 auto bits = StringBits::split(line, candidates[i], m_allowQuoting);
110 if (bits.size() >= 2) {
111 SVDEBUG << "Successfully split the line into:" << endl;
112 for (auto b: bits) {
113 SVDEBUG << b << endl;
114 }
91 m_separator = candidates[i]; 115 m_separator = candidates[i];
116 SVDEBUG << "Estimated column separator: '" << m_separator
117 << "'" << endl;
92 return; 118 return;
93 } 119 }
94 } 120 }
95 m_separator = " ";
96 } 121 }
97 122
98 void 123 void
99 CSVFormat::guessQualities(QString line, int lineno) 124 CSVFormat::guessQualities(QString line, int lineno)
100 { 125 {
101 if (m_separator == "") guessSeparator(line); 126 if (m_separator == "") {
102 127 guessSeparator(line);
103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting); 128 }
129
130 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
104 131
105 int cols = list.size(); 132 int cols = list.size();
106 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; 133 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
107 if (cols != m_columnCount) m_variableColumnCount = true; 134 if (cols != m_columnCount) m_variableColumnCount = true;
108 135
109 // All columns are regarded as having these qualities until we see 136 // All columns are regarded as having these qualities until we see
110 // something that indicates otherwise: 137 // something that indicates otherwise:
111 138
112 ColumnQualities defaultQualities = 139 ColumnQualities defaultQualities =
113 ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty; 140 ColumnNumeric | ColumnIntegral | ColumnSmall |
141 ColumnIncreasing | ColumnNearEmpty;
114 142
115 for (int i = 0; i < cols; ++i) { 143 for (int i = 0; i < cols; ++i) {
116 144
117 while (m_columnQualities.size() <= i) { 145 while (m_columnQualities.size() <= i) {
118 m_columnQualities.push_back(defaultQualities); 146 m_columnQualities.push_back(defaultQualities);
119 m_prevValues.push_back(0.f); 147 m_prevValues.push_back(0.f);
120 } 148 }
121 149
122 QString s(list[i]); 150 QString s(list[i]);
123 bool ok = false; 151 bool ok = false;
124 152
125 ColumnQualities qualities = m_columnQualities[i]; 153 ColumnQualities qualities = m_columnQualities[i];
126 154
155 // Looks like this is defined on Windows
156 #undef small
157
127 bool numeric = (qualities & ColumnNumeric); 158 bool numeric = (qualities & ColumnNumeric);
128 bool integral = (qualities & ColumnIntegral); 159 bool integral = (qualities & ColumnIntegral);
129 bool increasing = (qualities & ColumnIncreasing); 160 bool increasing = (qualities & ColumnIncreasing);
161 bool small = (qualities & ColumnSmall);
130 bool large = (qualities & ColumnLarge); // this one defaults to off 162 bool large = (qualities & ColumnLarge); // this one defaults to off
163 bool signd = (qualities & ColumnSigned); // also defaults to off
131 bool emptyish = (qualities & ColumnNearEmpty); 164 bool emptyish = (qualities & ColumnNearEmpty);
132 165
133 if (lineno > 1 && s.trimmed() != "") { 166 if (lineno > 1 && s.trimmed() != "") {
134 emptyish = false; 167 emptyish = false;
135 } 168 }
142 value = s.toFloat(&ok); 175 value = s.toFloat(&ok);
143 if (!ok) { 176 if (!ok) {
144 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); 177 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
145 } 178 }
146 if (ok) { 179 if (ok) {
147 if (lineno < 2 && value > 1000.f) large = true; 180 if (lineno < 2 && value > 1000.f) {
181 large = true;
182 }
183 if (value < 0.f) {
184 signd = true;
185 }
186 if (value < -1.f || value > 1.f) {
187 small = false;
188 }
148 } else { 189 } else {
149 numeric = false; 190 numeric = false;
191
192 // If the column is not numeric, it can't be any of
193 // these things either
194 integral = false;
195 increasing = false;
196 small = false;
197 large = false;
198 signd = false;
150 } 199 }
151 } 200 }
152 201
153 if (numeric) { 202 if (numeric) {
154 203
164 } 213 }
165 } 214 }
166 215
167 m_prevValues[i] = value; 216 m_prevValues[i] = value;
168 } 217 }
169 218
170 m_columnQualities[i] = 219 m_columnQualities[i] =
171 (numeric ? ColumnNumeric : 0) | 220 (numeric ? ColumnNumeric : 0) |
172 (integral ? ColumnIntegral : 0) | 221 (integral ? ColumnIntegral : 0) |
173 (increasing ? ColumnIncreasing : 0) | 222 (increasing ? ColumnIncreasing : 0) |
223 (small ? ColumnSmall : 0) |
174 (large ? ColumnLarge : 0) | 224 (large ? ColumnLarge : 0) |
225 (signd ? ColumnSigned : 0) |
175 (emptyish ? ColumnNearEmpty : 0); 226 (emptyish ? ColumnNearEmpty : 0);
176 } 227 }
177 228
178 if (lineno < 10) { 229 if (lineno < 10) {
179 m_example.push_back(list); 230 m_example.push_back(list);
180 if (lineno == 0 || cols > m_maxExampleCols) { 231 if (lineno == 0 || cols > m_maxExampleCols) {
181 m_maxExampleCols = cols; 232 m_maxExampleCols = cols;
182 } 233 }
183 } 234 }
184 235
185 // cerr << "Estimated column qualities: "; 236 if (lineno < 10) {
186 // for (int i = 0; i < m_columnCount; ++i) { 237 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
187 // cerr << int(m_columnQualities[i]) << " "; 238 for (int i = 0; i < m_columnCount; ++i) {
188 // } 239 SVDEBUG << int(m_columnQualities[i]) << " ";
189 // cerr << endl; 240 }
241 SVDEBUG << endl;
242 }
190 } 243 }
191 244
192 void 245 void
193 CSVFormat::guessPurposes() 246 CSVFormat::guessPurposes()
194 { 247 {
195 m_timingType = CSVFormat::ImplicitTiming; 248 m_timingType = CSVFormat::ImplicitTiming;
196 m_timeUnits = CSVFormat::TimeWindows; 249 m_timeUnits = CSVFormat::TimeWindows;
197 250
198 int timingColumnCount = 0; 251 int timingColumnCount = 0;
252 bool haveDurationOrEndTime = false;
253
254 SVDEBUG << "Estimated column qualities overall: ";
255 for (int i = 0; i < m_columnCount; ++i) {
256 SVDEBUG << int(m_columnQualities[i]) << " ";
257 }
258 SVDEBUG << endl;
199 259
200 // if our first column has zero or one entries in it and the rest 260 // if our first column has zero or one entries in it and the rest
201 // have more, then we'll default to ignoring the first column and 261 // have more, then we'll default to ignoring the first column and
202 // counting the next one as primary. (e.g. Sonic Annotator output 262 // counting the next one as primary. (e.g. Sonic Annotator output
203 // with filename at start of first column.) 263 // with filename at start of first column.)
249 309
250 } else { 310 } else {
251 311
252 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { 312 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
253 purpose = ColumnEndTime; 313 purpose = ColumnEndTime;
314 haveDurationOrEndTime = true;
254 } 315 }
255 } 316 }
256 } 317 }
257 318
258 if (purpose == ColumnUnknown) { 319 if (purpose == ColumnUnknown) {
292 timecol = b; 353 timecol = b;
293 } 354 }
294 if (m_columnQualities[timecol] & ColumnIncreasing) { 355 if (m_columnQualities[timecol] & ColumnIncreasing) {
295 // This shouldn't happen; should have been settled above 356 // This shouldn't happen; should have been settled above
296 m_columnPurposes[timecol] = ColumnEndTime; 357 m_columnPurposes[timecol] = ColumnEndTime;
358 haveDurationOrEndTime = true;
297 } else { 359 } else {
298 m_columnPurposes[timecol] = ColumnDuration; 360 m_columnPurposes[timecol] = ColumnDuration;
361 haveDurationOrEndTime = true;
299 } 362 }
300 --valueCount; 363 --valueCount;
301 } 364 }
302 } 365 }
303 } 366 }
304 367
305 if (timingColumnCount > 1) { 368 if (timingColumnCount > 1 || haveDurationOrEndTime) {
306 m_modelType = TwoDimensionalModelWithDuration; 369 m_modelType = TwoDimensionalModelWithDuration;
307 } else { 370 } else {
308 if (valueCount == 0) { 371 if (valueCount == 0) {
309 m_modelType = OneDimensionalModel; 372 m_modelType = OneDimensionalModel;
310 } else if (valueCount == 1) { 373 } else if (valueCount == 1) {
312 } else { 375 } else {
313 m_modelType = ThreeDimensionalModel; 376 m_modelType = ThreeDimensionalModel;
314 } 377 }
315 } 378 }
316 379
317 // cerr << "Estimated column purposes: "; 380 SVDEBUG << "Estimated column purposes: ";
318 // for (int i = 0; i < m_columnCount; ++i) { 381 for (int i = 0; i < m_columnCount; ++i) {
319 // cerr << int(m_columnPurposes[i]) << " "; 382 SVDEBUG << int(m_columnPurposes[i]) << " ";
320 // } 383 }
321 // cerr << endl; 384 SVDEBUG << endl;
322 385
323 // cerr << "Estimated model type: " << m_modelType << endl; 386 SVDEBUG << "Estimated model type: " << m_modelType << endl;
324 // cerr << "Estimated timing type: " << m_timingType << endl; 387 SVDEBUG << "Estimated timing type: " << m_timingType << endl;
325 // cerr << "Estimated units: " << m_timeUnits << endl; 388 SVDEBUG << "Estimated units: " << m_timeUnits << endl;
389 }
390
391 void
392 CSVFormat::guessAudioSampleRange()
393 {
394 AudioSampleRange range = SampleRangeSigned1;
395
396 range = SampleRangeSigned1;
397 bool knownSigned = false;
398 bool knownNonIntegral = false;
399
400 SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of "
401 << range << endl;
402
403 for (int i = 0; i < m_columnCount; ++i) {
404 if (m_columnPurposes[i] != ColumnValue) {
405 SVDEBUG << "... column " << i
406 << " is not apparently a value, ignoring" << endl;
407 continue;
408 }
409 if (!(m_columnQualities[i] & ColumnIntegral)) {
410 knownNonIntegral = true;
411 if (range == SampleRangeUnsigned255 ||
412 range == SampleRangeSigned32767) {
413 range = SampleRangeOther;
414 }
415 SVDEBUG << "... column " << i
416 << " is non-integral, updating range to " << range << endl;
417 }
418 if (m_columnQualities[i] & ColumnLarge) {
419 if (range == SampleRangeSigned1 ||
420 range == SampleRangeUnsigned255) {
421 if (knownNonIntegral) {
422 range = SampleRangeOther;
423 } else {
424 range = SampleRangeSigned32767;
425 }
426 }
427 SVDEBUG << "... column " << i << " is large, updating range to "
428 << range << endl;
429 }
430 if (m_columnQualities[i] & ColumnSigned) {
431 knownSigned = true;
432 if (range == SampleRangeUnsigned255) {
433 range = SampleRangeSigned32767;
434 }
435 SVDEBUG << "... column " << i << " is signed, updating range to "
436 << range << endl;
437 }
438 if (!(m_columnQualities[i] & ColumnSmall)) {
439 if (range == SampleRangeSigned1) {
440 if (knownNonIntegral) {
441 range = SampleRangeOther;
442 } else if (knownSigned) {
443 range = SampleRangeSigned32767;
444 } else {
445 range = SampleRangeUnsigned255;
446 }
447 }
448 SVDEBUG << "... column " << i << " is not small, updating range to "
449 << range << endl;
450 }
451 }
452
453 SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range "
454 << range << endl;
455
456 m_audioSampleRange = range;
326 } 457 }
327 458
328 CSVFormat::ColumnPurpose 459 CSVFormat::ColumnPurpose
329 CSVFormat::getColumnPurpose(int i) 460 CSVFormat::getColumnPurpose(int i)
330 { 461 {