Mercurial > hg > svcore
comparison data/fileio/CSVFormat.cpp @ 1527:710e6250a401 zoom
Merge from default branch
author | Chris Cannam |
---|---|
date | Mon, 17 Sep 2018 13:51:14 +0100 |
parents | a92e94215863 |
children | 9570ef94eaa3 |
comparison
equal
deleted
inserted
replaced
1324:d4a28d1479a8 | 1527:710e6250a401 |
---|---|
23 #include <QStringList> | 23 #include <QStringList> |
24 #include <QTextStream> | 24 #include <QTextStream> |
25 | 25 |
26 #include <iostream> | 26 #include <iostream> |
27 | 27 |
28 #include "base/Debug.h" | |
29 | |
28 CSVFormat::CSVFormat(QString path) : | 30 CSVFormat::CSVFormat(QString path) : |
29 m_separator(""), | 31 m_separator(""), |
30 m_sampleRate(44100), | 32 m_sampleRate(44100), |
31 m_windowSize(1024), | 33 m_windowSize(1024), |
32 m_allowQuoting(true) | 34 m_allowQuoting(true) |
33 { | 35 { |
34 guessFormatFor(path); | 36 (void)guessFormatFor(path); |
35 } | 37 } |
36 | 38 |
37 void | 39 bool |
38 CSVFormat::guessFormatFor(QString path) | 40 CSVFormat::guessFormatFor(QString path) |
39 { | 41 { |
42 m_separator = ""; // to prompt guessing for it | |
43 | |
40 m_modelType = TwoDimensionalModel; | 44 m_modelType = TwoDimensionalModel; |
41 m_timingType = ExplicitTiming; | 45 m_timingType = ExplicitTiming; |
42 m_timeUnits = TimeSeconds; | 46 m_timeUnits = TimeSeconds; |
43 | 47 |
44 m_maxExampleCols = 0; | 48 m_maxExampleCols = 0; |
49 m_columnQualities.clear(); | 53 m_columnQualities.clear(); |
50 m_columnPurposes.clear(); | 54 m_columnPurposes.clear(); |
51 m_prevValues.clear(); | 55 m_prevValues.clear(); |
52 | 56 |
53 QFile file(path); | 57 QFile file(path); |
54 if (!file.exists()) return; | 58 if (!file.exists()) { |
55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; | 59 SVCERR << "CSVFormat::guessFormatFor(" << path |
60 << "): File does not exist" << endl; | |
61 return false; | |
62 } | |
63 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { | |
64 SVCERR << "CSVFormat::guessFormatFor(" << path | |
65 << "): File could not be opened for reading" << endl; | |
66 return false; | |
67 } | |
68 SVDEBUG << "CSVFormat::guessFormatFor(" << path << ")" << endl; | |
56 | 69 |
57 QTextStream in(&file); | 70 QTextStream in(&file); |
58 in.seek(0); | 71 in.seek(0); |
59 | 72 |
60 int lineno = 0; | 73 int lineno = 0; |
67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); | 80 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); |
68 | 81 |
69 for (int li = 0; li < lines.size(); ++li) { | 82 for (int li = 0; li < lines.size(); ++li) { |
70 | 83 |
71 QString line = lines[li]; | 84 QString line = lines[li]; |
72 if (line.startsWith("#") || line == "") continue; | 85 if (line.startsWith("#") || line == "") { |
86 continue; | |
87 } | |
73 | 88 |
74 guessQualities(line, lineno); | 89 guessQualities(line, lineno); |
75 | 90 |
76 ++lineno; | 91 ++lineno; |
77 } | 92 } |
78 | 93 |
79 if (lineno >= 50) break; | 94 if (lineno >= 150) break; |
80 } | 95 } |
81 | 96 |
82 guessPurposes(); | 97 guessPurposes(); |
98 guessAudioSampleRange(); | |
99 | |
100 return true; | |
83 } | 101 } |
84 | 102 |
85 void | 103 void |
86 CSVFormat::guessSeparator(QString line) | 104 CSVFormat::guessSeparator(QString line) |
87 { | 105 { |
88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; | 106 QString candidates = "\t|,/: "; |
89 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) { | 107 |
90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { | 108 for (int i = 0; i < candidates.length(); ++i) { |
109 auto bits = StringBits::split(line, candidates[i], m_allowQuoting); | |
110 if (bits.size() >= 2) { | |
111 SVDEBUG << "Successfully split the line into:" << endl; | |
112 for (auto b: bits) { | |
113 SVDEBUG << b << endl; | |
114 } | |
91 m_separator = candidates[i]; | 115 m_separator = candidates[i]; |
116 SVDEBUG << "Estimated column separator: '" << m_separator | |
117 << "'" << endl; | |
92 return; | 118 return; |
93 } | 119 } |
94 } | 120 } |
95 m_separator = " "; | |
96 } | 121 } |
97 | 122 |
98 void | 123 void |
99 CSVFormat::guessQualities(QString line, int lineno) | 124 CSVFormat::guessQualities(QString line, int lineno) |
100 { | 125 { |
101 if (m_separator == "") guessSeparator(line); | 126 if (m_separator == "") { |
102 | 127 guessSeparator(line); |
103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting); | 128 } |
129 | |
130 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); | |
104 | 131 |
105 int cols = list.size(); | 132 int cols = list.size(); |
106 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; | 133 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; |
107 if (cols != m_columnCount) m_variableColumnCount = true; | 134 if (cols != m_columnCount) m_variableColumnCount = true; |
108 | 135 |
109 // All columns are regarded as having these qualities until we see | 136 // All columns are regarded as having these qualities until we see |
110 // something that indicates otherwise: | 137 // something that indicates otherwise: |
111 | 138 |
112 ColumnQualities defaultQualities = | 139 ColumnQualities defaultQualities = |
113 ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty; | 140 ColumnNumeric | ColumnIntegral | ColumnSmall | |
141 ColumnIncreasing | ColumnNearEmpty; | |
114 | 142 |
115 for (int i = 0; i < cols; ++i) { | 143 for (int i = 0; i < cols; ++i) { |
116 | 144 |
117 while (m_columnQualities.size() <= i) { | 145 while (m_columnQualities.size() <= i) { |
118 m_columnQualities.push_back(defaultQualities); | 146 m_columnQualities.push_back(defaultQualities); |
119 m_prevValues.push_back(0.f); | 147 m_prevValues.push_back(0.f); |
120 } | 148 } |
121 | 149 |
122 QString s(list[i]); | 150 QString s(list[i]); |
123 bool ok = false; | 151 bool ok = false; |
124 | 152 |
125 ColumnQualities qualities = m_columnQualities[i]; | 153 ColumnQualities qualities = m_columnQualities[i]; |
126 | 154 |
155 // Looks like this is defined on Windows | |
156 #undef small | |
157 | |
127 bool numeric = (qualities & ColumnNumeric); | 158 bool numeric = (qualities & ColumnNumeric); |
128 bool integral = (qualities & ColumnIntegral); | 159 bool integral = (qualities & ColumnIntegral); |
129 bool increasing = (qualities & ColumnIncreasing); | 160 bool increasing = (qualities & ColumnIncreasing); |
161 bool small = (qualities & ColumnSmall); | |
130 bool large = (qualities & ColumnLarge); // this one defaults to off | 162 bool large = (qualities & ColumnLarge); // this one defaults to off |
163 bool signd = (qualities & ColumnSigned); // also defaults to off | |
131 bool emptyish = (qualities & ColumnNearEmpty); | 164 bool emptyish = (qualities & ColumnNearEmpty); |
132 | 165 |
133 if (lineno > 1 && s.trimmed() != "") { | 166 if (lineno > 1 && s.trimmed() != "") { |
134 emptyish = false; | 167 emptyish = false; |
135 } | 168 } |
142 value = s.toFloat(&ok); | 175 value = s.toFloat(&ok); |
143 if (!ok) { | 176 if (!ok) { |
144 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); | 177 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); |
145 } | 178 } |
146 if (ok) { | 179 if (ok) { |
147 if (lineno < 2 && value > 1000.f) large = true; | 180 if (lineno < 2 && value > 1000.f) { |
181 large = true; | |
182 } | |
183 if (value < 0.f) { | |
184 signd = true; | |
185 } | |
186 if (value < -1.f || value > 1.f) { | |
187 small = false; | |
188 } | |
148 } else { | 189 } else { |
149 numeric = false; | 190 numeric = false; |
191 | |
192 // If the column is not numeric, it can't be any of | |
193 // these things either | |
194 integral = false; | |
195 increasing = false; | |
196 small = false; | |
197 large = false; | |
198 signd = false; | |
150 } | 199 } |
151 } | 200 } |
152 | 201 |
153 if (numeric) { | 202 if (numeric) { |
154 | 203 |
164 } | 213 } |
165 } | 214 } |
166 | 215 |
167 m_prevValues[i] = value; | 216 m_prevValues[i] = value; |
168 } | 217 } |
169 | 218 |
170 m_columnQualities[i] = | 219 m_columnQualities[i] = |
171 (numeric ? ColumnNumeric : 0) | | 220 (numeric ? ColumnNumeric : 0) | |
172 (integral ? ColumnIntegral : 0) | | 221 (integral ? ColumnIntegral : 0) | |
173 (increasing ? ColumnIncreasing : 0) | | 222 (increasing ? ColumnIncreasing : 0) | |
223 (small ? ColumnSmall : 0) | | |
174 (large ? ColumnLarge : 0) | | 224 (large ? ColumnLarge : 0) | |
225 (signd ? ColumnSigned : 0) | | |
175 (emptyish ? ColumnNearEmpty : 0); | 226 (emptyish ? ColumnNearEmpty : 0); |
176 } | 227 } |
177 | 228 |
178 if (lineno < 10) { | 229 if (lineno < 10) { |
179 m_example.push_back(list); | 230 m_example.push_back(list); |
180 if (lineno == 0 || cols > m_maxExampleCols) { | 231 if (lineno == 0 || cols > m_maxExampleCols) { |
181 m_maxExampleCols = cols; | 232 m_maxExampleCols = cols; |
182 } | 233 } |
183 } | 234 } |
184 | 235 |
185 // cerr << "Estimated column qualities: "; | 236 if (lineno < 10) { |
186 // for (int i = 0; i < m_columnCount; ++i) { | 237 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; |
187 // cerr << int(m_columnQualities[i]) << " "; | 238 for (int i = 0; i < m_columnCount; ++i) { |
188 // } | 239 SVDEBUG << int(m_columnQualities[i]) << " "; |
189 // cerr << endl; | 240 } |
241 SVDEBUG << endl; | |
242 } | |
190 } | 243 } |
191 | 244 |
192 void | 245 void |
193 CSVFormat::guessPurposes() | 246 CSVFormat::guessPurposes() |
194 { | 247 { |
195 m_timingType = CSVFormat::ImplicitTiming; | 248 m_timingType = CSVFormat::ImplicitTiming; |
196 m_timeUnits = CSVFormat::TimeWindows; | 249 m_timeUnits = CSVFormat::TimeWindows; |
197 | 250 |
198 int timingColumnCount = 0; | 251 int timingColumnCount = 0; |
252 bool haveDurationOrEndTime = false; | |
253 | |
254 SVDEBUG << "Estimated column qualities overall: "; | |
255 for (int i = 0; i < m_columnCount; ++i) { | |
256 SVDEBUG << int(m_columnQualities[i]) << " "; | |
257 } | |
258 SVDEBUG << endl; | |
199 | 259 |
200 // if our first column has zero or one entries in it and the rest | 260 // if our first column has zero or one entries in it and the rest |
201 // have more, then we'll default to ignoring the first column and | 261 // have more, then we'll default to ignoring the first column and |
202 // counting the next one as primary. (e.g. Sonic Annotator output | 262 // counting the next one as primary. (e.g. Sonic Annotator output |
203 // with filename at start of first column.) | 263 // with filename at start of first column.) |
249 | 309 |
250 } else { | 310 } else { |
251 | 311 |
252 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { | 312 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { |
253 purpose = ColumnEndTime; | 313 purpose = ColumnEndTime; |
314 haveDurationOrEndTime = true; | |
254 } | 315 } |
255 } | 316 } |
256 } | 317 } |
257 | 318 |
258 if (purpose == ColumnUnknown) { | 319 if (purpose == ColumnUnknown) { |
292 timecol = b; | 353 timecol = b; |
293 } | 354 } |
294 if (m_columnQualities[timecol] & ColumnIncreasing) { | 355 if (m_columnQualities[timecol] & ColumnIncreasing) { |
295 // This shouldn't happen; should have been settled above | 356 // This shouldn't happen; should have been settled above |
296 m_columnPurposes[timecol] = ColumnEndTime; | 357 m_columnPurposes[timecol] = ColumnEndTime; |
358 haveDurationOrEndTime = true; | |
297 } else { | 359 } else { |
298 m_columnPurposes[timecol] = ColumnDuration; | 360 m_columnPurposes[timecol] = ColumnDuration; |
361 haveDurationOrEndTime = true; | |
299 } | 362 } |
300 --valueCount; | 363 --valueCount; |
301 } | 364 } |
302 } | 365 } |
303 } | 366 } |
304 | 367 |
305 if (timingColumnCount > 1) { | 368 if (timingColumnCount > 1 || haveDurationOrEndTime) { |
306 m_modelType = TwoDimensionalModelWithDuration; | 369 m_modelType = TwoDimensionalModelWithDuration; |
307 } else { | 370 } else { |
308 if (valueCount == 0) { | 371 if (valueCount == 0) { |
309 m_modelType = OneDimensionalModel; | 372 m_modelType = OneDimensionalModel; |
310 } else if (valueCount == 1) { | 373 } else if (valueCount == 1) { |
312 } else { | 375 } else { |
313 m_modelType = ThreeDimensionalModel; | 376 m_modelType = ThreeDimensionalModel; |
314 } | 377 } |
315 } | 378 } |
316 | 379 |
317 // cerr << "Estimated column purposes: "; | 380 SVDEBUG << "Estimated column purposes: "; |
318 // for (int i = 0; i < m_columnCount; ++i) { | 381 for (int i = 0; i < m_columnCount; ++i) { |
319 // cerr << int(m_columnPurposes[i]) << " "; | 382 SVDEBUG << int(m_columnPurposes[i]) << " "; |
320 // } | 383 } |
321 // cerr << endl; | 384 SVDEBUG << endl; |
322 | 385 |
323 // cerr << "Estimated model type: " << m_modelType << endl; | 386 SVDEBUG << "Estimated model type: " << m_modelType << endl; |
324 // cerr << "Estimated timing type: " << m_timingType << endl; | 387 SVDEBUG << "Estimated timing type: " << m_timingType << endl; |
325 // cerr << "Estimated units: " << m_timeUnits << endl; | 388 SVDEBUG << "Estimated units: " << m_timeUnits << endl; |
389 } | |
390 | |
391 void | |
392 CSVFormat::guessAudioSampleRange() | |
393 { | |
394 AudioSampleRange range = SampleRangeSigned1; | |
395 | |
396 range = SampleRangeSigned1; | |
397 bool knownSigned = false; | |
398 bool knownNonIntegral = false; | |
399 | |
400 SVDEBUG << "CSVFormat::guessAudioSampleRange: starting with assumption of " | |
401 << range << endl; | |
402 | |
403 for (int i = 0; i < m_columnCount; ++i) { | |
404 if (m_columnPurposes[i] != ColumnValue) { | |
405 SVDEBUG << "... column " << i | |
406 << " is not apparently a value, ignoring" << endl; | |
407 continue; | |
408 } | |
409 if (!(m_columnQualities[i] & ColumnIntegral)) { | |
410 knownNonIntegral = true; | |
411 if (range == SampleRangeUnsigned255 || | |
412 range == SampleRangeSigned32767) { | |
413 range = SampleRangeOther; | |
414 } | |
415 SVDEBUG << "... column " << i | |
416 << " is non-integral, updating range to " << range << endl; | |
417 } | |
418 if (m_columnQualities[i] & ColumnLarge) { | |
419 if (range == SampleRangeSigned1 || | |
420 range == SampleRangeUnsigned255) { | |
421 if (knownNonIntegral) { | |
422 range = SampleRangeOther; | |
423 } else { | |
424 range = SampleRangeSigned32767; | |
425 } | |
426 } | |
427 SVDEBUG << "... column " << i << " is large, updating range to " | |
428 << range << endl; | |
429 } | |
430 if (m_columnQualities[i] & ColumnSigned) { | |
431 knownSigned = true; | |
432 if (range == SampleRangeUnsigned255) { | |
433 range = SampleRangeSigned32767; | |
434 } | |
435 SVDEBUG << "... column " << i << " is signed, updating range to " | |
436 << range << endl; | |
437 } | |
438 if (!(m_columnQualities[i] & ColumnSmall)) { | |
439 if (range == SampleRangeSigned1) { | |
440 if (knownNonIntegral) { | |
441 range = SampleRangeOther; | |
442 } else if (knownSigned) { | |
443 range = SampleRangeSigned32767; | |
444 } else { | |
445 range = SampleRangeUnsigned255; | |
446 } | |
447 } | |
448 SVDEBUG << "... column " << i << " is not small, updating range to " | |
449 << range << endl; | |
450 } | |
451 } | |
452 | |
453 SVDEBUG << "CSVFormat::guessAudioSampleRange: ended up with range " | |
454 << range << endl; | |
455 | |
456 m_audioSampleRange = range; | |
326 } | 457 } |
327 | 458 |
328 CSVFormat::ColumnPurpose | 459 CSVFormat::ColumnPurpose |
329 CSVFormat::getColumnPurpose(int i) | 460 CSVFormat::getColumnPurpose(int i) |
330 { | 461 { |