Mercurial > hg > svcore
comparison data/fileio/CSVFormat.cpp @ 1874:48f50a4a82ea
Merge from branch csv-import-headers
author | Chris Cannam |
---|---|
date | Thu, 18 Jun 2020 13:43:20 +0100 |
parents | bed42ce4d3ab |
children |
comparison
equal
deleted
inserted
replaced
1868:44dba7cd9ec3 | 1874:48f50a4a82ea |
---|---|
29 | 29 |
30 CSVFormat::CSVFormat(QString path) : | 30 CSVFormat::CSVFormat(QString path) : |
31 m_separator(""), | 31 m_separator(""), |
32 m_sampleRate(44100), | 32 m_sampleRate(44100), |
33 m_windowSize(1024), | 33 m_windowSize(1024), |
34 m_allowQuoting(true) | 34 m_headerStatus(HeaderUnknown), |
35 m_allowQuoting(true), | |
36 m_maxExampleCols(0) | |
35 { | 37 { |
36 (void)guessFormatFor(path); | 38 (void)guessFormatFor(path); |
37 } | 39 } |
38 | 40 |
39 bool | 41 bool |
122 guessSeparator(line); | 124 guessSeparator(line); |
123 | 125 |
124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); | 126 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); |
125 | 127 |
126 int cols = list.size(); | 128 int cols = list.size(); |
127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; | 129 |
128 if (cols != m_columnCount) m_variableColumnCount = true; | 130 int firstLine = 0; |
131 if (m_headerStatus == HeaderPresent) { | |
132 firstLine = 1; | |
133 } | |
134 | |
135 if (lineno == firstLine || (cols > m_columnCount)) { | |
136 m_columnCount = cols; | |
137 } | |
138 if (cols != m_columnCount) { | |
139 m_variableColumnCount = true; | |
140 } | |
129 | 141 |
130 // All columns are regarded as having these qualities until we see | 142 // All columns are regarded as having these qualities until we see |
131 // something that indicates otherwise: | 143 // something that indicates otherwise: |
132 | 144 |
133 ColumnQualities defaultQualities = | 145 ColumnQualities defaultQualities = |
135 ColumnIncreasing | ColumnNearEmpty; | 147 ColumnIncreasing | ColumnNearEmpty; |
136 | 148 |
137 for (int i = 0; i < cols; ++i) { | 149 for (int i = 0; i < cols; ++i) { |
138 | 150 |
139 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; | 151 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; |
140 | 152 |
141 while (m_columnQualities.size() <= i) { | 153 if (m_columnQualities.find(i) == m_columnQualities.end()) { |
142 m_columnQualities.push_back(defaultQualities); | 154 m_columnQualities[i] = defaultQualities; |
143 m_prevValues.push_back(0.f); | 155 m_prevValues[i] = 0.f; |
144 } | 156 } |
145 | 157 |
146 QString s(list[i]); | 158 QString s(list[i]); |
147 bool ok = false; | 159 bool ok = false; |
148 | 160 |
159 bool signd = (qualities & ColumnSigned); // also defaults to off | 171 bool signd = (qualities & ColumnSigned); // also defaults to off |
160 bool emptyish = (qualities & ColumnNearEmpty); | 172 bool emptyish = (qualities & ColumnNearEmpty); |
161 | 173 |
162 if (s.trimmed() != "") { | 174 if (s.trimmed() != "") { |
163 | 175 |
164 if (lineno > 1) { | 176 if (lineno > firstLine) { |
165 emptyish = false; | 177 emptyish = false; |
166 } | 178 } |
167 | 179 |
168 float value = 0.f; | 180 float value = 0.f; |
169 | |
170 //!!! how to take into account headers? | |
171 | 181 |
172 if (numeric) { | 182 if (numeric) { |
173 value = s.toFloat(&ok); | 183 value = s.toFloat(&ok); |
174 if (!ok) { | 184 if (!ok) { |
175 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); | 185 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); |
176 } | 186 } |
177 if (ok) { | 187 if (ok) { |
178 if (lineno < 2 && value > 1000.f) { | 188 if (lineno < firstLine + 2 && value > 1000.f) { |
179 large = true; | 189 large = true; |
180 } | 190 } |
181 if (value < 0.f) { | 191 if (value < 0.f) { |
182 signd = true; | 192 signd = true; |
183 } | 193 } |
204 integral = false; | 214 integral = false; |
205 } | 215 } |
206 } | 216 } |
207 | 217 |
208 if (increasing) { | 218 if (increasing) { |
209 if (lineno > 0 && value <= m_prevValues[i]) { | 219 if (lineno > firstLine && value <= m_prevValues[i]) { |
210 increasing = false; | 220 increasing = false; |
211 } | 221 } |
212 } | 222 } |
213 | 223 |
214 m_prevValues[i] = value; | 224 m_prevValues[i] = value; |
223 (large ? ColumnLarge : 0) | | 233 (large ? ColumnLarge : 0) | |
224 (signd ? ColumnSigned : 0) | | 234 (signd ? ColumnSigned : 0) | |
225 (emptyish ? ColumnNearEmpty : 0); | 235 (emptyish ? ColumnNearEmpty : 0); |
226 } | 236 } |
227 | 237 |
228 if (lineno < 10) { | 238 if (lineno == 0 && m_headerStatus == HeaderUnknown) { |
239 // If we have at least one column, and every column has | |
240 // quality == ColumnNearEmpty, i.e. not empty and not numeric, | |
241 // then we probably have a header row | |
242 bool couldBeHeader = (cols > 0); | |
243 std::map<int, QString> headings; | |
244 for (int i = 0; i < cols; ++i) { | |
245 if (m_columnQualities[i] != ColumnNearEmpty) { | |
246 couldBeHeader = false; | |
247 } else { | |
248 headings[i] = list[i].trimmed().toLower(); | |
249 } | |
250 } | |
251 if (couldBeHeader) { | |
252 m_headerStatus = HeaderPresent; | |
253 m_columnHeadings = headings; | |
254 } else { | |
255 m_headerStatus = HeaderAbsent; | |
256 } | |
257 } | |
258 | |
259 if (lineno == 0 && m_headerStatus == HeaderPresent) { | |
260 // Start again with the qualities: | |
261 m_columnQualities.clear(); | |
262 m_prevValues.clear(); | |
263 } | |
264 | |
265 if (lineno < firstLine + 10) { | |
229 m_example.push_back(list); | 266 m_example.push_back(list); |
230 if (lineno == 0 || cols > m_maxExampleCols) { | 267 if (lineno == 0 || cols > m_maxExampleCols) { |
231 m_maxExampleCols = cols; | 268 m_maxExampleCols = cols; |
232 } | 269 } |
233 } | 270 } |
234 | 271 |
235 if (lineno < 10) { | 272 if (lineno < firstLine + 10) { |
236 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; | 273 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; |
237 for (int i = 0; i < m_columnCount; ++i) { | 274 if (lineno == 0 && m_headerStatus == HeaderPresent && |
238 SVDEBUG << int(m_columnQualities[i]) << " "; | 275 m_columnCount > 0 && m_columnQualities.empty()) { |
276 SVDEBUG << "[whole line classified as a header row]"; | |
277 } else { | |
278 for (int i = 0; i < cols; ++i) { | |
279 if (m_columnQualities.find(i) == m_columnQualities.end()) { | |
280 SVDEBUG << "(not set) "; | |
281 } else { | |
282 SVDEBUG << int(m_columnQualities[i]) << " "; | |
283 } | |
284 } | |
239 } | 285 } |
240 SVDEBUG << endl; | 286 SVDEBUG << endl; |
287 SVDEBUG << "Estimated header status: " << m_headerStatus << endl; | |
241 } | 288 } |
242 } | 289 } |
243 | 290 |
244 void | 291 void |
245 CSVFormat::guessPurposes() | 292 CSVFormat::guessPurposes() |
250 int timingColumnCount = 0; | 297 int timingColumnCount = 0; |
251 bool haveDurationOrEndTime = false; | 298 bool haveDurationOrEndTime = false; |
252 | 299 |
253 SVDEBUG << "Estimated column qualities overall: "; | 300 SVDEBUG << "Estimated column qualities overall: "; |
254 for (int i = 0; i < m_columnCount; ++i) { | 301 for (int i = 0; i < m_columnCount; ++i) { |
255 SVDEBUG << int(m_columnQualities[i]) << " "; | 302 if (m_columnQualities.find(i) == m_columnQualities.end()) { |
303 SVDEBUG << "(not set) "; | |
304 } else { | |
305 SVDEBUG << int(m_columnQualities[i]) << " "; | |
306 } | |
256 } | 307 } |
257 SVDEBUG << endl; | 308 SVDEBUG << endl; |
258 | 309 |
259 // if our first column has zero or one entries in it and the rest | 310 // if our first column has zero or one entries in it and the rest |
260 // have more, then we'll default to ignoring the first column and | 311 // have more, then we'll default to ignoring the first column and |
288 bool increasing = (qualities & ColumnIncreasing); | 339 bool increasing = (qualities & ColumnIncreasing); |
289 bool large = (qualities & ColumnLarge); | 340 bool large = (qualities & ColumnLarge); |
290 | 341 |
291 bool timingColumn = (numeric && increasing); | 342 bool timingColumn = (numeric && increasing); |
292 | 343 |
344 QString heading; | |
345 if (m_columnHeadings.find(i) != m_columnHeadings.end()) { | |
346 heading = m_columnHeadings[i]; | |
347 } | |
348 | |
349 if (heading == "time" || heading == "frame" || | |
350 heading == "duration" || heading == "endtime") { | |
351 timingColumn = true; | |
352 } | |
353 | |
354 if (heading == "value" || heading == "height" || heading == "label") { | |
355 timingColumn = false; | |
356 } | |
357 | |
293 if (timingColumn) { | 358 if (timingColumn) { |
294 | 359 |
295 ++timingColumnCount; | 360 ++timingColumnCount; |
361 | |
362 if (heading == "endtime") { | |
363 | |
364 purpose = ColumnEndTime; | |
365 haveDurationOrEndTime = true; | |
366 | |
367 } else if (heading == "duration") { | |
368 | |
369 purpose = ColumnDuration; | |
370 haveDurationOrEndTime = true; | |
296 | 371 |
297 if (primary) { | 372 } else if (primary || heading == "time" || heading == "frame") { |
298 | 373 |
299 purpose = ColumnStartTime; | 374 purpose = ColumnStartTime; |
300 | |
301 m_timingType = ExplicitTiming; | 375 m_timingType = ExplicitTiming; |
302 | 376 |
303 if (integral && large) { | 377 if ((integral && large) || heading == "frame") { |
304 m_timeUnits = TimeAudioFrames; | 378 m_timeUnits = TimeAudioFrames; |
305 } else { | 379 } else { |
306 m_timeUnits = TimeSeconds; | 380 m_timeUnits = TimeSeconds; |
307 } | 381 } |
308 | 382 |
309 } else { | 383 } else if (timingColumnCount == 2 && |
310 | 384 m_timingType == ExplicitTiming) { |
311 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { | 385 purpose = ColumnEndTime; |
312 purpose = ColumnEndTime; | 386 haveDurationOrEndTime = true; |
313 haveDurationOrEndTime = true; | |
314 } | |
315 } | 387 } |
316 } | 388 } |
317 | 389 |
318 if (purpose == ColumnUnknown) { | 390 if (purpose == ColumnUnknown) { |
319 if (numeric) { | 391 if (heading == "label") { |
392 purpose = ColumnLabel; | |
393 } else if (numeric || heading == "value" || heading == "height") { | |
320 purpose = ColumnValue; | 394 purpose = ColumnValue; |
321 } else { | 395 } else { |
322 purpose = ColumnLabel; | 396 purpose = ColumnLabel; |
323 } | 397 } |
324 } | 398 } |
326 setColumnPurpose(i, purpose); | 400 setColumnPurpose(i, purpose); |
327 } | 401 } |
328 | 402 |
329 int valueCount = 0; | 403 int valueCount = 0; |
330 for (int i = 0; i < m_columnCount; ++i) { | 404 for (int i = 0; i < m_columnCount; ++i) { |
331 if (m_columnPurposes[i] == ColumnValue) ++valueCount; | 405 if (m_columnPurposes[i] == ColumnValue) { |
406 ++valueCount; | |
407 } | |
332 } | 408 } |
333 | 409 |
334 if (valueCount == 2 && timingColumnCount == 1) { | 410 if (valueCount == 2 && timingColumnCount == 1) { |
335 // If we have exactly two apparent value columns and only one | 411 // If we have exactly two apparent value columns and only one |
336 // timing column, but one value column is integral and the | 412 // timing column, but one value column is integral and the |
453 << range << endl; | 529 << range << endl; |
454 | 530 |
455 m_audioSampleRange = range; | 531 m_audioSampleRange = range; |
456 } | 532 } |
457 | 533 |
458 CSVFormat::ColumnPurpose | 534 QList<CSVFormat::ColumnPurpose> |
459 CSVFormat::getColumnPurpose(int i) | 535 CSVFormat::getColumnPurposes() const |
460 { | 536 { |
461 while (m_columnPurposes.size() <= i) { | 537 QList<ColumnPurpose> purposes; |
462 m_columnPurposes.push_back(ColumnUnknown); | 538 for (int i = 0; i < m_columnCount; ++i) { |
463 } | 539 purposes.push_back(getColumnPurpose(i)); |
464 return m_columnPurposes[i]; | 540 } |
541 return purposes; | |
542 } | |
543 | |
544 void | |
545 CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl) | |
546 { | |
547 m_columnPurposes.clear(); | |
548 for (int i = 0; in_range_for(cl, i); ++i) { | |
549 m_columnPurposes[i] = cl[i]; | |
550 } | |
465 } | 551 } |
466 | 552 |
467 CSVFormat::ColumnPurpose | 553 CSVFormat::ColumnPurpose |
468 CSVFormat::getColumnPurpose(int i) const | 554 CSVFormat::getColumnPurpose(int i) const |
469 { | 555 { |
470 if (m_columnPurposes.size() <= i) { | 556 if (m_columnPurposes.find(i) == m_columnPurposes.end()) { |
471 return ColumnUnknown; | 557 return ColumnUnknown; |
472 } | 558 } else { |
473 return m_columnPurposes[i]; | 559 return m_columnPurposes.at(i); |
560 } | |
474 } | 561 } |
475 | 562 |
476 void | 563 void |
477 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) | 564 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) |
478 { | 565 { |
479 while (m_columnPurposes.size() <= i) { | |
480 m_columnPurposes.push_back(ColumnUnknown); | |
481 } | |
482 m_columnPurposes[i] = p; | 566 m_columnPurposes[i] = p; |
483 } | 567 } |
484 | 568 |
485 | 569 QList<CSVFormat::ColumnQualities> |
486 | 570 CSVFormat::getColumnQualities() const |
487 | 571 { |
572 QList<ColumnQualities> qualities; | |
573 for (int i = 0; i < m_columnCount; ++i) { | |
574 if (m_columnQualities.find(i) == m_columnQualities.end()) { | |
575 qualities.push_back(0); | |
576 } else { | |
577 qualities.push_back(m_columnQualities.at(i)); | |
578 } | |
579 } | |
580 return qualities; | |
581 } |