Mercurial > hg > svcore
comparison data/fileio/CSVFormat.cpp @ 1870:1b8c4ee06f6d csv-import-headers
Detect presence of header row in CSV format guesser; use headings to inform our guesses about column purposes; test this
author | Chris Cannam |
---|---|
date | Wed, 17 Jun 2020 18:01:00 +0100 |
parents | bde22957545e |
children | bed42ce4d3ab |
comparison
equal
deleted
inserted
replaced
1868:44dba7cd9ec3 | 1870:1b8c4ee06f6d |
---|---|
29 | 29 |
30 CSVFormat::CSVFormat(QString path) : | 30 CSVFormat::CSVFormat(QString path) : |
31 m_separator(""), | 31 m_separator(""), |
32 m_sampleRate(44100), | 32 m_sampleRate(44100), |
33 m_windowSize(1024), | 33 m_windowSize(1024), |
34 m_allowQuoting(true) | 34 m_headerStatus(HeaderUnknown), |
35 m_allowQuoting(true), | |
36 m_maxExampleCols(0) | |
35 { | 37 { |
36 (void)guessFormatFor(path); | 38 (void)guessFormatFor(path); |
37 } | 39 } |
38 | 40 |
39 bool | 41 bool |
122 guessSeparator(line); | 124 guessSeparator(line); |
123 | 125 |
124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); | 126 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); |
125 | 127 |
126 int cols = list.size(); | 128 int cols = list.size(); |
127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; | 129 |
128 if (cols != m_columnCount) m_variableColumnCount = true; | 130 int firstLine = 0; |
131 if (m_headerStatus == HeaderPresent) { | |
132 firstLine = 1; | |
133 } | |
134 | |
135 if (lineno == firstLine || (cols > m_columnCount)) { | |
136 m_columnCount = cols; | |
137 } | |
138 if (cols != m_columnCount) { | |
139 m_variableColumnCount = true; | |
140 } | |
129 | 141 |
130 // All columns are regarded as having these qualities until we see | 142 // All columns are regarded as having these qualities until we see |
131 // something that indicates otherwise: | 143 // something that indicates otherwise: |
132 | 144 |
133 ColumnQualities defaultQualities = | 145 ColumnQualities defaultQualities = |
135 ColumnIncreasing | ColumnNearEmpty; | 147 ColumnIncreasing | ColumnNearEmpty; |
136 | 148 |
137 for (int i = 0; i < cols; ++i) { | 149 for (int i = 0; i < cols; ++i) { |
138 | 150 |
139 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; | 151 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; |
140 | 152 |
141 while (m_columnQualities.size() <= i) { | 153 if (m_columnQualities.find(i) == m_columnQualities.end()) { |
142 m_columnQualities.push_back(defaultQualities); | 154 m_columnQualities[i] = defaultQualities; |
143 m_prevValues.push_back(0.f); | 155 m_prevValues[i] = 0.f; |
144 } | 156 } |
145 | 157 |
146 QString s(list[i]); | 158 QString s(list[i]); |
147 bool ok = false; | 159 bool ok = false; |
148 | 160 |
159 bool signd = (qualities & ColumnSigned); // also defaults to off | 171 bool signd = (qualities & ColumnSigned); // also defaults to off |
160 bool emptyish = (qualities & ColumnNearEmpty); | 172 bool emptyish = (qualities & ColumnNearEmpty); |
161 | 173 |
162 if (s.trimmed() != "") { | 174 if (s.trimmed() != "") { |
163 | 175 |
164 if (lineno > 1) { | 176 if (lineno > firstLine) { |
165 emptyish = false; | 177 emptyish = false; |
166 } | 178 } |
167 | 179 |
168 float value = 0.f; | 180 float value = 0.f; |
169 | |
170 //!!! how to take into account headers? | |
171 | 181 |
172 if (numeric) { | 182 if (numeric) { |
173 value = s.toFloat(&ok); | 183 value = s.toFloat(&ok); |
174 if (!ok) { | 184 if (!ok) { |
175 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); | 185 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); |
176 } | 186 } |
177 if (ok) { | 187 if (ok) { |
178 if (lineno < 2 && value > 1000.f) { | 188 if (lineno < firstLine + 2 && value > 1000.f) { |
179 large = true; | 189 large = true; |
180 } | 190 } |
181 if (value < 0.f) { | 191 if (value < 0.f) { |
182 signd = true; | 192 signd = true; |
183 } | 193 } |
204 integral = false; | 214 integral = false; |
205 } | 215 } |
206 } | 216 } |
207 | 217 |
208 if (increasing) { | 218 if (increasing) { |
209 if (lineno > 0 && value <= m_prevValues[i]) { | 219 if (lineno > firstLine && value <= m_prevValues[i]) { |
210 increasing = false; | 220 increasing = false; |
211 } | 221 } |
212 } | 222 } |
213 | 223 |
214 m_prevValues[i] = value; | 224 m_prevValues[i] = value; |
223 (large ? ColumnLarge : 0) | | 233 (large ? ColumnLarge : 0) | |
224 (signd ? ColumnSigned : 0) | | 234 (signd ? ColumnSigned : 0) | |
225 (emptyish ? ColumnNearEmpty : 0); | 235 (emptyish ? ColumnNearEmpty : 0); |
226 } | 236 } |
227 | 237 |
228 if (lineno < 10) { | 238 if (lineno == 0 && m_headerStatus == HeaderUnknown) { |
239 // If we have at least one column, and every column has | |
240 // quality == ColumnNearEmpty, i.e. not empty and not numeric, | |
241 // then we probably have a header row | |
242 bool couldBeHeader = (cols > 0); | |
243 std::map<int, QString> headings; | |
244 for (int i = 0; i < cols; ++i) { | |
245 if (m_columnQualities[i] != ColumnNearEmpty) { | |
246 couldBeHeader = false; | |
247 } else { | |
248 headings[i] = list[i].trimmed().toLower(); | |
249 } | |
250 } | |
251 if (couldBeHeader) { | |
252 m_headerStatus = HeaderPresent; | |
253 m_columnHeadings = headings; | |
254 } else { | |
255 m_headerStatus = HeaderAbsent; | |
256 } | |
257 } | |
258 | |
259 if (lineno == 0 && m_headerStatus == HeaderPresent) { | |
260 // Start again with the qualities: | |
261 m_columnQualities.clear(); | |
262 m_prevValues.clear(); | |
263 } else if (lineno < firstLine + 10) { | |
264 // Not a header row, so add it to the example column output | |
229 m_example.push_back(list); | 265 m_example.push_back(list); |
230 if (lineno == 0 || cols > m_maxExampleCols) { | 266 if (lineno == firstLine || cols > m_maxExampleCols) { |
231 m_maxExampleCols = cols; | 267 m_maxExampleCols = cols; |
232 } | 268 } |
233 } | 269 } |
234 | 270 |
235 if (lineno < 10) { | 271 if (lineno < firstLine + 10) { |
236 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; | 272 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; |
237 for (int i = 0; i < m_columnCount; ++i) { | 273 if (lineno == 0 && m_headerStatus == HeaderPresent && |
238 SVDEBUG << int(m_columnQualities[i]) << " "; | 274 m_columnCount > 0 && m_columnQualities.empty()) { |
275 SVDEBUG << "[whole line classified as a header row]"; | |
276 } else { | |
277 for (int i = 0; i < cols; ++i) { | |
278 if (m_columnQualities.find(i) == m_columnQualities.end()) { | |
279 SVDEBUG << "(not set) "; | |
280 } else { | |
281 SVDEBUG << int(m_columnQualities[i]) << " "; | |
282 } | |
283 } | |
239 } | 284 } |
240 SVDEBUG << endl; | 285 SVDEBUG << endl; |
286 SVDEBUG << "Estimated header status: " << m_headerStatus << endl; | |
241 } | 287 } |
242 } | 288 } |
243 | 289 |
244 void | 290 void |
245 CSVFormat::guessPurposes() | 291 CSVFormat::guessPurposes() |
250 int timingColumnCount = 0; | 296 int timingColumnCount = 0; |
251 bool haveDurationOrEndTime = false; | 297 bool haveDurationOrEndTime = false; |
252 | 298 |
253 SVDEBUG << "Estimated column qualities overall: "; | 299 SVDEBUG << "Estimated column qualities overall: "; |
254 for (int i = 0; i < m_columnCount; ++i) { | 300 for (int i = 0; i < m_columnCount; ++i) { |
255 SVDEBUG << int(m_columnQualities[i]) << " "; | 301 if (m_columnQualities.find(i) == m_columnQualities.end()) { |
302 SVDEBUG << "(not set) "; | |
303 } else { | |
304 SVDEBUG << int(m_columnQualities[i]) << " "; | |
305 } | |
256 } | 306 } |
257 SVDEBUG << endl; | 307 SVDEBUG << endl; |
258 | 308 |
259 // if our first column has zero or one entries in it and the rest | 309 // if our first column has zero or one entries in it and the rest |
260 // have more, then we'll default to ignoring the first column and | 310 // have more, then we'll default to ignoring the first column and |
288 bool increasing = (qualities & ColumnIncreasing); | 338 bool increasing = (qualities & ColumnIncreasing); |
289 bool large = (qualities & ColumnLarge); | 339 bool large = (qualities & ColumnLarge); |
290 | 340 |
291 bool timingColumn = (numeric && increasing); | 341 bool timingColumn = (numeric && increasing); |
292 | 342 |
343 QString heading; | |
344 if (m_columnHeadings.find(i) != m_columnHeadings.end()) { | |
345 heading = m_columnHeadings[i]; | |
346 } | |
347 | |
348 if (heading == "time" || heading == "frame" || | |
349 heading == "duration" || heading == "endtime") { | |
350 timingColumn = true; | |
351 } | |
352 | |
353 if (heading == "value" || heading == "height" || heading == "label") { | |
354 timingColumn = false; | |
355 } | |
356 | |
293 if (timingColumn) { | 357 if (timingColumn) { |
294 | 358 |
295 ++timingColumnCount; | 359 ++timingColumnCount; |
360 | |
361 if (heading == "endtime") { | |
362 | |
363 purpose = ColumnEndTime; | |
364 haveDurationOrEndTime = true; | |
365 | |
366 } else if (heading == "duration") { | |
367 | |
368 purpose = ColumnDuration; | |
369 haveDurationOrEndTime = true; | |
296 | 370 |
297 if (primary) { | 371 } else if (primary || heading == "time" || heading == "frame") { |
298 | 372 |
299 purpose = ColumnStartTime; | 373 purpose = ColumnStartTime; |
300 | |
301 m_timingType = ExplicitTiming; | 374 m_timingType = ExplicitTiming; |
302 | 375 |
303 if (integral && large) { | 376 if ((integral && large) || heading == "frame") { |
304 m_timeUnits = TimeAudioFrames; | 377 m_timeUnits = TimeAudioFrames; |
305 } else { | 378 } else { |
306 m_timeUnits = TimeSeconds; | 379 m_timeUnits = TimeSeconds; |
307 } | 380 } |
308 | 381 |
309 } else { | 382 } else if (timingColumnCount == 2 && |
310 | 383 m_timingType == ExplicitTiming) { |
311 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { | 384 purpose = ColumnEndTime; |
312 purpose = ColumnEndTime; | 385 haveDurationOrEndTime = true; |
313 haveDurationOrEndTime = true; | |
314 } | |
315 } | 386 } |
316 } | 387 } |
317 | 388 |
318 if (purpose == ColumnUnknown) { | 389 if (purpose == ColumnUnknown) { |
319 if (numeric) { | 390 if (heading == "label") { |
391 purpose = ColumnLabel; | |
392 } else if (numeric || heading == "value" || heading == "height") { | |
320 purpose = ColumnValue; | 393 purpose = ColumnValue; |
321 } else { | 394 } else { |
322 purpose = ColumnLabel; | 395 purpose = ColumnLabel; |
323 } | 396 } |
324 } | 397 } |
326 setColumnPurpose(i, purpose); | 399 setColumnPurpose(i, purpose); |
327 } | 400 } |
328 | 401 |
329 int valueCount = 0; | 402 int valueCount = 0; |
330 for (int i = 0; i < m_columnCount; ++i) { | 403 for (int i = 0; i < m_columnCount; ++i) { |
331 if (m_columnPurposes[i] == ColumnValue) ++valueCount; | 404 if (m_columnPurposes[i] == ColumnValue) { |
405 ++valueCount; | |
406 } | |
332 } | 407 } |
333 | 408 |
334 if (valueCount == 2 && timingColumnCount == 1) { | 409 if (valueCount == 2 && timingColumnCount == 1) { |
335 // If we have exactly two apparent value columns and only one | 410 // If we have exactly two apparent value columns and only one |
336 // timing column, but one value column is integral and the | 411 // timing column, but one value column is integral and the |
453 << range << endl; | 528 << range << endl; |
454 | 529 |
455 m_audioSampleRange = range; | 530 m_audioSampleRange = range; |
456 } | 531 } |
457 | 532 |
458 CSVFormat::ColumnPurpose | 533 QList<CSVFormat::ColumnPurpose> |
459 CSVFormat::getColumnPurpose(int i) | 534 CSVFormat::getColumnPurposes() const |
460 { | 535 { |
461 while (m_columnPurposes.size() <= i) { | 536 QList<ColumnPurpose> purposes; |
462 m_columnPurposes.push_back(ColumnUnknown); | 537 for (int i = 0; i < m_columnCount; ++i) { |
463 } | 538 purposes.push_back(getColumnPurpose(i)); |
464 return m_columnPurposes[i]; | 539 } |
540 return purposes; | |
541 } | |
542 | |
543 void | |
544 CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl) | |
545 { | |
546 m_columnPurposes.clear(); | |
547 for (int i = 0; in_range_for(cl, i); ++i) { | |
548 m_columnPurposes[i] = cl[i]; | |
549 } | |
465 } | 550 } |
466 | 551 |
467 CSVFormat::ColumnPurpose | 552 CSVFormat::ColumnPurpose |
468 CSVFormat::getColumnPurpose(int i) const | 553 CSVFormat::getColumnPurpose(int i) const |
469 { | 554 { |
470 if (m_columnPurposes.size() <= i) { | 555 if (m_columnPurposes.find(i) == m_columnPurposes.end()) { |
471 return ColumnUnknown; | 556 return ColumnUnknown; |
472 } | 557 } else { |
473 return m_columnPurposes[i]; | 558 return m_columnPurposes.at(i); |
559 } | |
474 } | 560 } |
475 | 561 |
476 void | 562 void |
477 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) | 563 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) |
478 { | 564 { |
479 while (m_columnPurposes.size() <= i) { | |
480 m_columnPurposes.push_back(ColumnUnknown); | |
481 } | |
482 m_columnPurposes[i] = p; | 565 m_columnPurposes[i] = p; |
483 } | 566 } |
484 | 567 |
485 | 568 QList<CSVFormat::ColumnQualities> |
486 | 569 CSVFormat::getColumnQualities() const |
487 | 570 { |
571 QList<ColumnQualities> qualities; | |
572 for (int i = 0; i < m_columnCount; ++i) { | |
573 if (m_columnQualities.find(i) == m_columnQualities.end()) { | |
574 qualities.push_back(0); | |
575 } else { | |
576 qualities.push_back(m_columnQualities.at(i)); | |
577 } | |
578 } | |
579 return qualities; | |
580 } |