comparison data/fileio/CSVFormat.cpp @ 1870:1b8c4ee06f6d csv-import-headers

Detect presence of header row in CSV format guesser; use headings to inform our guesses about column purposes; test this
author Chris Cannam
date Wed, 17 Jun 2020 18:01:00 +0100
parents bde22957545e
children bed42ce4d3ab
comparison
equal deleted inserted replaced
1868:44dba7cd9ec3 1870:1b8c4ee06f6d
29 29
30 CSVFormat::CSVFormat(QString path) : 30 CSVFormat::CSVFormat(QString path) :
31 m_separator(""), 31 m_separator(""),
32 m_sampleRate(44100), 32 m_sampleRate(44100),
33 m_windowSize(1024), 33 m_windowSize(1024),
34 m_allowQuoting(true) 34 m_headerStatus(HeaderUnknown),
35 m_allowQuoting(true),
36 m_maxExampleCols(0)
35 { 37 {
36 (void)guessFormatFor(path); 38 (void)guessFormatFor(path);
37 } 39 }
38 40
39 bool 41 bool
122 guessSeparator(line); 124 guessSeparator(line);
123 125
124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); 126 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
125 127
126 int cols = list.size(); 128 int cols = list.size();
127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; 129
128 if (cols != m_columnCount) m_variableColumnCount = true; 130 int firstLine = 0;
131 if (m_headerStatus == HeaderPresent) {
132 firstLine = 1;
133 }
134
135 if (lineno == firstLine || (cols > m_columnCount)) {
136 m_columnCount = cols;
137 }
138 if (cols != m_columnCount) {
139 m_variableColumnCount = true;
140 }
129 141
130 // All columns are regarded as having these qualities until we see 142 // All columns are regarded as having these qualities until we see
131 // something that indicates otherwise: 143 // something that indicates otherwise:
132 144
133 ColumnQualities defaultQualities = 145 ColumnQualities defaultQualities =
135 ColumnIncreasing | ColumnNearEmpty; 147 ColumnIncreasing | ColumnNearEmpty;
136 148
137 for (int i = 0; i < cols; ++i) { 149 for (int i = 0; i < cols; ++i) {
138 150
139 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; 151 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
140 152
141 while (m_columnQualities.size() <= i) { 153 if (m_columnQualities.find(i) == m_columnQualities.end()) {
142 m_columnQualities.push_back(defaultQualities); 154 m_columnQualities[i] = defaultQualities;
143 m_prevValues.push_back(0.f); 155 m_prevValues[i] = 0.f;
144 } 156 }
145 157
146 QString s(list[i]); 158 QString s(list[i]);
147 bool ok = false; 159 bool ok = false;
148 160
159 bool signd = (qualities & ColumnSigned); // also defaults to off 171 bool signd = (qualities & ColumnSigned); // also defaults to off
160 bool emptyish = (qualities & ColumnNearEmpty); 172 bool emptyish = (qualities & ColumnNearEmpty);
161 173
162 if (s.trimmed() != "") { 174 if (s.trimmed() != "") {
163 175
164 if (lineno > 1) { 176 if (lineno > firstLine) {
165 emptyish = false; 177 emptyish = false;
166 } 178 }
167 179
168 float value = 0.f; 180 float value = 0.f;
169
170 //!!! how to take into account headers?
171 181
172 if (numeric) { 182 if (numeric) {
173 value = s.toFloat(&ok); 183 value = s.toFloat(&ok);
174 if (!ok) { 184 if (!ok) {
175 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); 185 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
176 } 186 }
177 if (ok) { 187 if (ok) {
178 if (lineno < 2 && value > 1000.f) { 188 if (lineno < firstLine + 2 && value > 1000.f) {
179 large = true; 189 large = true;
180 } 190 }
181 if (value < 0.f) { 191 if (value < 0.f) {
182 signd = true; 192 signd = true;
183 } 193 }
204 integral = false; 214 integral = false;
205 } 215 }
206 } 216 }
207 217
208 if (increasing) { 218 if (increasing) {
209 if (lineno > 0 && value <= m_prevValues[i]) { 219 if (lineno > firstLine && value <= m_prevValues[i]) {
210 increasing = false; 220 increasing = false;
211 } 221 }
212 } 222 }
213 223
214 m_prevValues[i] = value; 224 m_prevValues[i] = value;
223 (large ? ColumnLarge : 0) | 233 (large ? ColumnLarge : 0) |
224 (signd ? ColumnSigned : 0) | 234 (signd ? ColumnSigned : 0) |
225 (emptyish ? ColumnNearEmpty : 0); 235 (emptyish ? ColumnNearEmpty : 0);
226 } 236 }
227 237
228 if (lineno < 10) { 238 if (lineno == 0 && m_headerStatus == HeaderUnknown) {
239 // If we have at least one column, and every column has
240 // quality == ColumnNearEmpty, i.e. not empty and not numeric,
241 // then we probably have a header row
242 bool couldBeHeader = (cols > 0);
243 std::map<int, QString> headings;
244 for (int i = 0; i < cols; ++i) {
245 if (m_columnQualities[i] != ColumnNearEmpty) {
246 couldBeHeader = false;
247 } else {
248 headings[i] = list[i].trimmed().toLower();
249 }
250 }
251 if (couldBeHeader) {
252 m_headerStatus = HeaderPresent;
253 m_columnHeadings = headings;
254 } else {
255 m_headerStatus = HeaderAbsent;
256 }
257 }
258
259 if (lineno == 0 && m_headerStatus == HeaderPresent) {
260 // Start again with the qualities:
261 m_columnQualities.clear();
262 m_prevValues.clear();
263 } else if (lineno < firstLine + 10) {
264 // Not a header row, so add it to the example column output
229 m_example.push_back(list); 265 m_example.push_back(list);
230 if (lineno == 0 || cols > m_maxExampleCols) { 266 if (lineno == firstLine || cols > m_maxExampleCols) {
231 m_maxExampleCols = cols; 267 m_maxExampleCols = cols;
232 } 268 }
233 } 269 }
234 270
235 if (lineno < 10) { 271 if (lineno < firstLine + 10) {
236 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; 272 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
237 for (int i = 0; i < m_columnCount; ++i) { 273 if (lineno == 0 && m_headerStatus == HeaderPresent &&
238 SVDEBUG << int(m_columnQualities[i]) << " "; 274 m_columnCount > 0 && m_columnQualities.empty()) {
275 SVDEBUG << "[whole line classified as a header row]";
276 } else {
277 for (int i = 0; i < cols; ++i) {
278 if (m_columnQualities.find(i) == m_columnQualities.end()) {
279 SVDEBUG << "(not set) ";
280 } else {
281 SVDEBUG << int(m_columnQualities[i]) << " ";
282 }
283 }
239 } 284 }
240 SVDEBUG << endl; 285 SVDEBUG << endl;
286 SVDEBUG << "Estimated header status: " << m_headerStatus << endl;
241 } 287 }
242 } 288 }
243 289
244 void 290 void
245 CSVFormat::guessPurposes() 291 CSVFormat::guessPurposes()
250 int timingColumnCount = 0; 296 int timingColumnCount = 0;
251 bool haveDurationOrEndTime = false; 297 bool haveDurationOrEndTime = false;
252 298
253 SVDEBUG << "Estimated column qualities overall: "; 299 SVDEBUG << "Estimated column qualities overall: ";
254 for (int i = 0; i < m_columnCount; ++i) { 300 for (int i = 0; i < m_columnCount; ++i) {
255 SVDEBUG << int(m_columnQualities[i]) << " "; 301 if (m_columnQualities.find(i) == m_columnQualities.end()) {
302 SVDEBUG << "(not set) ";
303 } else {
304 SVDEBUG << int(m_columnQualities[i]) << " ";
305 }
256 } 306 }
257 SVDEBUG << endl; 307 SVDEBUG << endl;
258 308
259 // if our first column has zero or one entries in it and the rest 309 // if our first column has zero or one entries in it and the rest
260 // have more, then we'll default to ignoring the first column and 310 // have more, then we'll default to ignoring the first column and
288 bool increasing = (qualities & ColumnIncreasing); 338 bool increasing = (qualities & ColumnIncreasing);
289 bool large = (qualities & ColumnLarge); 339 bool large = (qualities & ColumnLarge);
290 340
291 bool timingColumn = (numeric && increasing); 341 bool timingColumn = (numeric && increasing);
292 342
343 QString heading;
344 if (m_columnHeadings.find(i) != m_columnHeadings.end()) {
345 heading = m_columnHeadings[i];
346 }
347
348 if (heading == "time" || heading == "frame" ||
349 heading == "duration" || heading == "endtime") {
350 timingColumn = true;
351 }
352
353 if (heading == "value" || heading == "height" || heading == "label") {
354 timingColumn = false;
355 }
356
293 if (timingColumn) { 357 if (timingColumn) {
294 358
295 ++timingColumnCount; 359 ++timingColumnCount;
360
361 if (heading == "endtime") {
362
363 purpose = ColumnEndTime;
364 haveDurationOrEndTime = true;
365
366 } else if (heading == "duration") {
367
368 purpose = ColumnDuration;
369 haveDurationOrEndTime = true;
296 370
297 if (primary) { 371 } else if (primary || heading == "time" || heading == "frame") {
298 372
299 purpose = ColumnStartTime; 373 purpose = ColumnStartTime;
300
301 m_timingType = ExplicitTiming; 374 m_timingType = ExplicitTiming;
302 375
303 if (integral && large) { 376 if ((integral && large) || heading == "frame") {
304 m_timeUnits = TimeAudioFrames; 377 m_timeUnits = TimeAudioFrames;
305 } else { 378 } else {
306 m_timeUnits = TimeSeconds; 379 m_timeUnits = TimeSeconds;
307 } 380 }
308 381
309 } else { 382 } else if (timingColumnCount == 2 &&
310 383 m_timingType == ExplicitTiming) {
311 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { 384 purpose = ColumnEndTime;
312 purpose = ColumnEndTime; 385 haveDurationOrEndTime = true;
313 haveDurationOrEndTime = true;
314 }
315 } 386 }
316 } 387 }
317 388
318 if (purpose == ColumnUnknown) { 389 if (purpose == ColumnUnknown) {
319 if (numeric) { 390 if (heading == "label") {
391 purpose = ColumnLabel;
392 } else if (numeric || heading == "value" || heading == "height") {
320 purpose = ColumnValue; 393 purpose = ColumnValue;
321 } else { 394 } else {
322 purpose = ColumnLabel; 395 purpose = ColumnLabel;
323 } 396 }
324 } 397 }
326 setColumnPurpose(i, purpose); 399 setColumnPurpose(i, purpose);
327 } 400 }
328 401
329 int valueCount = 0; 402 int valueCount = 0;
330 for (int i = 0; i < m_columnCount; ++i) { 403 for (int i = 0; i < m_columnCount; ++i) {
331 if (m_columnPurposes[i] == ColumnValue) ++valueCount; 404 if (m_columnPurposes[i] == ColumnValue) {
405 ++valueCount;
406 }
332 } 407 }
333 408
334 if (valueCount == 2 && timingColumnCount == 1) { 409 if (valueCount == 2 && timingColumnCount == 1) {
335 // If we have exactly two apparent value columns and only one 410 // If we have exactly two apparent value columns and only one
336 // timing column, but one value column is integral and the 411 // timing column, but one value column is integral and the
453 << range << endl; 528 << range << endl;
454 529
455 m_audioSampleRange = range; 530 m_audioSampleRange = range;
456 } 531 }
457 532
458 CSVFormat::ColumnPurpose 533 QList<CSVFormat::ColumnPurpose>
459 CSVFormat::getColumnPurpose(int i) 534 CSVFormat::getColumnPurposes() const
460 { 535 {
461 while (m_columnPurposes.size() <= i) { 536 QList<ColumnPurpose> purposes;
462 m_columnPurposes.push_back(ColumnUnknown); 537 for (int i = 0; i < m_columnCount; ++i) {
463 } 538 purposes.push_back(getColumnPurpose(i));
464 return m_columnPurposes[i]; 539 }
540 return purposes;
541 }
542
543 void
544 CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl)
545 {
546 m_columnPurposes.clear();
547 for (int i = 0; in_range_for(cl, i); ++i) {
548 m_columnPurposes[i] = cl[i];
549 }
465 } 550 }
466 551
467 CSVFormat::ColumnPurpose 552 CSVFormat::ColumnPurpose
468 CSVFormat::getColumnPurpose(int i) const 553 CSVFormat::getColumnPurpose(int i) const
469 { 554 {
470 if (m_columnPurposes.size() <= i) { 555 if (m_columnPurposes.find(i) == m_columnPurposes.end()) {
471 return ColumnUnknown; 556 return ColumnUnknown;
472 } 557 } else {
473 return m_columnPurposes[i]; 558 return m_columnPurposes.at(i);
559 }
474 } 560 }
475 561
476 void 562 void
477 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) 563 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
478 { 564 {
479 while (m_columnPurposes.size() <= i) {
480 m_columnPurposes.push_back(ColumnUnknown);
481 }
482 m_columnPurposes[i] = p; 565 m_columnPurposes[i] = p;
483 } 566 }
484 567
485 568 QList<CSVFormat::ColumnQualities>
486 569 CSVFormat::getColumnQualities() const
487 570 {
571 QList<ColumnQualities> qualities;
572 for (int i = 0; i < m_columnCount; ++i) {
573 if (m_columnQualities.find(i) == m_columnQualities.end()) {
574 qualities.push_back(0);
575 } else {
576 qualities.push_back(m_columnQualities.at(i));
577 }
578 }
579 return qualities;
580 }