comparison data/fileio/CSVFormat.cpp @ 1874:48f50a4a82ea

Merge from branch csv-import-headers
author Chris Cannam
date Thu, 18 Jun 2020 13:43:20 +0100
parents bed42ce4d3ab
children
comparison
equal deleted inserted replaced
1868:44dba7cd9ec3 1874:48f50a4a82ea
29 29
30 CSVFormat::CSVFormat(QString path) : 30 CSVFormat::CSVFormat(QString path) :
31 m_separator(""), 31 m_separator(""),
32 m_sampleRate(44100), 32 m_sampleRate(44100),
33 m_windowSize(1024), 33 m_windowSize(1024),
34 m_allowQuoting(true) 34 m_headerStatus(HeaderUnknown),
35 m_allowQuoting(true),
36 m_maxExampleCols(0)
35 { 37 {
36 (void)guessFormatFor(path); 38 (void)guessFormatFor(path);
37 } 39 }
38 40
39 bool 41 bool
122 guessSeparator(line); 124 guessSeparator(line);
123 125
124 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting); 126 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
125 127
126 int cols = list.size(); 128 int cols = list.size();
127 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols; 129
128 if (cols != m_columnCount) m_variableColumnCount = true; 130 int firstLine = 0;
131 if (m_headerStatus == HeaderPresent) {
132 firstLine = 1;
133 }
134
135 if (lineno == firstLine || (cols > m_columnCount)) {
136 m_columnCount = cols;
137 }
138 if (cols != m_columnCount) {
139 m_variableColumnCount = true;
140 }
129 141
130 // All columns are regarded as having these qualities until we see 142 // All columns are regarded as having these qualities until we see
131 // something that indicates otherwise: 143 // something that indicates otherwise:
132 144
133 ColumnQualities defaultQualities = 145 ColumnQualities defaultQualities =
135 ColumnIncreasing | ColumnNearEmpty; 147 ColumnIncreasing | ColumnNearEmpty;
136 148
137 for (int i = 0; i < cols; ++i) { 149 for (int i = 0; i < cols; ++i) {
138 150
139 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl; 151 SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
140 152
141 while (m_columnQualities.size() <= i) { 153 if (m_columnQualities.find(i) == m_columnQualities.end()) {
142 m_columnQualities.push_back(defaultQualities); 154 m_columnQualities[i] = defaultQualities;
143 m_prevValues.push_back(0.f); 155 m_prevValues[i] = 0.f;
144 } 156 }
145 157
146 QString s(list[i]); 158 QString s(list[i]);
147 bool ok = false; 159 bool ok = false;
148 160
159 bool signd = (qualities & ColumnSigned); // also defaults to off 171 bool signd = (qualities & ColumnSigned); // also defaults to off
160 bool emptyish = (qualities & ColumnNearEmpty); 172 bool emptyish = (qualities & ColumnNearEmpty);
161 173
162 if (s.trimmed() != "") { 174 if (s.trimmed() != "") {
163 175
164 if (lineno > 1) { 176 if (lineno > firstLine) {
165 emptyish = false; 177 emptyish = false;
166 } 178 }
167 179
168 float value = 0.f; 180 float value = 0.f;
169
170 //!!! how to take into account headers?
171 181
172 if (numeric) { 182 if (numeric) {
173 value = s.toFloat(&ok); 183 value = s.toFloat(&ok);
174 if (!ok) { 184 if (!ok) {
175 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); 185 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
176 } 186 }
177 if (ok) { 187 if (ok) {
178 if (lineno < 2 && value > 1000.f) { 188 if (lineno < firstLine + 2 && value > 1000.f) {
179 large = true; 189 large = true;
180 } 190 }
181 if (value < 0.f) { 191 if (value < 0.f) {
182 signd = true; 192 signd = true;
183 } 193 }
204 integral = false; 214 integral = false;
205 } 215 }
206 } 216 }
207 217
208 if (increasing) { 218 if (increasing) {
209 if (lineno > 0 && value <= m_prevValues[i]) { 219 if (lineno > firstLine && value <= m_prevValues[i]) {
210 increasing = false; 220 increasing = false;
211 } 221 }
212 } 222 }
213 223
214 m_prevValues[i] = value; 224 m_prevValues[i] = value;
223 (large ? ColumnLarge : 0) | 233 (large ? ColumnLarge : 0) |
224 (signd ? ColumnSigned : 0) | 234 (signd ? ColumnSigned : 0) |
225 (emptyish ? ColumnNearEmpty : 0); 235 (emptyish ? ColumnNearEmpty : 0);
226 } 236 }
227 237
228 if (lineno < 10) { 238 if (lineno == 0 && m_headerStatus == HeaderUnknown) {
239 // If we have at least one column, and every column has
240 // quality == ColumnNearEmpty, i.e. not empty and not numeric,
241 // then we probably have a header row
242 bool couldBeHeader = (cols > 0);
243 std::map<int, QString> headings;
244 for (int i = 0; i < cols; ++i) {
245 if (m_columnQualities[i] != ColumnNearEmpty) {
246 couldBeHeader = false;
247 } else {
248 headings[i] = list[i].trimmed().toLower();
249 }
250 }
251 if (couldBeHeader) {
252 m_headerStatus = HeaderPresent;
253 m_columnHeadings = headings;
254 } else {
255 m_headerStatus = HeaderAbsent;
256 }
257 }
258
259 if (lineno == 0 && m_headerStatus == HeaderPresent) {
260 // Start again with the qualities:
261 m_columnQualities.clear();
262 m_prevValues.clear();
263 }
264
265 if (lineno < firstLine + 10) {
229 m_example.push_back(list); 266 m_example.push_back(list);
230 if (lineno == 0 || cols > m_maxExampleCols) { 267 if (lineno == 0 || cols > m_maxExampleCols) {
231 m_maxExampleCols = cols; 268 m_maxExampleCols = cols;
232 } 269 }
233 } 270 }
234 271
235 if (lineno < 10) { 272 if (lineno < firstLine + 10) {
236 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): "; 273 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
237 for (int i = 0; i < m_columnCount; ++i) { 274 if (lineno == 0 && m_headerStatus == HeaderPresent &&
238 SVDEBUG << int(m_columnQualities[i]) << " "; 275 m_columnCount > 0 && m_columnQualities.empty()) {
276 SVDEBUG << "[whole line classified as a header row]";
277 } else {
278 for (int i = 0; i < cols; ++i) {
279 if (m_columnQualities.find(i) == m_columnQualities.end()) {
280 SVDEBUG << "(not set) ";
281 } else {
282 SVDEBUG << int(m_columnQualities[i]) << " ";
283 }
284 }
239 } 285 }
240 SVDEBUG << endl; 286 SVDEBUG << endl;
287 SVDEBUG << "Estimated header status: " << m_headerStatus << endl;
241 } 288 }
242 } 289 }
243 290
244 void 291 void
245 CSVFormat::guessPurposes() 292 CSVFormat::guessPurposes()
250 int timingColumnCount = 0; 297 int timingColumnCount = 0;
251 bool haveDurationOrEndTime = false; 298 bool haveDurationOrEndTime = false;
252 299
253 SVDEBUG << "Estimated column qualities overall: "; 300 SVDEBUG << "Estimated column qualities overall: ";
254 for (int i = 0; i < m_columnCount; ++i) { 301 for (int i = 0; i < m_columnCount; ++i) {
255 SVDEBUG << int(m_columnQualities[i]) << " "; 302 if (m_columnQualities.find(i) == m_columnQualities.end()) {
303 SVDEBUG << "(not set) ";
304 } else {
305 SVDEBUG << int(m_columnQualities[i]) << " ";
306 }
256 } 307 }
257 SVDEBUG << endl; 308 SVDEBUG << endl;
258 309
259 // if our first column has zero or one entries in it and the rest 310 // if our first column has zero or one entries in it and the rest
260 // have more, then we'll default to ignoring the first column and 311 // have more, then we'll default to ignoring the first column and
288 bool increasing = (qualities & ColumnIncreasing); 339 bool increasing = (qualities & ColumnIncreasing);
289 bool large = (qualities & ColumnLarge); 340 bool large = (qualities & ColumnLarge);
290 341
291 bool timingColumn = (numeric && increasing); 342 bool timingColumn = (numeric && increasing);
292 343
344 QString heading;
345 if (m_columnHeadings.find(i) != m_columnHeadings.end()) {
346 heading = m_columnHeadings[i];
347 }
348
349 if (heading == "time" || heading == "frame" ||
350 heading == "duration" || heading == "endtime") {
351 timingColumn = true;
352 }
353
354 if (heading == "value" || heading == "height" || heading == "label") {
355 timingColumn = false;
356 }
357
293 if (timingColumn) { 358 if (timingColumn) {
294 359
295 ++timingColumnCount; 360 ++timingColumnCount;
361
362 if (heading == "endtime") {
363
364 purpose = ColumnEndTime;
365 haveDurationOrEndTime = true;
366
367 } else if (heading == "duration") {
368
369 purpose = ColumnDuration;
370 haveDurationOrEndTime = true;
296 371
297 if (primary) { 372 } else if (primary || heading == "time" || heading == "frame") {
298 373
299 purpose = ColumnStartTime; 374 purpose = ColumnStartTime;
300
301 m_timingType = ExplicitTiming; 375 m_timingType = ExplicitTiming;
302 376
303 if (integral && large) { 377 if ((integral && large) || heading == "frame") {
304 m_timeUnits = TimeAudioFrames; 378 m_timeUnits = TimeAudioFrames;
305 } else { 379 } else {
306 m_timeUnits = TimeSeconds; 380 m_timeUnits = TimeSeconds;
307 } 381 }
308 382
309 } else { 383 } else if (timingColumnCount == 2 &&
310 384 m_timingType == ExplicitTiming) {
311 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { 385 purpose = ColumnEndTime;
312 purpose = ColumnEndTime; 386 haveDurationOrEndTime = true;
313 haveDurationOrEndTime = true;
314 }
315 } 387 }
316 } 388 }
317 389
318 if (purpose == ColumnUnknown) { 390 if (purpose == ColumnUnknown) {
319 if (numeric) { 391 if (heading == "label") {
392 purpose = ColumnLabel;
393 } else if (numeric || heading == "value" || heading == "height") {
320 purpose = ColumnValue; 394 purpose = ColumnValue;
321 } else { 395 } else {
322 purpose = ColumnLabel; 396 purpose = ColumnLabel;
323 } 397 }
324 } 398 }
326 setColumnPurpose(i, purpose); 400 setColumnPurpose(i, purpose);
327 } 401 }
328 402
329 int valueCount = 0; 403 int valueCount = 0;
330 for (int i = 0; i < m_columnCount; ++i) { 404 for (int i = 0; i < m_columnCount; ++i) {
331 if (m_columnPurposes[i] == ColumnValue) ++valueCount; 405 if (m_columnPurposes[i] == ColumnValue) {
406 ++valueCount;
407 }
332 } 408 }
333 409
334 if (valueCount == 2 && timingColumnCount == 1) { 410 if (valueCount == 2 && timingColumnCount == 1) {
335 // If we have exactly two apparent value columns and only one 411 // If we have exactly two apparent value columns and only one
336 // timing column, but one value column is integral and the 412 // timing column, but one value column is integral and the
453 << range << endl; 529 << range << endl;
454 530
455 m_audioSampleRange = range; 531 m_audioSampleRange = range;
456 } 532 }
457 533
458 CSVFormat::ColumnPurpose 534 QList<CSVFormat::ColumnPurpose>
459 CSVFormat::getColumnPurpose(int i) 535 CSVFormat::getColumnPurposes() const
460 { 536 {
461 while (m_columnPurposes.size() <= i) { 537 QList<ColumnPurpose> purposes;
462 m_columnPurposes.push_back(ColumnUnknown); 538 for (int i = 0; i < m_columnCount; ++i) {
463 } 539 purposes.push_back(getColumnPurpose(i));
464 return m_columnPurposes[i]; 540 }
541 return purposes;
542 }
543
544 void
545 CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl)
546 {
547 m_columnPurposes.clear();
548 for (int i = 0; in_range_for(cl, i); ++i) {
549 m_columnPurposes[i] = cl[i];
550 }
465 } 551 }
466 552
467 CSVFormat::ColumnPurpose 553 CSVFormat::ColumnPurpose
468 CSVFormat::getColumnPurpose(int i) const 554 CSVFormat::getColumnPurpose(int i) const
469 { 555 {
470 if (m_columnPurposes.size() <= i) { 556 if (m_columnPurposes.find(i) == m_columnPurposes.end()) {
471 return ColumnUnknown; 557 return ColumnUnknown;
472 } 558 } else {
473 return m_columnPurposes[i]; 559 return m_columnPurposes.at(i);
560 }
474 } 561 }
475 562
476 void 563 void
477 CSVFormat::setColumnPurpose(int i, ColumnPurpose p) 564 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
478 { 565 {
479 while (m_columnPurposes.size() <= i) {
480 m_columnPurposes.push_back(ColumnUnknown);
481 }
482 m_columnPurposes[i] = p; 566 m_columnPurposes[i] = p;
483 } 567 }
484 568
485 569 QList<CSVFormat::ColumnQualities>
486 570 CSVFormat::getColumnQualities() const
487 571 {
572 QList<ColumnQualities> qualities;
573 for (int i = 0; i < m_columnCount; ++i) {
574 if (m_columnQualities.find(i) == m_columnQualities.end()) {
575 qualities.push_back(0);
576 } else {
577 qualities.push_back(m_columnQualities.at(i));
578 }
579 }
580 return qualities;
581 }