annotate data/fileio/CSVFormat.cpp @ 1434:0684c6698e3f streaming-csv-writer

Added utility function for splitting a model selection into chunks and writing to a stream.
author Lucas Thompson <dev@lucas.im>
date Tue, 17 Apr 2018 10:03:49 +0100
parents 1bf38a4b91c4
children 48e9f538e6e9
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@1362 28 #include "base/Debug.h"
Chris@1362 29
Chris@629 30 CSVFormat::CSVFormat(QString path) :
Chris@629 31 m_separator(""),
Chris@392 32 m_sampleRate(44100),
Chris@392 33 m_windowSize(1024),
Chris@629 34 m_allowQuoting(true)
Chris@392 35 {
Chris@629 36 guessFormatFor(path);
Chris@629 37 }
Chris@629 38
Chris@629 39 void
Chris@629 40 CSVFormat::guessFormatFor(QString path)
Chris@629 41 {
Chris@629 42 m_modelType = TwoDimensionalModel;
Chris@629 43 m_timingType = ExplicitTiming;
Chris@629 44 m_timeUnits = TimeSeconds;
Chris@629 45
Chris@629 46 m_maxExampleCols = 0;
Chris@629 47 m_columnCount = 0;
Chris@629 48 m_variableColumnCount = false;
Chris@629 49
Chris@629 50 m_example.clear();
Chris@629 51 m_columnQualities.clear();
Chris@629 52 m_columnPurposes.clear();
Chris@629 53 m_prevValues.clear();
Chris@629 54
Chris@629 55 QFile file(path);
Chris@392 56 if (!file.exists()) return;
Chris@392 57 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 58
Chris@392 59 QTextStream in(&file);
Chris@392 60 in.seek(0);
Chris@392 61
Chris@629 62 int lineno = 0;
Chris@392 63
Chris@392 64 while (!in.atEnd()) {
Chris@392 65
Chris@392 66 // See comment about line endings in CSVFileReader::load()
Chris@392 67
Chris@392 68 QString chunk = in.readLine();
Chris@392 69 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 70
Chris@897 71 for (int li = 0; li < lines.size(); ++li) {
Chris@392 72
Chris@392 73 QString line = lines[li];
Chris@629 74 if (line.startsWith("#") || line == "") continue;
Chris@392 75
Chris@629 76 guessQualities(line, lineno);
Chris@392 77
Chris@840 78 ++lineno;
Chris@629 79 }
Chris@840 80
Chris@840 81 if (lineno >= 50) break;
Chris@629 82 }
Chris@392 83
Chris@629 84 guessPurposes();
Chris@629 85 }
Chris@629 86
Chris@629 87 void
Chris@629 88 CSVFormat::guessSeparator(QString line)
Chris@629 89 {
Chris@629 90 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@897 91 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
Chris@629 92 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 93 m_separator = candidates[i];
Chris@629 94 return;
Chris@629 95 }
Chris@629 96 }
Chris@629 97 }
Chris@629 98
Chris@629 99 void
Chris@629 100 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 101 {
Chris@629 102 if (m_separator == "") guessSeparator(line);
Chris@629 103
Chris@1362 104 QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
Chris@629 105
Chris@629 106 int cols = list.size();
Chris@991 107 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 108 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 109
Chris@629 110 // All columns are regarded as having these qualities until we see
Chris@629 111 // something that indicates otherwise:
Chris@629 112
Chris@629 113 ColumnQualities defaultQualities =
Chris@1021 114 ColumnNumeric | ColumnIntegral | ColumnIncreasing | ColumnNearEmpty;
Chris@629 115
Chris@629 116 for (int i = 0; i < cols; ++i) {
Chris@629 117
Chris@629 118 while (m_columnQualities.size() <= i) {
Chris@629 119 m_columnQualities.push_back(defaultQualities);
Chris@629 120 m_prevValues.push_back(0.f);
Chris@629 121 }
Chris@629 122
Chris@629 123 QString s(list[i]);
Chris@629 124 bool ok = false;
Chris@629 125
Chris@629 126 ColumnQualities qualities = m_columnQualities[i];
Chris@629 127
Chris@629 128 bool numeric = (qualities & ColumnNumeric);
Chris@629 129 bool integral = (qualities & ColumnIntegral);
Chris@629 130 bool increasing = (qualities & ColumnIncreasing);
Chris@629 131 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@1021 132 bool emptyish = (qualities & ColumnNearEmpty);
Chris@629 133
Chris@1021 134 if (lineno > 1 && s.trimmed() != "") {
Chris@1021 135 emptyish = false;
Chris@1021 136 }
Chris@1021 137
Chris@629 138 float value = 0.f;
Chris@629 139
Chris@629 140 //!!! how to take into account headers?
Chris@629 141
Chris@629 142 if (numeric) {
Chris@629 143 value = s.toFloat(&ok);
Chris@629 144 if (!ok) {
Chris@629 145 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 146 }
Chris@629 147 if (ok) {
Chris@629 148 if (lineno < 2 && value > 1000.f) large = true;
Chris@629 149 } else {
Chris@629 150 numeric = false;
Chris@629 151 }
Chris@629 152 }
Chris@629 153
Chris@629 154 if (numeric) {
Chris@629 155
Chris@629 156 if (integral) {
Chris@629 157 if (s.contains('.') || s.contains(',')) {
Chris@629 158 integral = false;
Chris@392 159 }
Chris@392 160 }
Chris@392 161
Chris@629 162 if (increasing) {
Chris@629 163 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 164 increasing = false;
Chris@392 165 }
Chris@392 166 }
Chris@392 167
Chris@629 168 m_prevValues[i] = value;
Chris@629 169 }
Chris@392 170
Chris@629 171 m_columnQualities[i] =
Chris@629 172 (numeric ? ColumnNumeric : 0) |
Chris@629 173 (integral ? ColumnIntegral : 0) |
Chris@629 174 (increasing ? ColumnIncreasing : 0) |
Chris@1021 175 (large ? ColumnLarge : 0) |
Chris@1021 176 (emptyish ? ColumnNearEmpty : 0);
Chris@629 177 }
Chris@392 178
Chris@629 179 if (lineno < 10) {
Chris@629 180 m_example.push_back(list);
Chris@629 181 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 182 m_maxExampleCols = cols;
Chris@392 183 }
Chris@392 184 }
Chris@392 185
Chris@1362 186 if (lineno < 10) {
Chris@1362 187 SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
Chris@1362 188 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 189 SVDEBUG << int(m_columnQualities[i]) << " ";
Chris@1362 190 }
Chris@1362 191 SVDEBUG << endl;
Chris@1362 192 }
Chris@629 193 }
Chris@629 194
Chris@629 195 void
Chris@629 196 CSVFormat::guessPurposes()
Chris@629 197 {
Chris@629 198 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 199 m_timeUnits = CSVFormat::TimeWindows;
Chris@392 200
Chris@629 201 int timingColumnCount = 0;
Chris@1021 202
Chris@1021 203 // if our first column has zero or one entries in it and the rest
Chris@1021 204 // have more, then we'll default to ignoring the first column and
Chris@1021 205 // counting the next one as primary. (e.g. Sonic Annotator output
Chris@1021 206 // with filename at start of first column.)
Chris@1021 207
Chris@1021 208 int primaryColumnNo = 0;
Chris@1021 209
Chris@1021 210 if (m_columnCount >= 2) {
Chris@1021 211 if ( (m_columnQualities[0] & ColumnNearEmpty) &&
Chris@1021 212 !(m_columnQualities[1] & ColumnNearEmpty)) {
Chris@1021 213 primaryColumnNo = 1;
Chris@1021 214 }
Chris@1021 215 }
Chris@629 216
Chris@629 217 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 218
Chris@629 219 ColumnPurpose purpose = ColumnUnknown;
Chris@1021 220
Chris@1021 221 if (i < primaryColumnNo) {
Chris@1021 222 setColumnPurpose(i, purpose);
Chris@1021 223 continue;
Chris@1021 224 }
Chris@1021 225
Chris@1021 226 bool primary = (i == primaryColumnNo);
Chris@392 227
Chris@629 228 ColumnQualities qualities = m_columnQualities[i];
Chris@392 229
Chris@629 230 bool numeric = (qualities & ColumnNumeric);
Chris@629 231 bool integral = (qualities & ColumnIntegral);
Chris@629 232 bool increasing = (qualities & ColumnIncreasing);
Chris@629 233 bool large = (qualities & ColumnLarge);
Chris@629 234
Chris@629 235 bool timingColumn = (numeric && increasing);
Chris@629 236
Chris@629 237 if (timingColumn) {
Chris@629 238
Chris@629 239 ++timingColumnCount;
Chris@629 240
Chris@629 241 if (primary) {
Chris@629 242
Chris@629 243 purpose = ColumnStartTime;
Chris@629 244
Chris@629 245 m_timingType = ExplicitTiming;
Chris@629 246
Chris@629 247 if (integral && large) {
Chris@629 248 m_timeUnits = TimeAudioFrames;
Chris@629 249 } else {
Chris@629 250 m_timeUnits = TimeSeconds;
Chris@629 251 }
Chris@629 252
Chris@629 253 } else {
Chris@629 254
Chris@629 255 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 256 purpose = ColumnEndTime;
Chris@629 257 }
Chris@629 258 }
Chris@629 259 }
Chris@629 260
Chris@629 261 if (purpose == ColumnUnknown) {
Chris@629 262 if (numeric) {
Chris@629 263 purpose = ColumnValue;
Chris@629 264 } else {
Chris@629 265 purpose = ColumnLabel;
Chris@629 266 }
Chris@629 267 }
Chris@629 268
Chris@631 269 setColumnPurpose(i, purpose);
Chris@629 270 }
Chris@629 271
Chris@629 272 int valueCount = 0;
Chris@629 273 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 274 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 275 }
Chris@629 276
Chris@630 277 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 278 // If we have exactly two apparent value columns and only one
Chris@630 279 // timing column, but one value column is integral and the
Chris@630 280 // other is not, guess that whichever one matches the integral
Chris@630 281 // status of the time column is either duration or end time
Chris@630 282 if (m_timingType == ExplicitTiming) {
Chris@630 283 int a = -1, b = -1;
Chris@630 284 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 285 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 286 if (a == -1) a = i;
Chris@630 287 else b = i;
Chris@630 288 }
Chris@630 289 }
Chris@630 290 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 291 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 292 int timecol = a;
Chris@630 293 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 294 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 295 timecol = b;
Chris@630 296 }
Chris@630 297 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 298 // This shouldn't happen; should have been settled above
Chris@630 299 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 300 } else {
Chris@630 301 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 302 }
Chris@630 303 --valueCount;
Chris@630 304 }
Chris@630 305 }
Chris@630 306 }
Chris@630 307
Chris@631 308 if (timingColumnCount > 1) {
Chris@631 309 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 310 } else {
Chris@631 311 if (valueCount == 0) {
Chris@631 312 m_modelType = OneDimensionalModel;
Chris@631 313 } else if (valueCount == 1) {
Chris@631 314 m_modelType = TwoDimensionalModel;
Chris@631 315 } else {
Chris@631 316 m_modelType = ThreeDimensionalModel;
Chris@631 317 }
Chris@629 318 }
Chris@392 319
Chris@1362 320 SVDEBUG << "Estimated column purposes: ";
Chris@1362 321 for (int i = 0; i < m_columnCount; ++i) {
Chris@1362 322 SVDEBUG << int(m_columnPurposes[i]) << " ";
Chris@1362 323 }
Chris@1362 324 SVDEBUG << endl;
Chris@392 325
Chris@1362 326 SVDEBUG << "Estimated model type: " << m_modelType << endl;
Chris@1362 327 SVDEBUG << "Estimated timing type: " << m_timingType << endl;
Chris@1362 328 SVDEBUG << "Estimated units: " << m_timeUnits << endl;
Chris@392 329 }
Chris@392 330
Chris@631 331 CSVFormat::ColumnPurpose
Chris@631 332 CSVFormat::getColumnPurpose(int i)
Chris@631 333 {
Chris@631 334 while (m_columnPurposes.size() <= i) {
Chris@631 335 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 336 }
Chris@631 337 return m_columnPurposes[i];
Chris@631 338 }
Chris@629 339
Chris@631 340 CSVFormat::ColumnPurpose
Chris@631 341 CSVFormat::getColumnPurpose(int i) const
Chris@631 342 {
Chris@668 343 if (m_columnPurposes.size() <= i) {
Chris@668 344 return ColumnUnknown;
Chris@668 345 }
Chris@631 346 return m_columnPurposes[i];
Chris@631 347 }
Chris@631 348
Chris@631 349 void
Chris@631 350 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 351 {
Chris@631 352 while (m_columnPurposes.size() <= i) {
Chris@631 353 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 354 }
Chris@631 355 m_columnPurposes[i] = p;
Chris@631 356 }
Chris@631 357
Chris@631 358
Chris@631 359
Chris@631 360