annotate data/fileio/CSVFormat.cpp @ 840:e50a8fee6752

Fix error in break condition for csv file analysis -- was making csv file type guessing very slow for large files
author Chris Cannam
date Mon, 04 Nov 2013 15:47:46 +0000
parents 611a4fa14dde
children e802e550a1f2
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@629 28 CSVFormat::CSVFormat(QString path) :
Chris@629 29 m_separator(""),
Chris@392 30 m_sampleRate(44100),
Chris@392 31 m_windowSize(1024),
Chris@629 32 m_allowQuoting(true)
Chris@392 33 {
Chris@629 34 guessFormatFor(path);
Chris@629 35 }
Chris@629 36
Chris@629 37 void
Chris@629 38 CSVFormat::guessFormatFor(QString path)
Chris@629 39 {
Chris@629 40 m_modelType = TwoDimensionalModel;
Chris@629 41 m_timingType = ExplicitTiming;
Chris@629 42 m_timeUnits = TimeSeconds;
Chris@629 43
Chris@629 44 m_maxExampleCols = 0;
Chris@629 45 m_columnCount = 0;
Chris@629 46 m_variableColumnCount = false;
Chris@629 47
Chris@629 48 m_example.clear();
Chris@629 49 m_columnQualities.clear();
Chris@629 50 m_columnPurposes.clear();
Chris@629 51 m_prevValues.clear();
Chris@629 52
Chris@629 53 QFile file(path);
Chris@392 54 if (!file.exists()) return;
Chris@392 55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 56
Chris@392 57 QTextStream in(&file);
Chris@392 58 in.seek(0);
Chris@392 59
Chris@629 60 int lineno = 0;
Chris@392 61
Chris@392 62 while (!in.atEnd()) {
Chris@392 63
Chris@392 64 // See comment about line endings in CSVFileReader::load()
Chris@392 65
Chris@392 66 QString chunk = in.readLine();
Chris@392 67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 68
Chris@392 69 for (size_t li = 0; li < lines.size(); ++li) {
Chris@392 70
Chris@392 71 QString line = lines[li];
Chris@629 72 if (line.startsWith("#") || line == "") continue;
Chris@392 73
Chris@629 74 guessQualities(line, lineno);
Chris@392 75
Chris@840 76 ++lineno;
Chris@629 77 }
Chris@840 78
Chris@840 79 if (lineno >= 50) break;
Chris@629 80 }
Chris@392 81
Chris@629 82 guessPurposes();
Chris@629 83 }
Chris@629 84
Chris@629 85 void
Chris@629 86 CSVFormat::guessSeparator(QString line)
Chris@629 87 {
Chris@629 88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@629 89 for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) {
Chris@629 90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 91 m_separator = candidates[i];
Chris@629 92 return;
Chris@629 93 }
Chris@629 94 }
Chris@629 95 m_separator = " ";
Chris@629 96 }
Chris@629 97
Chris@629 98 void
Chris@629 99 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 100 {
Chris@629 101 if (m_separator == "") guessSeparator(line);
Chris@629 102
Chris@629 103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
Chris@629 104
Chris@629 105 int cols = list.size();
Chris@629 106 if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
Chris@629 107 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 108
Chris@629 109 // All columns are regarded as having these qualities until we see
Chris@629 110 // something that indicates otherwise:
Chris@629 111
Chris@629 112 ColumnQualities defaultQualities =
Chris@629 113 ColumnNumeric | ColumnIntegral | ColumnIncreasing;
Chris@629 114
Chris@629 115 for (int i = 0; i < cols; ++i) {
Chris@629 116
Chris@629 117 while (m_columnQualities.size() <= i) {
Chris@629 118 m_columnQualities.push_back(defaultQualities);
Chris@629 119 m_prevValues.push_back(0.f);
Chris@629 120 }
Chris@629 121
Chris@629 122 QString s(list[i]);
Chris@629 123 bool ok = false;
Chris@629 124
Chris@629 125 ColumnQualities qualities = m_columnQualities[i];
Chris@629 126
Chris@629 127 bool numeric = (qualities & ColumnNumeric);
Chris@629 128 bool integral = (qualities & ColumnIntegral);
Chris@629 129 bool increasing = (qualities & ColumnIncreasing);
Chris@629 130 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@629 131
Chris@629 132 float value = 0.f;
Chris@629 133
Chris@629 134 //!!! how to take into account headers?
Chris@629 135
Chris@629 136 if (numeric) {
Chris@629 137 value = s.toFloat(&ok);
Chris@629 138 if (!ok) {
Chris@629 139 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 140 }
Chris@629 141 if (ok) {
Chris@629 142 if (lineno < 2 && value > 1000.f) large = true;
Chris@629 143 } else {
Chris@629 144 numeric = false;
Chris@629 145 }
Chris@629 146 }
Chris@629 147
Chris@629 148 if (numeric) {
Chris@629 149
Chris@629 150 if (integral) {
Chris@629 151 if (s.contains('.') || s.contains(',')) {
Chris@629 152 integral = false;
Chris@392 153 }
Chris@392 154 }
Chris@392 155
Chris@629 156 if (increasing) {
Chris@629 157 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 158 increasing = false;
Chris@392 159 }
Chris@392 160 }
Chris@392 161
Chris@629 162 m_prevValues[i] = value;
Chris@629 163 }
Chris@392 164
Chris@629 165 m_columnQualities[i] =
Chris@629 166 (numeric ? ColumnNumeric : 0) |
Chris@629 167 (integral ? ColumnIntegral : 0) |
Chris@629 168 (increasing ? ColumnIncreasing : 0) |
Chris@629 169 (large ? ColumnLarge : 0);
Chris@629 170 }
Chris@392 171
Chris@629 172 if (lineno < 10) {
Chris@629 173 m_example.push_back(list);
Chris@629 174 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 175 m_maxExampleCols = cols;
Chris@392 176 }
Chris@392 177 }
Chris@392 178
Chris@676 179 // std::cerr << "Estimated column qualities: ";
Chris@676 180 // for (int i = 0; i < m_columnCount; ++i) {
Chris@676 181 // std::cerr << int(m_columnQualities[i]) << " ";
Chris@676 182 // }
Chris@676 183 // std::cerr << std::endl;
Chris@629 184 }
Chris@629 185
Chris@629 186 void
Chris@629 187 CSVFormat::guessPurposes()
Chris@629 188 {
Chris@629 189 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 190 m_timeUnits = CSVFormat::TimeWindows;
Chris@392 191
Chris@629 192 int timingColumnCount = 0;
Chris@629 193
Chris@629 194 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 195
Chris@629 196 ColumnPurpose purpose = ColumnUnknown;
Chris@629 197 bool primary = (i == 0);
Chris@392 198
Chris@629 199 ColumnQualities qualities = m_columnQualities[i];
Chris@392 200
Chris@629 201 bool numeric = (qualities & ColumnNumeric);
Chris@629 202 bool integral = (qualities & ColumnIntegral);
Chris@629 203 bool increasing = (qualities & ColumnIncreasing);
Chris@629 204 bool large = (qualities & ColumnLarge);
Chris@629 205
Chris@629 206 bool timingColumn = (numeric && increasing);
Chris@629 207
Chris@629 208 if (timingColumn) {
Chris@629 209
Chris@629 210 ++timingColumnCount;
Chris@629 211
Chris@629 212 if (primary) {
Chris@629 213
Chris@629 214 purpose = ColumnStartTime;
Chris@629 215
Chris@629 216 m_timingType = ExplicitTiming;
Chris@629 217
Chris@629 218 if (integral && large) {
Chris@629 219 m_timeUnits = TimeAudioFrames;
Chris@629 220 } else {
Chris@629 221 m_timeUnits = TimeSeconds;
Chris@629 222 }
Chris@629 223
Chris@629 224 } else {
Chris@629 225
Chris@629 226 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 227 purpose = ColumnEndTime;
Chris@629 228 }
Chris@629 229 }
Chris@629 230 }
Chris@629 231
Chris@629 232 if (purpose == ColumnUnknown) {
Chris@629 233 if (numeric) {
Chris@629 234 purpose = ColumnValue;
Chris@629 235 } else {
Chris@629 236 purpose = ColumnLabel;
Chris@629 237 }
Chris@629 238 }
Chris@629 239
Chris@631 240 setColumnPurpose(i, purpose);
Chris@629 241 }
Chris@629 242
Chris@629 243 int valueCount = 0;
Chris@629 244 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 245 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 246 }
Chris@629 247
Chris@630 248 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 249 // If we have exactly two apparent value columns and only one
Chris@630 250 // timing column, but one value column is integral and the
Chris@630 251 // other is not, guess that whichever one matches the integral
Chris@630 252 // status of the time column is either duration or end time
Chris@630 253 if (m_timingType == ExplicitTiming) {
Chris@630 254 int a = -1, b = -1;
Chris@630 255 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 256 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 257 if (a == -1) a = i;
Chris@630 258 else b = i;
Chris@630 259 }
Chris@630 260 }
Chris@630 261 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 262 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 263 int timecol = a;
Chris@630 264 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 265 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 266 timecol = b;
Chris@630 267 }
Chris@630 268 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 269 // This shouldn't happen; should have been settled above
Chris@630 270 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 271 } else {
Chris@630 272 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 273 }
Chris@630 274 --valueCount;
Chris@630 275 }
Chris@630 276 }
Chris@630 277 }
Chris@630 278
Chris@631 279 if (timingColumnCount > 1) {
Chris@631 280 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 281 } else {
Chris@631 282 if (valueCount == 0) {
Chris@631 283 m_modelType = OneDimensionalModel;
Chris@631 284 } else if (valueCount == 1) {
Chris@631 285 m_modelType = TwoDimensionalModel;
Chris@631 286 } else {
Chris@631 287 m_modelType = ThreeDimensionalModel;
Chris@631 288 }
Chris@629 289 }
Chris@392 290
Chris@676 291 // std::cerr << "Estimated column purposes: ";
Chris@676 292 // for (int i = 0; i < m_columnCount; ++i) {
Chris@676 293 // std::cerr << int(m_columnPurposes[i]) << " ";
Chris@676 294 // }
Chris@676 295 // std::cerr << std::endl;
Chris@392 296
Chris@676 297 // std::cerr << "Estimated model type: " << m_modelType << std::endl;
Chris@676 298 // std::cerr << "Estimated timing type: " << m_timingType << std::endl;
Chris@676 299 // std::cerr << "Estimated units: " << m_timeUnits << std::endl;
Chris@392 300 }
Chris@392 301
Chris@631 302 CSVFormat::ColumnPurpose
Chris@631 303 CSVFormat::getColumnPurpose(int i)
Chris@631 304 {
Chris@631 305 while (m_columnPurposes.size() <= i) {
Chris@631 306 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 307 }
Chris@631 308 return m_columnPurposes[i];
Chris@631 309 }
Chris@629 310
Chris@631 311 CSVFormat::ColumnPurpose
Chris@631 312 CSVFormat::getColumnPurpose(int i) const
Chris@631 313 {
Chris@668 314 if (m_columnPurposes.size() <= i) {
Chris@668 315 return ColumnUnknown;
Chris@668 316 }
Chris@631 317 return m_columnPurposes[i];
Chris@631 318 }
Chris@631 319
Chris@631 320 void
Chris@631 321 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 322 {
Chris@631 323 while (m_columnPurposes.size() <= i) {
Chris@631 324 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 325 }
Chris@631 326 m_columnPurposes[i] = p;
Chris@631 327 }
Chris@631 328
Chris@631 329
Chris@631 330
Chris@631 331