annotate data/fileio/CSVFormat.cpp @ 631:3a5ee4b6c9ad

* Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns.
author Chris Cannam
date Mon, 19 Jul 2010 17:08:56 +0000
parents 11a664058dd8
children ad7c96620886
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@629 28 CSVFormat::CSVFormat(QString path) :
Chris@629 29 m_separator(""),
Chris@392 30 m_sampleRate(44100),
Chris@392 31 m_windowSize(1024),
Chris@629 32 m_allowQuoting(true)
Chris@392 33 {
Chris@629 34 guessFormatFor(path);
Chris@629 35 }
Chris@629 36
Chris@629 37 void
Chris@629 38 CSVFormat::guessFormatFor(QString path)
Chris@629 39 {
Chris@629 40 m_modelType = TwoDimensionalModel;
Chris@629 41 m_timingType = ExplicitTiming;
Chris@629 42 m_timeUnits = TimeSeconds;
Chris@629 43
Chris@629 44 m_maxExampleCols = 0;
Chris@629 45 m_columnCount = 0;
Chris@629 46 m_variableColumnCount = false;
Chris@629 47
Chris@629 48 m_example.clear();
Chris@629 49 m_columnQualities.clear();
Chris@629 50 m_columnPurposes.clear();
Chris@629 51 m_prevValues.clear();
Chris@629 52
Chris@629 53 QFile file(path);
Chris@392 54 if (!file.exists()) return;
Chris@392 55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 56
Chris@392 57 QTextStream in(&file);
Chris@392 58 in.seek(0);
Chris@392 59
Chris@629 60 int lineno = 0;
Chris@392 61
Chris@392 62 while (!in.atEnd()) {
Chris@392 63
Chris@392 64 // See comment about line endings in CSVFileReader::load()
Chris@392 65
Chris@392 66 QString chunk = in.readLine();
Chris@392 67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 68
Chris@392 69 for (size_t li = 0; li < lines.size(); ++li) {
Chris@392 70
Chris@392 71 QString line = lines[li];
Chris@629 72 if (line.startsWith("#") || line == "") continue;
Chris@392 73
Chris@629 74 guessQualities(line, lineno);
Chris@392 75
Chris@629 76 if (++lineno == 50) break;
Chris@629 77 }
Chris@629 78 }
Chris@392 79
Chris@629 80 guessPurposes();
Chris@629 81 }
Chris@629 82
Chris@629 83 void
Chris@629 84 CSVFormat::guessSeparator(QString line)
Chris@629 85 {
Chris@629 86 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@629 87 for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) {
Chris@629 88 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 89 m_separator = candidates[i];
Chris@629 90 return;
Chris@629 91 }
Chris@629 92 }
Chris@629 93 m_separator = " ";
Chris@629 94 }
Chris@629 95
Chris@629 96 void
Chris@629 97 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 98 {
Chris@629 99 if (m_separator == "") guessSeparator(line);
Chris@629 100
Chris@629 101 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
Chris@629 102
Chris@629 103 int cols = list.size();
Chris@629 104 if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
Chris@629 105 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 106
Chris@629 107 // All columns are regarded as having these qualities until we see
Chris@629 108 // something that indicates otherwise:
Chris@629 109
Chris@629 110 ColumnQualities defaultQualities =
Chris@629 111 ColumnNumeric | ColumnIntegral | ColumnIncreasing;
Chris@629 112
Chris@629 113 for (int i = 0; i < cols; ++i) {
Chris@629 114
Chris@629 115 while (m_columnQualities.size() <= i) {
Chris@629 116 m_columnQualities.push_back(defaultQualities);
Chris@629 117 m_prevValues.push_back(0.f);
Chris@629 118 }
Chris@629 119
Chris@629 120 QString s(list[i]);
Chris@629 121 bool ok = false;
Chris@629 122
Chris@629 123 ColumnQualities qualities = m_columnQualities[i];
Chris@629 124
Chris@629 125 bool numeric = (qualities & ColumnNumeric);
Chris@629 126 bool integral = (qualities & ColumnIntegral);
Chris@629 127 bool increasing = (qualities & ColumnIncreasing);
Chris@629 128 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@629 129
Chris@629 130 float value = 0.f;
Chris@629 131
Chris@629 132 //!!! how to take into account headers?
Chris@629 133
Chris@629 134 if (numeric) {
Chris@629 135 value = s.toFloat(&ok);
Chris@629 136 if (!ok) {
Chris@629 137 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 138 }
Chris@629 139 if (ok) {
Chris@629 140 if (lineno < 2 && value > 1000.f) large = true;
Chris@629 141 } else {
Chris@629 142 numeric = false;
Chris@629 143 }
Chris@629 144 }
Chris@629 145
Chris@629 146 if (numeric) {
Chris@629 147
Chris@629 148 if (integral) {
Chris@629 149 if (s.contains('.') || s.contains(',')) {
Chris@629 150 integral = false;
Chris@392 151 }
Chris@392 152 }
Chris@392 153
Chris@629 154 if (increasing) {
Chris@629 155 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 156 increasing = false;
Chris@392 157 }
Chris@392 158 }
Chris@392 159
Chris@629 160 m_prevValues[i] = value;
Chris@629 161 }
Chris@392 162
Chris@629 163 m_columnQualities[i] =
Chris@629 164 (numeric ? ColumnNumeric : 0) |
Chris@629 165 (integral ? ColumnIntegral : 0) |
Chris@629 166 (increasing ? ColumnIncreasing : 0) |
Chris@629 167 (large ? ColumnLarge : 0);
Chris@629 168 }
Chris@392 169
Chris@629 170 if (lineno < 10) {
Chris@629 171 m_example.push_back(list);
Chris@629 172 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 173 m_maxExampleCols = cols;
Chris@392 174 }
Chris@392 175 }
Chris@392 176
Chris@629 177 std::cerr << "Estimated column qualities: ";
Chris@629 178 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 179 std::cerr << int(m_columnQualities[i]) << " ";
Chris@629 180 }
Chris@629 181 std::cerr << std::endl;
Chris@629 182 }
Chris@629 183
Chris@629 184 void
Chris@629 185 CSVFormat::guessPurposes()
Chris@629 186 {
Chris@629 187 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 188 m_timeUnits = CSVFormat::TimeWindows;
Chris@392 189
Chris@629 190 int timingColumnCount = 0;
Chris@629 191
Chris@629 192 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 193
Chris@629 194 ColumnPurpose purpose = ColumnUnknown;
Chris@629 195 bool primary = (i == 0);
Chris@392 196
Chris@629 197 ColumnQualities qualities = m_columnQualities[i];
Chris@392 198
Chris@629 199 bool numeric = (qualities & ColumnNumeric);
Chris@629 200 bool integral = (qualities & ColumnIntegral);
Chris@629 201 bool increasing = (qualities & ColumnIncreasing);
Chris@629 202 bool large = (qualities & ColumnLarge);
Chris@629 203
Chris@629 204 bool timingColumn = (numeric && increasing);
Chris@629 205
Chris@629 206 if (timingColumn) {
Chris@629 207
Chris@629 208 ++timingColumnCount;
Chris@629 209
Chris@629 210 if (primary) {
Chris@629 211
Chris@629 212 purpose = ColumnStartTime;
Chris@629 213
Chris@629 214 m_timingType = ExplicitTiming;
Chris@629 215
Chris@629 216 if (integral && large) {
Chris@629 217 m_timeUnits = TimeAudioFrames;
Chris@629 218 } else {
Chris@629 219 m_timeUnits = TimeSeconds;
Chris@629 220 }
Chris@629 221
Chris@629 222 } else {
Chris@629 223
Chris@629 224 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 225 purpose = ColumnEndTime;
Chris@629 226 }
Chris@629 227 }
Chris@629 228 }
Chris@629 229
Chris@629 230 if (purpose == ColumnUnknown) {
Chris@629 231 if (numeric) {
Chris@629 232 purpose = ColumnValue;
Chris@629 233 } else {
Chris@629 234 purpose = ColumnLabel;
Chris@629 235 }
Chris@629 236 }
Chris@629 237
Chris@631 238 setColumnPurpose(i, purpose);
Chris@629 239 }
Chris@629 240
Chris@629 241 int valueCount = 0;
Chris@629 242 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 243 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 244 }
Chris@629 245
Chris@630 246 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 247 // If we have exactly two apparent value columns and only one
Chris@630 248 // timing column, but one value column is integral and the
Chris@630 249 // other is not, guess that whichever one matches the integral
Chris@630 250 // status of the time column is either duration or end time
Chris@630 251 if (m_timingType == ExplicitTiming) {
Chris@630 252 int a = -1, b = -1;
Chris@630 253 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 254 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 255 if (a == -1) a = i;
Chris@630 256 else b = i;
Chris@630 257 }
Chris@630 258 }
Chris@630 259 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 260 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 261 int timecol = a;
Chris@630 262 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 263 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 264 timecol = b;
Chris@630 265 }
Chris@630 266 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 267 // This shouldn't happen; should have been settled above
Chris@630 268 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 269 } else {
Chris@630 270 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 271 }
Chris@630 272 --valueCount;
Chris@630 273 }
Chris@630 274 }
Chris@630 275 }
Chris@630 276
Chris@631 277 if (timingColumnCount > 1) {
Chris@631 278 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 279 } else {
Chris@631 280 if (valueCount == 0) {
Chris@631 281 m_modelType = OneDimensionalModel;
Chris@631 282 } else if (valueCount == 1) {
Chris@631 283 m_modelType = TwoDimensionalModel;
Chris@631 284 } else {
Chris@631 285 m_modelType = ThreeDimensionalModel;
Chris@631 286 }
Chris@629 287 }
Chris@392 288
Chris@629 289 std::cerr << "Estimated column purposes: ";
Chris@629 290 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 291 std::cerr << int(m_columnPurposes[i]) << " ";
Chris@392 292 }
Chris@629 293 std::cerr << std::endl;
Chris@392 294
Chris@392 295 std::cerr << "Estimated model type: " << m_modelType << std::endl;
Chris@392 296 std::cerr << "Estimated timing type: " << m_timingType << std::endl;
Chris@392 297 std::cerr << "Estimated units: " << m_timeUnits << std::endl;
Chris@392 298 }
Chris@392 299
Chris@631 300 CSVFormat::ColumnPurpose
Chris@631 301 CSVFormat::getColumnPurpose(int i)
Chris@631 302 {
Chris@631 303 while (m_columnPurposes.size() <= i) {
Chris@631 304 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 305 }
Chris@631 306 return m_columnPurposes[i];
Chris@631 307 }
Chris@629 308
Chris@631 309 CSVFormat::ColumnPurpose
Chris@631 310 CSVFormat::getColumnPurpose(int i) const
Chris@631 311 {
Chris@631 312 return m_columnPurposes[i];
Chris@631 313 }
Chris@631 314
Chris@631 315 void
Chris@631 316 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 317 {
Chris@631 318 while (m_columnPurposes.size() <= i) {
Chris@631 319 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 320 }
Chris@631 321 m_columnPurposes[i] = p;
Chris@631 322 }
Chris@631 323
Chris@631 324
Chris@631 325
Chris@631 326