annotate data/fileio/CSVFormat.cpp @ 1008:d9e0e59a1581

When using an aggregate model to pass data to a transform, zero-pad the shorter input to the duration of the longer rather than truncating the longer. (This is better behaviour for e.g. MATCH, and in any case the code was previously truncating incorrectly and ending up with garbage data at the end.)
author Chris Cannam
date Fri, 14 Nov 2014 13:51:33 +0000
parents 1974859baba5
children 1888ca033a84
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@392 16 #include "CSVFormat.h"
Chris@392 17
Chris@629 18 #include "base/StringBits.h"
Chris@629 19
Chris@392 20 #include <QFile>
Chris@392 21 #include <QString>
Chris@392 22 #include <QRegExp>
Chris@392 23 #include <QStringList>
Chris@392 24 #include <QTextStream>
Chris@392 25
Chris@392 26 #include <iostream>
Chris@392 27
Chris@629 28 CSVFormat::CSVFormat(QString path) :
Chris@629 29 m_separator(""),
Chris@392 30 m_sampleRate(44100),
Chris@392 31 m_windowSize(1024),
Chris@629 32 m_allowQuoting(true)
Chris@392 33 {
Chris@629 34 guessFormatFor(path);
Chris@629 35 }
Chris@629 36
Chris@629 37 void
Chris@629 38 CSVFormat::guessFormatFor(QString path)
Chris@629 39 {
Chris@629 40 m_modelType = TwoDimensionalModel;
Chris@629 41 m_timingType = ExplicitTiming;
Chris@629 42 m_timeUnits = TimeSeconds;
Chris@629 43
Chris@629 44 m_maxExampleCols = 0;
Chris@629 45 m_columnCount = 0;
Chris@629 46 m_variableColumnCount = false;
Chris@629 47
Chris@629 48 m_example.clear();
Chris@629 49 m_columnQualities.clear();
Chris@629 50 m_columnPurposes.clear();
Chris@629 51 m_prevValues.clear();
Chris@629 52
Chris@629 53 QFile file(path);
Chris@392 54 if (!file.exists()) return;
Chris@392 55 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
Chris@392 56
Chris@392 57 QTextStream in(&file);
Chris@392 58 in.seek(0);
Chris@392 59
Chris@629 60 int lineno = 0;
Chris@392 61
Chris@392 62 while (!in.atEnd()) {
Chris@392 63
Chris@392 64 // See comment about line endings in CSVFileReader::load()
Chris@392 65
Chris@392 66 QString chunk = in.readLine();
Chris@392 67 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@392 68
Chris@897 69 for (int li = 0; li < lines.size(); ++li) {
Chris@392 70
Chris@392 71 QString line = lines[li];
Chris@629 72 if (line.startsWith("#") || line == "") continue;
Chris@392 73
Chris@629 74 guessQualities(line, lineno);
Chris@392 75
Chris@840 76 ++lineno;
Chris@629 77 }
Chris@840 78
Chris@840 79 if (lineno >= 50) break;
Chris@629 80 }
Chris@392 81
Chris@629 82 guessPurposes();
Chris@629 83 }
Chris@629 84
Chris@629 85 void
Chris@629 86 CSVFormat::guessSeparator(QString line)
Chris@629 87 {
Chris@629 88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
Chris@897 89 for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
Chris@629 90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
Chris@629 91 m_separator = candidates[i];
Chris@629 92 return;
Chris@629 93 }
Chris@629 94 }
Chris@629 95 m_separator = " ";
Chris@629 96 }
Chris@629 97
Chris@629 98 void
Chris@629 99 CSVFormat::guessQualities(QString line, int lineno)
Chris@629 100 {
Chris@629 101 if (m_separator == "") guessSeparator(line);
Chris@629 102
Chris@629 103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
Chris@629 104
Chris@629 105 int cols = list.size();
Chris@991 106 if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
Chris@629 107 if (cols != m_columnCount) m_variableColumnCount = true;
Chris@629 108
Chris@629 109 // All columns are regarded as having these qualities until we see
Chris@629 110 // something that indicates otherwise:
Chris@629 111
Chris@629 112 ColumnQualities defaultQualities =
Chris@629 113 ColumnNumeric | ColumnIntegral | ColumnIncreasing;
Chris@629 114
Chris@629 115 for (int i = 0; i < cols; ++i) {
Chris@629 116
Chris@629 117 while (m_columnQualities.size() <= i) {
Chris@629 118 m_columnQualities.push_back(defaultQualities);
Chris@629 119 m_prevValues.push_back(0.f);
Chris@629 120 }
Chris@629 121
Chris@629 122 QString s(list[i]);
Chris@629 123 bool ok = false;
Chris@629 124
Chris@629 125 ColumnQualities qualities = m_columnQualities[i];
Chris@629 126
Chris@629 127 bool numeric = (qualities & ColumnNumeric);
Chris@629 128 bool integral = (qualities & ColumnIntegral);
Chris@629 129 bool increasing = (qualities & ColumnIncreasing);
Chris@629 130 bool large = (qualities & ColumnLarge); // this one defaults to off
Chris@629 131
Chris@629 132 float value = 0.f;
Chris@629 133
Chris@629 134 //!!! how to take into account headers?
Chris@629 135
Chris@629 136 if (numeric) {
Chris@629 137 value = s.toFloat(&ok);
Chris@629 138 if (!ok) {
Chris@629 139 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
Chris@629 140 }
Chris@629 141 if (ok) {
Chris@629 142 if (lineno < 2 && value > 1000.f) large = true;
Chris@629 143 } else {
Chris@629 144 numeric = false;
Chris@629 145 }
Chris@629 146 }
Chris@629 147
Chris@629 148 if (numeric) {
Chris@629 149
Chris@629 150 if (integral) {
Chris@629 151 if (s.contains('.') || s.contains(',')) {
Chris@629 152 integral = false;
Chris@392 153 }
Chris@392 154 }
Chris@392 155
Chris@629 156 if (increasing) {
Chris@629 157 if (lineno > 0 && value <= m_prevValues[i]) {
Chris@629 158 increasing = false;
Chris@392 159 }
Chris@392 160 }
Chris@392 161
Chris@629 162 m_prevValues[i] = value;
Chris@629 163 }
Chris@392 164
Chris@629 165 m_columnQualities[i] =
Chris@629 166 (numeric ? ColumnNumeric : 0) |
Chris@629 167 (integral ? ColumnIntegral : 0) |
Chris@629 168 (increasing ? ColumnIncreasing : 0) |
Chris@629 169 (large ? ColumnLarge : 0);
Chris@629 170 }
Chris@392 171
Chris@629 172 if (lineno < 10) {
Chris@629 173 m_example.push_back(list);
Chris@629 174 if (lineno == 0 || cols > m_maxExampleCols) {
Chris@629 175 m_maxExampleCols = cols;
Chris@392 176 }
Chris@392 177 }
Chris@392 178
Chris@843 179 // cerr << "Estimated column qualities: ";
Chris@676 180 // for (int i = 0; i < m_columnCount; ++i) {
Chris@843 181 // cerr << int(m_columnQualities[i]) << " ";
Chris@676 182 // }
Chris@843 183 // cerr << endl;
Chris@629 184 }
Chris@629 185
Chris@629 186 void
Chris@629 187 CSVFormat::guessPurposes()
Chris@629 188 {
Chris@629 189 m_timingType = CSVFormat::ImplicitTiming;
Chris@629 190 m_timeUnits = CSVFormat::TimeWindows;
Chris@392 191
Chris@629 192 int timingColumnCount = 0;
Chris@629 193
Chris@629 194 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 195
Chris@629 196 ColumnPurpose purpose = ColumnUnknown;
Chris@629 197 bool primary = (i == 0);
Chris@392 198
Chris@629 199 ColumnQualities qualities = m_columnQualities[i];
Chris@392 200
Chris@629 201 bool numeric = (qualities & ColumnNumeric);
Chris@629 202 bool integral = (qualities & ColumnIntegral);
Chris@629 203 bool increasing = (qualities & ColumnIncreasing);
Chris@629 204 bool large = (qualities & ColumnLarge);
Chris@629 205
Chris@629 206 bool timingColumn = (numeric && increasing);
Chris@629 207
Chris@629 208 if (timingColumn) {
Chris@629 209
Chris@629 210 ++timingColumnCount;
Chris@629 211
Chris@629 212 if (primary) {
Chris@629 213
Chris@629 214 purpose = ColumnStartTime;
Chris@629 215
Chris@629 216 m_timingType = ExplicitTiming;
Chris@629 217
Chris@629 218 if (integral && large) {
Chris@629 219 m_timeUnits = TimeAudioFrames;
Chris@629 220 } else {
Chris@629 221 m_timeUnits = TimeSeconds;
Chris@629 222 }
Chris@629 223
Chris@629 224 } else {
Chris@629 225
Chris@629 226 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
Chris@629 227 purpose = ColumnEndTime;
Chris@629 228 }
Chris@629 229 }
Chris@629 230 }
Chris@629 231
Chris@629 232 if (purpose == ColumnUnknown) {
Chris@629 233 if (numeric) {
Chris@629 234 purpose = ColumnValue;
Chris@629 235 } else {
Chris@629 236 purpose = ColumnLabel;
Chris@629 237 }
Chris@629 238 }
Chris@629 239
Chris@631 240 setColumnPurpose(i, purpose);
Chris@629 241 }
Chris@629 242
Chris@629 243 int valueCount = 0;
Chris@629 244 for (int i = 0; i < m_columnCount; ++i) {
Chris@629 245 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
Chris@629 246 }
Chris@629 247
Chris@630 248 if (valueCount == 2 && timingColumnCount == 1) {
Chris@630 249 // If we have exactly two apparent value columns and only one
Chris@630 250 // timing column, but one value column is integral and the
Chris@630 251 // other is not, guess that whichever one matches the integral
Chris@630 252 // status of the time column is either duration or end time
Chris@630 253 if (m_timingType == ExplicitTiming) {
Chris@630 254 int a = -1, b = -1;
Chris@630 255 for (int i = 0; i < m_columnCount; ++i) {
Chris@630 256 if (m_columnPurposes[i] == ColumnValue) {
Chris@630 257 if (a == -1) a = i;
Chris@630 258 else b = i;
Chris@630 259 }
Chris@630 260 }
Chris@630 261 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 262 (m_columnQualities[b] & ColumnIntegral)) {
Chris@630 263 int timecol = a;
Chris@630 264 if ((m_columnQualities[a] & ColumnIntegral) !=
Chris@630 265 (m_columnQualities[0] & ColumnIntegral)) {
Chris@630 266 timecol = b;
Chris@630 267 }
Chris@630 268 if (m_columnQualities[timecol] & ColumnIncreasing) {
Chris@630 269 // This shouldn't happen; should have been settled above
Chris@630 270 m_columnPurposes[timecol] = ColumnEndTime;
Chris@630 271 } else {
Chris@630 272 m_columnPurposes[timecol] = ColumnDuration;
Chris@630 273 }
Chris@630 274 --valueCount;
Chris@630 275 }
Chris@630 276 }
Chris@630 277 }
Chris@630 278
Chris@631 279 if (timingColumnCount > 1) {
Chris@631 280 m_modelType = TwoDimensionalModelWithDuration;
Chris@392 281 } else {
Chris@631 282 if (valueCount == 0) {
Chris@631 283 m_modelType = OneDimensionalModel;
Chris@631 284 } else if (valueCount == 1) {
Chris@631 285 m_modelType = TwoDimensionalModel;
Chris@631 286 } else {
Chris@631 287 m_modelType = ThreeDimensionalModel;
Chris@631 288 }
Chris@629 289 }
Chris@392 290
Chris@843 291 // cerr << "Estimated column purposes: ";
Chris@676 292 // for (int i = 0; i < m_columnCount; ++i) {
Chris@843 293 // cerr << int(m_columnPurposes[i]) << " ";
Chris@676 294 // }
Chris@843 295 // cerr << endl;
Chris@392 296
Chris@843 297 // cerr << "Estimated model type: " << m_modelType << endl;
Chris@843 298 // cerr << "Estimated timing type: " << m_timingType << endl;
Chris@843 299 // cerr << "Estimated units: " << m_timeUnits << endl;
Chris@392 300 }
Chris@392 301
Chris@631 302 CSVFormat::ColumnPurpose
Chris@631 303 CSVFormat::getColumnPurpose(int i)
Chris@631 304 {
Chris@631 305 while (m_columnPurposes.size() <= i) {
Chris@631 306 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 307 }
Chris@631 308 return m_columnPurposes[i];
Chris@631 309 }
Chris@629 310
Chris@631 311 CSVFormat::ColumnPurpose
Chris@631 312 CSVFormat::getColumnPurpose(int i) const
Chris@631 313 {
Chris@668 314 if (m_columnPurposes.size() <= i) {
Chris@668 315 return ColumnUnknown;
Chris@668 316 }
Chris@631 317 return m_columnPurposes[i];
Chris@631 318 }
Chris@631 319
Chris@631 320 void
Chris@631 321 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
Chris@631 322 {
Chris@631 323 while (m_columnPurposes.size() <= i) {
Chris@631 324 m_columnPurposes.push_back(ColumnUnknown);
Chris@631 325 }
Chris@631 326 m_columnPurposes[i] = p;
Chris@631 327 }
Chris@631 328
Chris@631 329
Chris@631 330
Chris@631 331