annotate data/fileio/CSVFileReader.cpp @ 630:11a664058dd8

* Start revamping the CSV import dialog so as to show a "purpose" for each column. These are estimated from the file now, but changing them does not actually do anything yet.
author Chris Cannam
date Fri, 16 Jul 2010 16:51:39 +0000
parents 001db550bd48
children 3a5ee4b6c9ad
rev   line source
Chris@148 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@148 2
Chris@148 3 /*
Chris@148 4 Sonic Visualiser
Chris@148 5 An audio file viewer and annotation editor.
Chris@148 6 Centre for Digital Music, Queen Mary, University of London.
Chris@148 7 This file copyright 2006 Chris Cannam.
Chris@148 8
Chris@148 9 This program is free software; you can redistribute it and/or
Chris@148 10 modify it under the terms of the GNU General Public License as
Chris@148 11 published by the Free Software Foundation; either version 2 of the
Chris@148 12 License, or (at your option) any later version. See the file
Chris@148 13 COPYING included with this distribution for more information.
Chris@148 14 */
Chris@148 15
Chris@148 16 #include "CSVFileReader.h"
Chris@148 17
Chris@150 18 #include "model/Model.h"
Chris@148 19 #include "base/RealTime.h"
Chris@148 20 #include "model/SparseOneDimensionalModel.h"
Chris@148 21 #include "model/SparseTimeValueModel.h"
Chris@152 22 #include "model/EditableDenseThreeDimensionalModel.h"
Chris@628 23 #include "model/RegionModel.h"
Chris@308 24 #include "DataFileReaderFactory.h"
Chris@148 25
Chris@148 26 #include <QFile>
Chris@148 27 #include <QString>
Chris@148 28 #include <QRegExp>
Chris@148 29 #include <QStringList>
Chris@148 30 #include <QTextStream>
Chris@148 31
Chris@148 32 #include <iostream>
Chris@628 33 #include <map>
Chris@148 34
Chris@392 35 CSVFileReader::CSVFileReader(QString path, CSVFormat format,
Chris@392 36 size_t mainModelSampleRate) :
Chris@392 37 m_format(format),
Chris@148 38 m_file(0),
Chris@148 39 m_mainModelSampleRate(mainModelSampleRate)
Chris@148 40 {
Chris@148 41 m_file = new QFile(path);
Chris@148 42 bool good = false;
Chris@148 43
Chris@148 44 if (!m_file->exists()) {
Chris@148 45 m_error = QFile::tr("File \"%1\" does not exist").arg(path);
Chris@148 46 } else if (!m_file->open(QIODevice::ReadOnly | QIODevice::Text)) {
Chris@148 47 m_error = QFile::tr("Failed to open file \"%1\"").arg(path);
Chris@148 48 } else {
Chris@148 49 good = true;
Chris@148 50 }
Chris@148 51
Chris@148 52 if (!good) {
Chris@148 53 delete m_file;
Chris@148 54 m_file = 0;
Chris@148 55 }
Chris@148 56 }
Chris@148 57
Chris@148 58 CSVFileReader::~CSVFileReader()
Chris@148 59 {
Chris@148 60 std::cerr << "CSVFileReader::~CSVFileReader: file is " << m_file << std::endl;
Chris@148 61
Chris@148 62 if (m_file) {
Chris@148 63 std::cerr << "CSVFileReader::CSVFileReader: Closing file" << std::endl;
Chris@148 64 m_file->close();
Chris@148 65 }
Chris@148 66 delete m_file;
Chris@148 67 }
Chris@148 68
Chris@148 69 bool
Chris@148 70 CSVFileReader::isOK() const
Chris@148 71 {
Chris@148 72 return (m_file != 0);
Chris@148 73 }
Chris@148 74
Chris@148 75 QString
Chris@148 76 CSVFileReader::getError() const
Chris@148 77 {
Chris@148 78 return m_error;
Chris@148 79 }
Chris@148 80
Chris@148 81 Model *
Chris@148 82 CSVFileReader::load() const
Chris@148 83 {
Chris@148 84 if (!m_file) return 0;
Chris@392 85 /*!!!
Chris@148 86 CSVFormatDialog *dialog = new CSVFormatDialog
Chris@148 87 (0, m_file, m_mainModelSampleRate);
Chris@148 88
Chris@148 89 if (dialog->exec() == QDialog::Rejected) {
Chris@148 90 delete dialog;
Chris@308 91 throw DataFileReaderFactory::ImportCancelled;
Chris@148 92 }
Chris@392 93 */
Chris@148 94
Chris@628 95 CSVFormat::ModelType modelType = m_format.getModelType();
Chris@392 96 CSVFormat::TimingType timingType = m_format.getTimingType();
Chris@628 97 CSVFormat::DurationType durationType = m_format.getDurationType();
Chris@628 98 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
Chris@392 99 QString separator = m_format.getSeparator();
Chris@392 100 QString::SplitBehavior behaviour = m_format.getSplitBehaviour();
Chris@392 101 size_t sampleRate = m_format.getSampleRate();
Chris@392 102 size_t windowSize = m_format.getWindowSize();
Chris@148 103
Chris@392 104 if (timingType == CSVFormat::ExplicitTiming) {
Chris@611 105 if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@611 106 // This will be overridden later if more than one line
Chris@611 107 // appears in our file, but we want to choose a default
Chris@611 108 // that's likely to be visible
Chris@611 109 windowSize = 1024;
Chris@611 110 } else {
Chris@611 111 windowSize = 1;
Chris@611 112 }
Chris@392 113 if (timeUnits == CSVFormat::TimeSeconds) {
Chris@148 114 sampleRate = m_mainModelSampleRate;
Chris@148 115 }
Chris@148 116 }
Chris@148 117
Chris@148 118 SparseOneDimensionalModel *model1 = 0;
Chris@148 119 SparseTimeValueModel *model2 = 0;
Chris@628 120 RegionModel *model2a = 0;
Chris@152 121 EditableDenseThreeDimensionalModel *model3 = 0;
Chris@148 122 Model *model = 0;
Chris@148 123
Chris@148 124 QTextStream in(m_file);
Chris@148 125 in.seek(0);
Chris@148 126
Chris@148 127 unsigned int warnings = 0, warnLimit = 10;
Chris@148 128 unsigned int lineno = 0;
Chris@148 129
Chris@148 130 float min = 0.0, max = 0.0;
Chris@148 131
Chris@148 132 size_t frameNo = 0;
Chris@628 133 size_t duration = 0;
Chris@611 134 size_t startFrame = 0; // for calculation of dense model resolution
Chris@148 135
Chris@628 136 std::map<QString, float> labelValueMap;
Chris@628 137 float syntheticMax = 0.f;
Chris@628 138
Chris@148 139 while (!in.atEnd()) {
Chris@148 140
Chris@283 141 // QTextStream's readLine doesn't cope with old-style Mac
Chris@283 142 // CR-only line endings. Why did they bother making the class
Chris@283 143 // cope with more than one sort of line ending, if it still
Chris@283 144 // can't be configured to cope with all the common sorts?
Chris@148 145
Chris@283 146 // For the time being we'll deal with this case (which is
Chris@283 147 // relatively uncommon for us, but still necessary to handle)
Chris@283 148 // by reading the entire file using a single readLine, and
Chris@283 149 // splitting it. For CR and CR/LF line endings this will just
Chris@283 150 // read a line at a time, and that's obviously OK.
Chris@148 151
Chris@283 152 QString chunk = in.readLine();
Chris@283 153 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@283 154
Chris@283 155 for (size_t li = 0; li < lines.size(); ++li) {
Chris@148 156
Chris@283 157 QString line = lines[li];
Chris@148 158
Chris@283 159 if (line.startsWith("#")) continue;
Chris@283 160
Chris@390 161 QStringList list = line.split(separator, behaviour);
Chris@283 162
Chris@283 163 if (!model) {
Chris@283 164
Chris@283 165 switch (modelType) {
Chris@283 166
Chris@392 167 case CSVFormat::OneDimensionalModel:
Chris@283 168 model1 = new SparseOneDimensionalModel(sampleRate, windowSize);
Chris@283 169 model = model1;
Chris@283 170 break;
Chris@148 171
Chris@392 172 case CSVFormat::TwoDimensionalModel:
Chris@283 173 model2 = new SparseTimeValueModel(sampleRate, windowSize, false);
Chris@283 174 model = model2;
Chris@283 175 break;
Chris@148 176
Chris@628 177 case CSVFormat::TwoDimensionalModelWithDuration:
Chris@628 178 model2a = new RegionModel(sampleRate, windowSize, false);
Chris@628 179 model = model2a;
Chris@628 180 break;
Chris@628 181
Chris@392 182 case CSVFormat::ThreeDimensionalModel:
Chris@535 183 model3 = new EditableDenseThreeDimensionalModel
Chris@535 184 (sampleRate,
Chris@535 185 windowSize,
Chris@535 186 list.size(),
Chris@535 187 EditableDenseThreeDimensionalModel::NoCompression);
Chris@283 188 model = model3;
Chris@283 189 break;
Chris@283 190 }
Chris@283 191 }
Chris@148 192
Chris@283 193 QStringList tidyList;
Chris@390 194 QRegExp nonNumericRx("[^0-9eE.,+-]");
Chris@148 195
Chris@628 196 float value = 0.f;
Chris@628 197
Chris@283 198 for (int i = 0; i < list.size(); ++i) {
Chris@148 199
Chris@283 200 QString s(list[i].trimmed());
Chris@148 201
Chris@283 202 if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) {
Chris@283 203 s = s.mid(1, s.length() - 2);
Chris@283 204 } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) {
Chris@283 205 s = s.mid(1, s.length() - 2);
Chris@283 206 }
Chris@148 207
Chris@628 208 if (timingType == CSVFormat::ExplicitTiming) {
Chris@148 209
Chris@628 210 size_t calculatedFrame = 0;
Chris@628 211
Chris@628 212 if (i == 0 ||
Chris@628 213 (i == 1 &&
Chris@628 214 modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
Chris@628 215
Chris@628 216 bool ok = false;
Chris@628 217 QString numeric = s;
Chris@628 218 numeric.remove(nonNumericRx);
Chris@628 219
Chris@628 220 if (timeUnits == CSVFormat::TimeSeconds) {
Chris@628 221
Chris@628 222 double time = numeric.toDouble(&ok);
Chris@628 223 calculatedFrame = int(time * sampleRate + 0.5);
Chris@628 224
Chris@628 225 } else {
Chris@628 226
Chris@628 227 calculatedFrame = numeric.toInt(&ok);
Chris@628 228
Chris@628 229 if (timeUnits == CSVFormat::TimeWindows) {
Chris@628 230 calculatedFrame *= windowSize;
Chris@628 231 }
Chris@628 232 }
Chris@628 233
Chris@628 234 if (!ok) {
Chris@628 235 if (warnings < warnLimit) {
Chris@628 236 std::cerr << "WARNING: CSVFileReader::load: "
Chris@628 237 << "Bad time format (\"" << s.toStdString()
Chris@628 238 << "\") in data line "
Chris@628 239 << lineno+1 << ":" << std::endl;
Chris@628 240 std::cerr << line.toStdString() << std::endl;
Chris@628 241 } else if (warnings == warnLimit) {
Chris@628 242 std::cerr << "WARNING: Too many warnings" << std::endl;
Chris@628 243 }
Chris@628 244 ++warnings;
Chris@628 245 }
Chris@628 246
Chris@628 247 if (i == 0) frameNo = calculatedFrame;
Chris@628 248 else {
Chris@628 249 if (durationType == CSVFormat::EndTimes) {
Chris@628 250 duration = calculatedFrame - frameNo;
Chris@628 251 } else {
Chris@628 252 duration = calculatedFrame;
Chris@628 253 }
Chris@628 254 }
Chris@628 255
Chris@628 256 continue;
Chris@628 257 }
Chris@628 258 }
Chris@628 259
Chris@628 260 if ((i == 1 &&
Chris@628 261 modelType == CSVFormat::TwoDimensionalModel) ||
Chris@628 262 (i == 2 &&
Chris@628 263 modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
Chris@283 264 bool ok = false;
Chris@628 265 value = s.toFloat(&ok);
Chris@628 266 if (!ok) {
Chris@628 267 // cf. RDFImporter::fillModel
Chris@628 268 if (labelValueMap.find(s) == labelValueMap.end()) {
Chris@628 269 syntheticMax = syntheticMax + 1.f;
Chris@628 270 labelValueMap[s] = syntheticMax;
Chris@628 271 }
Chris@628 272 value = labelValueMap[s];
Chris@628 273 } else {
Chris@628 274 if (value > syntheticMax) syntheticMax = value;
Chris@628 275 }
Chris@628 276 if (i + 1 == list.size()) {
Chris@628 277 // keep text around for use as label (none other given)
Chris@628 278 tidyList.push_back(s);
Chris@628 279 }
Chris@628 280 continue;
Chris@628 281 }
Chris@148 282
Chris@628 283 tidyList.push_back(s);
Chris@283 284 }
Chris@148 285
Chris@392 286 if (modelType == CSVFormat::OneDimensionalModel) {
Chris@148 287
Chris@283 288 SparseOneDimensionalModel::Point point
Chris@283 289 (frameNo,
Chris@283 290 tidyList.size() > 0 ? tidyList[tidyList.size()-1] :
Chris@491 291 QString("%1").arg(lineno+1));
Chris@148 292
Chris@283 293 model1->addPoint(point);
Chris@148 294
Chris@392 295 } else if (modelType == CSVFormat::TwoDimensionalModel) {
Chris@148 296
Chris@283 297 SparseTimeValueModel::Point point
Chris@283 298 (frameNo,
Chris@628 299 value,
Chris@628 300 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
Chris@148 301
Chris@283 302 model2->addPoint(point);
Chris@148 303
Chris@628 304 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
Chris@628 305
Chris@628 306 RegionModel::Point point
Chris@628 307 (frameNo,
Chris@628 308 value,
Chris@628 309 duration,
Chris@628 310 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
Chris@628 311
Chris@628 312 model2a->addPoint(point);
Chris@628 313
Chris@392 314 } else if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@148 315
Chris@283 316 DenseThreeDimensionalModel::Column values;
Chris@148 317
Chris@283 318 for (int i = 0; i < tidyList.size(); ++i) {
Chris@148 319
Chris@283 320 bool ok = false;
Chris@283 321 float value = list[i].toFloat(&ok);
Chris@611 322
Chris@611 323 if (i > 0 || timingType != CSVFormat::ExplicitTiming) {
Chris@611 324 values.push_back(value);
Chris@611 325 }
Chris@148 326
Chris@611 327 bool firstEver = (lineno == 0 && i == 0);
Chris@611 328
Chris@611 329 if (firstEver || value < min) min = value;
Chris@611 330 if (firstEver || value > max) max = value;
Chris@611 331
Chris@611 332 if (firstEver) {
Chris@611 333 startFrame = frameNo;
Chris@611 334 model3->setStartFrame(startFrame);
Chris@611 335 } else if (lineno == 1 &&
Chris@611 336 timingType == CSVFormat::ExplicitTiming) {
Chris@611 337 model3->setResolution(frameNo - startFrame);
Chris@611 338 }
Chris@148 339
Chris@283 340 if (!ok) {
Chris@283 341 if (warnings < warnLimit) {
Chris@283 342 std::cerr << "WARNING: CSVFileReader::load: "
Chris@390 343 << "Non-numeric value \""
Chris@390 344 << list[i].toStdString()
Chris@491 345 << "\" in data line " << lineno+1
Chris@283 346 << ":" << std::endl;
Chris@283 347 std::cerr << line.toStdString() << std::endl;
Chris@283 348 ++warnings;
Chris@283 349 } else if (warnings == warnLimit) {
Chris@390 350 // std::cerr << "WARNING: Too many warnings" << std::endl;
Chris@283 351 }
Chris@283 352 }
Chris@283 353 }
Chris@148 354
Chris@390 355 // std::cerr << "Setting bin values for count " << lineno << ", frame "
Chris@390 356 // << frameNo << ", time " << RealTime::frame2RealTime(frameNo, sampleRate) << std::endl;
Chris@148 357
Chris@611 358 model3->setColumn(lineno, values);
Chris@283 359 }
Chris@148 360
Chris@283 361 ++lineno;
Chris@392 362 if (timingType == CSVFormat::ImplicitTiming ||
Chris@283 363 list.size() == 0) {
Chris@283 364 frameNo += windowSize;
Chris@283 365 }
Chris@283 366 }
Chris@148 367 }
Chris@148 368
Chris@392 369 if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@148 370 model3->setMinimumLevel(min);
Chris@148 371 model3->setMaximumLevel(max);
Chris@148 372 }
Chris@148 373
Chris@148 374 return model;
Chris@148 375 }
Chris@148 376