annotate data/fileio/CSVFileReader.cpp @ 631:3a5ee4b6c9ad

* Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns.
author Chris Cannam
date Mon, 19 Jul 2010 17:08:56 +0000
parents 001db550bd48
children 611a4fa14dde
rev   line source
Chris@148 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@148 2
Chris@148 3 /*
Chris@148 4 Sonic Visualiser
Chris@148 5 An audio file viewer and annotation editor.
Chris@148 6 Centre for Digital Music, Queen Mary, University of London.
Chris@148 7 This file copyright 2006 Chris Cannam.
Chris@148 8
Chris@148 9 This program is free software; you can redistribute it and/or
Chris@148 10 modify it under the terms of the GNU General Public License as
Chris@148 11 published by the Free Software Foundation; either version 2 of the
Chris@148 12 License, or (at your option) any later version. See the file
Chris@148 13 COPYING included with this distribution for more information.
Chris@148 14 */
Chris@148 15
Chris@148 16 #include "CSVFileReader.h"
Chris@148 17
Chris@150 18 #include "model/Model.h"
Chris@148 19 #include "base/RealTime.h"
Chris@631 20 #include "base/StringBits.h"
Chris@148 21 #include "model/SparseOneDimensionalModel.h"
Chris@148 22 #include "model/SparseTimeValueModel.h"
Chris@152 23 #include "model/EditableDenseThreeDimensionalModel.h"
Chris@628 24 #include "model/RegionModel.h"
Chris@308 25 #include "DataFileReaderFactory.h"
Chris@148 26
Chris@148 27 #include <QFile>
Chris@148 28 #include <QString>
Chris@148 29 #include <QRegExp>
Chris@148 30 #include <QStringList>
Chris@148 31 #include <QTextStream>
Chris@148 32
Chris@148 33 #include <iostream>
Chris@628 34 #include <map>
Chris@148 35
Chris@392 36 CSVFileReader::CSVFileReader(QString path, CSVFormat format,
Chris@392 37 size_t mainModelSampleRate) :
Chris@392 38 m_format(format),
Chris@148 39 m_file(0),
Chris@631 40 m_warnings(0),
Chris@148 41 m_mainModelSampleRate(mainModelSampleRate)
Chris@148 42 {
Chris@148 43 m_file = new QFile(path);
Chris@148 44 bool good = false;
Chris@148 45
Chris@148 46 if (!m_file->exists()) {
Chris@148 47 m_error = QFile::tr("File \"%1\" does not exist").arg(path);
Chris@148 48 } else if (!m_file->open(QIODevice::ReadOnly | QIODevice::Text)) {
Chris@148 49 m_error = QFile::tr("Failed to open file \"%1\"").arg(path);
Chris@148 50 } else {
Chris@148 51 good = true;
Chris@148 52 }
Chris@148 53
Chris@148 54 if (!good) {
Chris@148 55 delete m_file;
Chris@148 56 m_file = 0;
Chris@148 57 }
Chris@148 58 }
Chris@148 59
Chris@148 60 CSVFileReader::~CSVFileReader()
Chris@148 61 {
Chris@148 62 std::cerr << "CSVFileReader::~CSVFileReader: file is " << m_file << std::endl;
Chris@148 63
Chris@148 64 if (m_file) {
Chris@148 65 std::cerr << "CSVFileReader::CSVFileReader: Closing file" << std::endl;
Chris@148 66 m_file->close();
Chris@148 67 }
Chris@148 68 delete m_file;
Chris@148 69 }
Chris@148 70
Chris@148 71 bool
Chris@148 72 CSVFileReader::isOK() const
Chris@148 73 {
Chris@148 74 return (m_file != 0);
Chris@148 75 }
Chris@148 76
Chris@148 77 QString
Chris@148 78 CSVFileReader::getError() const
Chris@148 79 {
Chris@148 80 return m_error;
Chris@148 81 }
Chris@148 82
Chris@631 83 size_t
Chris@631 84 CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate,
Chris@631 85 size_t windowSize) const
Chris@631 86 {
Chris@631 87 QRegExp nonNumericRx("[^0-9eE.,+-]");
Chris@631 88 unsigned int warnLimit = 10;
Chris@631 89
Chris@631 90 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
Chris@631 91
Chris@631 92 size_t calculatedFrame = 0;
Chris@631 93
Chris@631 94 bool ok = false;
Chris@631 95 QString numeric = s;
Chris@631 96 numeric.remove(nonNumericRx);
Chris@631 97
Chris@631 98 if (timeUnits == CSVFormat::TimeSeconds) {
Chris@631 99
Chris@631 100 double time = numeric.toDouble(&ok);
Chris@631 101 if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
Chris@631 102 calculatedFrame = int(time * sampleRate + 0.5);
Chris@631 103
Chris@631 104 } else {
Chris@631 105
Chris@631 106 long n = numeric.toLong(&ok);
Chris@631 107 if (n >= 0) calculatedFrame = n;
Chris@631 108
Chris@631 109 if (timeUnits == CSVFormat::TimeWindows) {
Chris@631 110 calculatedFrame *= windowSize;
Chris@631 111 }
Chris@631 112 }
Chris@631 113
Chris@631 114 if (!ok) {
Chris@631 115 if (m_warnings < warnLimit) {
Chris@631 116 std::cerr << "WARNING: CSVFileReader::load: "
Chris@631 117 << "Bad time format (\"" << s.toStdString()
Chris@631 118 << "\") in data line "
Chris@631 119 << lineno+1 << std::endl;
Chris@631 120 } else if (m_warnings == warnLimit) {
Chris@631 121 std::cerr << "WARNING: Too many warnings" << std::endl;
Chris@631 122 }
Chris@631 123 ++m_warnings;
Chris@631 124 }
Chris@631 125
Chris@631 126 return calculatedFrame;
Chris@631 127 }
Chris@631 128
Chris@148 129 Model *
Chris@148 130 CSVFileReader::load() const
Chris@148 131 {
Chris@148 132 if (!m_file) return 0;
Chris@148 133
Chris@628 134 CSVFormat::ModelType modelType = m_format.getModelType();
Chris@392 135 CSVFormat::TimingType timingType = m_format.getTimingType();
Chris@628 136 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
Chris@392 137 size_t sampleRate = m_format.getSampleRate();
Chris@392 138 size_t windowSize = m_format.getWindowSize();
Chris@631 139 QChar separator = m_format.getSeparator();
Chris@631 140 bool allowQuoting = m_format.getAllowQuoting();
Chris@148 141
Chris@392 142 if (timingType == CSVFormat::ExplicitTiming) {
Chris@611 143 if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@611 144 // This will be overridden later if more than one line
Chris@611 145 // appears in our file, but we want to choose a default
Chris@611 146 // that's likely to be visible
Chris@611 147 windowSize = 1024;
Chris@611 148 } else {
Chris@611 149 windowSize = 1;
Chris@611 150 }
Chris@392 151 if (timeUnits == CSVFormat::TimeSeconds) {
Chris@148 152 sampleRate = m_mainModelSampleRate;
Chris@148 153 }
Chris@148 154 }
Chris@148 155
Chris@148 156 SparseOneDimensionalModel *model1 = 0;
Chris@148 157 SparseTimeValueModel *model2 = 0;
Chris@628 158 RegionModel *model2a = 0;
Chris@152 159 EditableDenseThreeDimensionalModel *model3 = 0;
Chris@148 160 Model *model = 0;
Chris@148 161
Chris@148 162 QTextStream in(m_file);
Chris@148 163 in.seek(0);
Chris@148 164
Chris@148 165 unsigned int warnings = 0, warnLimit = 10;
Chris@148 166 unsigned int lineno = 0;
Chris@148 167
Chris@148 168 float min = 0.0, max = 0.0;
Chris@148 169
Chris@148 170 size_t frameNo = 0;
Chris@628 171 size_t duration = 0;
Chris@631 172 size_t endFrame = 0;
Chris@631 173
Chris@631 174 bool haveAnyValue = false;
Chris@631 175 bool haveEndTime = false;
Chris@631 176
Chris@611 177 size_t startFrame = 0; // for calculation of dense model resolution
Chris@631 178 bool firstEverValue = true;
Chris@148 179
Chris@631 180 std::map<QString, int> labelCountMap;
Chris@631 181
Chris@148 182 while (!in.atEnd()) {
Chris@148 183
Chris@283 184 // QTextStream's readLine doesn't cope with old-style Mac
Chris@283 185 // CR-only line endings. Why did they bother making the class
Chris@283 186 // cope with more than one sort of line ending, if it still
Chris@283 187 // can't be configured to cope with all the common sorts?
Chris@148 188
Chris@283 189 // For the time being we'll deal with this case (which is
Chris@283 190 // relatively uncommon for us, but still necessary to handle)
Chris@283 191 // by reading the entire file using a single readLine, and
Chris@283 192 // splitting it. For CR and CR/LF line endings this will just
Chris@283 193 // read a line at a time, and that's obviously OK.
Chris@148 194
Chris@283 195 QString chunk = in.readLine();
Chris@283 196 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@283 197
Chris@283 198 for (size_t li = 0; li < lines.size(); ++li) {
Chris@148 199
Chris@283 200 QString line = lines[li];
Chris@148 201
Chris@283 202 if (line.startsWith("#")) continue;
Chris@283 203
Chris@631 204 QStringList list = StringBits::split(line, separator, allowQuoting);
Chris@283 205 if (!model) {
Chris@283 206
Chris@283 207 switch (modelType) {
Chris@283 208
Chris@392 209 case CSVFormat::OneDimensionalModel:
Chris@283 210 model1 = new SparseOneDimensionalModel(sampleRate, windowSize);
Chris@283 211 model = model1;
Chris@283 212 break;
Chris@148 213
Chris@392 214 case CSVFormat::TwoDimensionalModel:
Chris@283 215 model2 = new SparseTimeValueModel(sampleRate, windowSize, false);
Chris@283 216 model = model2;
Chris@283 217 break;
Chris@148 218
Chris@628 219 case CSVFormat::TwoDimensionalModelWithDuration:
Chris@628 220 model2a = new RegionModel(sampleRate, windowSize, false);
Chris@628 221 model = model2a;
Chris@628 222 break;
Chris@628 223
Chris@392 224 case CSVFormat::ThreeDimensionalModel:
Chris@535 225 model3 = new EditableDenseThreeDimensionalModel
Chris@535 226 (sampleRate,
Chris@535 227 windowSize,
Chris@535 228 list.size(),
Chris@535 229 EditableDenseThreeDimensionalModel::NoCompression);
Chris@283 230 model = model3;
Chris@283 231 break;
Chris@283 232 }
Chris@283 233 }
Chris@148 234
Chris@631 235 float value = 0.f;
Chris@631 236 QString label = "";
Chris@148 237
Chris@631 238 duration = 0.f;
Chris@631 239 haveEndTime = false;
Chris@628 240
Chris@283 241 for (int i = 0; i < list.size(); ++i) {
Chris@148 242
Chris@631 243 QString s = list[i];
Chris@631 244
Chris@631 245 CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i);
Chris@631 246
Chris@631 247 switch (purpose) {
Chris@631 248
Chris@631 249 case CSVFormat::ColumnUnknown:
Chris@631 250 break;
Chris@631 251
Chris@631 252 case CSVFormat::ColumnStartTime:
Chris@631 253 frameNo = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 254 break;
Chris@631 255
Chris@631 256 case CSVFormat::ColumnEndTime:
Chris@631 257 endFrame = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 258 haveEndTime = true;
Chris@631 259 break;
Chris@631 260
Chris@631 261 case CSVFormat::ColumnDuration:
Chris@631 262 duration = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 263 break;
Chris@631 264
Chris@631 265 case CSVFormat::ColumnValue:
Chris@631 266 value = s.toFloat();
Chris@631 267 haveAnyValue = true;
Chris@631 268 break;
Chris@631 269
Chris@631 270 case CSVFormat::ColumnLabel:
Chris@631 271 label = s;
Chris@631 272 ++labelCountMap[label];
Chris@631 273 break;
Chris@283 274 }
Chris@631 275 }
Chris@148 276
Chris@631 277 if (haveEndTime) { // ... calculate duration now all cols read
Chris@631 278 if (endFrame > frameNo) {
Chris@631 279 duration = endFrame - frameNo;
Chris@628 280 }
Chris@283 281 }
Chris@148 282
Chris@392 283 if (modelType == CSVFormat::OneDimensionalModel) {
Chris@148 284
Chris@631 285 SparseOneDimensionalModel::Point point(frameNo, label);
Chris@283 286 model1->addPoint(point);
Chris@148 287
Chris@392 288 } else if (modelType == CSVFormat::TwoDimensionalModel) {
Chris@148 289
Chris@631 290 SparseTimeValueModel::Point point(frameNo, value, label);
Chris@283 291 model2->addPoint(point);
Chris@148 292
Chris@628 293 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
Chris@628 294
Chris@631 295 RegionModel::Point point(frameNo, value, duration, label);
Chris@628 296 model2a->addPoint(point);
Chris@628 297
Chris@392 298 } else if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@148 299
Chris@283 300 DenseThreeDimensionalModel::Column values;
Chris@148 301
Chris@631 302 for (int i = 0; i < list.size(); ++i) {
Chris@148 303
Chris@283 304 bool ok = false;
Chris@283 305 float value = list[i].toFloat(&ok);
Chris@611 306
Chris@631 307 if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) {
Chris@611 308 values.push_back(value);
Chris@611 309 }
Chris@148 310
Chris@631 311 if (firstEverValue || value < min) min = value;
Chris@631 312 if (firstEverValue || value > max) max = value;
Chris@611 313
Chris@631 314 if (firstEverValue) {
Chris@611 315 startFrame = frameNo;
Chris@611 316 model3->setStartFrame(startFrame);
Chris@611 317 } else if (lineno == 1 &&
Chris@611 318 timingType == CSVFormat::ExplicitTiming) {
Chris@611 319 model3->setResolution(frameNo - startFrame);
Chris@611 320 }
Chris@631 321
Chris@631 322 firstEverValue = false;
Chris@148 323
Chris@283 324 if (!ok) {
Chris@283 325 if (warnings < warnLimit) {
Chris@283 326 std::cerr << "WARNING: CSVFileReader::load: "
Chris@390 327 << "Non-numeric value \""
Chris@390 328 << list[i].toStdString()
Chris@491 329 << "\" in data line " << lineno+1
Chris@283 330 << ":" << std::endl;
Chris@283 331 std::cerr << line.toStdString() << std::endl;
Chris@283 332 ++warnings;
Chris@283 333 } else if (warnings == warnLimit) {
Chris@390 334 // std::cerr << "WARNING: Too many warnings" << std::endl;
Chris@283 335 }
Chris@283 336 }
Chris@283 337 }
Chris@148 338
Chris@390 339 // std::cerr << "Setting bin values for count " << lineno << ", frame "
Chris@390 340 // << frameNo << ", time " << RealTime::frame2RealTime(frameNo, sampleRate) << std::endl;
Chris@148 341
Chris@611 342 model3->setColumn(lineno, values);
Chris@283 343 }
Chris@148 344
Chris@283 345 ++lineno;
Chris@392 346 if (timingType == CSVFormat::ImplicitTiming ||
Chris@283 347 list.size() == 0) {
Chris@283 348 frameNo += windowSize;
Chris@283 349 }
Chris@283 350 }
Chris@148 351 }
Chris@148 352
Chris@631 353 if (!haveAnyValue) {
Chris@631 354 if (model2a) {
Chris@631 355 // assign values for regions based on label frequency; we
Chris@631 356 // have this in our labelCountMap, sort of
Chris@631 357
Chris@631 358 std::map<int, std::map<QString, float> > countLabelValueMap;
Chris@631 359 for (std::map<QString, int>::iterator i = labelCountMap.begin();
Chris@631 360 i != labelCountMap.end(); ++i) {
Chris@631 361 countLabelValueMap[i->second][i->first] = 0.f;
Chris@631 362 }
Chris@631 363
Chris@631 364 float v = 0.f;
Chris@631 365 for (std::map<int, std::map<QString, float> >::iterator i =
Chris@631 366 countLabelValueMap.end(); i != countLabelValueMap.begin(); ) {
Chris@631 367 --i;
Chris@631 368 for (std::map<QString, float>::iterator j = i->second.begin();
Chris@631 369 j != i->second.end(); ++j) {
Chris@631 370 j->second = v;
Chris@631 371 v = v + 1.f;
Chris@631 372 }
Chris@631 373 }
Chris@631 374
Chris@631 375 std::map<RegionModel::Point, RegionModel::Point,
Chris@631 376 RegionModel::Point::Comparator> pointMap;
Chris@631 377 for (RegionModel::PointList::const_iterator i =
Chris@631 378 model2a->getPoints().begin();
Chris@631 379 i != model2a->getPoints().end(); ++i) {
Chris@631 380 RegionModel::Point p(*i);
Chris@631 381 v = countLabelValueMap[labelCountMap[p.label]][p.label];
Chris@631 382 RegionModel::Point pp(p.frame, v, p.duration, p.label);
Chris@631 383 pointMap[p] = pp;
Chris@631 384 }
Chris@631 385
Chris@631 386 for (std::map<RegionModel::Point, RegionModel::Point>::iterator i =
Chris@631 387 pointMap.begin(); i != pointMap.end(); ++i) {
Chris@631 388 model2a->deletePoint(i->first);
Chris@631 389 model2a->addPoint(i->second);
Chris@631 390 }
Chris@631 391 }
Chris@631 392 }
Chris@631 393
Chris@392 394 if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@148 395 model3->setMinimumLevel(min);
Chris@148 396 model3->setMaximumLevel(max);
Chris@148 397 }
Chris@148 398
Chris@148 399 return model;
Chris@148 400 }
Chris@148 401