annotate data/fileio/CSVFileReader.cpp @ 1078:ce82bcdc95d0

Fail upfront if the file is going to be too large. We expect the caller to split up large data sets into several MatrixFiles
author Chris Cannam
date Wed, 10 Jun 2015 13:10:26 +0100
parents 26cf6d5251ec
children ed207f89aaef e22bfe8ca248
rev   line source
Chris@148 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@148 2
Chris@148 3 /*
Chris@148 4 Sonic Visualiser
Chris@148 5 An audio file viewer and annotation editor.
Chris@148 6 Centre for Digital Music, Queen Mary, University of London.
Chris@148 7 This file copyright 2006 Chris Cannam.
Chris@148 8
Chris@148 9 This program is free software; you can redistribute it and/or
Chris@148 10 modify it under the terms of the GNU General Public License as
Chris@148 11 published by the Free Software Foundation; either version 2 of the
Chris@148 12 License, or (at your option) any later version. See the file
Chris@148 13 COPYING included with this distribution for more information.
Chris@148 14 */
Chris@148 15
Chris@148 16 #include "CSVFileReader.h"
Chris@148 17
Chris@150 18 #include "model/Model.h"
Chris@148 19 #include "base/RealTime.h"
Chris@631 20 #include "base/StringBits.h"
Chris@148 21 #include "model/SparseOneDimensionalModel.h"
Chris@148 22 #include "model/SparseTimeValueModel.h"
Chris@152 23 #include "model/EditableDenseThreeDimensionalModel.h"
Chris@628 24 #include "model/RegionModel.h"
Chris@897 25 #include "model/NoteModel.h"
Chris@308 26 #include "DataFileReaderFactory.h"
Chris@148 27
Chris@148 28 #include <QFile>
Chris@1030 29 #include <QFileInfo>
Chris@148 30 #include <QString>
Chris@148 31 #include <QRegExp>
Chris@148 32 #include <QStringList>
Chris@148 33 #include <QTextStream>
Chris@148 34
Chris@148 35 #include <iostream>
Chris@628 36 #include <map>
Chris@148 37
Chris@392 38 CSVFileReader::CSVFileReader(QString path, CSVFormat format,
Chris@1047 39 sv_samplerate_t mainModelSampleRate) :
Chris@392 40 m_format(format),
Chris@1009 41 m_device(0),
Chris@1009 42 m_ownDevice(true),
Chris@631 43 m_warnings(0),
Chris@148 44 m_mainModelSampleRate(mainModelSampleRate)
Chris@148 45 {
Chris@1009 46 QFile *file = new QFile(path);
Chris@148 47 bool good = false;
Chris@148 48
Chris@1009 49 if (!file->exists()) {
Chris@148 50 m_error = QFile::tr("File \"%1\" does not exist").arg(path);
Chris@1009 51 } else if (!file->open(QIODevice::ReadOnly | QIODevice::Text)) {
Chris@148 52 m_error = QFile::tr("Failed to open file \"%1\"").arg(path);
Chris@148 53 } else {
Chris@148 54 good = true;
Chris@148 55 }
Chris@148 56
Chris@1009 57 if (good) {
Chris@1009 58 m_device = file;
Chris@1030 59 m_filename = QFileInfo(path).fileName();
Chris@1009 60 } else {
Chris@1009 61 delete file;
Chris@148 62 }
Chris@148 63 }
Chris@148 64
Chris@1009 65 CSVFileReader::CSVFileReader(QIODevice *device, CSVFormat format,
Chris@1047 66 sv_samplerate_t mainModelSampleRate) :
Chris@1009 67 m_format(format),
Chris@1009 68 m_device(device),
Chris@1009 69 m_ownDevice(false),
Chris@1009 70 m_warnings(0),
Chris@1009 71 m_mainModelSampleRate(mainModelSampleRate)
Chris@1009 72 {
Chris@1009 73 }
Chris@1009 74
Chris@148 75 CSVFileReader::~CSVFileReader()
Chris@148 76 {
Chris@1009 77 SVDEBUG << "CSVFileReader::~CSVFileReader: device is " << m_device << endl;
Chris@148 78
Chris@1009 79 if (m_device && m_ownDevice) {
Chris@1009 80 SVDEBUG << "CSVFileReader::CSVFileReader: Closing device" << endl;
Chris@1009 81 m_device->close();
Chris@1009 82 delete m_device;
Chris@148 83 }
Chris@148 84 }
Chris@148 85
Chris@148 86 bool
Chris@148 87 CSVFileReader::isOK() const
Chris@148 88 {
Chris@1009 89 return (m_device != 0);
Chris@148 90 }
Chris@148 91
Chris@148 92 QString
Chris@148 93 CSVFileReader::getError() const
Chris@148 94 {
Chris@148 95 return m_error;
Chris@148 96 }
Chris@148 97
Chris@1038 98 sv_frame_t
Chris@1047 99 CSVFileReader::convertTimeValue(QString s, int lineno,
Chris@1047 100 sv_samplerate_t sampleRate,
Chris@929 101 int windowSize) const
Chris@631 102 {
Chris@631 103 QRegExp nonNumericRx("[^0-9eE.,+-]");
Chris@897 104 int warnLimit = 10;
Chris@631 105
Chris@631 106 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
Chris@631 107
Chris@1038 108 sv_frame_t calculatedFrame = 0;
Chris@631 109
Chris@631 110 bool ok = false;
Chris@631 111 QString numeric = s;
Chris@631 112 numeric.remove(nonNumericRx);
Chris@631 113
Chris@631 114 if (timeUnits == CSVFormat::TimeSeconds) {
Chris@631 115
Chris@631 116 double time = numeric.toDouble(&ok);
Chris@631 117 if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
Chris@1038 118 calculatedFrame = sv_frame_t(time * sampleRate + 0.5);
Chris@990 119
Chris@990 120 } else if (timeUnits == CSVFormat::TimeMilliseconds) {
Chris@990 121
Chris@990 122 double time = numeric.toDouble(&ok);
Chris@990 123 if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
Chris@1038 124 calculatedFrame = sv_frame_t((time / 1000.0) * sampleRate + 0.5);
Chris@631 125
Chris@631 126 } else {
Chris@631 127
Chris@631 128 long n = numeric.toLong(&ok);
Chris@631 129 if (n >= 0) calculatedFrame = n;
Chris@631 130
Chris@631 131 if (timeUnits == CSVFormat::TimeWindows) {
Chris@631 132 calculatedFrame *= windowSize;
Chris@631 133 }
Chris@631 134 }
Chris@631 135
Chris@631 136 if (!ok) {
Chris@631 137 if (m_warnings < warnLimit) {
Chris@843 138 cerr << "WARNING: CSVFileReader::load: "
Chris@844 139 << "Bad time format (\"" << s
Chris@631 140 << "\") in data line "
Chris@843 141 << lineno+1 << endl;
Chris@631 142 } else if (m_warnings == warnLimit) {
Chris@843 143 cerr << "WARNING: Too many warnings" << endl;
Chris@631 144 }
Chris@631 145 ++m_warnings;
Chris@631 146 }
Chris@631 147
Chris@631 148 return calculatedFrame;
Chris@631 149 }
Chris@631 150
Chris@148 151 Model *
Chris@148 152 CSVFileReader::load() const
Chris@148 153 {
Chris@1009 154 if (!m_device) return 0;
Chris@148 155
Chris@628 156 CSVFormat::ModelType modelType = m_format.getModelType();
Chris@392 157 CSVFormat::TimingType timingType = m_format.getTimingType();
Chris@628 158 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
Chris@1047 159 sv_samplerate_t sampleRate = m_format.getSampleRate();
Chris@929 160 int windowSize = m_format.getWindowSize();
Chris@631 161 QChar separator = m_format.getSeparator();
Chris@631 162 bool allowQuoting = m_format.getAllowQuoting();
Chris@148 163
Chris@392 164 if (timingType == CSVFormat::ExplicitTiming) {
Chris@611 165 if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@611 166 // This will be overridden later if more than one line
Chris@611 167 // appears in our file, but we want to choose a default
Chris@611 168 // that's likely to be visible
Chris@611 169 windowSize = 1024;
Chris@611 170 } else {
Chris@611 171 windowSize = 1;
Chris@611 172 }
Chris@990 173 if (timeUnits == CSVFormat::TimeSeconds ||
Chris@990 174 timeUnits == CSVFormat::TimeMilliseconds) {
Chris@148 175 sampleRate = m_mainModelSampleRate;
Chris@148 176 }
Chris@148 177 }
Chris@148 178
Chris@148 179 SparseOneDimensionalModel *model1 = 0;
Chris@148 180 SparseTimeValueModel *model2 = 0;
Chris@628 181 RegionModel *model2a = 0;
Chris@897 182 NoteModel *model2b = 0;
Chris@152 183 EditableDenseThreeDimensionalModel *model3 = 0;
Chris@148 184 Model *model = 0;
Chris@148 185
Chris@1009 186 QTextStream in(m_device);
Chris@148 187
Chris@148 188 unsigned int warnings = 0, warnLimit = 10;
Chris@148 189 unsigned int lineno = 0;
Chris@148 190
Chris@148 191 float min = 0.0, max = 0.0;
Chris@148 192
Chris@1038 193 sv_frame_t frameNo = 0;
Chris@1038 194 sv_frame_t duration = 0;
Chris@1038 195 sv_frame_t endFrame = 0;
Chris@631 196
Chris@631 197 bool haveAnyValue = false;
Chris@631 198 bool haveEndTime = false;
Chris@897 199 bool pitchLooksLikeMIDI = true;
Chris@631 200
Chris@1038 201 sv_frame_t startFrame = 0; // for calculation of dense model resolution
Chris@631 202 bool firstEverValue = true;
Chris@148 203
Chris@631 204 std::map<QString, int> labelCountMap;
Chris@631 205
Chris@676 206 int valueColumns = 0;
Chris@676 207 for (int i = 0; i < m_format.getColumnCount(); ++i) {
Chris@676 208 if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) {
Chris@676 209 ++valueColumns;
Chris@676 210 }
Chris@676 211 }
Chris@676 212
Chris@148 213 while (!in.atEnd()) {
Chris@148 214
Chris@283 215 // QTextStream's readLine doesn't cope with old-style Mac
Chris@283 216 // CR-only line endings. Why did they bother making the class
Chris@283 217 // cope with more than one sort of line ending, if it still
Chris@283 218 // can't be configured to cope with all the common sorts?
Chris@148 219
Chris@283 220 // For the time being we'll deal with this case (which is
Chris@283 221 // relatively uncommon for us, but still necessary to handle)
Chris@283 222 // by reading the entire file using a single readLine, and
Chris@283 223 // splitting it. For CR and CR/LF line endings this will just
Chris@283 224 // read a line at a time, and that's obviously OK.
Chris@148 225
Chris@283 226 QString chunk = in.readLine();
Chris@283 227 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
Chris@283 228
Chris@897 229 for (int li = 0; li < lines.size(); ++li) {
Chris@148 230
Chris@283 231 QString line = lines[li];
Chris@1009 232
Chris@283 233 if (line.startsWith("#")) continue;
Chris@283 234
Chris@631 235 QStringList list = StringBits::split(line, separator, allowQuoting);
Chris@283 236 if (!model) {
Chris@283 237
Chris@283 238 switch (modelType) {
Chris@283 239
Chris@392 240 case CSVFormat::OneDimensionalModel:
Chris@283 241 model1 = new SparseOneDimensionalModel(sampleRate, windowSize);
Chris@283 242 model = model1;
Chris@283 243 break;
Chris@148 244
Chris@392 245 case CSVFormat::TwoDimensionalModel:
Chris@283 246 model2 = new SparseTimeValueModel(sampleRate, windowSize, false);
Chris@283 247 model = model2;
Chris@283 248 break;
Chris@148 249
Chris@628 250 case CSVFormat::TwoDimensionalModelWithDuration:
Chris@628 251 model2a = new RegionModel(sampleRate, windowSize, false);
Chris@628 252 model = model2a;
Chris@628 253 break;
Chris@628 254
Chris@897 255 case CSVFormat::TwoDimensionalModelWithDurationAndPitch:
Chris@897 256 model2b = new NoteModel(sampleRate, windowSize, false);
Chris@897 257 model = model2b;
Chris@897 258 break;
Chris@897 259
Chris@392 260 case CSVFormat::ThreeDimensionalModel:
Chris@535 261 model3 = new EditableDenseThreeDimensionalModel
Chris@535 262 (sampleRate,
Chris@535 263 windowSize,
Chris@676 264 valueColumns,
Chris@535 265 EditableDenseThreeDimensionalModel::NoCompression);
Chris@283 266 model = model3;
Chris@283 267 break;
Chris@283 268 }
Chris@1030 269
Chris@1030 270 if (model) {
Chris@1030 271 if (m_filename != "") {
Chris@1030 272 model->setObjectName(m_filename);
Chris@1030 273 }
Chris@1030 274 }
Chris@283 275 }
Chris@148 276
Chris@631 277 float value = 0.f;
Chris@897 278 float pitch = 0.f;
Chris@631 279 QString label = "";
Chris@148 280
Chris@631 281 duration = 0.f;
Chris@631 282 haveEndTime = false;
Chris@628 283
Chris@283 284 for (int i = 0; i < list.size(); ++i) {
Chris@148 285
Chris@631 286 QString s = list[i];
Chris@631 287
Chris@631 288 CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i);
Chris@631 289
Chris@631 290 switch (purpose) {
Chris@631 291
Chris@631 292 case CSVFormat::ColumnUnknown:
Chris@631 293 break;
Chris@631 294
Chris@631 295 case CSVFormat::ColumnStartTime:
Chris@631 296 frameNo = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 297 break;
Chris@631 298
Chris@631 299 case CSVFormat::ColumnEndTime:
Chris@631 300 endFrame = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 301 haveEndTime = true;
Chris@631 302 break;
Chris@631 303
Chris@631 304 case CSVFormat::ColumnDuration:
Chris@631 305 duration = convertTimeValue(s, lineno, sampleRate, windowSize);
Chris@631 306 break;
Chris@631 307
Chris@631 308 case CSVFormat::ColumnValue:
Chris@631 309 value = s.toFloat();
Chris@631 310 haveAnyValue = true;
Chris@631 311 break;
Chris@631 312
Chris@897 313 case CSVFormat::ColumnPitch:
Chris@897 314 pitch = s.toFloat();
Chris@897 315 if (pitch < 0.f || pitch > 127.f) {
Chris@897 316 pitchLooksLikeMIDI = false;
Chris@897 317 }
Chris@897 318 break;
Chris@897 319
Chris@631 320 case CSVFormat::ColumnLabel:
Chris@631 321 label = s;
Chris@631 322 ++labelCountMap[label];
Chris@631 323 break;
Chris@283 324 }
Chris@631 325 }
Chris@148 326
Chris@631 327 if (haveEndTime) { // ... calculate duration now all cols read
Chris@631 328 if (endFrame > frameNo) {
Chris@631 329 duration = endFrame - frameNo;
Chris@628 330 }
Chris@283 331 }
Chris@148 332
Chris@392 333 if (modelType == CSVFormat::OneDimensionalModel) {
Chris@148 334
Chris@631 335 SparseOneDimensionalModel::Point point(frameNo, label);
Chris@283 336 model1->addPoint(point);
Chris@148 337
Chris@392 338 } else if (modelType == CSVFormat::TwoDimensionalModel) {
Chris@148 339
Chris@631 340 SparseTimeValueModel::Point point(frameNo, value, label);
Chris@283 341 model2->addPoint(point);
Chris@148 342
Chris@628 343 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
Chris@628 344
Chris@631 345 RegionModel::Point point(frameNo, value, duration, label);
Chris@628 346 model2a->addPoint(point);
Chris@628 347
Chris@897 348 } else if (modelType == CSVFormat::TwoDimensionalModelWithDurationAndPitch) {
Chris@897 349
Chris@897 350 float level = ((value >= 0.f && value <= 1.f) ? value : 1.f);
Chris@897 351 NoteModel::Point point(frameNo, pitch, duration, level, label);
Chris@897 352 model2b->addPoint(point);
Chris@897 353
Chris@392 354 } else if (modelType == CSVFormat::ThreeDimensionalModel) {
Chris@148 355
Chris@283 356 DenseThreeDimensionalModel::Column values;
Chris@148 357
Chris@631 358 for (int i = 0; i < list.size(); ++i) {
Chris@148 359
Chris@676 360 if (m_format.getColumnPurpose(i) != CSVFormat::ColumnValue) {
Chris@676 361 continue;
Chris@676 362 }
Chris@676 363
Chris@283 364 bool ok = false;
Chris@283 365 float value = list[i].toFloat(&ok);
Chris@611 366
Chris@676 367 values.push_back(value);
Chris@148 368
Chris@631 369 if (firstEverValue || value < min) min = value;
Chris@631 370 if (firstEverValue || value > max) max = value;
Chris@676 371
Chris@631 372 if (firstEverValue) {
Chris@611 373 startFrame = frameNo;
Chris@611 374 model3->setStartFrame(startFrame);
Chris@611 375 } else if (lineno == 1 &&
Chris@611 376 timingType == CSVFormat::ExplicitTiming) {
Chris@1038 377 model3->setResolution(int(frameNo - startFrame));
Chris@611 378 }
Chris@631 379
Chris@631 380 firstEverValue = false;
Chris@148 381
Chris@283 382 if (!ok) {
Chris@283 383 if (warnings < warnLimit) {
Chris@843 384 cerr << "WARNING: CSVFileReader::load: "
Chris@390 385 << "Non-numeric value \""
Chris@844 386 << list[i]
Chris@491 387 << "\" in data line " << lineno+1
Chris@843 388 << ":" << endl;
Chris@843 389 cerr << line << endl;
Chris@283 390 ++warnings;
Chris@283 391 } else if (warnings == warnLimit) {
Chris@843 392 // cerr << "WARNING: Too many warnings" << endl;
Chris@283 393 }
Chris@283 394 }
Chris@283 395 }
Chris@148 396
Chris@690 397 // SVDEBUG << "Setting bin values for count " << lineno << ", frame "
Chris@687 398 // << frameNo << ", time " << RealTime::frame2RealTime(frameNo, sampleRate) << endl;
Chris@148 399
Chris@611 400 model3->setColumn(lineno, values);
Chris@283 401 }
Chris@148 402
Chris@283 403 ++lineno;
Chris@392 404 if (timingType == CSVFormat::ImplicitTiming ||
Chris@283 405 list.size() == 0) {
Chris@283 406 frameNo += windowSize;
Chris@283 407 }
Chris@283 408 }
Chris@148 409 }
Chris@148 410
Chris@631 411 if (!haveAnyValue) {
Chris@631 412 if (model2a) {
Chris@631 413 // assign values for regions based on label frequency; we
Chris@631 414 // have this in our labelCountMap, sort of
Chris@631 415
Chris@631 416 std::map<int, std::map<QString, float> > countLabelValueMap;
Chris@631 417 for (std::map<QString, int>::iterator i = labelCountMap.begin();
Chris@631 418 i != labelCountMap.end(); ++i) {
Chris@631 419 countLabelValueMap[i->second][i->first] = 0.f;
Chris@631 420 }
Chris@631 421
Chris@631 422 float v = 0.f;
Chris@631 423 for (std::map<int, std::map<QString, float> >::iterator i =
Chris@631 424 countLabelValueMap.end(); i != countLabelValueMap.begin(); ) {
Chris@631 425 --i;
Chris@631 426 for (std::map<QString, float>::iterator j = i->second.begin();
Chris@631 427 j != i->second.end(); ++j) {
Chris@631 428 j->second = v;
Chris@631 429 v = v + 1.f;
Chris@631 430 }
Chris@631 431 }
Chris@631 432
Chris@631 433 std::map<RegionModel::Point, RegionModel::Point,
Chris@631 434 RegionModel::Point::Comparator> pointMap;
Chris@631 435 for (RegionModel::PointList::const_iterator i =
Chris@631 436 model2a->getPoints().begin();
Chris@631 437 i != model2a->getPoints().end(); ++i) {
Chris@631 438 RegionModel::Point p(*i);
Chris@631 439 v = countLabelValueMap[labelCountMap[p.label]][p.label];
Chris@631 440 RegionModel::Point pp(p.frame, v, p.duration, p.label);
Chris@631 441 pointMap[p] = pp;
Chris@631 442 }
Chris@631 443
Chris@631 444 for (std::map<RegionModel::Point, RegionModel::Point>::iterator i =
Chris@631 445 pointMap.begin(); i != pointMap.end(); ++i) {
Chris@631 446 model2a->deletePoint(i->first);
Chris@631 447 model2a->addPoint(i->second);
Chris@631 448 }
Chris@631 449 }
Chris@631 450 }
Chris@631 451
Chris@897 452 if (model2b) {
Chris@897 453 if (pitchLooksLikeMIDI) {
Chris@897 454 model2b->setScaleUnits("MIDI Pitch");
Chris@897 455 } else {
Chris@897 456 model2b->setScaleUnits("Hz");
Chris@897 457 }
Chris@897 458 }
Chris@897 459
Chris@961 460 if (model3) {
Chris@148 461 model3->setMinimumLevel(min);
Chris@148 462 model3->setMaximumLevel(max);
Chris@148 463 }
Chris@148 464
Chris@148 465 return model;
Chris@148 466 }
Chris@148 467