# HG changeset patch # User Chris Cannam # Date 1279559336 0 # Node ID 3a5ee4b6c9ad5464a48f360cc6fd33d6298b3deb # Parent 11a664058dd85c71b4b3b167000d027c4aaf3547 * Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns. diff -r 11a664058dd8 -r 3a5ee4b6c9ad data/fileio/CSVFileReader.cpp --- a/data/fileio/CSVFileReader.cpp Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFileReader.cpp Mon Jul 19 17:08:56 2010 +0000 @@ -17,6 +17,7 @@ #include "model/Model.h" #include "base/RealTime.h" +#include "base/StringBits.h" #include "model/SparseOneDimensionalModel.h" #include "model/SparseTimeValueModel.h" #include "model/EditableDenseThreeDimensionalModel.h" @@ -36,6 +37,7 @@ size_t mainModelSampleRate) : m_format(format), m_file(0), + m_warnings(0), m_mainModelSampleRate(mainModelSampleRate) { m_file = new QFile(path); @@ -78,28 +80,64 @@ return m_error; } +size_t +CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate, + size_t windowSize) const +{ + QRegExp nonNumericRx("[^0-9eE.,+-]"); + unsigned int warnLimit = 10; + + CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); + + size_t calculatedFrame = 0; + + bool ok = false; + QString numeric = s; + numeric.remove(nonNumericRx); + + if (timeUnits == CSVFormat::TimeSeconds) { + + double time = numeric.toDouble(&ok); + if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok); + calculatedFrame = int(time * sampleRate + 0.5); + + } else { + + long n = numeric.toLong(&ok); + if (n >= 0) calculatedFrame = n; + + if (timeUnits == CSVFormat::TimeWindows) { + calculatedFrame *= windowSize; + } + } + + if (!ok) { + if (m_warnings < warnLimit) { + std::cerr << "WARNING: CSVFileReader::load: " + << "Bad time format (\"" << s.toStdString() + << "\") in data line " + << lineno+1 << std::endl; + } else if (m_warnings == warnLimit) { + std::cerr << "WARNING: Too many warnings" << std::endl; + } + ++m_warnings; + } + + return calculatedFrame; +} + Model * CSVFileReader::load() const { if (!m_file) return 0; -/*!!! - CSVFormatDialog *dialog = new CSVFormatDialog - (0, m_file, m_mainModelSampleRate); - - if (dialog->exec() == QDialog::Rejected) { - delete dialog; - throw DataFileReaderFactory::ImportCancelled; - } -*/ CSVFormat::ModelType modelType = m_format.getModelType(); CSVFormat::TimingType timingType = m_format.getTimingType(); - CSVFormat::DurationType durationType = m_format.getDurationType(); CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); - QString separator = m_format.getSeparator(); - QString::SplitBehavior behaviour = m_format.getSplitBehaviour(); size_t sampleRate = m_format.getSampleRate(); size_t windowSize = m_format.getWindowSize(); + QChar separator = m_format.getSeparator(); + bool allowQuoting = m_format.getAllowQuoting(); if (timingType == CSVFormat::ExplicitTiming) { if (modelType == CSVFormat::ThreeDimensionalModel) { @@ -131,11 +169,16 @@ size_t frameNo = 0; size_t duration = 0; + size_t endFrame = 0; + + bool haveAnyValue = false; + bool haveEndTime = false; + size_t startFrame = 0; // for calculation of dense model resolution + bool firstEverValue = true; - std::map labelValueMap; - float syntheticMax = 0.f; - + std::map labelCountMap; + while (!in.atEnd()) { // QTextStream's readLine doesn't cope with old-style Mac @@ -158,8 +201,7 @@ if (line.startsWith("#")) continue; - QStringList list = line.split(separator, behaviour); - + QStringList list = StringBits::split(line, separator, allowQuoting); if (!model) { switch (modelType) { @@ -190,152 +232,94 @@ } } - QStringList tidyList; - QRegExp nonNumericRx("[^0-9eE.,+-]"); + float value = 0.f; + QString label = ""; - float value = 0.f; + duration = 0.f; + haveEndTime = false; for (int i = 0; i < list.size(); ++i) { - - QString s(list[i].trimmed()); - if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { - s = s.mid(1, s.length() - 2); - } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { - s = s.mid(1, s.length() - 2); + QString s = list[i]; + + CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i); + + switch (purpose) { + + case CSVFormat::ColumnUnknown: + break; + + case CSVFormat::ColumnStartTime: + frameNo = convertTimeValue(s, lineno, sampleRate, windowSize); + break; + + case CSVFormat::ColumnEndTime: + endFrame = convertTimeValue(s, lineno, sampleRate, windowSize); + haveEndTime = true; + break; + + case CSVFormat::ColumnDuration: + duration = convertTimeValue(s, lineno, sampleRate, windowSize); + break; + + case CSVFormat::ColumnValue: + value = s.toFloat(); + haveAnyValue = true; + break; + + case CSVFormat::ColumnLabel: + label = s; + ++labelCountMap[label]; + break; } + } - if (timingType == CSVFormat::ExplicitTiming) { - - size_t calculatedFrame = 0; - - if (i == 0 || - (i == 1 && - modelType == CSVFormat::TwoDimensionalModelWithDuration)) { - - bool ok = false; - QString numeric = s; - numeric.remove(nonNumericRx); - - if (timeUnits == CSVFormat::TimeSeconds) { - - double time = numeric.toDouble(&ok); - calculatedFrame = int(time * sampleRate + 0.5); - - } else { - - calculatedFrame = numeric.toInt(&ok); - - if (timeUnits == CSVFormat::TimeWindows) { - calculatedFrame *= windowSize; - } - } - - if (!ok) { - if (warnings < warnLimit) { - std::cerr << "WARNING: CSVFileReader::load: " - << "Bad time format (\"" << s.toStdString() - << "\") in data line " - << lineno+1 << ":" << std::endl; - std::cerr << line.toStdString() << std::endl; - } else if (warnings == warnLimit) { - std::cerr << "WARNING: Too many warnings" << std::endl; - } - ++warnings; - } - - if (i == 0) frameNo = calculatedFrame; - else { - if (durationType == CSVFormat::EndTimes) { - duration = calculatedFrame - frameNo; - } else { - duration = calculatedFrame; - } - } - - continue; - } + if (haveEndTime) { // ... calculate duration now all cols read + if (endFrame > frameNo) { + duration = endFrame - frameNo; } - - if ((i == 1 && - modelType == CSVFormat::TwoDimensionalModel) || - (i == 2 && - modelType == CSVFormat::TwoDimensionalModelWithDuration)) { - bool ok = false; - value = s.toFloat(&ok); - if (!ok) { - // cf. RDFImporter::fillModel - if (labelValueMap.find(s) == labelValueMap.end()) { - syntheticMax = syntheticMax + 1.f; - labelValueMap[s] = syntheticMax; - } - value = labelValueMap[s]; - } else { - if (value > syntheticMax) syntheticMax = value; - } - if (i + 1 == list.size()) { - // keep text around for use as label (none other given) - tidyList.push_back(s); - } - continue; - } - - tidyList.push_back(s); } if (modelType == CSVFormat::OneDimensionalModel) { - SparseOneDimensionalModel::Point point - (frameNo, - tidyList.size() > 0 ? tidyList[tidyList.size()-1] : - QString("%1").arg(lineno+1)); - + SparseOneDimensionalModel::Point point(frameNo, label); model1->addPoint(point); } else if (modelType == CSVFormat::TwoDimensionalModel) { - SparseTimeValueModel::Point point - (frameNo, - value, - tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); - + SparseTimeValueModel::Point point(frameNo, value, label); model2->addPoint(point); } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) { - RegionModel::Point point - (frameNo, - value, - duration, - tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); - + RegionModel::Point point(frameNo, value, duration, label); model2a->addPoint(point); } else if (modelType == CSVFormat::ThreeDimensionalModel) { DenseThreeDimensionalModel::Column values; - for (int i = 0; i < tidyList.size(); ++i) { + for (int i = 0; i < list.size(); ++i) { bool ok = false; float value = list[i].toFloat(&ok); - if (i > 0 || timingType != CSVFormat::ExplicitTiming) { + if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) { values.push_back(value); } - bool firstEver = (lineno == 0 && i == 0); + if (firstEverValue || value < min) min = value; + if (firstEverValue || value > max) max = value; - if (firstEver || value < min) min = value; - if (firstEver || value > max) max = value; - - if (firstEver) { + if (firstEverValue) { startFrame = frameNo; model3->setStartFrame(startFrame); } else if (lineno == 1 && timingType == CSVFormat::ExplicitTiming) { model3->setResolution(frameNo - startFrame); } + + firstEverValue = false; if (!ok) { if (warnings < warnLimit) { @@ -366,6 +350,47 @@ } } + if (!haveAnyValue) { + if (model2a) { + // assign values for regions based on label frequency; we + // have this in our labelCountMap, sort of + + std::map > countLabelValueMap; + for (std::map::iterator i = labelCountMap.begin(); + i != labelCountMap.end(); ++i) { + countLabelValueMap[i->second][i->first] = 0.f; + } + + float v = 0.f; + for (std::map >::iterator i = + countLabelValueMap.end(); i != countLabelValueMap.begin(); ) { + --i; + for (std::map::iterator j = i->second.begin(); + j != i->second.end(); ++j) { + j->second = v; + v = v + 1.f; + } + } + + std::map pointMap; + for (RegionModel::PointList::const_iterator i = + model2a->getPoints().begin(); + i != model2a->getPoints().end(); ++i) { + RegionModel::Point p(*i); + v = countLabelValueMap[labelCountMap[p.label]][p.label]; + RegionModel::Point pp(p.frame, v, p.duration, p.label); + pointMap[p] = pp; + } + + for (std::map::iterator i = + pointMap.begin(); i != pointMap.end(); ++i) { + model2a->deletePoint(i->first); + model2a->addPoint(i->second); + } + } + } + if (modelType == CSVFormat::ThreeDimensionalModel) { model3->setMinimumLevel(min); model3->setMaximumLevel(max); diff -r 11a664058dd8 -r 3a5ee4b6c9ad data/fileio/CSVFileReader.h --- a/data/fileio/CSVFileReader.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFileReader.h Mon Jul 19 17:08:56 2010 +0000 @@ -39,7 +39,11 @@ CSVFormat m_format; QFile *m_file; QString m_error; + mutable int m_warnings; size_t m_mainModelSampleRate; + + size_t convertTimeValue(QString, int lineno, size_t sampleRate, + size_t windowSize) const; }; diff -r 11a664058dd8 -r 3a5ee4b6c9ad data/fileio/CSVFormat.cpp --- a/data/fileio/CSVFormat.cpp Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFormat.cpp Mon Jul 19 17:08:56 2010 +0000 @@ -39,9 +39,7 @@ { m_modelType = TwoDimensionalModel; m_timingType = ExplicitTiming; - m_durationType = Durations; m_timeUnits = TimeSeconds; - m_behaviour = QString::KeepEmptyParts; m_maxExampleCols = 0; m_columnCount = 0; @@ -186,10 +184,6 @@ void CSVFormat::guessPurposes() { - while (m_columnPurposes.size() <= m_columnCount) { - m_columnPurposes.push_back(ColumnUnknown); - } - m_timingType = CSVFormat::ImplicitTiming; m_timeUnits = CSVFormat::TimeWindows; @@ -229,7 +223,6 @@ if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { purpose = ColumnEndTime; - m_durationType = EndTimes; } } } @@ -242,7 +235,7 @@ } } - m_columnPurposes[i] = purpose; + setColumnPurpose(i, purpose); } int valueCount = 0; @@ -281,12 +274,16 @@ } } - if (valueCount == 0) { - m_modelType = OneDimensionalModel; - } else if (valueCount == 1) { - m_modelType = TwoDimensionalModel; + if (timingColumnCount > 1) { + m_modelType = TwoDimensionalModelWithDuration; } else { - m_modelType = ThreeDimensionalModel; + if (valueCount == 0) { + m_modelType = OneDimensionalModel; + } else if (valueCount == 1) { + m_modelType = TwoDimensionalModel; + } else { + m_modelType = ThreeDimensionalModel; + } } std::cerr << "Estimated column purposes: "; @@ -297,8 +294,33 @@ std::cerr << "Estimated model type: " << m_modelType << std::endl; std::cerr << "Estimated timing type: " << m_timingType << std::endl; - std::cerr << "Estimated duration type: " << m_durationType << std::endl; std::cerr << "Estimated units: " << m_timeUnits << std::endl; } +CSVFormat::ColumnPurpose +CSVFormat::getColumnPurpose(int i) +{ + while (m_columnPurposes.size() <= i) { + m_columnPurposes.push_back(ColumnUnknown); + } + return m_columnPurposes[i]; +} +CSVFormat::ColumnPurpose +CSVFormat::getColumnPurpose(int i) const +{ + return m_columnPurposes[i]; +} + +void +CSVFormat::setColumnPurpose(int i, ColumnPurpose p) +{ + while (m_columnPurposes.size() <= i) { + m_columnPurposes.push_back(ColumnUnknown); + } + m_columnPurposes[i] = p; +} + + + + diff -r 11a664058dd8 -r 3a5ee4b6c9ad data/fileio/CSVFormat.h --- a/data/fileio/CSVFormat.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFormat.h Mon Jul 19 17:08:56 2010 +0000 @@ -34,11 +34,6 @@ ImplicitTiming }; - enum DurationType { - Durations, - EndTimes - }; - enum TimeUnits { TimeSeconds, TimeAudioFrames, @@ -65,14 +60,12 @@ CSVFormat() : // arbitrary defaults m_modelType(TwoDimensionalModel), m_timingType(ExplicitTiming), - m_durationType(Durations), m_timeUnits(TimeSeconds), m_separator(","), m_sampleRate(44100), m_windowSize(1024), m_columnCount(0), m_variableColumnCount(false), - m_behaviour(QString::KeepEmptyParts), m_allowQuoting(true), m_maxExampleCols(0) { } @@ -90,43 +83,42 @@ ModelType getModelType() const { return m_modelType; } TimingType getTimingType() const { return m_timingType; } - DurationType getDurationType() const { return m_durationType; } TimeUnits getTimeUnits() const { return m_timeUnits; } - QString getSeparator() const { return m_separator; } size_t getSampleRate() const { return m_sampleRate; } size_t getWindowSize() const { return m_windowSize; } int getColumnCount() const { return m_columnCount; } - - QString::SplitBehavior getSplitBehaviour() const { return m_behaviour; } - QList getColumnPurposes() const { return m_columnPurposes; } + bool getAllowQuoting() const { return m_allowQuoting; } + QChar getSeparator() const { + if (m_separator == "") return ' '; + else return m_separator[0]; + } - ColumnPurpose getColumnPurpose(int i) { return m_columnPurposes[i]; } - void setModelType(ModelType t) { m_modelType = t; } void setTimingType(TimingType t) { m_timingType = t; } - void setDurationType(DurationType t) { m_durationType = t; } void setTimeUnits(TimeUnits t) { m_timeUnits = t; } - void setSeparator(QString s) { m_separator = s; } + void setSeparator(QChar s) { m_separator = s; } void setSampleRate(size_t r) { m_sampleRate = r; } void setWindowSize(size_t s) { m_windowSize = s; } void setColumnCount(int c) { m_columnCount = c; } + void setAllowQuoting(bool q) { m_allowQuoting = q; } - void setSplitBehaviour(QString::SplitBehavior b) { m_behaviour = b; } + QList getColumnPurposes() const { return m_columnPurposes; } void setColumnPurposes(QList cl) { m_columnPurposes = cl; } + + ColumnPurpose getColumnPurpose(int i); + ColumnPurpose getColumnPurpose(int i) const; + void setColumnPurpose(int i, ColumnPurpose p); - void setColumnPurpose(int i, ColumnPurpose p) { m_columnPurposes[i] = p; } - // read-only; only valid if format has been guessed: QList getColumnQualities() const { return m_columnQualities; } // read-only; only valid if format has been guessed: QList getExample() const { return m_example; } int getMaxExampleCols() const { return m_maxExampleCols; } - + protected: ModelType m_modelType; TimingType m_timingType; - DurationType m_durationType; TimeUnits m_timeUnits; QString m_separator; size_t m_sampleRate; @@ -140,7 +132,6 @@ QList m_prevValues; - QString::SplitBehavior m_behaviour; bool m_allowQuoting; QList m_example; diff -r 11a664058dd8 -r 3a5ee4b6c9ad data/model/RegionModel.h --- a/data/model/RegionModel.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/model/RegionModel.h Mon Jul 19 17:08:56 2010 +0000 @@ -36,6 +36,7 @@ struct RegionRec { public: + RegionRec() : frame(0), value(0.f), duration(0) { } RegionRec(long _frame) : frame(_frame), value(0.0f), duration(0) { } RegionRec(long _frame, float _value, size_t _duration, QString _label) : frame(_frame), value(_value), duration(_duration), label(_label) { }