Mercurial > hg > svcore
changeset 631:3a5ee4b6c9ad
* Complete the overhaul of CSV file import; now you can pick the purpose for
each column in the file, and SV should do the rest. The most significant
practical improvement here is that we can now handle files in which time
and duration do not necessarily appear in known columns.
author | Chris Cannam |
---|---|
date | Mon, 19 Jul 2010 17:08:56 +0000 |
parents | 11a664058dd8 |
children | a4b8ad0f1a8f |
files | data/fileio/CSVFileReader.cpp data/fileio/CSVFileReader.h data/fileio/CSVFormat.cpp data/fileio/CSVFormat.h data/model/RegionModel.h |
diffstat | 5 files changed, 203 insertions(+), 160 deletions(-) [+] |
line wrap: on
line diff
--- a/data/fileio/CSVFileReader.cpp Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFileReader.cpp Mon Jul 19 17:08:56 2010 +0000 @@ -17,6 +17,7 @@ #include "model/Model.h" #include "base/RealTime.h" +#include "base/StringBits.h" #include "model/SparseOneDimensionalModel.h" #include "model/SparseTimeValueModel.h" #include "model/EditableDenseThreeDimensionalModel.h" @@ -36,6 +37,7 @@ size_t mainModelSampleRate) : m_format(format), m_file(0), + m_warnings(0), m_mainModelSampleRate(mainModelSampleRate) { m_file = new QFile(path); @@ -78,28 +80,64 @@ return m_error; } +size_t +CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate, + size_t windowSize) const +{ + QRegExp nonNumericRx("[^0-9eE.,+-]"); + unsigned int warnLimit = 10; + + CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); + + size_t calculatedFrame = 0; + + bool ok = false; + QString numeric = s; + numeric.remove(nonNumericRx); + + if (timeUnits == CSVFormat::TimeSeconds) { + + double time = numeric.toDouble(&ok); + if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok); + calculatedFrame = int(time * sampleRate + 0.5); + + } else { + + long n = numeric.toLong(&ok); + if (n >= 0) calculatedFrame = n; + + if (timeUnits == CSVFormat::TimeWindows) { + calculatedFrame *= windowSize; + } + } + + if (!ok) { + if (m_warnings < warnLimit) { + std::cerr << "WARNING: CSVFileReader::load: " + << "Bad time format (\"" << s.toStdString() + << "\") in data line " + << lineno+1 << std::endl; + } else if (m_warnings == warnLimit) { + std::cerr << "WARNING: Too many warnings" << std::endl; + } + ++m_warnings; + } + + return calculatedFrame; +} + Model * CSVFileReader::load() const { if (!m_file) return 0; -/*!!! - CSVFormatDialog *dialog = new CSVFormatDialog - (0, m_file, m_mainModelSampleRate); - - if (dialog->exec() == QDialog::Rejected) { - delete dialog; - throw DataFileReaderFactory::ImportCancelled; - } -*/ CSVFormat::ModelType modelType = m_format.getModelType(); CSVFormat::TimingType timingType = m_format.getTimingType(); - CSVFormat::DurationType durationType = m_format.getDurationType(); CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); - QString separator = m_format.getSeparator(); - QString::SplitBehavior behaviour = m_format.getSplitBehaviour(); size_t sampleRate = m_format.getSampleRate(); size_t windowSize = m_format.getWindowSize(); + QChar separator = m_format.getSeparator(); + bool allowQuoting = m_format.getAllowQuoting(); if (timingType == CSVFormat::ExplicitTiming) { if (modelType == CSVFormat::ThreeDimensionalModel) { @@ -131,11 +169,16 @@ size_t frameNo = 0; size_t duration = 0; + size_t endFrame = 0; + + bool haveAnyValue = false; + bool haveEndTime = false; + size_t startFrame = 0; // for calculation of dense model resolution + bool firstEverValue = true; - std::map<QString, float> labelValueMap; - float syntheticMax = 0.f; - + std::map<QString, int> labelCountMap; + while (!in.atEnd()) { // QTextStream's readLine doesn't cope with old-style Mac @@ -158,8 +201,7 @@ if (line.startsWith("#")) continue; - QStringList list = line.split(separator, behaviour); - + QStringList list = StringBits::split(line, separator, allowQuoting); if (!model) { switch (modelType) { @@ -190,152 +232,94 @@ } } - QStringList tidyList; - QRegExp nonNumericRx("[^0-9eE.,+-]"); + float value = 0.f; + QString label = ""; - float value = 0.f; + duration = 0.f; + haveEndTime = false; for (int i = 0; i < list.size(); ++i) { - - QString s(list[i].trimmed()); - if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { - s = s.mid(1, s.length() - 2); - } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { - s = s.mid(1, s.length() - 2); + QString s = list[i]; + + CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i); + + switch (purpose) { + + case CSVFormat::ColumnUnknown: + break; + + case CSVFormat::ColumnStartTime: + frameNo = convertTimeValue(s, lineno, sampleRate, windowSize); + break; + + case CSVFormat::ColumnEndTime: + endFrame = convertTimeValue(s, lineno, sampleRate, windowSize); + haveEndTime = true; + break; + + case CSVFormat::ColumnDuration: + duration = convertTimeValue(s, lineno, sampleRate, windowSize); + break; + + case CSVFormat::ColumnValue: + value = s.toFloat(); + haveAnyValue = true; + break; + + case CSVFormat::ColumnLabel: + label = s; + ++labelCountMap[label]; + break; } + } - if (timingType == CSVFormat::ExplicitTiming) { - - size_t calculatedFrame = 0; - - if (i == 0 || - (i == 1 && - modelType == CSVFormat::TwoDimensionalModelWithDuration)) { - - bool ok = false; - QString numeric = s; - numeric.remove(nonNumericRx); - - if (timeUnits == CSVFormat::TimeSeconds) { - - double time = numeric.toDouble(&ok); - calculatedFrame = int(time * sampleRate + 0.5); - - } else { - - calculatedFrame = numeric.toInt(&ok); - - if (timeUnits == CSVFormat::TimeWindows) { - calculatedFrame *= windowSize; - } - } - - if (!ok) { - if (warnings < warnLimit) { - std::cerr << "WARNING: CSVFileReader::load: " - << "Bad time format (\"" << s.toStdString() - << "\") in data line " - << lineno+1 << ":" << std::endl; - std::cerr << line.toStdString() << std::endl; - } else if (warnings == warnLimit) { - std::cerr << "WARNING: Too many warnings" << std::endl; - } - ++warnings; - } - - if (i == 0) frameNo = calculatedFrame; - else { - if (durationType == CSVFormat::EndTimes) { - duration = calculatedFrame - frameNo; - } else { - duration = calculatedFrame; - } - } - - continue; - } + if (haveEndTime) { // ... calculate duration now all cols read + if (endFrame > frameNo) { + duration = endFrame - frameNo; } - - if ((i == 1 && - modelType == CSVFormat::TwoDimensionalModel) || - (i == 2 && - modelType == CSVFormat::TwoDimensionalModelWithDuration)) { - bool ok = false; - value = s.toFloat(&ok); - if (!ok) { - // cf. RDFImporter::fillModel - if (labelValueMap.find(s) == labelValueMap.end()) { - syntheticMax = syntheticMax + 1.f; - labelValueMap[s] = syntheticMax; - } - value = labelValueMap[s]; - } else { - if (value > syntheticMax) syntheticMax = value; - } - if (i + 1 == list.size()) { - // keep text around for use as label (none other given) - tidyList.push_back(s); - } - continue; - } - - tidyList.push_back(s); } if (modelType == CSVFormat::OneDimensionalModel) { - SparseOneDimensionalModel::Point point - (frameNo, - tidyList.size() > 0 ? tidyList[tidyList.size()-1] : - QString("%1").arg(lineno+1)); - + SparseOneDimensionalModel::Point point(frameNo, label); model1->addPoint(point); } else if (modelType == CSVFormat::TwoDimensionalModel) { - SparseTimeValueModel::Point point - (frameNo, - value, - tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); - + SparseTimeValueModel::Point point(frameNo, value, label); model2->addPoint(point); } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) { - RegionModel::Point point - (frameNo, - value, - duration, - tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); - + RegionModel::Point point(frameNo, value, duration, label); model2a->addPoint(point); } else if (modelType == CSVFormat::ThreeDimensionalModel) { DenseThreeDimensionalModel::Column values; - for (int i = 0; i < tidyList.size(); ++i) { + for (int i = 0; i < list.size(); ++i) { bool ok = false; float value = list[i].toFloat(&ok); - if (i > 0 || timingType != CSVFormat::ExplicitTiming) { + if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) { values.push_back(value); } - bool firstEver = (lineno == 0 && i == 0); + if (firstEverValue || value < min) min = value; + if (firstEverValue || value > max) max = value; - if (firstEver || value < min) min = value; - if (firstEver || value > max) max = value; - - if (firstEver) { + if (firstEverValue) { startFrame = frameNo; model3->setStartFrame(startFrame); } else if (lineno == 1 && timingType == CSVFormat::ExplicitTiming) { model3->setResolution(frameNo - startFrame); } + + firstEverValue = false; if (!ok) { if (warnings < warnLimit) { @@ -366,6 +350,47 @@ } } + if (!haveAnyValue) { + if (model2a) { + // assign values for regions based on label frequency; we + // have this in our labelCountMap, sort of + + std::map<int, std::map<QString, float> > countLabelValueMap; + for (std::map<QString, int>::iterator i = labelCountMap.begin(); + i != labelCountMap.end(); ++i) { + countLabelValueMap[i->second][i->first] = 0.f; + } + + float v = 0.f; + for (std::map<int, std::map<QString, float> >::iterator i = + countLabelValueMap.end(); i != countLabelValueMap.begin(); ) { + --i; + for (std::map<QString, float>::iterator j = i->second.begin(); + j != i->second.end(); ++j) { + j->second = v; + v = v + 1.f; + } + } + + std::map<RegionModel::Point, RegionModel::Point, + RegionModel::Point::Comparator> pointMap; + for (RegionModel::PointList::const_iterator i = + model2a->getPoints().begin(); + i != model2a->getPoints().end(); ++i) { + RegionModel::Point p(*i); + v = countLabelValueMap[labelCountMap[p.label]][p.label]; + RegionModel::Point pp(p.frame, v, p.duration, p.label); + pointMap[p] = pp; + } + + for (std::map<RegionModel::Point, RegionModel::Point>::iterator i = + pointMap.begin(); i != pointMap.end(); ++i) { + model2a->deletePoint(i->first); + model2a->addPoint(i->second); + } + } + } + if (modelType == CSVFormat::ThreeDimensionalModel) { model3->setMinimumLevel(min); model3->setMaximumLevel(max);
--- a/data/fileio/CSVFileReader.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFileReader.h Mon Jul 19 17:08:56 2010 +0000 @@ -39,7 +39,11 @@ CSVFormat m_format; QFile *m_file; QString m_error; + mutable int m_warnings; size_t m_mainModelSampleRate; + + size_t convertTimeValue(QString, int lineno, size_t sampleRate, + size_t windowSize) const; };
--- a/data/fileio/CSVFormat.cpp Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFormat.cpp Mon Jul 19 17:08:56 2010 +0000 @@ -39,9 +39,7 @@ { m_modelType = TwoDimensionalModel; m_timingType = ExplicitTiming; - m_durationType = Durations; m_timeUnits = TimeSeconds; - m_behaviour = QString::KeepEmptyParts; m_maxExampleCols = 0; m_columnCount = 0; @@ -186,10 +184,6 @@ void CSVFormat::guessPurposes() { - while (m_columnPurposes.size() <= m_columnCount) { - m_columnPurposes.push_back(ColumnUnknown); - } - m_timingType = CSVFormat::ImplicitTiming; m_timeUnits = CSVFormat::TimeWindows; @@ -229,7 +223,6 @@ if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { purpose = ColumnEndTime; - m_durationType = EndTimes; } } } @@ -242,7 +235,7 @@ } } - m_columnPurposes[i] = purpose; + setColumnPurpose(i, purpose); } int valueCount = 0; @@ -281,12 +274,16 @@ } } - if (valueCount == 0) { - m_modelType = OneDimensionalModel; - } else if (valueCount == 1) { - m_modelType = TwoDimensionalModel; + if (timingColumnCount > 1) { + m_modelType = TwoDimensionalModelWithDuration; } else { - m_modelType = ThreeDimensionalModel; + if (valueCount == 0) { + m_modelType = OneDimensionalModel; + } else if (valueCount == 1) { + m_modelType = TwoDimensionalModel; + } else { + m_modelType = ThreeDimensionalModel; + } } std::cerr << "Estimated column purposes: "; @@ -297,8 +294,33 @@ std::cerr << "Estimated model type: " << m_modelType << std::endl; std::cerr << "Estimated timing type: " << m_timingType << std::endl; - std::cerr << "Estimated duration type: " << m_durationType << std::endl; std::cerr << "Estimated units: " << m_timeUnits << std::endl; } +CSVFormat::ColumnPurpose +CSVFormat::getColumnPurpose(int i) +{ + while (m_columnPurposes.size() <= i) { + m_columnPurposes.push_back(ColumnUnknown); + } + return m_columnPurposes[i]; +} +CSVFormat::ColumnPurpose +CSVFormat::getColumnPurpose(int i) const +{ + return m_columnPurposes[i]; +} + +void +CSVFormat::setColumnPurpose(int i, ColumnPurpose p) +{ + while (m_columnPurposes.size() <= i) { + m_columnPurposes.push_back(ColumnUnknown); + } + m_columnPurposes[i] = p; +} + + + +
--- a/data/fileio/CSVFormat.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/fileio/CSVFormat.h Mon Jul 19 17:08:56 2010 +0000 @@ -34,11 +34,6 @@ ImplicitTiming }; - enum DurationType { - Durations, - EndTimes - }; - enum TimeUnits { TimeSeconds, TimeAudioFrames, @@ -65,14 +60,12 @@ CSVFormat() : // arbitrary defaults m_modelType(TwoDimensionalModel), m_timingType(ExplicitTiming), - m_durationType(Durations), m_timeUnits(TimeSeconds), m_separator(","), m_sampleRate(44100), m_windowSize(1024), m_columnCount(0), m_variableColumnCount(false), - m_behaviour(QString::KeepEmptyParts), m_allowQuoting(true), m_maxExampleCols(0) { } @@ -90,43 +83,42 @@ ModelType getModelType() const { return m_modelType; } TimingType getTimingType() const { return m_timingType; } - DurationType getDurationType() const { return m_durationType; } TimeUnits getTimeUnits() const { return m_timeUnits; } - QString getSeparator() const { return m_separator; } size_t getSampleRate() const { return m_sampleRate; } size_t getWindowSize() const { return m_windowSize; } int getColumnCount() const { return m_columnCount; } - - QString::SplitBehavior getSplitBehaviour() const { return m_behaviour; } - QList<ColumnPurpose> getColumnPurposes() const { return m_columnPurposes; } + bool getAllowQuoting() const { return m_allowQuoting; } + QChar getSeparator() const { + if (m_separator == "") return ' '; + else return m_separator[0]; + } - ColumnPurpose getColumnPurpose(int i) { return m_columnPurposes[i]; } - void setModelType(ModelType t) { m_modelType = t; } void setTimingType(TimingType t) { m_timingType = t; } - void setDurationType(DurationType t) { m_durationType = t; } void setTimeUnits(TimeUnits t) { m_timeUnits = t; } - void setSeparator(QString s) { m_separator = s; } + void setSeparator(QChar s) { m_separator = s; } void setSampleRate(size_t r) { m_sampleRate = r; } void setWindowSize(size_t s) { m_windowSize = s; } void setColumnCount(int c) { m_columnCount = c; } + void setAllowQuoting(bool q) { m_allowQuoting = q; } - void setSplitBehaviour(QString::SplitBehavior b) { m_behaviour = b; } + QList<ColumnPurpose> getColumnPurposes() const { return m_columnPurposes; } void setColumnPurposes(QList<ColumnPurpose> cl) { m_columnPurposes = cl; } + + ColumnPurpose getColumnPurpose(int i); + ColumnPurpose getColumnPurpose(int i) const; + void setColumnPurpose(int i, ColumnPurpose p); - void setColumnPurpose(int i, ColumnPurpose p) { m_columnPurposes[i] = p; } - // read-only; only valid if format has been guessed: QList<ColumnQualities> getColumnQualities() const { return m_columnQualities; } // read-only; only valid if format has been guessed: QList<QStringList> getExample() const { return m_example; } int getMaxExampleCols() const { return m_maxExampleCols; } - + protected: ModelType m_modelType; TimingType m_timingType; - DurationType m_durationType; TimeUnits m_timeUnits; QString m_separator; size_t m_sampleRate; @@ -140,7 +132,6 @@ QList<float> m_prevValues; - QString::SplitBehavior m_behaviour; bool m_allowQuoting; QList<QStringList> m_example;
--- a/data/model/RegionModel.h Fri Jul 16 16:51:39 2010 +0000 +++ b/data/model/RegionModel.h Mon Jul 19 17:08:56 2010 +0000 @@ -36,6 +36,7 @@ struct RegionRec { public: + RegionRec() : frame(0), value(0.f), duration(0) { } RegionRec(long _frame) : frame(_frame), value(0.0f), duration(0) { } RegionRec(long _frame, float _value, size_t _duration, QString _label) : frame(_frame), value(_value), duration(_duration), label(_label) { }