changeset 631:3a5ee4b6c9ad

* Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns.
author Chris Cannam
date Mon, 19 Jul 2010 17:08:56 +0000
parents 11a664058dd8
children a4b8ad0f1a8f
files data/fileio/CSVFileReader.cpp data/fileio/CSVFileReader.h data/fileio/CSVFormat.cpp data/fileio/CSVFormat.h data/model/RegionModel.h
diffstat 5 files changed, 203 insertions(+), 160 deletions(-) [+]
line wrap: on
line diff
--- a/data/fileio/CSVFileReader.cpp	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/fileio/CSVFileReader.cpp	Mon Jul 19 17:08:56 2010 +0000
@@ -17,6 +17,7 @@
 
 #include "model/Model.h"
 #include "base/RealTime.h"
+#include "base/StringBits.h"
 #include "model/SparseOneDimensionalModel.h"
 #include "model/SparseTimeValueModel.h"
 #include "model/EditableDenseThreeDimensionalModel.h"
@@ -36,6 +37,7 @@
                              size_t mainModelSampleRate) :
     m_format(format),
     m_file(0),
+    m_warnings(0),
     m_mainModelSampleRate(mainModelSampleRate)
 {
     m_file = new QFile(path);
@@ -78,28 +80,64 @@
     return m_error;
 }
 
+size_t
+CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate,
+                                size_t windowSize) const
+{
+    QRegExp nonNumericRx("[^0-9eE.,+-]");
+    unsigned int warnLimit = 10;
+
+    CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
+
+    size_t calculatedFrame = 0;
+
+    bool ok = false;
+    QString numeric = s;
+    numeric.remove(nonNumericRx);
+    
+    if (timeUnits == CSVFormat::TimeSeconds) {
+
+        double time = numeric.toDouble(&ok);
+        if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
+        calculatedFrame = int(time * sampleRate + 0.5);
+        
+    } else {
+        
+        long n = numeric.toLong(&ok);
+        if (n >= 0) calculatedFrame = n;
+        
+        if (timeUnits == CSVFormat::TimeWindows) {
+            calculatedFrame *= windowSize;
+        }
+    }
+    
+    if (!ok) {
+        if (m_warnings < warnLimit) {
+            std::cerr << "WARNING: CSVFileReader::load: "
+                      << "Bad time format (\"" << s.toStdString()
+                      << "\") in data line "
+                      << lineno+1 << std::endl;
+        } else if (m_warnings == warnLimit) {
+            std::cerr << "WARNING: Too many warnings" << std::endl;
+        }
+        ++m_warnings;
+    }
+
+    return calculatedFrame;
+}
+
 Model *
 CSVFileReader::load() const
 {
     if (!m_file) return 0;
-/*!!!
-    CSVFormatDialog *dialog = new CSVFormatDialog
-	(0, m_file, m_mainModelSampleRate);
-
-    if (dialog->exec() == QDialog::Rejected) {
-	delete dialog;
-        throw DataFileReaderFactory::ImportCancelled;
-    }
-*/
 
     CSVFormat::ModelType modelType = m_format.getModelType();
     CSVFormat::TimingType timingType = m_format.getTimingType();
-    CSVFormat::DurationType durationType = m_format.getDurationType();
     CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
-    QString separator = m_format.getSeparator();
-    QString::SplitBehavior behaviour = m_format.getSplitBehaviour();
     size_t sampleRate = m_format.getSampleRate();
     size_t windowSize = m_format.getWindowSize();
+    QChar separator = m_format.getSeparator();
+    bool allowQuoting = m_format.getAllowQuoting();
 
     if (timingType == CSVFormat::ExplicitTiming) {
         if (modelType == CSVFormat::ThreeDimensionalModel) {
@@ -131,11 +169,16 @@
 
     size_t frameNo = 0;
     size_t duration = 0;
+    size_t endFrame = 0;
+
+    bool haveAnyValue = false;
+    bool haveEndTime = false;
+
     size_t startFrame = 0; // for calculation of dense model resolution
+    bool firstEverValue = true;
 
-    std::map<QString, float> labelValueMap;
-    float syntheticMax = 0.f;
-
+    std::map<QString, int> labelCountMap;
+    
     while (!in.atEnd()) {
 
         // QTextStream's readLine doesn't cope with old-style Mac
@@ -158,8 +201,7 @@
 
             if (line.startsWith("#")) continue;
 
-            QStringList list = line.split(separator, behaviour);
-
+            QStringList list = StringBits::split(line, separator, allowQuoting);
             if (!model) {
 
                 switch (modelType) {
@@ -190,152 +232,94 @@
                 }
             }
 
-            QStringList tidyList;
-            QRegExp nonNumericRx("[^0-9eE.,+-]");
+            float value = 0.f;
+            QString label = "";
 
-            float value = 0.f;
+            duration = 0.f;
+            haveEndTime = false;
 
             for (int i = 0; i < list.size(); ++i) {
-	    
-                QString s(list[i].trimmed());
 
-                if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) {
-                    s = s.mid(1, s.length() - 2);
-                } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) {
-                    s = s.mid(1, s.length() - 2);
+                QString s = list[i];
+
+                CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i);
+
+                switch (purpose) {
+
+                case CSVFormat::ColumnUnknown:
+                    break;
+
+                case CSVFormat::ColumnStartTime:
+                    frameNo = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    break;
+                
+                case CSVFormat::ColumnEndTime:
+                    endFrame = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    haveEndTime = true;
+                    break;
+
+                case CSVFormat::ColumnDuration:
+                    duration = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    break;
+
+                case CSVFormat::ColumnValue:
+                    value = s.toFloat();
+                    haveAnyValue = true;
+                    break;
+
+                case CSVFormat::ColumnLabel:
+                    label = s;
+                    ++labelCountMap[label];
+                    break;
                 }
+            }
 
-                if (timingType == CSVFormat::ExplicitTiming) {
-
-                    size_t calculatedFrame = 0;
-
-                    if (i == 0 ||
-                        (i == 1 &&
-                         modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
-
-                        bool ok = false;
-                        QString numeric = s;
-                        numeric.remove(nonNumericRx);
-
-                        if (timeUnits == CSVFormat::TimeSeconds) {
-
-                            double time = numeric.toDouble(&ok);
-                            calculatedFrame = int(time * sampleRate + 0.5);
-
-                        } else {
-
-                            calculatedFrame = numeric.toInt(&ok);
-
-                            if (timeUnits == CSVFormat::TimeWindows) {
-                                calculatedFrame *= windowSize;
-                            }
-                        }
-			       
-                        if (!ok) {
-                            if (warnings < warnLimit) {
-                                std::cerr << "WARNING: CSVFileReader::load: "
-                                          << "Bad time format (\"" << s.toStdString()
-                                          << "\") in data line "
-                                          << lineno+1 << ":" << std::endl;
-                                std::cerr << line.toStdString() << std::endl;
-                            } else if (warnings == warnLimit) {
-                                std::cerr << "WARNING: Too many warnings" << std::endl;
-                            }
-                            ++warnings;
-                        }
-
-                        if (i == 0) frameNo = calculatedFrame;
-                        else {
-                            if (durationType == CSVFormat::EndTimes) {
-                                duration = calculatedFrame - frameNo;
-                            } else {
-                                duration = calculatedFrame;
-                            }
-                        }
-
-                        continue;
-                    }
+            if (haveEndTime) { // ... calculate duration now all cols read
+                if (endFrame > frameNo) {
+                    duration = endFrame - frameNo;
                 }
-
-                if ((i == 1 &&
-                     modelType == CSVFormat::TwoDimensionalModel) ||
-                    (i == 2 &&
-                     modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
-                    bool ok = false;
-                    value = s.toFloat(&ok);
-                    if (!ok) {
-                        // cf. RDFImporter::fillModel
-                        if (labelValueMap.find(s) == labelValueMap.end()) {
-                            syntheticMax = syntheticMax + 1.f;
-                            labelValueMap[s] = syntheticMax;
-                        }
-                        value = labelValueMap[s];
-                    } else {
-                        if (value > syntheticMax) syntheticMax = value;
-                    }
-                    if (i + 1 == list.size()) {
-                        // keep text around for use as label (none other given)
-                        tidyList.push_back(s);
-                    }
-                    continue;
-                }
-
-                tidyList.push_back(s);
             }
 
             if (modelType == CSVFormat::OneDimensionalModel) {
 	    
-                SparseOneDimensionalModel::Point point
-                    (frameNo,
-                     tidyList.size() > 0 ? tidyList[tidyList.size()-1] :
-                     QString("%1").arg(lineno+1));
-
+                SparseOneDimensionalModel::Point point(frameNo, label);
                 model1->addPoint(point);
 
             } else if (modelType == CSVFormat::TwoDimensionalModel) {
 
-                SparseTimeValueModel::Point point
-                    (frameNo,
-                     value,
-                     tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
-
+                SparseTimeValueModel::Point point(frameNo, value, label);
                 model2->addPoint(point);
 
             } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
 
-                RegionModel::Point point
-                    (frameNo,
-                     value,
-                     duration,
-                     tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
-
+                RegionModel::Point point(frameNo, value, duration, label);
                 model2a->addPoint(point);
 
             } else if (modelType == CSVFormat::ThreeDimensionalModel) {
 
                 DenseThreeDimensionalModel::Column values;
 
-                for (int i = 0; i < tidyList.size(); ++i) {
+                for (int i = 0; i < list.size(); ++i) {
 
                     bool ok = false;
                     float value = list[i].toFloat(&ok);
 
-                    if (i > 0 || timingType != CSVFormat::ExplicitTiming) {
+                    if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) {
                         values.push_back(value);
                     }
 	    
-                    bool firstEver = (lineno == 0 && i == 0);
+                    if (firstEverValue || value < min) min = value;
+                    if (firstEverValue || value > max) max = value;
 
-                    if (firstEver || value < min) min = value;
-                    if (firstEver || value > max) max = value;
-
-                    if (firstEver) {
+                    if (firstEverValue) {
                         startFrame = frameNo;
                         model3->setStartFrame(startFrame);
                     } else if (lineno == 1 &&
                                timingType == CSVFormat::ExplicitTiming) {
                         model3->setResolution(frameNo - startFrame);
                     }
+                    
+                    firstEverValue = false;
 
                     if (!ok) {
                         if (warnings < warnLimit) {
@@ -366,6 +350,47 @@
         }
     }
 
+    if (!haveAnyValue) {
+        if (model2a) {
+            // assign values for regions based on label frequency; we
+            // have this in our labelCountMap, sort of
+
+            std::map<int, std::map<QString, float> > countLabelValueMap;
+            for (std::map<QString, int>::iterator i = labelCountMap.begin();
+                 i != labelCountMap.end(); ++i) {
+                countLabelValueMap[i->second][i->first] = 0.f;
+            }
+
+            float v = 0.f;
+            for (std::map<int, std::map<QString, float> >::iterator i =
+                     countLabelValueMap.end(); i != countLabelValueMap.begin(); ) {
+                --i;
+                for (std::map<QString, float>::iterator j = i->second.begin();
+                     j != i->second.end(); ++j) {
+                    j->second = v;
+                    v = v + 1.f;
+                }
+            }
+
+            std::map<RegionModel::Point, RegionModel::Point,
+                RegionModel::Point::Comparator> pointMap;
+            for (RegionModel::PointList::const_iterator i =
+                     model2a->getPoints().begin();
+                 i != model2a->getPoints().end(); ++i) {
+                RegionModel::Point p(*i);
+                v = countLabelValueMap[labelCountMap[p.label]][p.label];
+                RegionModel::Point pp(p.frame, v, p.duration, p.label);
+                pointMap[p] = pp;
+            }
+
+            for (std::map<RegionModel::Point, RegionModel::Point>::iterator i = 
+                     pointMap.begin(); i != pointMap.end(); ++i) {
+                model2a->deletePoint(i->first);
+                model2a->addPoint(i->second);
+            }
+        }
+    }
+                
     if (modelType == CSVFormat::ThreeDimensionalModel) {
 	model3->setMinimumLevel(min);
 	model3->setMaximumLevel(max);
--- a/data/fileio/CSVFileReader.h	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/fileio/CSVFileReader.h	Mon Jul 19 17:08:56 2010 +0000
@@ -39,7 +39,11 @@
     CSVFormat m_format;
     QFile *m_file;
     QString m_error;
+    mutable int m_warnings;
     size_t m_mainModelSampleRate;
+
+    size_t convertTimeValue(QString, int lineno, size_t sampleRate,
+                            size_t windowSize) const;
 };
 
 
--- a/data/fileio/CSVFormat.cpp	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/fileio/CSVFormat.cpp	Mon Jul 19 17:08:56 2010 +0000
@@ -39,9 +39,7 @@
 {
     m_modelType = TwoDimensionalModel;
     m_timingType = ExplicitTiming;
-    m_durationType = Durations;
     m_timeUnits = TimeSeconds;
-    m_behaviour = QString::KeepEmptyParts;
 
     m_maxExampleCols = 0;
     m_columnCount = 0;
@@ -186,10 +184,6 @@
 void
 CSVFormat::guessPurposes()
 {
-    while (m_columnPurposes.size() <= m_columnCount) {
-        m_columnPurposes.push_back(ColumnUnknown);
-    }
-
     m_timingType = CSVFormat::ImplicitTiming;
     m_timeUnits = CSVFormat::TimeWindows;
 	
@@ -229,7 +223,6 @@
 
                 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
                     purpose = ColumnEndTime;
-                    m_durationType = EndTimes;
                 }
             }
         }
@@ -242,7 +235,7 @@
             }
         }
 
-        m_columnPurposes[i] = purpose;
+        setColumnPurpose(i, purpose);
     }            
 
     int valueCount = 0;
@@ -281,12 +274,16 @@
         }
     }
 
-    if (valueCount == 0) {
-        m_modelType = OneDimensionalModel;
-    } else if (valueCount == 1) {
-        m_modelType = TwoDimensionalModel;
+    if (timingColumnCount > 1) {
+        m_modelType = TwoDimensionalModelWithDuration;
     } else {
-        m_modelType = ThreeDimensionalModel;
+        if (valueCount == 0) {
+            m_modelType = OneDimensionalModel;
+        } else if (valueCount == 1) {
+            m_modelType = TwoDimensionalModel;
+        } else {
+            m_modelType = ThreeDimensionalModel;
+        }
     }
 
     std::cerr << "Estimated column purposes: ";
@@ -297,8 +294,33 @@
 
     std::cerr << "Estimated model type: " << m_modelType << std::endl;
     std::cerr << "Estimated timing type: " << m_timingType << std::endl;
-    std::cerr << "Estimated duration type: " << m_durationType << std::endl;
     std::cerr << "Estimated units: " << m_timeUnits << std::endl;
 }
 
+CSVFormat::ColumnPurpose
+CSVFormat::getColumnPurpose(int i)
+{
+    while (m_columnPurposes.size() <= i) {
+        m_columnPurposes.push_back(ColumnUnknown);
+    }
+    return m_columnPurposes[i];
+}
 
+CSVFormat::ColumnPurpose
+CSVFormat::getColumnPurpose(int i) const
+{
+    return m_columnPurposes[i];
+}
+
+void
+CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
+{
+    while (m_columnPurposes.size() <= i) {
+        m_columnPurposes.push_back(ColumnUnknown);
+    }
+    m_columnPurposes[i] = p;
+}
+
+
+
+
--- a/data/fileio/CSVFormat.h	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/fileio/CSVFormat.h	Mon Jul 19 17:08:56 2010 +0000
@@ -34,11 +34,6 @@
 	ImplicitTiming
     };
 
-    enum DurationType {
-        Durations,
-        EndTimes
-    };
-    
     enum TimeUnits {
 	TimeSeconds,
 	TimeAudioFrames,
@@ -65,14 +60,12 @@
     CSVFormat() : // arbitrary defaults
         m_modelType(TwoDimensionalModel),
         m_timingType(ExplicitTiming),
-        m_durationType(Durations),
         m_timeUnits(TimeSeconds),
         m_separator(","),
         m_sampleRate(44100),
         m_windowSize(1024),
         m_columnCount(0),
         m_variableColumnCount(false),
-        m_behaviour(QString::KeepEmptyParts),
         m_allowQuoting(true),
         m_maxExampleCols(0)
     { }
@@ -90,43 +83,42 @@
  
     ModelType    getModelType()     const { return m_modelType;     }
     TimingType   getTimingType()    const { return m_timingType;    }
-    DurationType getDurationType()  const { return m_durationType;  }
     TimeUnits    getTimeUnits()     const { return m_timeUnits;     }
-    QString      getSeparator()     const { return m_separator;     }
     size_t       getSampleRate()    const { return m_sampleRate;    }
     size_t       getWindowSize()    const { return m_windowSize;    }
     int          getColumnCount()   const { return m_columnCount;   }
-    
-    QString::SplitBehavior getSplitBehaviour() const { return m_behaviour; }
-    QList<ColumnPurpose> getColumnPurposes() const { return m_columnPurposes; }
+    bool         getAllowQuoting()  const { return m_allowQuoting;  }
+    QChar        getSeparator()     const { 
+        if (m_separator == "") return ' ';
+        else return m_separator[0];
+    }
 
-    ColumnPurpose getColumnPurpose(int i) { return m_columnPurposes[i]; }
-	
     void setModelType(ModelType t)        { m_modelType    = t; }
     void setTimingType(TimingType t)      { m_timingType   = t; }
-    void setDurationType(DurationType t)  { m_durationType = t; }
     void setTimeUnits(TimeUnits t)        { m_timeUnits    = t; }
-    void setSeparator(QString s)          { m_separator    = s; }
+    void setSeparator(QChar s)            { m_separator    = s; }
     void setSampleRate(size_t r)          { m_sampleRate   = r; }
     void setWindowSize(size_t s)          { m_windowSize   = s; }
     void setColumnCount(int c)            { m_columnCount  = c; }
+    void setAllowQuoting(bool q)          { m_allowQuoting = q; }
 
-    void setSplitBehaviour(QString::SplitBehavior b) { m_behaviour = b; }
+    QList<ColumnPurpose> getColumnPurposes() const { return m_columnPurposes; }
     void setColumnPurposes(QList<ColumnPurpose> cl) { m_columnPurposes = cl; }
+
+    ColumnPurpose getColumnPurpose(int i);
+    ColumnPurpose getColumnPurpose(int i) const;
+    void setColumnPurpose(int i, ColumnPurpose p);
     
-    void setColumnPurpose(int i, ColumnPurpose p) { m_columnPurposes[i] = p; }
-
     // read-only; only valid if format has been guessed:
     QList<ColumnQualities> getColumnQualities() const { return m_columnQualities; }
 
     // read-only; only valid if format has been guessed:
     QList<QStringList> getExample() const { return m_example; }
     int getMaxExampleCols() const { return m_maxExampleCols; }
-
+	
 protected:
     ModelType    m_modelType;
     TimingType   m_timingType;
-    DurationType m_durationType;
     TimeUnits    m_timeUnits;
     QString      m_separator;
     size_t       m_sampleRate;
@@ -140,7 +132,6 @@
 
     QList<float> m_prevValues;
 
-    QString::SplitBehavior m_behaviour;
     bool m_allowQuoting;
 
     QList<QStringList> m_example;
--- a/data/model/RegionModel.h	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/model/RegionModel.h	Mon Jul 19 17:08:56 2010 +0000
@@ -36,6 +36,7 @@
 struct RegionRec
 {
 public:
+    RegionRec() : frame(0), value(0.f), duration(0) { }
     RegionRec(long _frame) : frame(_frame), value(0.0f), duration(0) { }
     RegionRec(long _frame, float _value, size_t _duration, QString _label) :
 	frame(_frame), value(_value), duration(_duration), label(_label) { }