diff data/fileio/CSVFileReader.cpp @ 631:3a5ee4b6c9ad

* Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns.
author Chris Cannam
date Mon, 19 Jul 2010 17:08:56 +0000
parents 001db550bd48
children 611a4fa14dde
line wrap: on
line diff
--- a/data/fileio/CSVFileReader.cpp	Fri Jul 16 16:51:39 2010 +0000
+++ b/data/fileio/CSVFileReader.cpp	Mon Jul 19 17:08:56 2010 +0000
@@ -17,6 +17,7 @@
 
 #include "model/Model.h"
 #include "base/RealTime.h"
+#include "base/StringBits.h"
 #include "model/SparseOneDimensionalModel.h"
 #include "model/SparseTimeValueModel.h"
 #include "model/EditableDenseThreeDimensionalModel.h"
@@ -36,6 +37,7 @@
                              size_t mainModelSampleRate) :
     m_format(format),
     m_file(0),
+    m_warnings(0),
     m_mainModelSampleRate(mainModelSampleRate)
 {
     m_file = new QFile(path);
@@ -78,28 +80,64 @@
     return m_error;
 }
 
+size_t
+CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate,
+                                size_t windowSize) const
+{
+    QRegExp nonNumericRx("[^0-9eE.,+-]");
+    unsigned int warnLimit = 10;
+
+    CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
+
+    size_t calculatedFrame = 0;
+
+    bool ok = false;
+    QString numeric = s;
+    numeric.remove(nonNumericRx);
+    
+    if (timeUnits == CSVFormat::TimeSeconds) {
+
+        double time = numeric.toDouble(&ok);
+        if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
+        calculatedFrame = int(time * sampleRate + 0.5);
+        
+    } else {
+        
+        long n = numeric.toLong(&ok);
+        if (n >= 0) calculatedFrame = n;
+        
+        if (timeUnits == CSVFormat::TimeWindows) {
+            calculatedFrame *= windowSize;
+        }
+    }
+    
+    if (!ok) {
+        if (m_warnings < warnLimit) {
+            std::cerr << "WARNING: CSVFileReader::load: "
+                      << "Bad time format (\"" << s.toStdString()
+                      << "\") in data line "
+                      << lineno+1 << std::endl;
+        } else if (m_warnings == warnLimit) {
+            std::cerr << "WARNING: Too many warnings" << std::endl;
+        }
+        ++m_warnings;
+    }
+
+    return calculatedFrame;
+}
+
 Model *
 CSVFileReader::load() const
 {
     if (!m_file) return 0;
-/*!!!
-    CSVFormatDialog *dialog = new CSVFormatDialog
-	(0, m_file, m_mainModelSampleRate);
-
-    if (dialog->exec() == QDialog::Rejected) {
-	delete dialog;
-        throw DataFileReaderFactory::ImportCancelled;
-    }
-*/
 
     CSVFormat::ModelType modelType = m_format.getModelType();
     CSVFormat::TimingType timingType = m_format.getTimingType();
-    CSVFormat::DurationType durationType = m_format.getDurationType();
     CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
-    QString separator = m_format.getSeparator();
-    QString::SplitBehavior behaviour = m_format.getSplitBehaviour();
     size_t sampleRate = m_format.getSampleRate();
     size_t windowSize = m_format.getWindowSize();
+    QChar separator = m_format.getSeparator();
+    bool allowQuoting = m_format.getAllowQuoting();
 
     if (timingType == CSVFormat::ExplicitTiming) {
         if (modelType == CSVFormat::ThreeDimensionalModel) {
@@ -131,11 +169,16 @@
 
     size_t frameNo = 0;
     size_t duration = 0;
+    size_t endFrame = 0;
+
+    bool haveAnyValue = false;
+    bool haveEndTime = false;
+
     size_t startFrame = 0; // for calculation of dense model resolution
+    bool firstEverValue = true;
 
-    std::map<QString, float> labelValueMap;
-    float syntheticMax = 0.f;
-
+    std::map<QString, int> labelCountMap;
+    
     while (!in.atEnd()) {
 
         // QTextStream's readLine doesn't cope with old-style Mac
@@ -158,8 +201,7 @@
 
             if (line.startsWith("#")) continue;
 
-            QStringList list = line.split(separator, behaviour);
-
+            QStringList list = StringBits::split(line, separator, allowQuoting);
             if (!model) {
 
                 switch (modelType) {
@@ -190,152 +232,94 @@
                 }
             }
 
-            QStringList tidyList;
-            QRegExp nonNumericRx("[^0-9eE.,+-]");
+            float value = 0.f;
+            QString label = "";
 
-            float value = 0.f;
+            duration = 0.f;
+            haveEndTime = false;
 
             for (int i = 0; i < list.size(); ++i) {
-	    
-                QString s(list[i].trimmed());
 
-                if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) {
-                    s = s.mid(1, s.length() - 2);
-                } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) {
-                    s = s.mid(1, s.length() - 2);
+                QString s = list[i];
+
+                CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i);
+
+                switch (purpose) {
+
+                case CSVFormat::ColumnUnknown:
+                    break;
+
+                case CSVFormat::ColumnStartTime:
+                    frameNo = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    break;
+                
+                case CSVFormat::ColumnEndTime:
+                    endFrame = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    haveEndTime = true;
+                    break;
+
+                case CSVFormat::ColumnDuration:
+                    duration = convertTimeValue(s, lineno, sampleRate, windowSize);
+                    break;
+
+                case CSVFormat::ColumnValue:
+                    value = s.toFloat();
+                    haveAnyValue = true;
+                    break;
+
+                case CSVFormat::ColumnLabel:
+                    label = s;
+                    ++labelCountMap[label];
+                    break;
                 }
+            }
 
-                if (timingType == CSVFormat::ExplicitTiming) {
-
-                    size_t calculatedFrame = 0;
-
-                    if (i == 0 ||
-                        (i == 1 &&
-                         modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
-
-                        bool ok = false;
-                        QString numeric = s;
-                        numeric.remove(nonNumericRx);
-
-                        if (timeUnits == CSVFormat::TimeSeconds) {
-
-                            double time = numeric.toDouble(&ok);
-                            calculatedFrame = int(time * sampleRate + 0.5);
-
-                        } else {
-
-                            calculatedFrame = numeric.toInt(&ok);
-
-                            if (timeUnits == CSVFormat::TimeWindows) {
-                                calculatedFrame *= windowSize;
-                            }
-                        }
-			       
-                        if (!ok) {
-                            if (warnings < warnLimit) {
-                                std::cerr << "WARNING: CSVFileReader::load: "
-                                          << "Bad time format (\"" << s.toStdString()
-                                          << "\") in data line "
-                                          << lineno+1 << ":" << std::endl;
-                                std::cerr << line.toStdString() << std::endl;
-                            } else if (warnings == warnLimit) {
-                                std::cerr << "WARNING: Too many warnings" << std::endl;
-                            }
-                            ++warnings;
-                        }
-
-                        if (i == 0) frameNo = calculatedFrame;
-                        else {
-                            if (durationType == CSVFormat::EndTimes) {
-                                duration = calculatedFrame - frameNo;
-                            } else {
-                                duration = calculatedFrame;
-                            }
-                        }
-
-                        continue;
-                    }
+            if (haveEndTime) { // ... calculate duration now all cols read
+                if (endFrame > frameNo) {
+                    duration = endFrame - frameNo;
                 }
-
-                if ((i == 1 &&
-                     modelType == CSVFormat::TwoDimensionalModel) ||
-                    (i == 2 &&
-                     modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
-                    bool ok = false;
-                    value = s.toFloat(&ok);
-                    if (!ok) {
-                        // cf. RDFImporter::fillModel
-                        if (labelValueMap.find(s) == labelValueMap.end()) {
-                            syntheticMax = syntheticMax + 1.f;
-                            labelValueMap[s] = syntheticMax;
-                        }
-                        value = labelValueMap[s];
-                    } else {
-                        if (value > syntheticMax) syntheticMax = value;
-                    }
-                    if (i + 1 == list.size()) {
-                        // keep text around for use as label (none other given)
-                        tidyList.push_back(s);
-                    }
-                    continue;
-                }
-
-                tidyList.push_back(s);
             }
 
             if (modelType == CSVFormat::OneDimensionalModel) {
 	    
-                SparseOneDimensionalModel::Point point
-                    (frameNo,
-                     tidyList.size() > 0 ? tidyList[tidyList.size()-1] :
-                     QString("%1").arg(lineno+1));
-
+                SparseOneDimensionalModel::Point point(frameNo, label);
                 model1->addPoint(point);
 
             } else if (modelType == CSVFormat::TwoDimensionalModel) {
 
-                SparseTimeValueModel::Point point
-                    (frameNo,
-                     value,
-                     tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
-
+                SparseTimeValueModel::Point point(frameNo, value, label);
                 model2->addPoint(point);
 
             } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
 
-                RegionModel::Point point
-                    (frameNo,
-                     value,
-                     duration,
-                     tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
-
+                RegionModel::Point point(frameNo, value, duration, label);
                 model2a->addPoint(point);
 
             } else if (modelType == CSVFormat::ThreeDimensionalModel) {
 
                 DenseThreeDimensionalModel::Column values;
 
-                for (int i = 0; i < tidyList.size(); ++i) {
+                for (int i = 0; i < list.size(); ++i) {
 
                     bool ok = false;
                     float value = list[i].toFloat(&ok);
 
-                    if (i > 0 || timingType != CSVFormat::ExplicitTiming) {
+                    if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) {
                         values.push_back(value);
                     }
 	    
-                    bool firstEver = (lineno == 0 && i == 0);
+                    if (firstEverValue || value < min) min = value;
+                    if (firstEverValue || value > max) max = value;
 
-                    if (firstEver || value < min) min = value;
-                    if (firstEver || value > max) max = value;
-
-                    if (firstEver) {
+                    if (firstEverValue) {
                         startFrame = frameNo;
                         model3->setStartFrame(startFrame);
                     } else if (lineno == 1 &&
                                timingType == CSVFormat::ExplicitTiming) {
                         model3->setResolution(frameNo - startFrame);
                     }
+                    
+                    firstEverValue = false;
 
                     if (!ok) {
                         if (warnings < warnLimit) {
@@ -366,6 +350,47 @@
         }
     }
 
+    if (!haveAnyValue) {
+        if (model2a) {
+            // assign values for regions based on label frequency; we
+            // have this in our labelCountMap, sort of
+
+            std::map<int, std::map<QString, float> > countLabelValueMap;
+            for (std::map<QString, int>::iterator i = labelCountMap.begin();
+                 i != labelCountMap.end(); ++i) {
+                countLabelValueMap[i->second][i->first] = 0.f;
+            }
+
+            float v = 0.f;
+            for (std::map<int, std::map<QString, float> >::iterator i =
+                     countLabelValueMap.end(); i != countLabelValueMap.begin(); ) {
+                --i;
+                for (std::map<QString, float>::iterator j = i->second.begin();
+                     j != i->second.end(); ++j) {
+                    j->second = v;
+                    v = v + 1.f;
+                }
+            }
+
+            std::map<RegionModel::Point, RegionModel::Point,
+                RegionModel::Point::Comparator> pointMap;
+            for (RegionModel::PointList::const_iterator i =
+                     model2a->getPoints().begin();
+                 i != model2a->getPoints().end(); ++i) {
+                RegionModel::Point p(*i);
+                v = countLabelValueMap[labelCountMap[p.label]][p.label];
+                RegionModel::Point pp(p.frame, v, p.duration, p.label);
+                pointMap[p] = pp;
+            }
+
+            for (std::map<RegionModel::Point, RegionModel::Point>::iterator i = 
+                     pointMap.begin(); i != pointMap.end(); ++i) {
+                model2a->deletePoint(i->first);
+                model2a->addPoint(i->second);
+            }
+        }
+    }
+                
     if (modelType == CSVFormat::ThreeDimensionalModel) {
 	model3->setMinimumLevel(min);
 	model3->setMaximumLevel(max);