diff data/fileio/CSVFormat.cpp @ 629:35499d48a5d1

* Start overhauling CSV parser to associate purposes with columns en route to its guesses; add some string manipulation code
author Chris Cannam
date Thu, 15 Jul 2010 15:27:21 +0000
parents 001db550bd48
children 11a664058dd8
line wrap: on
line diff
--- a/data/fileio/CSVFormat.cpp	Thu Jul 08 14:22:28 2010 +0000
+++ b/data/fileio/CSVFormat.cpp	Thu Jul 15 15:27:21 2010 +0000
@@ -15,6 +15,8 @@
 
 #include "CSVFormat.h"
 
+#include "base/StringBits.h"
+
 #include <QFile>
 #include <QString>
 #include <QRegExp>
@@ -23,39 +25,41 @@
 
 #include <iostream>
 
-CSVFormat::CSVFormat(QString filename) :
-    m_modelType(TwoDimensionalModel),
-    m_timingType(ExplicitTiming),
-    m_durationType(Durations),
-    m_timeUnits(TimeSeconds),
-    m_separator(","),
+CSVFormat::CSVFormat(QString path) :
+    m_separator(""),
     m_sampleRate(44100),
     m_windowSize(1024),
-    m_behaviour(QString::KeepEmptyParts),
-    m_maxExampleCols(0)
+    m_allowQuoting(true)
 {
-    QFile file(filename);
+    guessFormatFor(path);
+}
+
+void
+CSVFormat::guessFormatFor(QString path)
+{
+    m_modelType = TwoDimensionalModel;
+    m_timingType = ExplicitTiming;
+    m_durationType = Durations;
+    m_timeUnits = TimeSeconds;
+    m_behaviour = QString::KeepEmptyParts;
+
+    m_maxExampleCols = 0;
+    m_columnCount = 0;
+    m_variableColumnCount = false;
+
+    m_example.clear();
+    m_columnQualities.clear();
+    m_columnPurposes.clear();
+    m_prevValues.clear();
+
+    QFile file(path);
     if (!file.exists()) return;
     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
 
     QTextStream in(&file);
     in.seek(0);
 
-    unsigned int lineno = 0;
-
-    bool nonIncreasingPrimaries = false;
-    bool nonIncreasingSecondaries = false;
-    bool nonNumericPrimaries = false;
-    bool floatPrimaries = false;
-    bool variableItemCount = false;
-    int itemCount = 1;
-    int earliestNonNumericItem = -1;
-
-    float prevPrimary = 0.0;
-    float prevSecondary = 0.0;
-
-    m_maxExampleCols = 0;
-    m_separator = "";
+    int lineno = 0;
 
     while (!in.atEnd()) {
 
@@ -67,148 +71,198 @@
         for (size_t li = 0; li < lines.size(); ++li) {
 
             QString line = lines[li];
+            if (line.startsWith("#") || line == "") continue;
 
-            if (line.startsWith("#")) continue;
+            guessQualities(line, lineno);
 
-            m_behaviour = QString::KeepEmptyParts;
+            if (++lineno == 50) break;
+        }
+    }
 
-            if (m_separator == "") {
-                //!!! to do: ask the user
-                if (line.split(",").size() >= 2) m_separator = ",";
-                else if (line.split("\t").size() >= 2) m_separator = "\t";
-                else if (line.split("|").size() >= 2) m_separator = "|";
-                else if (line.split("/").size() >= 2) m_separator = "/";
-                else if (line.split(":").size() >= 2) m_separator = ":";
-                else {
-                    m_separator = " ";
-                    m_behaviour = QString::SkipEmptyParts;
+    guessPurposes();
+}
+
+void
+CSVFormat::guessSeparator(QString line)
+{
+    char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
+    for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) {
+        if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
+            m_separator = candidates[i];
+            return;
+        }
+    }
+    m_separator = " ";
+}
+
+void
+CSVFormat::guessQualities(QString line, int lineno)
+{
+    if (m_separator == "") guessSeparator(line);
+
+    QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
+
+    int cols = list.size();
+    if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
+    if (cols != m_columnCount) m_variableColumnCount = true;
+
+    // All columns are regarded as having these qualities until we see
+    // something that indicates otherwise:
+
+    ColumnQualities defaultQualities =
+        ColumnNumeric | ColumnIntegral | ColumnIncreasing;
+    
+    for (int i = 0; i < cols; ++i) {
+	    
+        while (m_columnQualities.size() <= i) {
+            m_columnQualities.push_back(defaultQualities);
+            m_prevValues.push_back(0.f);
+        }
+
+        QString s(list[i]);
+        bool ok = false;
+
+        ColumnQualities qualities = m_columnQualities[i];
+
+        bool numeric    = (qualities & ColumnNumeric);
+        bool integral   = (qualities & ColumnIntegral);
+        bool increasing = (qualities & ColumnIncreasing);
+        bool large      = (qualities & ColumnLarge); // this one defaults to off
+
+        float value = 0.f;
+
+        //!!! how to take into account headers?
+
+        if (numeric) {
+            value = s.toFloat(&ok);
+            if (!ok) {
+                value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
+            }
+            if (ok) {
+                if (lineno < 2 && value > 1000.f) large = true;
+            } else {
+                numeric = false;
+            }
+        }
+
+        if (numeric) {
+
+            if (integral) {
+                if (s.contains('.') || s.contains(',')) {
+                    integral = false;
                 }
             }
 
-//            std::cerr << "separator = \"" << m_separator.toStdString() << "\"" << std::endl;
-
-            QStringList list = line.split(m_separator, m_behaviour);
-            QStringList tidyList;
-
-            for (int i = 0; i < list.size(); ++i) {
-	    
-                QString s(list[i]);
-                bool numeric = false;
-
-                if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) {
-                    s = s.mid(1, s.length() - 2);
-                } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) {
-                    s = s.mid(1, s.length() - 2);
-                } else {
-                    float f = s.toFloat(&numeric);
-//                    std::cerr << "converted \"" << s.toStdString() << "\" to float, got " << f << " and success = " << numeric << std::endl;
-                }
-
-                tidyList.push_back(s);
-
-                if (lineno == 0 || (list.size() < itemCount)) {
-                    itemCount = list.size();
-                } else {
-                    if (itemCount != list.size()) {
-                        variableItemCount = true;
-                    }
-                }
-	    
-                if (i == 0) { // primary
-
-                    if (numeric) {
-
-                        float primary = s.toFloat();
-
-                        if (lineno > 0 && primary <= prevPrimary) {
-                            nonIncreasingPrimaries = true;
-                        }
-
-                        if (s.contains(".") || s.contains(",")) {
-                            floatPrimaries = true;
-                        }
-
-                        prevPrimary = primary;
-
-                    } else {
-                        nonNumericPrimaries = true;
-                    }
-                } else { // secondary
-
-                    if (!numeric) {
-                        if (earliestNonNumericItem < 0 ||
-                            i < earliestNonNumericItem) {
-                            earliestNonNumericItem = i;
-                        }
-                    } else if (i == 1) {
-                        float secondary = s.toFloat();
-                        if (lineno > 0 && secondary <= prevSecondary) {
-                            nonIncreasingSecondaries = true;
-                        }
-                        prevSecondary = secondary;
-                    }
+            if (increasing) {
+                if (lineno > 0 && value <= m_prevValues[i]) {
+                    increasing = false;
                 }
             }
 
-            if (lineno < 10) {
-                m_example.push_back(tidyList);
-                if (lineno == 0 || tidyList.size() > m_maxExampleCols) {
-                    m_maxExampleCols = tidyList.size();
-                }
-            }
+            m_prevValues[i] = value;
+        }
 
-            ++lineno;
+        m_columnQualities[i] =
+            (numeric    ? ColumnNumeric : 0) |
+            (integral   ? ColumnIntegral : 0) |
+            (increasing ? ColumnIncreasing : 0) |
+            (large      ? ColumnLarge : 0);
+    }
 
-            if (lineno == 50) break;
+    if (lineno < 10) {
+        m_example.push_back(list);
+        if (lineno == 0 || cols > m_maxExampleCols) {
+            m_maxExampleCols = cols;
         }
     }
 
-    if (nonNumericPrimaries || nonIncreasingPrimaries) {
+    std::cerr << "Estimated column qualities: ";
+    for (int i = 0; i < m_columnCount; ++i) {
+        std::cerr << int(m_columnQualities[i]) << " ";
+    }
+    std::cerr << std::endl;
+}
+
+void
+CSVFormat::guessPurposes()
+{
+    while (m_columnPurposes.size() <= m_columnCount) {
+        m_columnPurposes.push_back(ColumnUnknown);
+    }
+
+    m_timingType = CSVFormat::ImplicitTiming;
+    m_timeUnits = CSVFormat::TimeWindows;
 	
-	// Primaries are probably not a series of times
+    int timingColumnCount = 0;
+    
+    for (int i = 0; i < m_columnCount; ++i) {
+        
+        ColumnPurpose purpose = ColumnUnknown;
+        bool primary = (i == 0);
 
-	m_timingType = CSVFormat::ImplicitTiming;
-	m_timeUnits = CSVFormat::TimeWindows;
-	
-	if (nonNumericPrimaries) {
-	    m_modelType = CSVFormat::OneDimensionalModel;
-	} else if (itemCount == 1 || variableItemCount ||
-		   (earliestNonNumericItem != -1)) {
-	    m_modelType = CSVFormat::TwoDimensionalModel;
-	} else {
-	    m_modelType = CSVFormat::ThreeDimensionalModel;
-	}
+        ColumnQualities qualities = m_columnQualities[i];
 
+        bool numeric    = (qualities & ColumnNumeric);
+        bool integral   = (qualities & ColumnIntegral);
+        bool increasing = (qualities & ColumnIncreasing);
+        bool large      = (qualities & ColumnLarge);
+
+        bool timingColumn = (numeric && increasing);
+
+        if (timingColumn) {
+
+            ++timingColumnCount;
+                              
+            if (primary) {
+
+                purpose = ColumnStartTime;
+
+                m_timingType = ExplicitTiming;
+
+                if (integral && large) {
+                    m_timeUnits = TimeAudioFrames;
+                } else {
+                    m_timeUnits = TimeSeconds;
+                }
+
+            } else {
+
+                if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
+                    purpose = ColumnEndTime;
+                    m_durationType = EndTimes;
+                }
+            }
+        }
+
+        if (purpose == ColumnUnknown) {
+            if (numeric) {
+                purpose = ColumnValue;
+            } else {
+                purpose = ColumnLabel;
+            }
+        }
+
+        m_columnPurposes[i] = purpose;
+    }            
+
+    int valueCount = 0;
+    for (int i = 0; i < m_columnCount; ++i) {
+        if (m_columnPurposes[i] == ColumnValue) ++valueCount;
+    }
+
+    if (valueCount == 0) {
+        m_modelType = OneDimensionalModel;
+    } else if (valueCount == 1) {
+        m_modelType = TwoDimensionalModel;
     } else {
+        m_modelType = ThreeDimensionalModel;
+    }
 
-	// Increasing numeric primaries -- likely to be time
-
-	m_timingType = CSVFormat::ExplicitTiming;
-
-	if (floatPrimaries) {
-	    m_timeUnits = CSVFormat::TimeSeconds;
-	} else {
-	    m_timeUnits = CSVFormat::TimeAudioFrames;
-	}
-
-	if (itemCount == 1) {
-	    m_modelType = CSVFormat::OneDimensionalModel;
-	} else if (variableItemCount || (earliestNonNumericItem != -1)) {
-	    if (earliestNonNumericItem != -1 && earliestNonNumericItem < 2) {
-		m_modelType = CSVFormat::OneDimensionalModel;
-	    } else {
-		m_modelType = CSVFormat::TwoDimensionalModel;
-	    }
-	} else {
-	    m_modelType = CSVFormat::ThreeDimensionalModel;
-	}
-
-        if (nonIncreasingSecondaries) {
-            m_durationType = Durations;
-        } else {
-            m_durationType = EndTimes;
-        }
+    std::cerr << "Estimated column purposes: ";
+    for (int i = 0; i < m_columnCount; ++i) {
+        std::cerr << int(m_columnPurposes[i]) << " ";
     }
+    std::cerr << std::endl;
 
     std::cerr << "Estimated model type: " << m_modelType << std::endl;
     std::cerr << "Estimated timing type: " << m_timingType << std::endl;
@@ -216,3 +270,4 @@
     std::cerr << "Estimated units: " << m_timeUnits << std::endl;
 }
 
+