diff data/fileio/CSVFormat.cpp @ 1870:1b8c4ee06f6d csv-import-headers

Detect presence of header row in CSV format guesser; use headings to inform our guesses about column purposes; test this
author Chris Cannam
date Wed, 17 Jun 2020 18:01:00 +0100
parents bde22957545e
children bed42ce4d3ab
line wrap: on
line diff
--- a/data/fileio/CSVFormat.cpp	Tue Jun 16 15:15:57 2020 +0100
+++ b/data/fileio/CSVFormat.cpp	Wed Jun 17 18:01:00 2020 +0100
@@ -31,7 +31,9 @@
     m_separator(""),
     m_sampleRate(44100),
     m_windowSize(1024),
-    m_allowQuoting(true)
+    m_headerStatus(HeaderUnknown),
+    m_allowQuoting(true),
+    m_maxExampleCols(0)
 {
     (void)guessFormatFor(path);
 }
@@ -124,8 +126,18 @@
     QStringList list = StringBits::split(line, getSeparator(), m_allowQuoting);
 
     int cols = list.size();
-    if (lineno == 0 || (cols > m_columnCount)) m_columnCount = cols;
-    if (cols != m_columnCount) m_variableColumnCount = true;
+
+    int firstLine = 0;
+    if (m_headerStatus == HeaderPresent) {
+        firstLine = 1;
+    }
+    
+    if (lineno == firstLine || (cols > m_columnCount)) {
+        m_columnCount = cols;
+    }
+    if (cols != m_columnCount) {
+        m_variableColumnCount = true;
+    }
 
     // All columns are regarded as having these qualities until we see
     // something that indicates otherwise:
@@ -137,10 +149,10 @@
     for (int i = 0; i < cols; ++i) {
 
         SVDEBUG << "line no " << lineno << ": column " << i << " contains: \"" << list[i] << "\"" << endl;
-        
-        while (m_columnQualities.size() <= i) {
-            m_columnQualities.push_back(defaultQualities);
-            m_prevValues.push_back(0.f);
+
+        if (m_columnQualities.find(i) == m_columnQualities.end()) {
+            m_columnQualities[i] = defaultQualities;
+            m_prevValues[i] = 0.f;
         }
 
         QString s(list[i]);
@@ -161,21 +173,19 @@
 
         if (s.trimmed() != "") {
         
-            if (lineno > 1) {
+            if (lineno > firstLine) {
                 emptyish = false;
             }
         
             float value = 0.f;
 
-            //!!! how to take into account headers?
-
             if (numeric) {
                 value = s.toFloat(&ok);
                 if (!ok) {
                     value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
                 }
                 if (ok) {
-                    if (lineno < 2 && value > 1000.f) {
+                    if (lineno < firstLine + 2 && value > 1000.f) {
                         large = true;
                     }
                     if (value < 0.f) {
@@ -206,7 +216,7 @@
                 }
 
                 if (increasing) {
-                    if (lineno > 0 && value <= m_prevValues[i]) {
+                    if (lineno > firstLine && value <= m_prevValues[i]) {
                         increasing = false;
                     }
                 }
@@ -225,19 +235,55 @@
             (emptyish   ? ColumnNearEmpty : 0);
     }
 
-    if (lineno < 10) {
+    if (lineno == 0 && m_headerStatus == HeaderUnknown) {
+        // If we have at least one column, and every column has
+        // quality == ColumnNearEmpty, i.e. not empty and not numeric,
+        // then we probably have a header row
+        bool couldBeHeader = (cols > 0);
+        std::map<int, QString> headings;
+        for (int i = 0; i < cols; ++i) {
+            if (m_columnQualities[i] != ColumnNearEmpty) {
+                couldBeHeader = false;
+            } else {
+                headings[i] = list[i].trimmed().toLower();
+            }
+        }
+        if (couldBeHeader) {
+            m_headerStatus = HeaderPresent;
+            m_columnHeadings = headings;
+        } else {
+            m_headerStatus = HeaderAbsent;
+        }
+    }
+
+    if (lineno == 0 && m_headerStatus == HeaderPresent) {
+        // Start again with the qualities:
+        m_columnQualities.clear();
+        m_prevValues.clear();
+    } else if (lineno < firstLine + 10) {
+        // Not a header row, so add it to the example column output
         m_example.push_back(list);
-        if (lineno == 0 || cols > m_maxExampleCols) {
+        if (lineno == firstLine || cols > m_maxExampleCols) {
             m_maxExampleCols = cols;
         }
     }
 
-    if (lineno < 10) {
+    if (lineno < firstLine + 10) {
         SVDEBUG << "Estimated column qualities for line " << lineno << " (reporting up to first 10): ";
-        for (int i = 0; i < m_columnCount; ++i) {
-            SVDEBUG << int(m_columnQualities[i]) << " ";
+        if (lineno == 0 && m_headerStatus == HeaderPresent &&
+            m_columnCount > 0 && m_columnQualities.empty()) {
+            SVDEBUG << "[whole line classified as a header row]";
+        } else {
+            for (int i = 0; i < cols; ++i) {
+                if (m_columnQualities.find(i) == m_columnQualities.end()) {
+                    SVDEBUG << "(not set) ";
+                } else {
+                    SVDEBUG << int(m_columnQualities[i]) << " ";
+                }
+            }
         }
         SVDEBUG << endl;
+        SVDEBUG << "Estimated header status: " << m_headerStatus << endl;
     }
 }
 
@@ -252,7 +298,11 @@
 
     SVDEBUG << "Estimated column qualities overall: ";
     for (int i = 0; i < m_columnCount; ++i) {
-        SVDEBUG << int(m_columnQualities[i]) << " ";
+        if (m_columnQualities.find(i) == m_columnQualities.end()) {
+            SVDEBUG << "(not set) ";
+        } else {
+            SVDEBUG << int(m_columnQualities[i]) << " ";
+        }
     }
     SVDEBUG << endl;
 
@@ -290,33 +340,56 @@
 
         bool timingColumn = (numeric && increasing);
 
+        QString heading;
+        if (m_columnHeadings.find(i) != m_columnHeadings.end()) {
+            heading = m_columnHeadings[i];
+        }
+        
+        if (heading == "time" || heading == "frame" ||
+            heading == "duration" || heading == "endtime") {
+            timingColumn = true;
+        }
+
+        if (heading == "value" || heading == "height" || heading == "label") {
+            timingColumn = false;
+        }
+        
         if (timingColumn) {
 
             ++timingColumnCount;
+
+            if (heading == "endtime") {
+
+                purpose = ColumnEndTime;
+                haveDurationOrEndTime = true;
+
+            } else if (heading == "duration") {
+
+                purpose = ColumnDuration;
+                haveDurationOrEndTime = true;
                               
-            if (primary) {
+            } else if (primary || heading == "time" || heading == "frame") {
 
                 purpose = ColumnStartTime;
-
                 m_timingType = ExplicitTiming;
 
-                if (integral && large) {
+                if ((integral && large) || heading == "frame") {
                     m_timeUnits = TimeAudioFrames;
                 } else {
                     m_timeUnits = TimeSeconds;
                 }
 
-            } else {
-
-                if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
-                    purpose = ColumnEndTime;
-                    haveDurationOrEndTime = true;
-                }
+            } else if (timingColumnCount == 2 &&
+                       m_timingType == ExplicitTiming) {
+                purpose = ColumnEndTime;
+                haveDurationOrEndTime = true;
             }
         }
 
         if (purpose == ColumnUnknown) {
-            if (numeric) {
+            if (heading == "label") {
+                purpose = ColumnLabel;
+            } else if (numeric || heading == "value" || heading == "height") {
                 purpose = ColumnValue;
             } else {
                 purpose = ColumnLabel;
@@ -328,7 +401,9 @@
 
     int valueCount = 0;
     for (int i = 0; i < m_columnCount; ++i) {
-        if (m_columnPurposes[i] == ColumnValue) ++valueCount;
+        if (m_columnPurposes[i] == ColumnValue) {
+            ++valueCount;
+        }
     }
 
     if (valueCount == 2 && timingColumnCount == 1) {
@@ -455,33 +530,51 @@
     m_audioSampleRange = range;
 }
 
-CSVFormat::ColumnPurpose
-CSVFormat::getColumnPurpose(int i)
+QList<CSVFormat::ColumnPurpose>
+CSVFormat::getColumnPurposes() const
 {
-    while (m_columnPurposes.size() <= i) {
-        m_columnPurposes.push_back(ColumnUnknown);
+    QList<ColumnPurpose> purposes;
+    for (int i = 0; i < m_columnCount; ++i) {
+        purposes.push_back(getColumnPurpose(i));
     }
-    return m_columnPurposes[i];
+    return purposes;
+}
+
+void
+CSVFormat::setColumnPurposes(QList<ColumnPurpose> cl)
+{
+    m_columnPurposes.clear();
+    for (int i = 0; in_range_for(cl, i); ++i) {
+        m_columnPurposes[i] = cl[i];
+    }
 }
 
 CSVFormat::ColumnPurpose
 CSVFormat::getColumnPurpose(int i) const
 {
-    if (m_columnPurposes.size() <= i) {
+    if (m_columnPurposes.find(i) == m_columnPurposes.end()) {
         return ColumnUnknown;
+    } else {
+        return m_columnPurposes.at(i);
     }
-    return m_columnPurposes[i];
 }
 
 void
 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
 {
-    while (m_columnPurposes.size() <= i) {
-        m_columnPurposes.push_back(ColumnUnknown);
-    }
     m_columnPurposes[i] = p;
 }
 
-
-
-
+QList<CSVFormat::ColumnQualities>
+CSVFormat::getColumnQualities() const
+{
+    QList<ColumnQualities> qualities;
+    for (int i = 0; i < m_columnCount; ++i) {
+        if (m_columnQualities.find(i) == m_columnQualities.end()) {
+            qualities.push_back(0);
+        } else {
+            qualities.push_back(m_columnQualities.at(i));
+        }
+    }
+    return qualities;
+}