view data/fileio/CSVFormat.cpp @ 668:ad7c96620886

* Fix crash when importing CSV file with varying number of columns per row
author Chris Cannam
date Mon, 07 Feb 2011 21:14:09 +0000
parents 3a5ee4b6c9ad
children 611a4fa14dde
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    Sonic Visualiser
    An audio file viewer and annotation editor.
    Centre for Digital Music, Queen Mary, University of London.
    This file copyright 2006 Chris Cannam.
    
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.
*/

#include "CSVFormat.h"

#include "base/StringBits.h"

#include <QFile>
#include <QString>
#include <QRegExp>
#include <QStringList>
#include <QTextStream>

#include <iostream>

CSVFormat::CSVFormat(QString path) :
    m_separator(""),
    m_sampleRate(44100),
    m_windowSize(1024),
    m_allowQuoting(true)
{
    guessFormatFor(path);
}

void
CSVFormat::guessFormatFor(QString path)
{
    m_modelType = TwoDimensionalModel;
    m_timingType = ExplicitTiming;
    m_timeUnits = TimeSeconds;

    m_maxExampleCols = 0;
    m_columnCount = 0;
    m_variableColumnCount = false;

    m_example.clear();
    m_columnQualities.clear();
    m_columnPurposes.clear();
    m_prevValues.clear();

    QFile file(path);
    if (!file.exists()) return;
    if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;

    QTextStream in(&file);
    in.seek(0);

    int lineno = 0;

    while (!in.atEnd()) {

        // See comment about line endings in CSVFileReader::load() 

        QString chunk = in.readLine();
        QStringList lines = chunk.split('\r', QString::SkipEmptyParts);

        for (size_t li = 0; li < lines.size(); ++li) {

            QString line = lines[li];
            if (line.startsWith("#") || line == "") continue;

            guessQualities(line, lineno);

            if (++lineno == 50) break;
        }
    }

    guessPurposes();
}

void
CSVFormat::guessSeparator(QString line)
{
    char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
    for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) {
        if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
            m_separator = candidates[i];
            return;
        }
    }
    m_separator = " ";
}

void
CSVFormat::guessQualities(QString line, int lineno)
{
    if (m_separator == "") guessSeparator(line);

    QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);

    int cols = list.size();
    if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
    if (cols != m_columnCount) m_variableColumnCount = true;

    // All columns are regarded as having these qualities until we see
    // something that indicates otherwise:

    ColumnQualities defaultQualities =
        ColumnNumeric | ColumnIntegral | ColumnIncreasing;
    
    for (int i = 0; i < cols; ++i) {
	    
        while (m_columnQualities.size() <= i) {
            m_columnQualities.push_back(defaultQualities);
            m_prevValues.push_back(0.f);
        }

        QString s(list[i]);
        bool ok = false;

        ColumnQualities qualities = m_columnQualities[i];

        bool numeric    = (qualities & ColumnNumeric);
        bool integral   = (qualities & ColumnIntegral);
        bool increasing = (qualities & ColumnIncreasing);
        bool large      = (qualities & ColumnLarge); // this one defaults to off

        float value = 0.f;

        //!!! how to take into account headers?

        if (numeric) {
            value = s.toFloat(&ok);
            if (!ok) {
                value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
            }
            if (ok) {
                if (lineno < 2 && value > 1000.f) large = true;
            } else {
                numeric = false;
            }
        }

        if (numeric) {

            if (integral) {
                if (s.contains('.') || s.contains(',')) {
                    integral = false;
                }
            }

            if (increasing) {
                if (lineno > 0 && value <= m_prevValues[i]) {
                    increasing = false;
                }
            }

            m_prevValues[i] = value;
        }

        m_columnQualities[i] =
            (numeric    ? ColumnNumeric : 0) |
            (integral   ? ColumnIntegral : 0) |
            (increasing ? ColumnIncreasing : 0) |
            (large      ? ColumnLarge : 0);
    }

    if (lineno < 10) {
        m_example.push_back(list);
        if (lineno == 0 || cols > m_maxExampleCols) {
            m_maxExampleCols = cols;
        }
    }

    std::cerr << "Estimated column qualities: ";
    for (int i = 0; i < m_columnCount; ++i) {
        std::cerr << int(m_columnQualities[i]) << " ";
    }
    std::cerr << std::endl;
}

void
CSVFormat::guessPurposes()
{
    m_timingType = CSVFormat::ImplicitTiming;
    m_timeUnits = CSVFormat::TimeWindows;
	
    int timingColumnCount = 0;
    
    for (int i = 0; i < m_columnCount; ++i) {
        
        ColumnPurpose purpose = ColumnUnknown;
        bool primary = (i == 0);

        ColumnQualities qualities = m_columnQualities[i];

        bool numeric    = (qualities & ColumnNumeric);
        bool integral   = (qualities & ColumnIntegral);
        bool increasing = (qualities & ColumnIncreasing);
        bool large      = (qualities & ColumnLarge);

        bool timingColumn = (numeric && increasing);

        if (timingColumn) {

            ++timingColumnCount;
                              
            if (primary) {

                purpose = ColumnStartTime;

                m_timingType = ExplicitTiming;

                if (integral && large) {
                    m_timeUnits = TimeAudioFrames;
                } else {
                    m_timeUnits = TimeSeconds;
                }

            } else {

                if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
                    purpose = ColumnEndTime;
                }
            }
        }

        if (purpose == ColumnUnknown) {
            if (numeric) {
                purpose = ColumnValue;
            } else {
                purpose = ColumnLabel;
            }
        }

        setColumnPurpose(i, purpose);
    }            

    int valueCount = 0;
    for (int i = 0; i < m_columnCount; ++i) {
        if (m_columnPurposes[i] == ColumnValue) ++valueCount;
    }

    if (valueCount == 2 && timingColumnCount == 1) {
        // If we have exactly two apparent value columns and only one
        // timing column, but one value column is integral and the
        // other is not, guess that whichever one matches the integral
        // status of the time column is either duration or end time
        if (m_timingType == ExplicitTiming) {
            int a = -1, b = -1;
            for (int i = 0; i < m_columnCount; ++i) {
                if (m_columnPurposes[i] == ColumnValue) {
                    if (a == -1) a = i;
                    else b = i;
                }
            }
            if ((m_columnQualities[a] & ColumnIntegral) !=
                (m_columnQualities[b] & ColumnIntegral)) {
                int timecol = a;
                if ((m_columnQualities[a] & ColumnIntegral) !=
                    (m_columnQualities[0] & ColumnIntegral)) {
                    timecol = b;
                }
                if (m_columnQualities[timecol] & ColumnIncreasing) {
                    // This shouldn't happen; should have been settled above
                    m_columnPurposes[timecol] = ColumnEndTime;
                } else {
                    m_columnPurposes[timecol] = ColumnDuration;
                }
                --valueCount;
            }
        }
    }

    if (timingColumnCount > 1) {
        m_modelType = TwoDimensionalModelWithDuration;
    } else {
        if (valueCount == 0) {
            m_modelType = OneDimensionalModel;
        } else if (valueCount == 1) {
            m_modelType = TwoDimensionalModel;
        } else {
            m_modelType = ThreeDimensionalModel;
        }
    }

    std::cerr << "Estimated column purposes: ";
    for (int i = 0; i < m_columnCount; ++i) {
        std::cerr << int(m_columnPurposes[i]) << " ";
    }
    std::cerr << std::endl;

    std::cerr << "Estimated model type: " << m_modelType << std::endl;
    std::cerr << "Estimated timing type: " << m_timingType << std::endl;
    std::cerr << "Estimated units: " << m_timeUnits << std::endl;
}

CSVFormat::ColumnPurpose
CSVFormat::getColumnPurpose(int i)
{
    while (m_columnPurposes.size() <= i) {
        m_columnPurposes.push_back(ColumnUnknown);
    }
    return m_columnPurposes[i];
}

CSVFormat::ColumnPurpose
CSVFormat::getColumnPurpose(int i) const
{
    if (m_columnPurposes.size() <= i) {
        return ColumnUnknown;
    }
    return m_columnPurposes[i];
}

void
CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
{
    while (m_columnPurposes.size() <= i) {
        m_columnPurposes.push_back(ColumnUnknown);
    }
    m_columnPurposes[i] = p;
}