annotate data/fileio/CSVFormat.h @ 1870:1b8c4ee06f6d csv-import-headers

Detect presence of header row in CSV format guesser; use headings to inform our guesses about column purposes; test this
author Chris Cannam
date Wed, 17 Jun 2020 18:01:00 +0100
parents f0ffc88a36b3
children
rev   line source
Chris@392 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@392 2
Chris@392 3 /*
Chris@392 4 Sonic Visualiser
Chris@392 5 An audio file viewer and annotation editor.
Chris@392 6 Centre for Digital Music, Queen Mary, University of London.
Chris@392 7 This file copyright 2006 Chris Cannam.
Chris@392 8
Chris@392 9 This program is free software; you can redistribute it and/or
Chris@392 10 modify it under the terms of the GNU General Public License as
Chris@392 11 published by the Free Software Foundation; either version 2 of the
Chris@392 12 License, or (at your option) any later version. See the file
Chris@392 13 COPYING included with this distribution for more information.
Chris@392 14 */
Chris@392 15
Chris@1362 16 #ifndef SV_CSV_FORMAT_H
Chris@1362 17 #define SV_CSV_FORMAT_H
Chris@392 18
Chris@392 19 #include <QString>
Chris@392 20 #include <QStringList>
Chris@392 21
Chris@1585 22 #include <set>
Chris@1870 23 #include <map>
Chris@1585 24
Chris@1047 25 #include "base/BaseTypes.h"
Chris@1047 26
Chris@392 27 class CSVFormat
Chris@392 28 {
Chris@392 29 public:
Chris@392 30 enum ModelType {
Chris@1429 31 OneDimensionalModel,
Chris@1429 32 TwoDimensionalModel,
Chris@628 33 TwoDimensionalModelWithDuration,
Chris@897 34 TwoDimensionalModelWithDurationAndPitch,
Chris@1793 35 TwoDimensionalModelWithDurationAndExtent,
Chris@1488 36 ThreeDimensionalModel,
Chris@1488 37 WaveFileModel
Chris@392 38 };
Chris@392 39
Chris@392 40 enum TimingType {
Chris@1429 41 ExplicitTiming,
Chris@1429 42 ImplicitTiming
Chris@392 43 };
Chris@628 44
Chris@392 45 enum TimeUnits {
Chris@1429 46 TimeSeconds,
Chris@990 47 TimeMilliseconds,
Chris@1429 48 TimeAudioFrames,
Chris@1429 49 TimeWindows,
Chris@392 50 };
Chris@392 51
Chris@629 52 enum ColumnPurpose {
Chris@629 53 ColumnUnknown,
Chris@629 54 ColumnStartTime,
Chris@629 55 ColumnEndTime,
Chris@629 56 ColumnDuration,
Chris@629 57 ColumnValue,
Chris@897 58 ColumnPitch,
Chris@629 59 ColumnLabel
Chris@629 60 };
Chris@629 61
Chris@1870 62 enum HeaderStatus {
Chris@1870 63 HeaderUnknown = 0,
Chris@1870 64 HeaderAbsent = 1,
Chris@1870 65 HeaderPresent = 2
Chris@1870 66 };
Chris@1870 67
Chris@629 68 enum ColumnQuality {
Chris@1512 69 ColumnNumeric = 1, // No non-numeric values were seen in sample
Chris@1512 70 ColumnIntegral = 2, // All sampled values were integers
Chris@1512 71 ColumnIncreasing = 4, // Sampled values were monotonically increasing
Chris@1512 72 ColumnSmall = 8, // All sampled values had magnitude < 1
Chris@1512 73 ColumnLarge = 16, // Values "quickly" grew to over 1000
Chris@1512 74 ColumnSigned = 32, // Some negative values were seen
Chris@1512 75 ColumnNearEmpty = 64, // Nothing in this column beyond first row
Chris@629 76 };
Chris@629 77 typedef unsigned int ColumnQualities;
Chris@392 78
Chris@1515 79 enum AudioSampleRange {
Chris@1515 80 SampleRangeSigned1 = 0, // -1 .. 1
Chris@1515 81 SampleRangeUnsigned255, // 0 .. 255
Chris@1515 82 SampleRangeSigned32767, // -32768 .. 32767
Chris@1515 83 SampleRangeOther // Other/unknown: Normalise on load
Chris@1515 84 };
Chris@1515 85
Chris@392 86 CSVFormat() : // arbitrary defaults
Chris@392 87 m_modelType(TwoDimensionalModel),
Chris@392 88 m_timingType(ExplicitTiming),
Chris@392 89 m_timeUnits(TimeSeconds),
Chris@1585 90 m_separator(""),
Chris@392 91 m_sampleRate(44100),
Chris@392 92 m_windowSize(1024),
Chris@1870 93 m_headerStatus(HeaderUnknown),
Chris@629 94 m_columnCount(0),
Chris@629 95 m_variableColumnCount(false),
Chris@1516 96 m_audioSampleRange(SampleRangeOther),
Chris@629 97 m_allowQuoting(true),
Chris@629 98 m_maxExampleCols(0)
Chris@392 99 { }
Chris@629 100
Chris@629 101 CSVFormat(QString path); // guess format
Chris@629 102
Chris@629 103 /**
Chris@629 104 * Guess the format of the given CSV file, setting the fields in
Chris@629 105 * this object accordingly. If the current separator is the empty
Chris@629 106 * string, the separator character will also be guessed; otherwise
Chris@629 107 * the current separator will be used. The other properties of
Chris@629 108 * this object will be set according to guesses from the file.
Chris@1524 109 *
Chris@1524 110 * The properties that are guessed from the file contents are:
Chris@1524 111 * separator, column count, variable-column-count flag, audio
Chris@1524 112 * sample range, timing type, time units, column qualities, column
Chris@1524 113 * purposes, and model type. The sample rate and window size
Chris@1524 114 * cannot be guessed and will not be changed by this function.
Chris@1524 115 * Note also that this function will never guess WaveFileModel for
Chris@1524 116 * the model type.
Chris@1524 117 *
Chris@1524 118 * Return false if there is some fundamental error, e.g. the file
Chris@1524 119 * could not be opened at all. Return true otherwise. Note that
Chris@1524 120 * this function returns true even if the file doesn't appear to
Chris@1524 121 * make much sense as a data format.
Chris@629 122 */
Chris@1524 123 bool guessFormatFor(QString path);
Chris@628 124
Chris@628 125 ModelType getModelType() const { return m_modelType; }
Chris@628 126 TimingType getTimingType() const { return m_timingType; }
Chris@628 127 TimeUnits getTimeUnits() const { return m_timeUnits; }
Chris@1047 128 sv_samplerate_t getSampleRate() const { return m_sampleRate; }
Chris@929 129 int getWindowSize() const { return m_windowSize; }
Chris@630 130 int getColumnCount() const { return m_columnCount; }
Chris@1516 131 AudioSampleRange getAudioSampleRange() const { return m_audioSampleRange; }
Chris@631 132 bool getAllowQuoting() const { return m_allowQuoting; }
Chris@1870 133 HeaderStatus getHeaderStatus() const { return m_headerStatus; }
Chris@631 134 QChar getSeparator() const {
Chris@1585 135 if (m_separator == "") return ',';
Chris@631 136 else return m_separator[0];
Chris@631 137 }
Chris@1585 138 // set rather than QSet to ensure a fixed order
Chris@1585 139 std::set<QChar> getPlausibleSeparators() const {
Chris@1585 140 return m_plausibleSeparators;
Chris@1585 141 }
Chris@630 142
Chris@628 143 void setModelType(ModelType t) { m_modelType = t; }
Chris@628 144 void setTimingType(TimingType t) { m_timingType = t; }
Chris@628 145 void setTimeUnits(TimeUnits t) { m_timeUnits = t; }
Chris@631 146 void setSeparator(QChar s) { m_separator = s; }
Chris@1047 147 void setSampleRate(sv_samplerate_t r) { m_sampleRate = r; }
Chris@1009 148 void setWindowSize(int s) { m_windowSize = s; }
Chris@630 149 void setColumnCount(int c) { m_columnCount = c; }
Chris@1516 150 void setAudioSampleRange(AudioSampleRange r) { m_audioSampleRange = r; }
Chris@631 151 void setAllowQuoting(bool q) { m_allowQuoting = q; }
Chris@1870 152 void setHeaderStatus(HeaderStatus s) { m_headerStatus = s; }
Chris@392 153
Chris@1870 154 QList<ColumnPurpose> getColumnPurposes() const;
Chris@1870 155 void setColumnPurposes(QList<ColumnPurpose> cl);
Chris@631 156
Chris@631 157 ColumnPurpose getColumnPurpose(int i) const;
Chris@631 158 void setColumnPurpose(int i, ColumnPurpose p);
Chris@392 159
Chris@1870 160 // only valid if format has been guessed:
Chris@1870 161 QList<ColumnQualities> getColumnQualities() const;
Chris@629 162
Chris@1870 163 // only valid if format has been guessed:
Chris@1870 164 QList<QStringList> getExample() const { return m_example; }
Chris@392 165 int getMaxExampleCols() const { return m_maxExampleCols; }
Chris@1429 166
Chris@392 167 protected:
Chris@628 168 ModelType m_modelType;
Chris@628 169 TimingType m_timingType;
Chris@628 170 TimeUnits m_timeUnits;
Chris@1585 171 QString m_separator; // "" or a single char - basically QChar option
Chris@1585 172 std::set<QChar> m_plausibleSeparators;
Chris@1047 173 sv_samplerate_t m_sampleRate;
Chris@929 174 int m_windowSize;
Chris@1870 175 HeaderStatus m_headerStatus;
Chris@392 176
Chris@629 177 int m_columnCount;
Chris@629 178 bool m_variableColumnCount;
Chris@629 179
Chris@1870 180 std::map<int, ColumnQualities> m_columnQualities;
Chris@1870 181 std::map<int, ColumnPurpose> m_columnPurposes;
Chris@1870 182 std::map<int, QString> m_columnHeadings;
Chris@629 183
Chris@1870 184 std::map<int, float> m_prevValues;
Chris@1870 185
Chris@1515 186 AudioSampleRange m_audioSampleRange;
Chris@1515 187
Chris@629 188 bool m_allowQuoting;
Chris@392 189
Chris@392 190 QList<QStringList> m_example;
Chris@392 191 int m_maxExampleCols;
Chris@629 192
Chris@629 193 void guessSeparator(QString line);
Chris@629 194 void guessQualities(QString line, int lineno);
Chris@629 195 void guessPurposes();
Chris@1515 196 void guessAudioSampleRange();
Chris@392 197 };
Chris@392 198
Chris@392 199 #endif