comparison data/fileio/CSVFormat.cpp @ 629:35499d48a5d1

* Start overhauling CSV parser to associate purposes with columns en route to its guesses; add some string manipulation code
author Chris Cannam
date Thu, 15 Jul 2010 15:27:21 +0000
parents 001db550bd48
children 11a664058dd8
comparison
equal deleted inserted replaced
628:001db550bd48 629:35499d48a5d1
13 COPYING included with this distribution for more information. 13 COPYING included with this distribution for more information.
14 */ 14 */
15 15
16 #include "CSVFormat.h" 16 #include "CSVFormat.h"
17 17
18 #include "base/StringBits.h"
19
18 #include <QFile> 20 #include <QFile>
19 #include <QString> 21 #include <QString>
20 #include <QRegExp> 22 #include <QRegExp>
21 #include <QStringList> 23 #include <QStringList>
22 #include <QTextStream> 24 #include <QTextStream>
23 25
24 #include <iostream> 26 #include <iostream>
25 27
26 CSVFormat::CSVFormat(QString filename) : 28 CSVFormat::CSVFormat(QString path) :
27 m_modelType(TwoDimensionalModel), 29 m_separator(""),
28 m_timingType(ExplicitTiming),
29 m_durationType(Durations),
30 m_timeUnits(TimeSeconds),
31 m_separator(","),
32 m_sampleRate(44100), 30 m_sampleRate(44100),
33 m_windowSize(1024), 31 m_windowSize(1024),
34 m_behaviour(QString::KeepEmptyParts), 32 m_allowQuoting(true)
35 m_maxExampleCols(0) 33 {
36 { 34 guessFormatFor(path);
37 QFile file(filename); 35 }
36
37 void
38 CSVFormat::guessFormatFor(QString path)
39 {
40 m_modelType = TwoDimensionalModel;
41 m_timingType = ExplicitTiming;
42 m_durationType = Durations;
43 m_timeUnits = TimeSeconds;
44 m_behaviour = QString::KeepEmptyParts;
45
46 m_maxExampleCols = 0;
47 m_columnCount = 0;
48 m_variableColumnCount = false;
49
50 m_example.clear();
51 m_columnQualities.clear();
52 m_columnPurposes.clear();
53 m_prevValues.clear();
54
55 QFile file(path);
38 if (!file.exists()) return; 56 if (!file.exists()) return;
39 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; 57 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
40 58
41 QTextStream in(&file); 59 QTextStream in(&file);
42 in.seek(0); 60 in.seek(0);
43 61
44 unsigned int lineno = 0; 62 int lineno = 0;
45
46 bool nonIncreasingPrimaries = false;
47 bool nonIncreasingSecondaries = false;
48 bool nonNumericPrimaries = false;
49 bool floatPrimaries = false;
50 bool variableItemCount = false;
51 int itemCount = 1;
52 int earliestNonNumericItem = -1;
53
54 float prevPrimary = 0.0;
55 float prevSecondary = 0.0;
56
57 m_maxExampleCols = 0;
58 m_separator = "";
59 63
60 while (!in.atEnd()) { 64 while (!in.atEnd()) {
61 65
62 // See comment about line endings in CSVFileReader::load() 66 // See comment about line endings in CSVFileReader::load()
63 67
65 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); 69 QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
66 70
67 for (size_t li = 0; li < lines.size(); ++li) { 71 for (size_t li = 0; li < lines.size(); ++li) {
68 72
69 QString line = lines[li]; 73 QString line = lines[li];
70 74 if (line.startsWith("#") || line == "") continue;
71 if (line.startsWith("#")) continue; 75
72 76 guessQualities(line, lineno);
73 m_behaviour = QString::KeepEmptyParts; 77
74 78 if (++lineno == 50) break;
75 if (m_separator == "") { 79 }
76 //!!! to do: ask the user 80 }
77 if (line.split(",").size() >= 2) m_separator = ","; 81
78 else if (line.split("\t").size() >= 2) m_separator = "\t"; 82 guessPurposes();
79 else if (line.split("|").size() >= 2) m_separator = "|"; 83 }
80 else if (line.split("/").size() >= 2) m_separator = "/"; 84
81 else if (line.split(":").size() >= 2) m_separator = ":"; 85 void
82 else { 86 CSVFormat::guessSeparator(QString line)
83 m_separator = " "; 87 {
84 m_behaviour = QString::SkipEmptyParts; 88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
85 } 89 for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) {
86 } 90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
87 91 m_separator = candidates[i];
88 // std::cerr << "separator = \"" << m_separator.toStdString() << "\"" << std::endl; 92 return;
89 93 }
90 QStringList list = line.split(m_separator, m_behaviour); 94 }
91 QStringList tidyList; 95 m_separator = " ";
92 96 }
93 for (int i = 0; i < list.size(); ++i) { 97
98 void
99 CSVFormat::guessQualities(QString line, int lineno)
100 {
101 if (m_separator == "") guessSeparator(line);
102
103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
104
105 int cols = list.size();
106 if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
107 if (cols != m_columnCount) m_variableColumnCount = true;
108
109 // All columns are regarded as having these qualities until we see
110 // something that indicates otherwise:
111
112 ColumnQualities defaultQualities =
113 ColumnNumeric | ColumnIntegral | ColumnIncreasing;
114
115 for (int i = 0; i < cols; ++i) {
94 116
95 QString s(list[i]); 117 while (m_columnQualities.size() <= i) {
96 bool numeric = false; 118 m_columnQualities.push_back(defaultQualities);
97 119 m_prevValues.push_back(0.f);
98 if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { 120 }
99 s = s.mid(1, s.length() - 2); 121
100 } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { 122 QString s(list[i]);
101 s = s.mid(1, s.length() - 2); 123 bool ok = false;
124
125 ColumnQualities qualities = m_columnQualities[i];
126
127 bool numeric = (qualities & ColumnNumeric);
128 bool integral = (qualities & ColumnIntegral);
129 bool increasing = (qualities & ColumnIncreasing);
130 bool large = (qualities & ColumnLarge); // this one defaults to off
131
132 float value = 0.f;
133
134 //!!! how to take into account headers?
135
136 if (numeric) {
137 value = s.toFloat(&ok);
138 if (!ok) {
139 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
140 }
141 if (ok) {
142 if (lineno < 2 && value > 1000.f) large = true;
143 } else {
144 numeric = false;
145 }
146 }
147
148 if (numeric) {
149
150 if (integral) {
151 if (s.contains('.') || s.contains(',')) {
152 integral = false;
153 }
154 }
155
156 if (increasing) {
157 if (lineno > 0 && value <= m_prevValues[i]) {
158 increasing = false;
159 }
160 }
161
162 m_prevValues[i] = value;
163 }
164
165 m_columnQualities[i] =
166 (numeric ? ColumnNumeric : 0) |
167 (integral ? ColumnIntegral : 0) |
168 (increasing ? ColumnIncreasing : 0) |
169 (large ? ColumnLarge : 0);
170 }
171
172 if (lineno < 10) {
173 m_example.push_back(list);
174 if (lineno == 0 || cols > m_maxExampleCols) {
175 m_maxExampleCols = cols;
176 }
177 }
178
179 std::cerr << "Estimated column qualities: ";
180 for (int i = 0; i < m_columnCount; ++i) {
181 std::cerr << int(m_columnQualities[i]) << " ";
182 }
183 std::cerr << std::endl;
184 }
185
186 void
187 CSVFormat::guessPurposes()
188 {
189 while (m_columnPurposes.size() <= m_columnCount) {
190 m_columnPurposes.push_back(ColumnUnknown);
191 }
192
193 m_timingType = CSVFormat::ImplicitTiming;
194 m_timeUnits = CSVFormat::TimeWindows;
195
196 int timingColumnCount = 0;
197
198 for (int i = 0; i < m_columnCount; ++i) {
199
200 ColumnPurpose purpose = ColumnUnknown;
201 bool primary = (i == 0);
202
203 ColumnQualities qualities = m_columnQualities[i];
204
205 bool numeric = (qualities & ColumnNumeric);
206 bool integral = (qualities & ColumnIntegral);
207 bool increasing = (qualities & ColumnIncreasing);
208 bool large = (qualities & ColumnLarge);
209
210 bool timingColumn = (numeric && increasing);
211
212 if (timingColumn) {
213
214 ++timingColumnCount;
215
216 if (primary) {
217
218 purpose = ColumnStartTime;
219
220 m_timingType = ExplicitTiming;
221
222 if (integral && large) {
223 m_timeUnits = TimeAudioFrames;
102 } else { 224 } else {
103 float f = s.toFloat(&numeric); 225 m_timeUnits = TimeSeconds;
104 // std::cerr << "converted \"" << s.toStdString() << "\" to float, got " << f << " and success = " << numeric << std::endl; 226 }
105 } 227
106 228 } else {
107 tidyList.push_back(s); 229
108 230 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
109 if (lineno == 0 || (list.size() < itemCount)) { 231 purpose = ColumnEndTime;
110 itemCount = list.size(); 232 m_durationType = EndTimes;
111 } else { 233 }
112 if (itemCount != list.size()) { 234 }
113 variableItemCount = true; 235 }
114 } 236
115 } 237 if (purpose == ColumnUnknown) {
116 238 if (numeric) {
117 if (i == 0) { // primary 239 purpose = ColumnValue;
118 240 } else {
119 if (numeric) { 241 purpose = ColumnLabel;
120 242 }
121 float primary = s.toFloat(); 243 }
122 244
123 if (lineno > 0 && primary <= prevPrimary) { 245 m_columnPurposes[i] = purpose;
124 nonIncreasingPrimaries = true; 246 }
125 } 247
126 248 int valueCount = 0;
127 if (s.contains(".") || s.contains(",")) { 249 for (int i = 0; i < m_columnCount; ++i) {
128 floatPrimaries = true; 250 if (m_columnPurposes[i] == ColumnValue) ++valueCount;
129 } 251 }
130 252
131 prevPrimary = primary; 253 if (valueCount == 0) {
132 254 m_modelType = OneDimensionalModel;
133 } else { 255 } else if (valueCount == 1) {
134 nonNumericPrimaries = true; 256 m_modelType = TwoDimensionalModel;
135 }
136 } else { // secondary
137
138 if (!numeric) {
139 if (earliestNonNumericItem < 0 ||
140 i < earliestNonNumericItem) {
141 earliestNonNumericItem = i;
142 }
143 } else if (i == 1) {
144 float secondary = s.toFloat();
145 if (lineno > 0 && secondary <= prevSecondary) {
146 nonIncreasingSecondaries = true;
147 }
148 prevSecondary = secondary;
149 }
150 }
151 }
152
153 if (lineno < 10) {
154 m_example.push_back(tidyList);
155 if (lineno == 0 || tidyList.size() > m_maxExampleCols) {
156 m_maxExampleCols = tidyList.size();
157 }
158 }
159
160 ++lineno;
161
162 if (lineno == 50) break;
163 }
164 }
165
166 if (nonNumericPrimaries || nonIncreasingPrimaries) {
167
168 // Primaries are probably not a series of times
169
170 m_timingType = CSVFormat::ImplicitTiming;
171 m_timeUnits = CSVFormat::TimeWindows;
172
173 if (nonNumericPrimaries) {
174 m_modelType = CSVFormat::OneDimensionalModel;
175 } else if (itemCount == 1 || variableItemCount ||
176 (earliestNonNumericItem != -1)) {
177 m_modelType = CSVFormat::TwoDimensionalModel;
178 } else {
179 m_modelType = CSVFormat::ThreeDimensionalModel;
180 }
181
182 } else { 257 } else {
183 258 m_modelType = ThreeDimensionalModel;
184 // Increasing numeric primaries -- likely to be time 259 }
185 260
186 m_timingType = CSVFormat::ExplicitTiming; 261 std::cerr << "Estimated column purposes: ";
187 262 for (int i = 0; i < m_columnCount; ++i) {
188 if (floatPrimaries) { 263 std::cerr << int(m_columnPurposes[i]) << " ";
189 m_timeUnits = CSVFormat::TimeSeconds; 264 }
190 } else { 265 std::cerr << std::endl;
191 m_timeUnits = CSVFormat::TimeAudioFrames;
192 }
193
194 if (itemCount == 1) {
195 m_modelType = CSVFormat::OneDimensionalModel;
196 } else if (variableItemCount || (earliestNonNumericItem != -1)) {
197 if (earliestNonNumericItem != -1 && earliestNonNumericItem < 2) {
198 m_modelType = CSVFormat::OneDimensionalModel;
199 } else {
200 m_modelType = CSVFormat::TwoDimensionalModel;
201 }
202 } else {
203 m_modelType = CSVFormat::ThreeDimensionalModel;
204 }
205
206 if (nonIncreasingSecondaries) {
207 m_durationType = Durations;
208 } else {
209 m_durationType = EndTimes;
210 }
211 }
212 266
213 std::cerr << "Estimated model type: " << m_modelType << std::endl; 267 std::cerr << "Estimated model type: " << m_modelType << std::endl;
214 std::cerr << "Estimated timing type: " << m_timingType << std::endl; 268 std::cerr << "Estimated timing type: " << m_timingType << std::endl;
215 std::cerr << "Estimated duration type: " << m_durationType << std::endl; 269 std::cerr << "Estimated duration type: " << m_durationType << std::endl;
216 std::cerr << "Estimated units: " << m_timeUnits << std::endl; 270 std::cerr << "Estimated units: " << m_timeUnits << std::endl;
217 } 271 }
218 272
273