Mercurial > hg > svcore
comparison data/fileio/CSVFormat.cpp @ 629:35499d48a5d1
* Start overhauling CSV parser to associate purposes with columns en route to its guesses; add some string manipulation code
author | Chris Cannam |
---|---|
date | Thu, 15 Jul 2010 15:27:21 +0000 |
parents | 001db550bd48 |
children | 11a664058dd8 |
comparison
equal
deleted
inserted
replaced
628:001db550bd48 | 629:35499d48a5d1 |
---|---|
13 COPYING included with this distribution for more information. | 13 COPYING included with this distribution for more information. |
14 */ | 14 */ |
15 | 15 |
16 #include "CSVFormat.h" | 16 #include "CSVFormat.h" |
17 | 17 |
18 #include "base/StringBits.h" | |
19 | |
18 #include <QFile> | 20 #include <QFile> |
19 #include <QString> | 21 #include <QString> |
20 #include <QRegExp> | 22 #include <QRegExp> |
21 #include <QStringList> | 23 #include <QStringList> |
22 #include <QTextStream> | 24 #include <QTextStream> |
23 | 25 |
24 #include <iostream> | 26 #include <iostream> |
25 | 27 |
26 CSVFormat::CSVFormat(QString filename) : | 28 CSVFormat::CSVFormat(QString path) : |
27 m_modelType(TwoDimensionalModel), | 29 m_separator(""), |
28 m_timingType(ExplicitTiming), | |
29 m_durationType(Durations), | |
30 m_timeUnits(TimeSeconds), | |
31 m_separator(","), | |
32 m_sampleRate(44100), | 30 m_sampleRate(44100), |
33 m_windowSize(1024), | 31 m_windowSize(1024), |
34 m_behaviour(QString::KeepEmptyParts), | 32 m_allowQuoting(true) |
35 m_maxExampleCols(0) | 33 { |
36 { | 34 guessFormatFor(path); |
37 QFile file(filename); | 35 } |
36 | |
37 void | |
38 CSVFormat::guessFormatFor(QString path) | |
39 { | |
40 m_modelType = TwoDimensionalModel; | |
41 m_timingType = ExplicitTiming; | |
42 m_durationType = Durations; | |
43 m_timeUnits = TimeSeconds; | |
44 m_behaviour = QString::KeepEmptyParts; | |
45 | |
46 m_maxExampleCols = 0; | |
47 m_columnCount = 0; | |
48 m_variableColumnCount = false; | |
49 | |
50 m_example.clear(); | |
51 m_columnQualities.clear(); | |
52 m_columnPurposes.clear(); | |
53 m_prevValues.clear(); | |
54 | |
55 QFile file(path); | |
38 if (!file.exists()) return; | 56 if (!file.exists()) return; |
39 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; | 57 if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return; |
40 | 58 |
41 QTextStream in(&file); | 59 QTextStream in(&file); |
42 in.seek(0); | 60 in.seek(0); |
43 | 61 |
44 unsigned int lineno = 0; | 62 int lineno = 0; |
45 | |
46 bool nonIncreasingPrimaries = false; | |
47 bool nonIncreasingSecondaries = false; | |
48 bool nonNumericPrimaries = false; | |
49 bool floatPrimaries = false; | |
50 bool variableItemCount = false; | |
51 int itemCount = 1; | |
52 int earliestNonNumericItem = -1; | |
53 | |
54 float prevPrimary = 0.0; | |
55 float prevSecondary = 0.0; | |
56 | |
57 m_maxExampleCols = 0; | |
58 m_separator = ""; | |
59 | 63 |
60 while (!in.atEnd()) { | 64 while (!in.atEnd()) { |
61 | 65 |
62 // See comment about line endings in CSVFileReader::load() | 66 // See comment about line endings in CSVFileReader::load() |
63 | 67 |
65 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); | 69 QStringList lines = chunk.split('\r', QString::SkipEmptyParts); |
66 | 70 |
67 for (size_t li = 0; li < lines.size(); ++li) { | 71 for (size_t li = 0; li < lines.size(); ++li) { |
68 | 72 |
69 QString line = lines[li]; | 73 QString line = lines[li]; |
70 | 74 if (line.startsWith("#") || line == "") continue; |
71 if (line.startsWith("#")) continue; | 75 |
72 | 76 guessQualities(line, lineno); |
73 m_behaviour = QString::KeepEmptyParts; | 77 |
74 | 78 if (++lineno == 50) break; |
75 if (m_separator == "") { | 79 } |
76 //!!! to do: ask the user | 80 } |
77 if (line.split(",").size() >= 2) m_separator = ","; | 81 |
78 else if (line.split("\t").size() >= 2) m_separator = "\t"; | 82 guessPurposes(); |
79 else if (line.split("|").size() >= 2) m_separator = "|"; | 83 } |
80 else if (line.split("/").size() >= 2) m_separator = "/"; | 84 |
81 else if (line.split(":").size() >= 2) m_separator = ":"; | 85 void |
82 else { | 86 CSVFormat::guessSeparator(QString line) |
83 m_separator = " "; | 87 { |
84 m_behaviour = QString::SkipEmptyParts; | 88 char candidates[] = { ',', '\t', ' ', '|', '/', ':' }; |
85 } | 89 for (int i = 0; i < sizeof(candidates)/sizeof(candidates[0]); ++i) { |
86 } | 90 if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) { |
87 | 91 m_separator = candidates[i]; |
88 // std::cerr << "separator = \"" << m_separator.toStdString() << "\"" << std::endl; | 92 return; |
89 | 93 } |
90 QStringList list = line.split(m_separator, m_behaviour); | 94 } |
91 QStringList tidyList; | 95 m_separator = " "; |
92 | 96 } |
93 for (int i = 0; i < list.size(); ++i) { | 97 |
98 void | |
99 CSVFormat::guessQualities(QString line, int lineno) | |
100 { | |
101 if (m_separator == "") guessSeparator(line); | |
102 | |
103 QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting); | |
104 | |
105 int cols = list.size(); | |
106 if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols; | |
107 if (cols != m_columnCount) m_variableColumnCount = true; | |
108 | |
109 // All columns are regarded as having these qualities until we see | |
110 // something that indicates otherwise: | |
111 | |
112 ColumnQualities defaultQualities = | |
113 ColumnNumeric | ColumnIntegral | ColumnIncreasing; | |
114 | |
115 for (int i = 0; i < cols; ++i) { | |
94 | 116 |
95 QString s(list[i]); | 117 while (m_columnQualities.size() <= i) { |
96 bool numeric = false; | 118 m_columnQualities.push_back(defaultQualities); |
97 | 119 m_prevValues.push_back(0.f); |
98 if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { | 120 } |
99 s = s.mid(1, s.length() - 2); | 121 |
100 } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { | 122 QString s(list[i]); |
101 s = s.mid(1, s.length() - 2); | 123 bool ok = false; |
124 | |
125 ColumnQualities qualities = m_columnQualities[i]; | |
126 | |
127 bool numeric = (qualities & ColumnNumeric); | |
128 bool integral = (qualities & ColumnIntegral); | |
129 bool increasing = (qualities & ColumnIncreasing); | |
130 bool large = (qualities & ColumnLarge); // this one defaults to off | |
131 | |
132 float value = 0.f; | |
133 | |
134 //!!! how to take into account headers? | |
135 | |
136 if (numeric) { | |
137 value = s.toFloat(&ok); | |
138 if (!ok) { | |
139 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok); | |
140 } | |
141 if (ok) { | |
142 if (lineno < 2 && value > 1000.f) large = true; | |
143 } else { | |
144 numeric = false; | |
145 } | |
146 } | |
147 | |
148 if (numeric) { | |
149 | |
150 if (integral) { | |
151 if (s.contains('.') || s.contains(',')) { | |
152 integral = false; | |
153 } | |
154 } | |
155 | |
156 if (increasing) { | |
157 if (lineno > 0 && value <= m_prevValues[i]) { | |
158 increasing = false; | |
159 } | |
160 } | |
161 | |
162 m_prevValues[i] = value; | |
163 } | |
164 | |
165 m_columnQualities[i] = | |
166 (numeric ? ColumnNumeric : 0) | | |
167 (integral ? ColumnIntegral : 0) | | |
168 (increasing ? ColumnIncreasing : 0) | | |
169 (large ? ColumnLarge : 0); | |
170 } | |
171 | |
172 if (lineno < 10) { | |
173 m_example.push_back(list); | |
174 if (lineno == 0 || cols > m_maxExampleCols) { | |
175 m_maxExampleCols = cols; | |
176 } | |
177 } | |
178 | |
179 std::cerr << "Estimated column qualities: "; | |
180 for (int i = 0; i < m_columnCount; ++i) { | |
181 std::cerr << int(m_columnQualities[i]) << " "; | |
182 } | |
183 std::cerr << std::endl; | |
184 } | |
185 | |
186 void | |
187 CSVFormat::guessPurposes() | |
188 { | |
189 while (m_columnPurposes.size() <= m_columnCount) { | |
190 m_columnPurposes.push_back(ColumnUnknown); | |
191 } | |
192 | |
193 m_timingType = CSVFormat::ImplicitTiming; | |
194 m_timeUnits = CSVFormat::TimeWindows; | |
195 | |
196 int timingColumnCount = 0; | |
197 | |
198 for (int i = 0; i < m_columnCount; ++i) { | |
199 | |
200 ColumnPurpose purpose = ColumnUnknown; | |
201 bool primary = (i == 0); | |
202 | |
203 ColumnQualities qualities = m_columnQualities[i]; | |
204 | |
205 bool numeric = (qualities & ColumnNumeric); | |
206 bool integral = (qualities & ColumnIntegral); | |
207 bool increasing = (qualities & ColumnIncreasing); | |
208 bool large = (qualities & ColumnLarge); | |
209 | |
210 bool timingColumn = (numeric && increasing); | |
211 | |
212 if (timingColumn) { | |
213 | |
214 ++timingColumnCount; | |
215 | |
216 if (primary) { | |
217 | |
218 purpose = ColumnStartTime; | |
219 | |
220 m_timingType = ExplicitTiming; | |
221 | |
222 if (integral && large) { | |
223 m_timeUnits = TimeAudioFrames; | |
102 } else { | 224 } else { |
103 float f = s.toFloat(&numeric); | 225 m_timeUnits = TimeSeconds; |
104 // std::cerr << "converted \"" << s.toStdString() << "\" to float, got " << f << " and success = " << numeric << std::endl; | 226 } |
105 } | 227 |
106 | 228 } else { |
107 tidyList.push_back(s); | 229 |
108 | 230 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) { |
109 if (lineno == 0 || (list.size() < itemCount)) { | 231 purpose = ColumnEndTime; |
110 itemCount = list.size(); | 232 m_durationType = EndTimes; |
111 } else { | 233 } |
112 if (itemCount != list.size()) { | 234 } |
113 variableItemCount = true; | 235 } |
114 } | 236 |
115 } | 237 if (purpose == ColumnUnknown) { |
116 | 238 if (numeric) { |
117 if (i == 0) { // primary | 239 purpose = ColumnValue; |
118 | 240 } else { |
119 if (numeric) { | 241 purpose = ColumnLabel; |
120 | 242 } |
121 float primary = s.toFloat(); | 243 } |
122 | 244 |
123 if (lineno > 0 && primary <= prevPrimary) { | 245 m_columnPurposes[i] = purpose; |
124 nonIncreasingPrimaries = true; | 246 } |
125 } | 247 |
126 | 248 int valueCount = 0; |
127 if (s.contains(".") || s.contains(",")) { | 249 for (int i = 0; i < m_columnCount; ++i) { |
128 floatPrimaries = true; | 250 if (m_columnPurposes[i] == ColumnValue) ++valueCount; |
129 } | 251 } |
130 | 252 |
131 prevPrimary = primary; | 253 if (valueCount == 0) { |
132 | 254 m_modelType = OneDimensionalModel; |
133 } else { | 255 } else if (valueCount == 1) { |
134 nonNumericPrimaries = true; | 256 m_modelType = TwoDimensionalModel; |
135 } | |
136 } else { // secondary | |
137 | |
138 if (!numeric) { | |
139 if (earliestNonNumericItem < 0 || | |
140 i < earliestNonNumericItem) { | |
141 earliestNonNumericItem = i; | |
142 } | |
143 } else if (i == 1) { | |
144 float secondary = s.toFloat(); | |
145 if (lineno > 0 && secondary <= prevSecondary) { | |
146 nonIncreasingSecondaries = true; | |
147 } | |
148 prevSecondary = secondary; | |
149 } | |
150 } | |
151 } | |
152 | |
153 if (lineno < 10) { | |
154 m_example.push_back(tidyList); | |
155 if (lineno == 0 || tidyList.size() > m_maxExampleCols) { | |
156 m_maxExampleCols = tidyList.size(); | |
157 } | |
158 } | |
159 | |
160 ++lineno; | |
161 | |
162 if (lineno == 50) break; | |
163 } | |
164 } | |
165 | |
166 if (nonNumericPrimaries || nonIncreasingPrimaries) { | |
167 | |
168 // Primaries are probably not a series of times | |
169 | |
170 m_timingType = CSVFormat::ImplicitTiming; | |
171 m_timeUnits = CSVFormat::TimeWindows; | |
172 | |
173 if (nonNumericPrimaries) { | |
174 m_modelType = CSVFormat::OneDimensionalModel; | |
175 } else if (itemCount == 1 || variableItemCount || | |
176 (earliestNonNumericItem != -1)) { | |
177 m_modelType = CSVFormat::TwoDimensionalModel; | |
178 } else { | |
179 m_modelType = CSVFormat::ThreeDimensionalModel; | |
180 } | |
181 | |
182 } else { | 257 } else { |
183 | 258 m_modelType = ThreeDimensionalModel; |
184 // Increasing numeric primaries -- likely to be time | 259 } |
185 | 260 |
186 m_timingType = CSVFormat::ExplicitTiming; | 261 std::cerr << "Estimated column purposes: "; |
187 | 262 for (int i = 0; i < m_columnCount; ++i) { |
188 if (floatPrimaries) { | 263 std::cerr << int(m_columnPurposes[i]) << " "; |
189 m_timeUnits = CSVFormat::TimeSeconds; | 264 } |
190 } else { | 265 std::cerr << std::endl; |
191 m_timeUnits = CSVFormat::TimeAudioFrames; | |
192 } | |
193 | |
194 if (itemCount == 1) { | |
195 m_modelType = CSVFormat::OneDimensionalModel; | |
196 } else if (variableItemCount || (earliestNonNumericItem != -1)) { | |
197 if (earliestNonNumericItem != -1 && earliestNonNumericItem < 2) { | |
198 m_modelType = CSVFormat::OneDimensionalModel; | |
199 } else { | |
200 m_modelType = CSVFormat::TwoDimensionalModel; | |
201 } | |
202 } else { | |
203 m_modelType = CSVFormat::ThreeDimensionalModel; | |
204 } | |
205 | |
206 if (nonIncreasingSecondaries) { | |
207 m_durationType = Durations; | |
208 } else { | |
209 m_durationType = EndTimes; | |
210 } | |
211 } | |
212 | 266 |
213 std::cerr << "Estimated model type: " << m_modelType << std::endl; | 267 std::cerr << "Estimated model type: " << m_modelType << std::endl; |
214 std::cerr << "Estimated timing type: " << m_timingType << std::endl; | 268 std::cerr << "Estimated timing type: " << m_timingType << std::endl; |
215 std::cerr << "Estimated duration type: " << m_durationType << std::endl; | 269 std::cerr << "Estimated duration type: " << m_durationType << std::endl; |
216 std::cerr << "Estimated units: " << m_timeUnits << std::endl; | 270 std::cerr << "Estimated units: " << m_timeUnits << std::endl; |
217 } | 271 } |
218 | 272 |
273 |