Mercurial > hg > svcore
comparison data/fileio/CSVFileReader.cpp @ 631:3a5ee4b6c9ad
* Complete the overhaul of CSV file import; now you can pick the purpose for
each column in the file, and SV should do the rest. The most significant
practical improvement here is that we can now handle files in which time
and duration do not necessarily appear in known columns.
author | Chris Cannam |
---|---|
date | Mon, 19 Jul 2010 17:08:56 +0000 |
parents | 001db550bd48 |
children | 611a4fa14dde |
comparison
equal
deleted
inserted
replaced
630:11a664058dd8 | 631:3a5ee4b6c9ad |
---|---|
15 | 15 |
16 #include "CSVFileReader.h" | 16 #include "CSVFileReader.h" |
17 | 17 |
18 #include "model/Model.h" | 18 #include "model/Model.h" |
19 #include "base/RealTime.h" | 19 #include "base/RealTime.h" |
20 #include "base/StringBits.h" | |
20 #include "model/SparseOneDimensionalModel.h" | 21 #include "model/SparseOneDimensionalModel.h" |
21 #include "model/SparseTimeValueModel.h" | 22 #include "model/SparseTimeValueModel.h" |
22 #include "model/EditableDenseThreeDimensionalModel.h" | 23 #include "model/EditableDenseThreeDimensionalModel.h" |
23 #include "model/RegionModel.h" | 24 #include "model/RegionModel.h" |
24 #include "DataFileReaderFactory.h" | 25 #include "DataFileReaderFactory.h" |
34 | 35 |
35 CSVFileReader::CSVFileReader(QString path, CSVFormat format, | 36 CSVFileReader::CSVFileReader(QString path, CSVFormat format, |
36 size_t mainModelSampleRate) : | 37 size_t mainModelSampleRate) : |
37 m_format(format), | 38 m_format(format), |
38 m_file(0), | 39 m_file(0), |
40 m_warnings(0), | |
39 m_mainModelSampleRate(mainModelSampleRate) | 41 m_mainModelSampleRate(mainModelSampleRate) |
40 { | 42 { |
41 m_file = new QFile(path); | 43 m_file = new QFile(path); |
42 bool good = false; | 44 bool good = false; |
43 | 45 |
76 CSVFileReader::getError() const | 78 CSVFileReader::getError() const |
77 { | 79 { |
78 return m_error; | 80 return m_error; |
79 } | 81 } |
80 | 82 |
83 size_t | |
84 CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate, | |
85 size_t windowSize) const | |
86 { | |
87 QRegExp nonNumericRx("[^0-9eE.,+-]"); | |
88 unsigned int warnLimit = 10; | |
89 | |
90 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); | |
91 | |
92 size_t calculatedFrame = 0; | |
93 | |
94 bool ok = false; | |
95 QString numeric = s; | |
96 numeric.remove(nonNumericRx); | |
97 | |
98 if (timeUnits == CSVFormat::TimeSeconds) { | |
99 | |
100 double time = numeric.toDouble(&ok); | |
101 if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok); | |
102 calculatedFrame = int(time * sampleRate + 0.5); | |
103 | |
104 } else { | |
105 | |
106 long n = numeric.toLong(&ok); | |
107 if (n >= 0) calculatedFrame = n; | |
108 | |
109 if (timeUnits == CSVFormat::TimeWindows) { | |
110 calculatedFrame *= windowSize; | |
111 } | |
112 } | |
113 | |
114 if (!ok) { | |
115 if (m_warnings < warnLimit) { | |
116 std::cerr << "WARNING: CSVFileReader::load: " | |
117 << "Bad time format (\"" << s.toStdString() | |
118 << "\") in data line " | |
119 << lineno+1 << std::endl; | |
120 } else if (m_warnings == warnLimit) { | |
121 std::cerr << "WARNING: Too many warnings" << std::endl; | |
122 } | |
123 ++m_warnings; | |
124 } | |
125 | |
126 return calculatedFrame; | |
127 } | |
128 | |
81 Model * | 129 Model * |
82 CSVFileReader::load() const | 130 CSVFileReader::load() const |
83 { | 131 { |
84 if (!m_file) return 0; | 132 if (!m_file) return 0; |
85 /*!!! | |
86 CSVFormatDialog *dialog = new CSVFormatDialog | |
87 (0, m_file, m_mainModelSampleRate); | |
88 | |
89 if (dialog->exec() == QDialog::Rejected) { | |
90 delete dialog; | |
91 throw DataFileReaderFactory::ImportCancelled; | |
92 } | |
93 */ | |
94 | 133 |
95 CSVFormat::ModelType modelType = m_format.getModelType(); | 134 CSVFormat::ModelType modelType = m_format.getModelType(); |
96 CSVFormat::TimingType timingType = m_format.getTimingType(); | 135 CSVFormat::TimingType timingType = m_format.getTimingType(); |
97 CSVFormat::DurationType durationType = m_format.getDurationType(); | |
98 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); | 136 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); |
99 QString separator = m_format.getSeparator(); | |
100 QString::SplitBehavior behaviour = m_format.getSplitBehaviour(); | |
101 size_t sampleRate = m_format.getSampleRate(); | 137 size_t sampleRate = m_format.getSampleRate(); |
102 size_t windowSize = m_format.getWindowSize(); | 138 size_t windowSize = m_format.getWindowSize(); |
139 QChar separator = m_format.getSeparator(); | |
140 bool allowQuoting = m_format.getAllowQuoting(); | |
103 | 141 |
104 if (timingType == CSVFormat::ExplicitTiming) { | 142 if (timingType == CSVFormat::ExplicitTiming) { |
105 if (modelType == CSVFormat::ThreeDimensionalModel) { | 143 if (modelType == CSVFormat::ThreeDimensionalModel) { |
106 // This will be overridden later if more than one line | 144 // This will be overridden later if more than one line |
107 // appears in our file, but we want to choose a default | 145 // appears in our file, but we want to choose a default |
129 | 167 |
130 float min = 0.0, max = 0.0; | 168 float min = 0.0, max = 0.0; |
131 | 169 |
132 size_t frameNo = 0; | 170 size_t frameNo = 0; |
133 size_t duration = 0; | 171 size_t duration = 0; |
172 size_t endFrame = 0; | |
173 | |
174 bool haveAnyValue = false; | |
175 bool haveEndTime = false; | |
176 | |
134 size_t startFrame = 0; // for calculation of dense model resolution | 177 size_t startFrame = 0; // for calculation of dense model resolution |
135 | 178 bool firstEverValue = true; |
136 std::map<QString, float> labelValueMap; | 179 |
137 float syntheticMax = 0.f; | 180 std::map<QString, int> labelCountMap; |
138 | 181 |
139 while (!in.atEnd()) { | 182 while (!in.atEnd()) { |
140 | 183 |
141 // QTextStream's readLine doesn't cope with old-style Mac | 184 // QTextStream's readLine doesn't cope with old-style Mac |
142 // CR-only line endings. Why did they bother making the class | 185 // CR-only line endings. Why did they bother making the class |
143 // cope with more than one sort of line ending, if it still | 186 // cope with more than one sort of line ending, if it still |
156 | 199 |
157 QString line = lines[li]; | 200 QString line = lines[li]; |
158 | 201 |
159 if (line.startsWith("#")) continue; | 202 if (line.startsWith("#")) continue; |
160 | 203 |
161 QStringList list = line.split(separator, behaviour); | 204 QStringList list = StringBits::split(line, separator, allowQuoting); |
162 | |
163 if (!model) { | 205 if (!model) { |
164 | 206 |
165 switch (modelType) { | 207 switch (modelType) { |
166 | 208 |
167 case CSVFormat::OneDimensionalModel: | 209 case CSVFormat::OneDimensionalModel: |
188 model = model3; | 230 model = model3; |
189 break; | 231 break; |
190 } | 232 } |
191 } | 233 } |
192 | 234 |
193 QStringList tidyList; | |
194 QRegExp nonNumericRx("[^0-9eE.,+-]"); | |
195 | |
196 float value = 0.f; | 235 float value = 0.f; |
236 QString label = ""; | |
237 | |
238 duration = 0.f; | |
239 haveEndTime = false; | |
197 | 240 |
198 for (int i = 0; i < list.size(); ++i) { | 241 for (int i = 0; i < list.size(); ++i) { |
199 | 242 |
200 QString s(list[i].trimmed()); | 243 QString s = list[i]; |
201 | 244 |
202 if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { | 245 CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i); |
203 s = s.mid(1, s.length() - 2); | 246 |
204 } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { | 247 switch (purpose) { |
205 s = s.mid(1, s.length() - 2); | 248 |
249 case CSVFormat::ColumnUnknown: | |
250 break; | |
251 | |
252 case CSVFormat::ColumnStartTime: | |
253 frameNo = convertTimeValue(s, lineno, sampleRate, windowSize); | |
254 break; | |
255 | |
256 case CSVFormat::ColumnEndTime: | |
257 endFrame = convertTimeValue(s, lineno, sampleRate, windowSize); | |
258 haveEndTime = true; | |
259 break; | |
260 | |
261 case CSVFormat::ColumnDuration: | |
262 duration = convertTimeValue(s, lineno, sampleRate, windowSize); | |
263 break; | |
264 | |
265 case CSVFormat::ColumnValue: | |
266 value = s.toFloat(); | |
267 haveAnyValue = true; | |
268 break; | |
269 | |
270 case CSVFormat::ColumnLabel: | |
271 label = s; | |
272 ++labelCountMap[label]; | |
273 break; | |
206 } | 274 } |
207 | 275 } |
208 if (timingType == CSVFormat::ExplicitTiming) { | 276 |
209 | 277 if (haveEndTime) { // ... calculate duration now all cols read |
210 size_t calculatedFrame = 0; | 278 if (endFrame > frameNo) { |
211 | 279 duration = endFrame - frameNo; |
212 if (i == 0 || | |
213 (i == 1 && | |
214 modelType == CSVFormat::TwoDimensionalModelWithDuration)) { | |
215 | |
216 bool ok = false; | |
217 QString numeric = s; | |
218 numeric.remove(nonNumericRx); | |
219 | |
220 if (timeUnits == CSVFormat::TimeSeconds) { | |
221 | |
222 double time = numeric.toDouble(&ok); | |
223 calculatedFrame = int(time * sampleRate + 0.5); | |
224 | |
225 } else { | |
226 | |
227 calculatedFrame = numeric.toInt(&ok); | |
228 | |
229 if (timeUnits == CSVFormat::TimeWindows) { | |
230 calculatedFrame *= windowSize; | |
231 } | |
232 } | |
233 | |
234 if (!ok) { | |
235 if (warnings < warnLimit) { | |
236 std::cerr << "WARNING: CSVFileReader::load: " | |
237 << "Bad time format (\"" << s.toStdString() | |
238 << "\") in data line " | |
239 << lineno+1 << ":" << std::endl; | |
240 std::cerr << line.toStdString() << std::endl; | |
241 } else if (warnings == warnLimit) { | |
242 std::cerr << "WARNING: Too many warnings" << std::endl; | |
243 } | |
244 ++warnings; | |
245 } | |
246 | |
247 if (i == 0) frameNo = calculatedFrame; | |
248 else { | |
249 if (durationType == CSVFormat::EndTimes) { | |
250 duration = calculatedFrame - frameNo; | |
251 } else { | |
252 duration = calculatedFrame; | |
253 } | |
254 } | |
255 | |
256 continue; | |
257 } | |
258 } | 280 } |
259 | |
260 if ((i == 1 && | |
261 modelType == CSVFormat::TwoDimensionalModel) || | |
262 (i == 2 && | |
263 modelType == CSVFormat::TwoDimensionalModelWithDuration)) { | |
264 bool ok = false; | |
265 value = s.toFloat(&ok); | |
266 if (!ok) { | |
267 // cf. RDFImporter::fillModel | |
268 if (labelValueMap.find(s) == labelValueMap.end()) { | |
269 syntheticMax = syntheticMax + 1.f; | |
270 labelValueMap[s] = syntheticMax; | |
271 } | |
272 value = labelValueMap[s]; | |
273 } else { | |
274 if (value > syntheticMax) syntheticMax = value; | |
275 } | |
276 if (i + 1 == list.size()) { | |
277 // keep text around for use as label (none other given) | |
278 tidyList.push_back(s); | |
279 } | |
280 continue; | |
281 } | |
282 | |
283 tidyList.push_back(s); | |
284 } | 281 } |
285 | 282 |
286 if (modelType == CSVFormat::OneDimensionalModel) { | 283 if (modelType == CSVFormat::OneDimensionalModel) { |
287 | 284 |
288 SparseOneDimensionalModel::Point point | 285 SparseOneDimensionalModel::Point point(frameNo, label); |
289 (frameNo, | |
290 tidyList.size() > 0 ? tidyList[tidyList.size()-1] : | |
291 QString("%1").arg(lineno+1)); | |
292 | |
293 model1->addPoint(point); | 286 model1->addPoint(point); |
294 | 287 |
295 } else if (modelType == CSVFormat::TwoDimensionalModel) { | 288 } else if (modelType == CSVFormat::TwoDimensionalModel) { |
296 | 289 |
297 SparseTimeValueModel::Point point | 290 SparseTimeValueModel::Point point(frameNo, value, label); |
298 (frameNo, | |
299 value, | |
300 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); | |
301 | |
302 model2->addPoint(point); | 291 model2->addPoint(point); |
303 | 292 |
304 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) { | 293 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) { |
305 | 294 |
306 RegionModel::Point point | 295 RegionModel::Point point(frameNo, value, duration, label); |
307 (frameNo, | |
308 value, | |
309 duration, | |
310 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1)); | |
311 | |
312 model2a->addPoint(point); | 296 model2a->addPoint(point); |
313 | 297 |
314 } else if (modelType == CSVFormat::ThreeDimensionalModel) { | 298 } else if (modelType == CSVFormat::ThreeDimensionalModel) { |
315 | 299 |
316 DenseThreeDimensionalModel::Column values; | 300 DenseThreeDimensionalModel::Column values; |
317 | 301 |
318 for (int i = 0; i < tidyList.size(); ++i) { | 302 for (int i = 0; i < list.size(); ++i) { |
319 | 303 |
320 bool ok = false; | 304 bool ok = false; |
321 float value = list[i].toFloat(&ok); | 305 float value = list[i].toFloat(&ok); |
322 | 306 |
323 if (i > 0 || timingType != CSVFormat::ExplicitTiming) { | 307 if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) { |
324 values.push_back(value); | 308 values.push_back(value); |
325 } | 309 } |
326 | 310 |
327 bool firstEver = (lineno == 0 && i == 0); | 311 if (firstEverValue || value < min) min = value; |
328 | 312 if (firstEverValue || value > max) max = value; |
329 if (firstEver || value < min) min = value; | 313 |
330 if (firstEver || value > max) max = value; | 314 if (firstEverValue) { |
331 | |
332 if (firstEver) { | |
333 startFrame = frameNo; | 315 startFrame = frameNo; |
334 model3->setStartFrame(startFrame); | 316 model3->setStartFrame(startFrame); |
335 } else if (lineno == 1 && | 317 } else if (lineno == 1 && |
336 timingType == CSVFormat::ExplicitTiming) { | 318 timingType == CSVFormat::ExplicitTiming) { |
337 model3->setResolution(frameNo - startFrame); | 319 model3->setResolution(frameNo - startFrame); |
338 } | 320 } |
321 | |
322 firstEverValue = false; | |
339 | 323 |
340 if (!ok) { | 324 if (!ok) { |
341 if (warnings < warnLimit) { | 325 if (warnings < warnLimit) { |
342 std::cerr << "WARNING: CSVFileReader::load: " | 326 std::cerr << "WARNING: CSVFileReader::load: " |
343 << "Non-numeric value \"" | 327 << "Non-numeric value \"" |
364 frameNo += windowSize; | 348 frameNo += windowSize; |
365 } | 349 } |
366 } | 350 } |
367 } | 351 } |
368 | 352 |
353 if (!haveAnyValue) { | |
354 if (model2a) { | |
355 // assign values for regions based on label frequency; we | |
356 // have this in our labelCountMap, sort of | |
357 | |
358 std::map<int, std::map<QString, float> > countLabelValueMap; | |
359 for (std::map<QString, int>::iterator i = labelCountMap.begin(); | |
360 i != labelCountMap.end(); ++i) { | |
361 countLabelValueMap[i->second][i->first] = 0.f; | |
362 } | |
363 | |
364 float v = 0.f; | |
365 for (std::map<int, std::map<QString, float> >::iterator i = | |
366 countLabelValueMap.end(); i != countLabelValueMap.begin(); ) { | |
367 --i; | |
368 for (std::map<QString, float>::iterator j = i->second.begin(); | |
369 j != i->second.end(); ++j) { | |
370 j->second = v; | |
371 v = v + 1.f; | |
372 } | |
373 } | |
374 | |
375 std::map<RegionModel::Point, RegionModel::Point, | |
376 RegionModel::Point::Comparator> pointMap; | |
377 for (RegionModel::PointList::const_iterator i = | |
378 model2a->getPoints().begin(); | |
379 i != model2a->getPoints().end(); ++i) { | |
380 RegionModel::Point p(*i); | |
381 v = countLabelValueMap[labelCountMap[p.label]][p.label]; | |
382 RegionModel::Point pp(p.frame, v, p.duration, p.label); | |
383 pointMap[p] = pp; | |
384 } | |
385 | |
386 for (std::map<RegionModel::Point, RegionModel::Point>::iterator i = | |
387 pointMap.begin(); i != pointMap.end(); ++i) { | |
388 model2a->deletePoint(i->first); | |
389 model2a->addPoint(i->second); | |
390 } | |
391 } | |
392 } | |
393 | |
369 if (modelType == CSVFormat::ThreeDimensionalModel) { | 394 if (modelType == CSVFormat::ThreeDimensionalModel) { |
370 model3->setMinimumLevel(min); | 395 model3->setMinimumLevel(min); |
371 model3->setMaximumLevel(max); | 396 model3->setMaximumLevel(max); |
372 } | 397 } |
373 | 398 |