comparison data/fileio/CSVFileReader.cpp @ 631:3a5ee4b6c9ad

* Complete the overhaul of CSV file import; now you can pick the purpose for each column in the file, and SV should do the rest. The most significant practical improvement here is that we can now handle files in which time and duration do not necessarily appear in known columns.
author Chris Cannam
date Mon, 19 Jul 2010 17:08:56 +0000
parents 001db550bd48
children 611a4fa14dde
comparison
equal deleted inserted replaced
630:11a664058dd8 631:3a5ee4b6c9ad
15 15
16 #include "CSVFileReader.h" 16 #include "CSVFileReader.h"
17 17
18 #include "model/Model.h" 18 #include "model/Model.h"
19 #include "base/RealTime.h" 19 #include "base/RealTime.h"
20 #include "base/StringBits.h"
20 #include "model/SparseOneDimensionalModel.h" 21 #include "model/SparseOneDimensionalModel.h"
21 #include "model/SparseTimeValueModel.h" 22 #include "model/SparseTimeValueModel.h"
22 #include "model/EditableDenseThreeDimensionalModel.h" 23 #include "model/EditableDenseThreeDimensionalModel.h"
23 #include "model/RegionModel.h" 24 #include "model/RegionModel.h"
24 #include "DataFileReaderFactory.h" 25 #include "DataFileReaderFactory.h"
34 35
35 CSVFileReader::CSVFileReader(QString path, CSVFormat format, 36 CSVFileReader::CSVFileReader(QString path, CSVFormat format,
36 size_t mainModelSampleRate) : 37 size_t mainModelSampleRate) :
37 m_format(format), 38 m_format(format),
38 m_file(0), 39 m_file(0),
40 m_warnings(0),
39 m_mainModelSampleRate(mainModelSampleRate) 41 m_mainModelSampleRate(mainModelSampleRate)
40 { 42 {
41 m_file = new QFile(path); 43 m_file = new QFile(path);
42 bool good = false; 44 bool good = false;
43 45
76 CSVFileReader::getError() const 78 CSVFileReader::getError() const
77 { 79 {
78 return m_error; 80 return m_error;
79 } 81 }
80 82
83 size_t
84 CSVFileReader::convertTimeValue(QString s, int lineno, size_t sampleRate,
85 size_t windowSize) const
86 {
87 QRegExp nonNumericRx("[^0-9eE.,+-]");
88 unsigned int warnLimit = 10;
89
90 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
91
92 size_t calculatedFrame = 0;
93
94 bool ok = false;
95 QString numeric = s;
96 numeric.remove(nonNumericRx);
97
98 if (timeUnits == CSVFormat::TimeSeconds) {
99
100 double time = numeric.toDouble(&ok);
101 if (!ok) time = StringBits::stringToDoubleLocaleFree(numeric, &ok);
102 calculatedFrame = int(time * sampleRate + 0.5);
103
104 } else {
105
106 long n = numeric.toLong(&ok);
107 if (n >= 0) calculatedFrame = n;
108
109 if (timeUnits == CSVFormat::TimeWindows) {
110 calculatedFrame *= windowSize;
111 }
112 }
113
114 if (!ok) {
115 if (m_warnings < warnLimit) {
116 std::cerr << "WARNING: CSVFileReader::load: "
117 << "Bad time format (\"" << s.toStdString()
118 << "\") in data line "
119 << lineno+1 << std::endl;
120 } else if (m_warnings == warnLimit) {
121 std::cerr << "WARNING: Too many warnings" << std::endl;
122 }
123 ++m_warnings;
124 }
125
126 return calculatedFrame;
127 }
128
81 Model * 129 Model *
82 CSVFileReader::load() const 130 CSVFileReader::load() const
83 { 131 {
84 if (!m_file) return 0; 132 if (!m_file) return 0;
85 /*!!!
86 CSVFormatDialog *dialog = new CSVFormatDialog
87 (0, m_file, m_mainModelSampleRate);
88
89 if (dialog->exec() == QDialog::Rejected) {
90 delete dialog;
91 throw DataFileReaderFactory::ImportCancelled;
92 }
93 */
94 133
95 CSVFormat::ModelType modelType = m_format.getModelType(); 134 CSVFormat::ModelType modelType = m_format.getModelType();
96 CSVFormat::TimingType timingType = m_format.getTimingType(); 135 CSVFormat::TimingType timingType = m_format.getTimingType();
97 CSVFormat::DurationType durationType = m_format.getDurationType();
98 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits(); 136 CSVFormat::TimeUnits timeUnits = m_format.getTimeUnits();
99 QString separator = m_format.getSeparator();
100 QString::SplitBehavior behaviour = m_format.getSplitBehaviour();
101 size_t sampleRate = m_format.getSampleRate(); 137 size_t sampleRate = m_format.getSampleRate();
102 size_t windowSize = m_format.getWindowSize(); 138 size_t windowSize = m_format.getWindowSize();
139 QChar separator = m_format.getSeparator();
140 bool allowQuoting = m_format.getAllowQuoting();
103 141
104 if (timingType == CSVFormat::ExplicitTiming) { 142 if (timingType == CSVFormat::ExplicitTiming) {
105 if (modelType == CSVFormat::ThreeDimensionalModel) { 143 if (modelType == CSVFormat::ThreeDimensionalModel) {
106 // This will be overridden later if more than one line 144 // This will be overridden later if more than one line
107 // appears in our file, but we want to choose a default 145 // appears in our file, but we want to choose a default
129 167
130 float min = 0.0, max = 0.0; 168 float min = 0.0, max = 0.0;
131 169
132 size_t frameNo = 0; 170 size_t frameNo = 0;
133 size_t duration = 0; 171 size_t duration = 0;
172 size_t endFrame = 0;
173
174 bool haveAnyValue = false;
175 bool haveEndTime = false;
176
134 size_t startFrame = 0; // for calculation of dense model resolution 177 size_t startFrame = 0; // for calculation of dense model resolution
135 178 bool firstEverValue = true;
136 std::map<QString, float> labelValueMap; 179
137 float syntheticMax = 0.f; 180 std::map<QString, int> labelCountMap;
138 181
139 while (!in.atEnd()) { 182 while (!in.atEnd()) {
140 183
141 // QTextStream's readLine doesn't cope with old-style Mac 184 // QTextStream's readLine doesn't cope with old-style Mac
142 // CR-only line endings. Why did they bother making the class 185 // CR-only line endings. Why did they bother making the class
143 // cope with more than one sort of line ending, if it still 186 // cope with more than one sort of line ending, if it still
156 199
157 QString line = lines[li]; 200 QString line = lines[li];
158 201
159 if (line.startsWith("#")) continue; 202 if (line.startsWith("#")) continue;
160 203
161 QStringList list = line.split(separator, behaviour); 204 QStringList list = StringBits::split(line, separator, allowQuoting);
162
163 if (!model) { 205 if (!model) {
164 206
165 switch (modelType) { 207 switch (modelType) {
166 208
167 case CSVFormat::OneDimensionalModel: 209 case CSVFormat::OneDimensionalModel:
188 model = model3; 230 model = model3;
189 break; 231 break;
190 } 232 }
191 } 233 }
192 234
193 QStringList tidyList;
194 QRegExp nonNumericRx("[^0-9eE.,+-]");
195
196 float value = 0.f; 235 float value = 0.f;
236 QString label = "";
237
238 duration = 0.f;
239 haveEndTime = false;
197 240
198 for (int i = 0; i < list.size(); ++i) { 241 for (int i = 0; i < list.size(); ++i) {
199 242
200 QString s(list[i].trimmed()); 243 QString s = list[i];
201 244
202 if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { 245 CSVFormat::ColumnPurpose purpose = m_format.getColumnPurpose(i);
203 s = s.mid(1, s.length() - 2); 246
204 } else if (s.length() >= 2 && s.startsWith("'") && s.endsWith("'")) { 247 switch (purpose) {
205 s = s.mid(1, s.length() - 2); 248
249 case CSVFormat::ColumnUnknown:
250 break;
251
252 case CSVFormat::ColumnStartTime:
253 frameNo = convertTimeValue(s, lineno, sampleRate, windowSize);
254 break;
255
256 case CSVFormat::ColumnEndTime:
257 endFrame = convertTimeValue(s, lineno, sampleRate, windowSize);
258 haveEndTime = true;
259 break;
260
261 case CSVFormat::ColumnDuration:
262 duration = convertTimeValue(s, lineno, sampleRate, windowSize);
263 break;
264
265 case CSVFormat::ColumnValue:
266 value = s.toFloat();
267 haveAnyValue = true;
268 break;
269
270 case CSVFormat::ColumnLabel:
271 label = s;
272 ++labelCountMap[label];
273 break;
206 } 274 }
207 275 }
208 if (timingType == CSVFormat::ExplicitTiming) { 276
209 277 if (haveEndTime) { // ... calculate duration now all cols read
210 size_t calculatedFrame = 0; 278 if (endFrame > frameNo) {
211 279 duration = endFrame - frameNo;
212 if (i == 0 ||
213 (i == 1 &&
214 modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
215
216 bool ok = false;
217 QString numeric = s;
218 numeric.remove(nonNumericRx);
219
220 if (timeUnits == CSVFormat::TimeSeconds) {
221
222 double time = numeric.toDouble(&ok);
223 calculatedFrame = int(time * sampleRate + 0.5);
224
225 } else {
226
227 calculatedFrame = numeric.toInt(&ok);
228
229 if (timeUnits == CSVFormat::TimeWindows) {
230 calculatedFrame *= windowSize;
231 }
232 }
233
234 if (!ok) {
235 if (warnings < warnLimit) {
236 std::cerr << "WARNING: CSVFileReader::load: "
237 << "Bad time format (\"" << s.toStdString()
238 << "\") in data line "
239 << lineno+1 << ":" << std::endl;
240 std::cerr << line.toStdString() << std::endl;
241 } else if (warnings == warnLimit) {
242 std::cerr << "WARNING: Too many warnings" << std::endl;
243 }
244 ++warnings;
245 }
246
247 if (i == 0) frameNo = calculatedFrame;
248 else {
249 if (durationType == CSVFormat::EndTimes) {
250 duration = calculatedFrame - frameNo;
251 } else {
252 duration = calculatedFrame;
253 }
254 }
255
256 continue;
257 }
258 } 280 }
259
260 if ((i == 1 &&
261 modelType == CSVFormat::TwoDimensionalModel) ||
262 (i == 2 &&
263 modelType == CSVFormat::TwoDimensionalModelWithDuration)) {
264 bool ok = false;
265 value = s.toFloat(&ok);
266 if (!ok) {
267 // cf. RDFImporter::fillModel
268 if (labelValueMap.find(s) == labelValueMap.end()) {
269 syntheticMax = syntheticMax + 1.f;
270 labelValueMap[s] = syntheticMax;
271 }
272 value = labelValueMap[s];
273 } else {
274 if (value > syntheticMax) syntheticMax = value;
275 }
276 if (i + 1 == list.size()) {
277 // keep text around for use as label (none other given)
278 tidyList.push_back(s);
279 }
280 continue;
281 }
282
283 tidyList.push_back(s);
284 } 281 }
285 282
286 if (modelType == CSVFormat::OneDimensionalModel) { 283 if (modelType == CSVFormat::OneDimensionalModel) {
287 284
288 SparseOneDimensionalModel::Point point 285 SparseOneDimensionalModel::Point point(frameNo, label);
289 (frameNo,
290 tidyList.size() > 0 ? tidyList[tidyList.size()-1] :
291 QString("%1").arg(lineno+1));
292
293 model1->addPoint(point); 286 model1->addPoint(point);
294 287
295 } else if (modelType == CSVFormat::TwoDimensionalModel) { 288 } else if (modelType == CSVFormat::TwoDimensionalModel) {
296 289
297 SparseTimeValueModel::Point point 290 SparseTimeValueModel::Point point(frameNo, value, label);
298 (frameNo,
299 value,
300 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
301
302 model2->addPoint(point); 291 model2->addPoint(point);
303 292
304 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) { 293 } else if (modelType == CSVFormat::TwoDimensionalModelWithDuration) {
305 294
306 RegionModel::Point point 295 RegionModel::Point point(frameNo, value, duration, label);
307 (frameNo,
308 value,
309 duration,
310 tidyList.size() > 0 ? tidyList[0] : QString("%1").arg(lineno+1));
311
312 model2a->addPoint(point); 296 model2a->addPoint(point);
313 297
314 } else if (modelType == CSVFormat::ThreeDimensionalModel) { 298 } else if (modelType == CSVFormat::ThreeDimensionalModel) {
315 299
316 DenseThreeDimensionalModel::Column values; 300 DenseThreeDimensionalModel::Column values;
317 301
318 for (int i = 0; i < tidyList.size(); ++i) { 302 for (int i = 0; i < list.size(); ++i) {
319 303
320 bool ok = false; 304 bool ok = false;
321 float value = list[i].toFloat(&ok); 305 float value = list[i].toFloat(&ok);
322 306
323 if (i > 0 || timingType != CSVFormat::ExplicitTiming) { 307 if (m_format.getColumnPurpose(i) == CSVFormat::ColumnValue) {
324 values.push_back(value); 308 values.push_back(value);
325 } 309 }
326 310
327 bool firstEver = (lineno == 0 && i == 0); 311 if (firstEverValue || value < min) min = value;
328 312 if (firstEverValue || value > max) max = value;
329 if (firstEver || value < min) min = value; 313
330 if (firstEver || value > max) max = value; 314 if (firstEverValue) {
331
332 if (firstEver) {
333 startFrame = frameNo; 315 startFrame = frameNo;
334 model3->setStartFrame(startFrame); 316 model3->setStartFrame(startFrame);
335 } else if (lineno == 1 && 317 } else if (lineno == 1 &&
336 timingType == CSVFormat::ExplicitTiming) { 318 timingType == CSVFormat::ExplicitTiming) {
337 model3->setResolution(frameNo - startFrame); 319 model3->setResolution(frameNo - startFrame);
338 } 320 }
321
322 firstEverValue = false;
339 323
340 if (!ok) { 324 if (!ok) {
341 if (warnings < warnLimit) { 325 if (warnings < warnLimit) {
342 std::cerr << "WARNING: CSVFileReader::load: " 326 std::cerr << "WARNING: CSVFileReader::load: "
343 << "Non-numeric value \"" 327 << "Non-numeric value \""
364 frameNo += windowSize; 348 frameNo += windowSize;
365 } 349 }
366 } 350 }
367 } 351 }
368 352
353 if (!haveAnyValue) {
354 if (model2a) {
355 // assign values for regions based on label frequency; we
356 // have this in our labelCountMap, sort of
357
358 std::map<int, std::map<QString, float> > countLabelValueMap;
359 for (std::map<QString, int>::iterator i = labelCountMap.begin();
360 i != labelCountMap.end(); ++i) {
361 countLabelValueMap[i->second][i->first] = 0.f;
362 }
363
364 float v = 0.f;
365 for (std::map<int, std::map<QString, float> >::iterator i =
366 countLabelValueMap.end(); i != countLabelValueMap.begin(); ) {
367 --i;
368 for (std::map<QString, float>::iterator j = i->second.begin();
369 j != i->second.end(); ++j) {
370 j->second = v;
371 v = v + 1.f;
372 }
373 }
374
375 std::map<RegionModel::Point, RegionModel::Point,
376 RegionModel::Point::Comparator> pointMap;
377 for (RegionModel::PointList::const_iterator i =
378 model2a->getPoints().begin();
379 i != model2a->getPoints().end(); ++i) {
380 RegionModel::Point p(*i);
381 v = countLabelValueMap[labelCountMap[p.label]][p.label];
382 RegionModel::Point pp(p.frame, v, p.duration, p.label);
383 pointMap[p] = pp;
384 }
385
386 for (std::map<RegionModel::Point, RegionModel::Point>::iterator i =
387 pointMap.begin(); i != pointMap.end(); ++i) {
388 model2a->deletePoint(i->first);
389 model2a->addPoint(i->second);
390 }
391 }
392 }
393
369 if (modelType == CSVFormat::ThreeDimensionalModel) { 394 if (modelType == CSVFormat::ThreeDimensionalModel) {
370 model3->setMinimumLevel(min); 395 model3->setMinimumLevel(min);
371 model3->setMaximumLevel(max); 396 model3->setMaximumLevel(max);
372 } 397 }
373 398