Mercurial > hg > camir-aes2014
comparison core/tools/csv2cell.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function data = csv2cell(varargin) | |
2 % CSV2CELL - parses a Windows CSV file into an NxM cell array, where N is | |
3 % the number of lines in the CSV text and M is the number of fields in the | |
4 % longest line of the CSV file. Lines are delimited by carriage returns | |
5 % and/or newlines. | |
6 % | |
7 % A Windows CSV file format allows for commas (,) and double quotes (") to | |
8 % be contained within fields of the CSV file. Regular fields are just text | |
9 % separated by commas (e.g. foo,bar,hello world). Fields containing commas | |
10 % or double quotes are surrounded by double quotes (e.g. | |
11 % foo,bar,"item1,item2,item3",hello world). In the previous example, | |
12 % "item1,item2,item3" is one field in the CSV text. For double quotes to be | |
13 % represented, they are written in pairs in the file, and contained within | |
14 % a quoted field, (e.g. foo,"this field contains ""quotes""",bar). Spaces | |
15 % within fields (even leading and trailing) are preserved. | |
16 % | |
17 % All fields from the CSV file are returned as strings. If the CSV text | |
18 % contains lines with different numbers of fields, then the "missing" | |
19 % fields with appear as empty arrays, [], in the returned data. You can | |
20 % easily convert the data you expect to be numeric using str2num() and | |
21 % num2cell(). | |
22 % | |
23 % Examples: | |
24 % >> csv2cell('foo.csv','fromfile') % loads and parses entire file | |
25 % >> csv2cell(',,,') % returns cell array {'','','',''} | |
26 % >> csv2cell(',,,','text') % same as above, declaring text input | |
27 % >> csv2cell(sprintf('%s\r\n%s',... | |
28 % '"Ten Thousand",10000,,"10,000","""It''s ""10 Grand"", baby",10k',... | |
29 % ',foo,bar,soo')) | |
30 % ans = | |
31 % 'Ten Thousand' '10000' '' '10,000' [1x22 char] '10k' | |
32 % '' 'foo' 'bar' 'soo' [] [] | |
33 % >> % note the two empty [] cells, because the second line has two fewer | |
34 % >> % fields than the first. The empty field '' at the beginning of the | |
35 % >> % second line is due to the leading comma on that line, which is | |
36 % >> % correct behavior. A trailing comma will do the same to the end of a | |
37 % >> % line. | |
38 % | |
39 % Limitations/Exceptions: | |
40 % * This code is untested on large files. It may take a long time due to | |
41 % variables growing inside loops (yes, poor practice, but easy coding). | |
42 % * This code has been minimally tested to work with a variety of weird | |
43 % Excel files that I have. | |
44 % * Behavior with improperly formatted CSV files is untested. | |
45 % * Technically, CSV files from Excel always separate lines with the pair | |
46 % of characters \r\n. This parser will also separate lines that have only | |
47 % \r or \n as line terminators. | |
48 % * Line separation is the first operation. I don't think the Excel CSV | |
49 % format has any allowance for newlines or carriage returns within | |
50 % fields. If it does, then this parser does not support it and would not | |
51 % return bad output. | |
52 % | |
53 % Copyright 2008 Arthur Hebert | |
54 | |
55 % Process arguments | |
56 if nargin == 1 | |
57 text = varargin{1}; | |
58 elseif nargin == 2 | |
59 switch varargin{2} | |
60 case 'fromfile' | |
61 filename = varargin{1}; | |
62 fid = fopen(filename); | |
63 text = char(fread(fid))'; | |
64 fclose(fid); | |
65 case 'text' | |
66 text = varargin{1}; | |
67 otherwise | |
68 error('Invalid 2nd argument %s. Valid options are ''fromfile'' and ''text''',varargin{2}) | |
69 end | |
70 else | |
71 error('CSV2CELL requires 1 or 2 arguments.') | |
72 end | |
73 | |
74 | |
75 % First split it into lines | |
76 lines = regexp(text,'(\r\n|[\r\n])','split'); % lines should now be a cell array of text split by newlines | |
77 | |
78 % a character is either a delimiter or a field | |
79 inField = true; | |
80 inQuoteField = false; | |
81 % if inField && ~inQuoteField --> then we're in a raw field | |
82 | |
83 skipNext = false; | |
84 data = {}; | |
85 field = ''; | |
86 for lineNumber = 1:length(lines) | |
87 nChars = length(lines{lineNumber}); % number of characters in this line | |
88 fieldNumber = 1; | |
89 for charNumber = 1:nChars | |
90 if skipNext | |
91 skipNext = false; | |
92 continue | |
93 end | |
94 thisChar = lines{lineNumber}(charNumber); | |
95 if thisChar == ',' | |
96 if inField | |
97 if inQuoteField % this comma is part of the field | |
98 field(end+1) = thisChar; | |
99 else % this comma is the delimiter marking the end of the field | |
100 data{lineNumber,fieldNumber} = field; | |
101 field = ''; | |
102 fieldNumber = fieldNumber + 1; | |
103 end | |
104 else % we are not currently in a field -- this is the start of a new delimiter | |
105 inField = true; | |
106 end | |
107 if charNumber == nChars % this is a hanging comma, indicating the last field is blank | |
108 data{lineNumber,fieldNumber} = ''; | |
109 field = ''; | |
110 fieldNumber = fieldNumber + 1; | |
111 end | |
112 elseif thisChar == '"' | |
113 if inField | |
114 if inQuoteField | |
115 if charNumber == nChars % it's the last character, so this must be the closing delimiter? | |
116 inField = false; | |
117 inQuoteField = false; | |
118 data{lineNumber,fieldNumber} = field; | |
119 field = ''; | |
120 fieldNumber = fieldNumber + 1; | |
121 else | |
122 if lines{lineNumber}(charNumber+1) == '"' % this is translated to be a double quote in the field | |
123 field(end+1) = '"'; | |
124 skipNext = true; | |
125 else % this " is the delimiter ending this field | |
126 data{lineNumber,fieldNumber} = field; | |
127 field = ''; | |
128 inField = false; | |
129 inQuoteField = false; | |
130 fieldNumber = fieldNumber + 1; | |
131 end | |
132 end | |
133 else % this is a delimiter and we are in a new quote field | |
134 inQuoteField = true; | |
135 end | |
136 else % we are not in a field. This must be an opening quote for the first field? | |
137 inField = true; | |
138 inQuoteField = true; | |
139 end | |
140 else % any other character ought to be added to field | |
141 field(end+1) = thisChar; | |
142 if charNumber == nChars | |
143 data{lineNumber,fieldNumber} = field; | |
144 field = ''; | |
145 fieldNumber = fieldNumber + 1; | |
146 elseif charNumber == 1 % we are starting a new raw field | |
147 inField = true; | |
148 end | |
149 end | |
150 end | |
151 end | |
152 |