wolffd@0
|
1 function data = csv2cell(varargin)
|
wolffd@0
|
2 % CSV2CELL - parses a Windows CSV file into an NxM cell array, where N is
|
wolffd@0
|
3 % the number of lines in the CSV text and M is the number of fields in the
|
wolffd@0
|
4 % longest line of the CSV file. Lines are delimited by carriage returns
|
wolffd@0
|
5 % and/or newlines.
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % A Windows CSV file format allows for commas (,) and double quotes (") to
|
wolffd@0
|
8 % be contained within fields of the CSV file. Regular fields are just text
|
wolffd@0
|
9 % separated by commas (e.g. foo,bar,hello world). Fields containing commas
|
wolffd@0
|
10 % or double quotes are surrounded by double quotes (e.g.
|
wolffd@0
|
11 % foo,bar,"item1,item2,item3",hello world). In the previous example,
|
wolffd@0
|
12 % "item1,item2,item3" is one field in the CSV text. For double quotes to be
|
wolffd@0
|
13 % represented, they are written in pairs in the file, and contained within
|
wolffd@0
|
14 % a quoted field, (e.g. foo,"this field contains ""quotes""",bar). Spaces
|
wolffd@0
|
15 % within fields (even leading and trailing) are preserved.
|
wolffd@0
|
16 %
|
wolffd@0
|
17 % All fields from the CSV file are returned as strings. If the CSV text
|
wolffd@0
|
18 % contains lines with different numbers of fields, then the "missing"
|
wolffd@0
|
19 % fields with appear as empty arrays, [], in the returned data. You can
|
wolffd@0
|
20 % easily convert the data you expect to be numeric using str2num() and
|
wolffd@0
|
21 % num2cell().
|
wolffd@0
|
22 %
|
wolffd@0
|
23 % Examples:
|
wolffd@0
|
24 % >> csv2cell('foo.csv','fromfile') % loads and parses entire file
|
wolffd@0
|
25 % >> csv2cell(',,,') % returns cell array {'','','',''}
|
wolffd@0
|
26 % >> csv2cell(',,,','text') % same as above, declaring text input
|
wolffd@0
|
27 % >> csv2cell(sprintf('%s\r\n%s',...
|
wolffd@0
|
28 % '"Ten Thousand",10000,,"10,000","""It''s ""10 Grand"", baby",10k',...
|
wolffd@0
|
29 % ',foo,bar,soo'))
|
wolffd@0
|
30 % ans =
|
wolffd@0
|
31 % 'Ten Thousand' '10000' '' '10,000' [1x22 char] '10k'
|
wolffd@0
|
32 % '' 'foo' 'bar' 'soo' [] []
|
wolffd@0
|
33 % >> % note the two empty [] cells, because the second line has two fewer
|
wolffd@0
|
34 % >> % fields than the first. The empty field '' at the beginning of the
|
wolffd@0
|
35 % >> % second line is due to the leading comma on that line, which is
|
wolffd@0
|
36 % >> % correct behavior. A trailing comma will do the same to the end of a
|
wolffd@0
|
37 % >> % line.
|
wolffd@0
|
38 %
|
wolffd@0
|
39 % Limitations/Exceptions:
|
wolffd@0
|
40 % * This code is untested on large files. It may take a long time due to
|
wolffd@0
|
41 % variables growing inside loops (yes, poor practice, but easy coding).
|
wolffd@0
|
42 % * This code has been minimally tested to work with a variety of weird
|
wolffd@0
|
43 % Excel files that I have.
|
wolffd@0
|
44 % * Behavior with improperly formatted CSV files is untested.
|
wolffd@0
|
45 % * Technically, CSV files from Excel always separate lines with the pair
|
wolffd@0
|
46 % of characters \r\n. This parser will also separate lines that have only
|
wolffd@0
|
47 % \r or \n as line terminators.
|
wolffd@0
|
48 % * Line separation is the first operation. I don't think the Excel CSV
|
wolffd@0
|
49 % format has any allowance for newlines or carriage returns within
|
wolffd@0
|
50 % fields. If it does, then this parser does not support it and would not
|
wolffd@0
|
51 % return bad output.
|
wolffd@0
|
52 %
|
wolffd@0
|
53 % Copyright 2008 Arthur Hebert
|
wolffd@0
|
54
|
wolffd@0
|
55 % Process arguments
|
wolffd@0
|
56 if nargin == 1
|
wolffd@0
|
57 text = varargin{1};
|
wolffd@0
|
58 elseif nargin == 2
|
wolffd@0
|
59 switch varargin{2}
|
wolffd@0
|
60 case 'fromfile'
|
wolffd@0
|
61 filename = varargin{1};
|
wolffd@0
|
62 fid = fopen(filename);
|
wolffd@0
|
63 text = char(fread(fid))';
|
wolffd@0
|
64 fclose(fid);
|
wolffd@0
|
65 case 'text'
|
wolffd@0
|
66 text = varargin{1};
|
wolffd@0
|
67 otherwise
|
wolffd@0
|
68 error('Invalid 2nd argument %s. Valid options are ''fromfile'' and ''text''',varargin{2})
|
wolffd@0
|
69 end
|
wolffd@0
|
70 else
|
wolffd@0
|
71 error('CSV2CELL requires 1 or 2 arguments.')
|
wolffd@0
|
72 end
|
wolffd@0
|
73
|
wolffd@0
|
74
|
wolffd@0
|
75 % First split it into lines
|
wolffd@0
|
76 lines = regexp(text,'(\r\n|[\r\n])','split'); % lines should now be a cell array of text split by newlines
|
wolffd@0
|
77
|
wolffd@0
|
78 % a character is either a delimiter or a field
|
wolffd@0
|
79 inField = true;
|
wolffd@0
|
80 inQuoteField = false;
|
wolffd@0
|
81 % if inField && ~inQuoteField --> then we're in a raw field
|
wolffd@0
|
82
|
wolffd@0
|
83 skipNext = false;
|
wolffd@0
|
84 data = {};
|
wolffd@0
|
85 field = '';
|
wolffd@0
|
86 for lineNumber = 1:length(lines)
|
wolffd@0
|
87 nChars = length(lines{lineNumber}); % number of characters in this line
|
wolffd@0
|
88 fieldNumber = 1;
|
wolffd@0
|
89 for charNumber = 1:nChars
|
wolffd@0
|
90 if skipNext
|
wolffd@0
|
91 skipNext = false;
|
wolffd@0
|
92 continue
|
wolffd@0
|
93 end
|
wolffd@0
|
94 thisChar = lines{lineNumber}(charNumber);
|
wolffd@0
|
95 if thisChar == ','
|
wolffd@0
|
96 if inField
|
wolffd@0
|
97 if inQuoteField % this comma is part of the field
|
wolffd@0
|
98 field(end+1) = thisChar;
|
wolffd@0
|
99 else % this comma is the delimiter marking the end of the field
|
wolffd@0
|
100 data{lineNumber,fieldNumber} = field;
|
wolffd@0
|
101 field = '';
|
wolffd@0
|
102 fieldNumber = fieldNumber + 1;
|
wolffd@0
|
103 end
|
wolffd@0
|
104 else % we are not currently in a field -- this is the start of a new delimiter
|
wolffd@0
|
105 inField = true;
|
wolffd@0
|
106 end
|
wolffd@0
|
107 if charNumber == nChars % this is a hanging comma, indicating the last field is blank
|
wolffd@0
|
108 data{lineNumber,fieldNumber} = '';
|
wolffd@0
|
109 field = '';
|
wolffd@0
|
110 fieldNumber = fieldNumber + 1;
|
wolffd@0
|
111 end
|
wolffd@0
|
112 elseif thisChar == '"'
|
wolffd@0
|
113 if inField
|
wolffd@0
|
114 if inQuoteField
|
wolffd@0
|
115 if charNumber == nChars % it's the last character, so this must be the closing delimiter?
|
wolffd@0
|
116 inField = false;
|
wolffd@0
|
117 inQuoteField = false;
|
wolffd@0
|
118 data{lineNumber,fieldNumber} = field;
|
wolffd@0
|
119 field = '';
|
wolffd@0
|
120 fieldNumber = fieldNumber + 1;
|
wolffd@0
|
121 else
|
wolffd@0
|
122 if lines{lineNumber}(charNumber+1) == '"' % this is translated to be a double quote in the field
|
wolffd@0
|
123 field(end+1) = '"';
|
wolffd@0
|
124 skipNext = true;
|
wolffd@0
|
125 else % this " is the delimiter ending this field
|
wolffd@0
|
126 data{lineNumber,fieldNumber} = field;
|
wolffd@0
|
127 field = '';
|
wolffd@0
|
128 inField = false;
|
wolffd@0
|
129 inQuoteField = false;
|
wolffd@0
|
130 fieldNumber = fieldNumber + 1;
|
wolffd@0
|
131 end
|
wolffd@0
|
132 end
|
wolffd@0
|
133 else % this is a delimiter and we are in a new quote field
|
wolffd@0
|
134 inQuoteField = true;
|
wolffd@0
|
135 end
|
wolffd@0
|
136 else % we are not in a field. This must be an opening quote for the first field?
|
wolffd@0
|
137 inField = true;
|
wolffd@0
|
138 inQuoteField = true;
|
wolffd@0
|
139 end
|
wolffd@0
|
140 else % any other character ought to be added to field
|
wolffd@0
|
141 field(end+1) = thisChar;
|
wolffd@0
|
142 if charNumber == nChars
|
wolffd@0
|
143 data{lineNumber,fieldNumber} = field;
|
wolffd@0
|
144 field = '';
|
wolffd@0
|
145 fieldNumber = fieldNumber + 1;
|
wolffd@0
|
146 elseif charNumber == 1 % we are starting a new raw field
|
wolffd@0
|
147 inField = true;
|
wolffd@0
|
148 end
|
wolffd@0
|
149 end
|
wolffd@0
|
150 end
|
wolffd@0
|
151 end
|
wolffd@0
|
152 |