annotate core/tools/csv2cell.m @ 0:cc4b1211e677 tip

initial commit to HG from Changeset: 646 (e263d8a21543) added further path and more save "camirversion.m"
author Daniel Wolff
date Fri, 19 Aug 2016 13:07:06 +0200
parents
children
rev   line source
Daniel@0 1 function data = csv2cell(varargin)
Daniel@0 2 % CSV2CELL - parses a Windows CSV file into an NxM cell array, where N is
Daniel@0 3 % the number of lines in the CSV text and M is the number of fields in the
Daniel@0 4 % longest line of the CSV file. Lines are delimited by carriage returns
Daniel@0 5 % and/or newlines.
Daniel@0 6 %
Daniel@0 7 % A Windows CSV file format allows for commas (,) and double quotes (") to
Daniel@0 8 % be contained within fields of the CSV file. Regular fields are just text
Daniel@0 9 % separated by commas (e.g. foo,bar,hello world). Fields containing commas
Daniel@0 10 % or double quotes are surrounded by double quotes (e.g.
Daniel@0 11 % foo,bar,"item1,item2,item3",hello world). In the previous example,
Daniel@0 12 % "item1,item2,item3" is one field in the CSV text. For double quotes to be
Daniel@0 13 % represented, they are written in pairs in the file, and contained within
Daniel@0 14 % a quoted field, (e.g. foo,"this field contains ""quotes""",bar). Spaces
Daniel@0 15 % within fields (even leading and trailing) are preserved.
Daniel@0 16 %
Daniel@0 17 % All fields from the CSV file are returned as strings. If the CSV text
Daniel@0 18 % contains lines with different numbers of fields, then the "missing"
Daniel@0 19 % fields with appear as empty arrays, [], in the returned data. You can
Daniel@0 20 % easily convert the data you expect to be numeric using str2num() and
Daniel@0 21 % num2cell().
Daniel@0 22 %
Daniel@0 23 % Examples:
Daniel@0 24 % >> csv2cell('foo.csv','fromfile') % loads and parses entire file
Daniel@0 25 % >> csv2cell(',,,') % returns cell array {'','','',''}
Daniel@0 26 % >> csv2cell(',,,','text') % same as above, declaring text input
Daniel@0 27 % >> csv2cell(sprintf('%s\r\n%s',...
Daniel@0 28 % '"Ten Thousand",10000,,"10,000","""It''s ""10 Grand"", baby",10k',...
Daniel@0 29 % ',foo,bar,soo'))
Daniel@0 30 % ans =
Daniel@0 31 % 'Ten Thousand' '10000' '' '10,000' [1x22 char] '10k'
Daniel@0 32 % '' 'foo' 'bar' 'soo' [] []
Daniel@0 33 % >> % note the two empty [] cells, because the second line has two fewer
Daniel@0 34 % >> % fields than the first. The empty field '' at the beginning of the
Daniel@0 35 % >> % second line is due to the leading comma on that line, which is
Daniel@0 36 % >> % correct behavior. A trailing comma will do the same to the end of a
Daniel@0 37 % >> % line.
Daniel@0 38 %
Daniel@0 39 % Limitations/Exceptions:
Daniel@0 40 % * This code is untested on large files. It may take a long time due to
Daniel@0 41 % variables growing inside loops (yes, poor practice, but easy coding).
Daniel@0 42 % * This code has been minimally tested to work with a variety of weird
Daniel@0 43 % Excel files that I have.
Daniel@0 44 % * Behavior with improperly formatted CSV files is untested.
Daniel@0 45 % * Technically, CSV files from Excel always separate lines with the pair
Daniel@0 46 % of characters \r\n. This parser will also separate lines that have only
Daniel@0 47 % \r or \n as line terminators.
Daniel@0 48 % * Line separation is the first operation. I don't think the Excel CSV
Daniel@0 49 % format has any allowance for newlines or carriage returns within
Daniel@0 50 % fields. If it does, then this parser does not support it and would not
Daniel@0 51 % return bad output.
Daniel@0 52 %
Daniel@0 53 % Copyright 2008 Arthur Hebert
Daniel@0 54
Daniel@0 55 % Process arguments
Daniel@0 56 if nargin == 1
Daniel@0 57 text = varargin{1};
Daniel@0 58 elseif nargin == 2
Daniel@0 59 switch varargin{2}
Daniel@0 60 case 'fromfile'
Daniel@0 61 filename = varargin{1};
Daniel@0 62 fid = fopen(filename);
Daniel@0 63 text = char(fread(fid))';
Daniel@0 64 fclose(fid);
Daniel@0 65 case 'text'
Daniel@0 66 text = varargin{1};
Daniel@0 67 otherwise
Daniel@0 68 error('Invalid 2nd argument %s. Valid options are ''fromfile'' and ''text''',varargin{2})
Daniel@0 69 end
Daniel@0 70 else
Daniel@0 71 error('CSV2CELL requires 1 or 2 arguments.')
Daniel@0 72 end
Daniel@0 73
Daniel@0 74
Daniel@0 75 % First split it into lines
Daniel@0 76 lines = regexp(text,'(\r\n|[\r\n])','split'); % lines should now be a cell array of text split by newlines
Daniel@0 77
Daniel@0 78 % a character is either a delimiter or a field
Daniel@0 79 inField = true;
Daniel@0 80 inQuoteField = false;
Daniel@0 81 % if inField && ~inQuoteField --> then we're in a raw field
Daniel@0 82
Daniel@0 83 skipNext = false;
Daniel@0 84 data = {};
Daniel@0 85 field = '';
Daniel@0 86 for lineNumber = 1:length(lines)
Daniel@0 87 nChars = length(lines{lineNumber}); % number of characters in this line
Daniel@0 88 fieldNumber = 1;
Daniel@0 89 for charNumber = 1:nChars
Daniel@0 90 if skipNext
Daniel@0 91 skipNext = false;
Daniel@0 92 continue
Daniel@0 93 end
Daniel@0 94 thisChar = lines{lineNumber}(charNumber);
Daniel@0 95 if thisChar == ','
Daniel@0 96 if inField
Daniel@0 97 if inQuoteField % this comma is part of the field
Daniel@0 98 field(end+1) = thisChar;
Daniel@0 99 else % this comma is the delimiter marking the end of the field
Daniel@0 100 data{lineNumber,fieldNumber} = field;
Daniel@0 101 field = '';
Daniel@0 102 fieldNumber = fieldNumber + 1;
Daniel@0 103 end
Daniel@0 104 else % we are not currently in a field -- this is the start of a new delimiter
Daniel@0 105 inField = true;
Daniel@0 106 end
Daniel@0 107 if charNumber == nChars % this is a hanging comma, indicating the last field is blank
Daniel@0 108 data{lineNumber,fieldNumber} = '';
Daniel@0 109 field = '';
Daniel@0 110 fieldNumber = fieldNumber + 1;
Daniel@0 111 end
Daniel@0 112 elseif thisChar == '"'
Daniel@0 113 if inField
Daniel@0 114 if inQuoteField
Daniel@0 115 if charNumber == nChars % it's the last character, so this must be the closing delimiter?
Daniel@0 116 inField = false;
Daniel@0 117 inQuoteField = false;
Daniel@0 118 data{lineNumber,fieldNumber} = field;
Daniel@0 119 field = '';
Daniel@0 120 fieldNumber = fieldNumber + 1;
Daniel@0 121 else
Daniel@0 122 if lines{lineNumber}(charNumber+1) == '"' % this is translated to be a double quote in the field
Daniel@0 123 field(end+1) = '"';
Daniel@0 124 skipNext = true;
Daniel@0 125 else % this " is the delimiter ending this field
Daniel@0 126 data{lineNumber,fieldNumber} = field;
Daniel@0 127 field = '';
Daniel@0 128 inField = false;
Daniel@0 129 inQuoteField = false;
Daniel@0 130 fieldNumber = fieldNumber + 1;
Daniel@0 131 end
Daniel@0 132 end
Daniel@0 133 else % this is a delimiter and we are in a new quote field
Daniel@0 134 inQuoteField = true;
Daniel@0 135 end
Daniel@0 136 else % we are not in a field. This must be an opening quote for the first field?
Daniel@0 137 inField = true;
Daniel@0 138 inQuoteField = true;
Daniel@0 139 end
Daniel@0 140 else % any other character ought to be added to field
Daniel@0 141 field(end+1) = thisChar;
Daniel@0 142 if charNumber == nChars
Daniel@0 143 data{lineNumber,fieldNumber} = field;
Daniel@0 144 field = '';
Daniel@0 145 fieldNumber = fieldNumber + 1;
Daniel@0 146 elseif charNumber == 1 % we are starting a new raw field
Daniel@0 147 inField = true;
Daniel@0 148 end
Daniel@0 149 end
Daniel@0 150 end
Daniel@0 151 end
Daniel@0 152