diff core/tools/csv2cell.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/tools/csv2cell.m	Tue Feb 10 15:05:51 2015 +0000
@@ -0,0 +1,152 @@
+function data = csv2cell(varargin)
+% CSV2CELL - parses a Windows CSV file into an NxM cell array, where N is
+% the number of lines in the CSV text and M is the number of fields in the
+% longest line of the CSV file. Lines are delimited by carriage returns
+% and/or newlines.
+%
+% A Windows CSV file format allows for commas (,) and double quotes (") to
+% be contained within fields of the CSV file. Regular fields are just text
+% separated by commas (e.g. foo,bar,hello world). Fields containing commas
+% or double quotes are surrounded by double quotes (e.g.
+% foo,bar,"item1,item2,item3",hello world). In the previous example,
+% "item1,item2,item3" is one field in the CSV text. For double quotes to be
+% represented, they are written in pairs in the file, and contained within
+% a quoted field, (e.g. foo,"this field contains ""quotes""",bar). Spaces
+% within fields (even leading and trailing) are preserved.
+%
+% All fields from the CSV file are returned as strings. If the CSV text
+% contains lines with different numbers of fields, then the "missing"
+% fields with appear as empty arrays, [], in the returned data. You can
+% easily convert the data you expect to be numeric using str2num() and
+% num2cell(). 
+%
+% Examples:
+%  >> csv2cell('foo.csv','fromfile') % loads and parses entire file
+%  >> csv2cell(',,,') % returns cell array {'','','',''}
+%  >> csv2cell(',,,','text') % same as above, declaring text input
+%  >> csv2cell(sprintf('%s\r\n%s',...
+%     '"Ten Thousand",10000,,"10,000","""It''s ""10 Grand"", baby",10k',...
+%     ',foo,bar,soo'))
+%  ans = 
+%    'Ten Thousand'    '10000'       ''    '10,000'    [1x22 char]    '10k'
+%                ''    'foo'      'bar'    'soo'                []       []
+%  >> % note the two empty [] cells, because the second line has two fewer
+%  >> % fields than the first. The empty field '' at the beginning of the
+%  >> % second line is due to the leading comma on that line, which is
+%  >> % correct behavior. A trailing comma will do the same to the end of a
+%  >> % line.
+% 
+% Limitations/Exceptions:
+%   * This code is untested on large files. It may take a long time due to
+%   variables growing inside loops (yes, poor practice, but easy coding).
+%   * This code has been minimally tested to work with a variety of weird
+%   Excel files that I have.
+%   * Behavior with improperly formatted CSV files is untested.
+%   * Technically, CSV files from Excel always separate lines with the pair
+%   of characters \r\n. This parser will also separate lines that have only
+%   \r or \n as line terminators. 
+%   * Line separation is the first operation. I don't think the Excel CSV
+%   format has any allowance for newlines or carriage returns within
+%   fields. If it does, then this parser does not support it and would not
+%   return bad output.
+%
+% Copyright 2008 Arthur Hebert
+
+% Process arguments
+if nargin == 1
+    text = varargin{1};
+elseif nargin == 2
+    switch varargin{2}
+        case 'fromfile'
+            filename = varargin{1};
+            fid = fopen(filename);
+            text = char(fread(fid))';
+            fclose(fid);
+        case 'text'
+            text = varargin{1};
+        otherwise
+            error('Invalid 2nd argument %s. Valid options are ''fromfile'' and ''text''',varargin{2})
+    end
+else
+    error('CSV2CELL requires 1 or 2 arguments.')
+end
+
+
+% First split it into lines
+lines = regexp(text,'(\r\n|[\r\n])','split'); % lines should now be a cell array of text split by newlines
+
+% a character is either a delimiter or a field
+inField = true;
+inQuoteField = false;
+% if inField && ~inQuoteField --> then we're in a raw field
+
+skipNext = false;
+data = {};
+field = '';
+for lineNumber = 1:length(lines)
+    nChars = length(lines{lineNumber}); % number of characters in this line
+    fieldNumber = 1;
+    for charNumber = 1:nChars
+        if skipNext
+            skipNext = false;
+            continue
+        end
+        thisChar = lines{lineNumber}(charNumber);
+        if thisChar == ','
+            if inField 
+                if inQuoteField % this comma is part of the field
+                    field(end+1) = thisChar;
+                else % this comma is the delimiter marking the end of the field
+                    data{lineNumber,fieldNumber} = field;
+                    field = '';
+                    fieldNumber = fieldNumber + 1;
+                end
+            else % we are not currently in a field -- this is the start of a new delimiter
+                inField = true;
+            end
+            if charNumber == nChars % this is a hanging comma, indicating the last field is blank
+                data{lineNumber,fieldNumber} = '';
+                field = '';
+                fieldNumber = fieldNumber + 1;
+            end
+        elseif thisChar == '"' 
+            if inField
+                if inQuoteField
+                    if charNumber == nChars % it's the last character, so this must be the closing delimiter?
+                        inField = false;
+                        inQuoteField = false;
+                        data{lineNumber,fieldNumber} = field;
+                        field = '';
+                        fieldNumber = fieldNumber + 1;
+                    else 
+                        if lines{lineNumber}(charNumber+1) == '"' % this is translated to be a double quote in the field
+                            field(end+1) = '"';
+                            skipNext = true;
+                        else % this " is the delimiter ending this field
+                            data{lineNumber,fieldNumber} = field;
+                            field = '';
+                            inField = false;
+                            inQuoteField = false;
+                            fieldNumber = fieldNumber + 1;
+                        end
+                    end
+                else % this is a delimiter and we are in a new quote field
+                    inQuoteField = true;
+                end
+            else % we are not in a field. This must be an opening quote for the first field?
+                inField = true;
+                inQuoteField = true;
+            end
+        else % any other character ought to be added to field
+            field(end+1) = thisChar;
+            if charNumber == nChars
+                data{lineNumber,fieldNumber} = field;
+                field = '';
+                fieldNumber = fieldNumber + 1;
+            elseif charNumber == 1 % we are starting a new raw field
+                inField = true;
+            end
+        end
+    end
+end
+            
\ No newline at end of file