annotate lib/faster_csv.rb @ 8:0c83d98252d9 yuya

* Add custom repo prefix and proper auth realm, remove auth cache (seems like an unwise feature), pass DB handle around, various other bits of tidying
author Chris Cannam
date Thu, 12 Aug 2010 15:31:37 +0100
parents 513646585e45
children
rev   line source
Chris@0 1 #!/usr/local/bin/ruby -w
Chris@0 2
Chris@0 3 # = faster_csv.rb -- Faster CSV Reading and Writing
Chris@0 4 #
Chris@0 5 # Created by James Edward Gray II on 2005-10-31.
Chris@0 6 # Copyright 2005 Gray Productions. All rights reserved.
Chris@0 7 #
Chris@0 8 # See FasterCSV for documentation.
Chris@0 9
Chris@0 10 if RUBY_VERSION >= "1.9"
Chris@0 11 abort <<-VERSION_WARNING.gsub(/^\s+/, "")
Chris@0 12 Please switch to Ruby 1.9's standard CSV library. It's FasterCSV plus
Chris@0 13 support for Ruby 1.9's m17n encoding engine.
Chris@0 14 VERSION_WARNING
Chris@0 15 end
Chris@0 16
Chris@0 17 require "forwardable"
Chris@0 18 require "English"
Chris@0 19 require "enumerator"
Chris@0 20 require "date"
Chris@0 21 require "stringio"
Chris@0 22
Chris@0 23 #
Chris@0 24 # This class provides a complete interface to CSV files and data. It offers
Chris@0 25 # tools to enable you to read and write to and from Strings or IO objects, as
Chris@0 26 # needed.
Chris@0 27 #
Chris@0 28 # == Reading
Chris@0 29 #
Chris@0 30 # === From a File
Chris@0 31 #
Chris@0 32 # ==== A Line at a Time
Chris@0 33 #
Chris@0 34 # FasterCSV.foreach("path/to/file.csv") do |row|
Chris@0 35 # # use row here...
Chris@0 36 # end
Chris@0 37 #
Chris@0 38 # ==== All at Once
Chris@0 39 #
Chris@0 40 # arr_of_arrs = FasterCSV.read("path/to/file.csv")
Chris@0 41 #
Chris@0 42 # === From a String
Chris@0 43 #
Chris@0 44 # ==== A Line at a Time
Chris@0 45 #
Chris@0 46 # FasterCSV.parse("CSV,data,String") do |row|
Chris@0 47 # # use row here...
Chris@0 48 # end
Chris@0 49 #
Chris@0 50 # ==== All at Once
Chris@0 51 #
Chris@0 52 # arr_of_arrs = FasterCSV.parse("CSV,data,String")
Chris@0 53 #
Chris@0 54 # == Writing
Chris@0 55 #
Chris@0 56 # === To a File
Chris@0 57 #
Chris@0 58 # FasterCSV.open("path/to/file.csv", "w") do |csv|
Chris@0 59 # csv << ["row", "of", "CSV", "data"]
Chris@0 60 # csv << ["another", "row"]
Chris@0 61 # # ...
Chris@0 62 # end
Chris@0 63 #
Chris@0 64 # === To a String
Chris@0 65 #
Chris@0 66 # csv_string = FasterCSV.generate do |csv|
Chris@0 67 # csv << ["row", "of", "CSV", "data"]
Chris@0 68 # csv << ["another", "row"]
Chris@0 69 # # ...
Chris@0 70 # end
Chris@0 71 #
Chris@0 72 # == Convert a Single Line
Chris@0 73 #
Chris@0 74 # csv_string = ["CSV", "data"].to_csv # to CSV
Chris@0 75 # csv_array = "CSV,String".parse_csv # from CSV
Chris@0 76 #
Chris@0 77 # == Shortcut Interface
Chris@0 78 #
Chris@0 79 # FCSV { |csv_out| csv_out << %w{my data here} } # to $stdout
Chris@0 80 # FCSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String
Chris@0 81 # FCSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr
Chris@0 82 #
Chris@0 83 class FasterCSV
Chris@0 84 # The version of the installed library.
Chris@0 85 VERSION = "1.5.0".freeze
Chris@0 86
Chris@0 87 #
Chris@0 88 # A FasterCSV::Row is part Array and part Hash. It retains an order for the
Chris@0 89 # fields and allows duplicates just as an Array would, but also allows you to
Chris@0 90 # access fields by name just as you could if they were in a Hash.
Chris@0 91 #
Chris@0 92 # All rows returned by FasterCSV will be constructed from this class, if
Chris@0 93 # header row processing is activated.
Chris@0 94 #
Chris@0 95 class Row
Chris@0 96 #
Chris@0 97 # Construct a new FasterCSV::Row from +headers+ and +fields+, which are
Chris@0 98 # expected to be Arrays. If one Array is shorter than the other, it will be
Chris@0 99 # padded with +nil+ objects.
Chris@0 100 #
Chris@0 101 # The optional +header_row+ parameter can be set to +true+ to indicate, via
Chris@0 102 # FasterCSV::Row.header_row?() and FasterCSV::Row.field_row?(), that this is
Chris@0 103 # a header row. Otherwise, the row is assumes to be a field row.
Chris@0 104 #
Chris@0 105 # A FasterCSV::Row object supports the following Array methods through
Chris@0 106 # delegation:
Chris@0 107 #
Chris@0 108 # * empty?()
Chris@0 109 # * length()
Chris@0 110 # * size()
Chris@0 111 #
Chris@0 112 def initialize(headers, fields, header_row = false)
Chris@0 113 @header_row = header_row
Chris@0 114
Chris@0 115 # handle extra headers or fields
Chris@0 116 @row = if headers.size > fields.size
Chris@0 117 headers.zip(fields)
Chris@0 118 else
Chris@0 119 fields.zip(headers).map { |pair| pair.reverse }
Chris@0 120 end
Chris@0 121 end
Chris@0 122
Chris@0 123 # Internal data format used to compare equality.
Chris@0 124 attr_reader :row
Chris@0 125 protected :row
Chris@0 126
Chris@0 127 ### Array Delegation ###
Chris@0 128
Chris@0 129 extend Forwardable
Chris@0 130 def_delegators :@row, :empty?, :length, :size
Chris@0 131
Chris@0 132 # Returns +true+ if this is a header row.
Chris@0 133 def header_row?
Chris@0 134 @header_row
Chris@0 135 end
Chris@0 136
Chris@0 137 # Returns +true+ if this is a field row.
Chris@0 138 def field_row?
Chris@0 139 not header_row?
Chris@0 140 end
Chris@0 141
Chris@0 142 # Returns the headers of this row.
Chris@0 143 def headers
Chris@0 144 @row.map { |pair| pair.first }
Chris@0 145 end
Chris@0 146
Chris@0 147 #
Chris@0 148 # :call-seq:
Chris@0 149 # field( header )
Chris@0 150 # field( header, offset )
Chris@0 151 # field( index )
Chris@0 152 #
Chris@0 153 # This method will fetch the field value by +header+ or +index+. If a field
Chris@0 154 # is not found, +nil+ is returned.
Chris@0 155 #
Chris@0 156 # When provided, +offset+ ensures that a header match occurrs on or later
Chris@0 157 # than the +offset+ index. You can use this to find duplicate headers,
Chris@0 158 # without resorting to hard-coding exact indices.
Chris@0 159 #
Chris@0 160 def field(header_or_index, minimum_index = 0)
Chris@0 161 # locate the pair
Chris@0 162 finder = header_or_index.is_a?(Integer) ? :[] : :assoc
Chris@0 163 pair = @row[minimum_index..-1].send(finder, header_or_index)
Chris@0 164
Chris@0 165 # return the field if we have a pair
Chris@0 166 pair.nil? ? nil : pair.last
Chris@0 167 end
Chris@0 168 alias_method :[], :field
Chris@0 169
Chris@0 170 #
Chris@0 171 # :call-seq:
Chris@0 172 # []=( header, value )
Chris@0 173 # []=( header, offset, value )
Chris@0 174 # []=( index, value )
Chris@0 175 #
Chris@0 176 # Looks up the field by the semantics described in FasterCSV::Row.field()
Chris@0 177 # and assigns the +value+.
Chris@0 178 #
Chris@0 179 # Assigning past the end of the row with an index will set all pairs between
Chris@0 180 # to <tt>[nil, nil]</tt>. Assigning to an unused header appends the new
Chris@0 181 # pair.
Chris@0 182 #
Chris@0 183 def []=(*args)
Chris@0 184 value = args.pop
Chris@0 185
Chris@0 186 if args.first.is_a? Integer
Chris@0 187 if @row[args.first].nil? # extending past the end with index
Chris@0 188 @row[args.first] = [nil, value]
Chris@0 189 @row.map! { |pair| pair.nil? ? [nil, nil] : pair }
Chris@0 190 else # normal index assignment
Chris@0 191 @row[args.first][1] = value
Chris@0 192 end
Chris@0 193 else
Chris@0 194 index = index(*args)
Chris@0 195 if index.nil? # appending a field
Chris@0 196 self << [args.first, value]
Chris@0 197 else # normal header assignment
Chris@0 198 @row[index][1] = value
Chris@0 199 end
Chris@0 200 end
Chris@0 201 end
Chris@0 202
Chris@0 203 #
Chris@0 204 # :call-seq:
Chris@0 205 # <<( field )
Chris@0 206 # <<( header_and_field_array )
Chris@0 207 # <<( header_and_field_hash )
Chris@0 208 #
Chris@0 209 # If a two-element Array is provided, it is assumed to be a header and field
Chris@0 210 # and the pair is appended. A Hash works the same way with the key being
Chris@0 211 # the header and the value being the field. Anything else is assumed to be
Chris@0 212 # a lone field which is appended with a +nil+ header.
Chris@0 213 #
Chris@0 214 # This method returns the row for chaining.
Chris@0 215 #
Chris@0 216 def <<(arg)
Chris@0 217 if arg.is_a?(Array) and arg.size == 2 # appending a header and name
Chris@0 218 @row << arg
Chris@0 219 elsif arg.is_a?(Hash) # append header and name pairs
Chris@0 220 arg.each { |pair| @row << pair }
Chris@0 221 else # append field value
Chris@0 222 @row << [nil, arg]
Chris@0 223 end
Chris@0 224
Chris@0 225 self # for chaining
Chris@0 226 end
Chris@0 227
Chris@0 228 #
Chris@0 229 # A shortcut for appending multiple fields. Equivalent to:
Chris@0 230 #
Chris@0 231 # args.each { |arg| faster_csv_row << arg }
Chris@0 232 #
Chris@0 233 # This method returns the row for chaining.
Chris@0 234 #
Chris@0 235 def push(*args)
Chris@0 236 args.each { |arg| self << arg }
Chris@0 237
Chris@0 238 self # for chaining
Chris@0 239 end
Chris@0 240
Chris@0 241 #
Chris@0 242 # :call-seq:
Chris@0 243 # delete( header )
Chris@0 244 # delete( header, offset )
Chris@0 245 # delete( index )
Chris@0 246 #
Chris@0 247 # Used to remove a pair from the row by +header+ or +index+. The pair is
Chris@0 248 # located as described in FasterCSV::Row.field(). The deleted pair is
Chris@0 249 # returned, or +nil+ if a pair could not be found.
Chris@0 250 #
Chris@0 251 def delete(header_or_index, minimum_index = 0)
Chris@0 252 if header_or_index.is_a? Integer # by index
Chris@0 253 @row.delete_at(header_or_index)
Chris@0 254 else # by header
Chris@0 255 @row.delete_at(index(header_or_index, minimum_index))
Chris@0 256 end
Chris@0 257 end
Chris@0 258
Chris@0 259 #
Chris@0 260 # The provided +block+ is passed a header and field for each pair in the row
Chris@0 261 # and expected to return +true+ or +false+, depending on whether the pair
Chris@0 262 # should be deleted.
Chris@0 263 #
Chris@0 264 # This method returns the row for chaining.
Chris@0 265 #
Chris@0 266 def delete_if(&block)
Chris@0 267 @row.delete_if(&block)
Chris@0 268
Chris@0 269 self # for chaining
Chris@0 270 end
Chris@0 271
Chris@0 272 #
Chris@0 273 # This method accepts any number of arguments which can be headers, indices,
Chris@0 274 # Ranges of either, or two-element Arrays containing a header and offset.
Chris@0 275 # Each argument will be replaced with a field lookup as described in
Chris@0 276 # FasterCSV::Row.field().
Chris@0 277 #
Chris@0 278 # If called with no arguments, all fields are returned.
Chris@0 279 #
Chris@0 280 def fields(*headers_and_or_indices)
Chris@0 281 if headers_and_or_indices.empty? # return all fields--no arguments
Chris@0 282 @row.map { |pair| pair.last }
Chris@0 283 else # or work like values_at()
Chris@0 284 headers_and_or_indices.inject(Array.new) do |all, h_or_i|
Chris@0 285 all + if h_or_i.is_a? Range
Chris@0 286 index_begin = h_or_i.begin.is_a?(Integer) ? h_or_i.begin :
Chris@0 287 index(h_or_i.begin)
Chris@0 288 index_end = h_or_i.end.is_a?(Integer) ? h_or_i.end :
Chris@0 289 index(h_or_i.end)
Chris@0 290 new_range = h_or_i.exclude_end? ? (index_begin...index_end) :
Chris@0 291 (index_begin..index_end)
Chris@0 292 fields.values_at(new_range)
Chris@0 293 else
Chris@0 294 [field(*Array(h_or_i))]
Chris@0 295 end
Chris@0 296 end
Chris@0 297 end
Chris@0 298 end
Chris@0 299 alias_method :values_at, :fields
Chris@0 300
Chris@0 301 #
Chris@0 302 # :call-seq:
Chris@0 303 # index( header )
Chris@0 304 # index( header, offset )
Chris@0 305 #
Chris@0 306 # This method will return the index of a field with the provided +header+.
Chris@0 307 # The +offset+ can be used to locate duplicate header names, as described in
Chris@0 308 # FasterCSV::Row.field().
Chris@0 309 #
Chris@0 310 def index(header, minimum_index = 0)
Chris@0 311 # find the pair
Chris@0 312 index = headers[minimum_index..-1].index(header)
Chris@0 313 # return the index at the right offset, if we found one
Chris@0 314 index.nil? ? nil : index + minimum_index
Chris@0 315 end
Chris@0 316
Chris@0 317 # Returns +true+ if +name+ is a header for this row, and +false+ otherwise.
Chris@0 318 def header?(name)
Chris@0 319 headers.include? name
Chris@0 320 end
Chris@0 321 alias_method :include?, :header?
Chris@0 322
Chris@0 323 #
Chris@0 324 # Returns +true+ if +data+ matches a field in this row, and +false+
Chris@0 325 # otherwise.
Chris@0 326 #
Chris@0 327 def field?(data)
Chris@0 328 fields.include? data
Chris@0 329 end
Chris@0 330
Chris@0 331 include Enumerable
Chris@0 332
Chris@0 333 #
Chris@0 334 # Yields each pair of the row as header and field tuples (much like
Chris@0 335 # iterating over a Hash).
Chris@0 336 #
Chris@0 337 # Support for Enumerable.
Chris@0 338 #
Chris@0 339 # This method returns the row for chaining.
Chris@0 340 #
Chris@0 341 def each(&block)
Chris@0 342 @row.each(&block)
Chris@0 343
Chris@0 344 self # for chaining
Chris@0 345 end
Chris@0 346
Chris@0 347 #
Chris@0 348 # Returns +true+ if this row contains the same headers and fields in the
Chris@0 349 # same order as +other+.
Chris@0 350 #
Chris@0 351 def ==(other)
Chris@0 352 @row == other.row
Chris@0 353 end
Chris@0 354
Chris@0 355 #
Chris@0 356 # Collapses the row into a simple Hash. Be warning that this discards field
Chris@0 357 # order and clobbers duplicate fields.
Chris@0 358 #
Chris@0 359 def to_hash
Chris@0 360 # flatten just one level of the internal Array
Chris@0 361 Hash[*@row.inject(Array.new) { |ary, pair| ary.push(*pair) }]
Chris@0 362 end
Chris@0 363
Chris@0 364 #
Chris@0 365 # Returns the row as a CSV String. Headers are not used. Equivalent to:
Chris@0 366 #
Chris@0 367 # faster_csv_row.fields.to_csv( options )
Chris@0 368 #
Chris@0 369 def to_csv(options = Hash.new)
Chris@0 370 fields.to_csv(options)
Chris@0 371 end
Chris@0 372 alias_method :to_s, :to_csv
Chris@0 373
Chris@0 374 # A summary of fields, by header.
Chris@0 375 def inspect
Chris@0 376 str = "#<#{self.class}"
Chris@0 377 each do |header, field|
Chris@0 378 str << " #{header.is_a?(Symbol) ? header.to_s : header.inspect}:" <<
Chris@0 379 field.inspect
Chris@0 380 end
Chris@0 381 str << ">"
Chris@0 382 end
Chris@0 383 end
Chris@0 384
Chris@0 385 #
Chris@0 386 # A FasterCSV::Table is a two-dimensional data structure for representing CSV
Chris@0 387 # documents. Tables allow you to work with the data by row or column,
Chris@0 388 # manipulate the data, and even convert the results back to CSV, if needed.
Chris@0 389 #
Chris@0 390 # All tables returned by FasterCSV will be constructed from this class, if
Chris@0 391 # header row processing is activated.
Chris@0 392 #
Chris@0 393 class Table
Chris@0 394 #
Chris@0 395 # Construct a new FasterCSV::Table from +array_of_rows+, which are expected
Chris@0 396 # to be FasterCSV::Row objects. All rows are assumed to have the same
Chris@0 397 # headers.
Chris@0 398 #
Chris@0 399 # A FasterCSV::Table object supports the following Array methods through
Chris@0 400 # delegation:
Chris@0 401 #
Chris@0 402 # * empty?()
Chris@0 403 # * length()
Chris@0 404 # * size()
Chris@0 405 #
Chris@0 406 def initialize(array_of_rows)
Chris@0 407 @table = array_of_rows
Chris@0 408 @mode = :col_or_row
Chris@0 409 end
Chris@0 410
Chris@0 411 # The current access mode for indexing and iteration.
Chris@0 412 attr_reader :mode
Chris@0 413
Chris@0 414 # Internal data format used to compare equality.
Chris@0 415 attr_reader :table
Chris@0 416 protected :table
Chris@0 417
Chris@0 418 ### Array Delegation ###
Chris@0 419
Chris@0 420 extend Forwardable
Chris@0 421 def_delegators :@table, :empty?, :length, :size
Chris@0 422
Chris@0 423 #
Chris@0 424 # Returns a duplicate table object, in column mode. This is handy for
Chris@0 425 # chaining in a single call without changing the table mode, but be aware
Chris@0 426 # that this method can consume a fair amount of memory for bigger data sets.
Chris@0 427 #
Chris@0 428 # This method returns the duplicate table for chaining. Don't chain
Chris@0 429 # destructive methods (like []=()) this way though, since you are working
Chris@0 430 # with a duplicate.
Chris@0 431 #
Chris@0 432 def by_col
Chris@0 433 self.class.new(@table.dup).by_col!
Chris@0 434 end
Chris@0 435
Chris@0 436 #
Chris@0 437 # Switches the mode of this table to column mode. All calls to indexing and
Chris@0 438 # iteration methods will work with columns until the mode is changed again.
Chris@0 439 #
Chris@0 440 # This method returns the table and is safe to chain.
Chris@0 441 #
Chris@0 442 def by_col!
Chris@0 443 @mode = :col
Chris@0 444
Chris@0 445 self
Chris@0 446 end
Chris@0 447
Chris@0 448 #
Chris@0 449 # Returns a duplicate table object, in mixed mode. This is handy for
Chris@0 450 # chaining in a single call without changing the table mode, but be aware
Chris@0 451 # that this method can consume a fair amount of memory for bigger data sets.
Chris@0 452 #
Chris@0 453 # This method returns the duplicate table for chaining. Don't chain
Chris@0 454 # destructive methods (like []=()) this way though, since you are working
Chris@0 455 # with a duplicate.
Chris@0 456 #
Chris@0 457 def by_col_or_row
Chris@0 458 self.class.new(@table.dup).by_col_or_row!
Chris@0 459 end
Chris@0 460
Chris@0 461 #
Chris@0 462 # Switches the mode of this table to mixed mode. All calls to indexing and
Chris@0 463 # iteration methods will use the default intelligent indexing system until
Chris@0 464 # the mode is changed again. In mixed mode an index is assumed to be a row
Chris@0 465 # reference while anything else is assumed to be column access by headers.
Chris@0 466 #
Chris@0 467 # This method returns the table and is safe to chain.
Chris@0 468 #
Chris@0 469 def by_col_or_row!
Chris@0 470 @mode = :col_or_row
Chris@0 471
Chris@0 472 self
Chris@0 473 end
Chris@0 474
Chris@0 475 #
Chris@0 476 # Returns a duplicate table object, in row mode. This is handy for chaining
Chris@0 477 # in a single call without changing the table mode, but be aware that this
Chris@0 478 # method can consume a fair amount of memory for bigger data sets.
Chris@0 479 #
Chris@0 480 # This method returns the duplicate table for chaining. Don't chain
Chris@0 481 # destructive methods (like []=()) this way though, since you are working
Chris@0 482 # with a duplicate.
Chris@0 483 #
Chris@0 484 def by_row
Chris@0 485 self.class.new(@table.dup).by_row!
Chris@0 486 end
Chris@0 487
Chris@0 488 #
Chris@0 489 # Switches the mode of this table to row mode. All calls to indexing and
Chris@0 490 # iteration methods will work with rows until the mode is changed again.
Chris@0 491 #
Chris@0 492 # This method returns the table and is safe to chain.
Chris@0 493 #
Chris@0 494 def by_row!
Chris@0 495 @mode = :row
Chris@0 496
Chris@0 497 self
Chris@0 498 end
Chris@0 499
Chris@0 500 #
Chris@0 501 # Returns the headers for the first row of this table (assumed to match all
Chris@0 502 # other rows). An empty Array is returned for empty tables.
Chris@0 503 #
Chris@0 504 def headers
Chris@0 505 if @table.empty?
Chris@0 506 Array.new
Chris@0 507 else
Chris@0 508 @table.first.headers
Chris@0 509 end
Chris@0 510 end
Chris@0 511
Chris@0 512 #
Chris@0 513 # In the default mixed mode, this method returns rows for index access and
Chris@0 514 # columns for header access. You can force the index association by first
Chris@0 515 # calling by_col!() or by_row!().
Chris@0 516 #
Chris@0 517 # Columns are returned as an Array of values. Altering that Array has no
Chris@0 518 # effect on the table.
Chris@0 519 #
Chris@0 520 def [](index_or_header)
Chris@0 521 if @mode == :row or # by index
Chris@0 522 (@mode == :col_or_row and index_or_header.is_a? Integer)
Chris@0 523 @table[index_or_header]
Chris@0 524 else # by header
Chris@0 525 @table.map { |row| row[index_or_header] }
Chris@0 526 end
Chris@0 527 end
Chris@0 528
Chris@0 529 #
Chris@0 530 # In the default mixed mode, this method assigns rows for index access and
Chris@0 531 # columns for header access. You can force the index association by first
Chris@0 532 # calling by_col!() or by_row!().
Chris@0 533 #
Chris@0 534 # Rows may be set to an Array of values (which will inherit the table's
Chris@0 535 # headers()) or a FasterCSV::Row.
Chris@0 536 #
Chris@0 537 # Columns may be set to a single value, which is copied to each row of the
Chris@0 538 # column, or an Array of values. Arrays of values are assigned to rows top
Chris@0 539 # to bottom in row major order. Excess values are ignored and if the Array
Chris@0 540 # does not have a value for each row the extra rows will receive a +nil+.
Chris@0 541 #
Chris@0 542 # Assigning to an existing column or row clobbers the data. Assigning to
Chris@0 543 # new columns creates them at the right end of the table.
Chris@0 544 #
Chris@0 545 def []=(index_or_header, value)
Chris@0 546 if @mode == :row or # by index
Chris@0 547 (@mode == :col_or_row and index_or_header.is_a? Integer)
Chris@0 548 if value.is_a? Array
Chris@0 549 @table[index_or_header] = Row.new(headers, value)
Chris@0 550 else
Chris@0 551 @table[index_or_header] = value
Chris@0 552 end
Chris@0 553 else # set column
Chris@0 554 if value.is_a? Array # multiple values
Chris@0 555 @table.each_with_index do |row, i|
Chris@0 556 if row.header_row?
Chris@0 557 row[index_or_header] = index_or_header
Chris@0 558 else
Chris@0 559 row[index_or_header] = value[i]
Chris@0 560 end
Chris@0 561 end
Chris@0 562 else # repeated value
Chris@0 563 @table.each do |row|
Chris@0 564 if row.header_row?
Chris@0 565 row[index_or_header] = index_or_header
Chris@0 566 else
Chris@0 567 row[index_or_header] = value
Chris@0 568 end
Chris@0 569 end
Chris@0 570 end
Chris@0 571 end
Chris@0 572 end
Chris@0 573
Chris@0 574 #
Chris@0 575 # The mixed mode default is to treat a list of indices as row access,
Chris@0 576 # returning the rows indicated. Anything else is considered columnar
Chris@0 577 # access. For columnar access, the return set has an Array for each row
Chris@0 578 # with the values indicated by the headers in each Array. You can force
Chris@0 579 # column or row mode using by_col!() or by_row!().
Chris@0 580 #
Chris@0 581 # You cannot mix column and row access.
Chris@0 582 #
Chris@0 583 def values_at(*indices_or_headers)
Chris@0 584 if @mode == :row or # by indices
Chris@0 585 ( @mode == :col_or_row and indices_or_headers.all? do |index|
Chris@0 586 index.is_a?(Integer) or
Chris@0 587 ( index.is_a?(Range) and
Chris@0 588 index.first.is_a?(Integer) and
Chris@0 589 index.last.is_a?(Integer) )
Chris@0 590 end )
Chris@0 591 @table.values_at(*indices_or_headers)
Chris@0 592 else # by headers
Chris@0 593 @table.map { |row| row.values_at(*indices_or_headers) }
Chris@0 594 end
Chris@0 595 end
Chris@0 596
Chris@0 597 #
Chris@0 598 # Adds a new row to the bottom end of this table. You can provide an Array,
Chris@0 599 # which will be converted to a FasterCSV::Row (inheriting the table's
Chris@0 600 # headers()), or a FasterCSV::Row.
Chris@0 601 #
Chris@0 602 # This method returns the table for chaining.
Chris@0 603 #
Chris@0 604 def <<(row_or_array)
Chris@0 605 if row_or_array.is_a? Array # append Array
Chris@0 606 @table << Row.new(headers, row_or_array)
Chris@0 607 else # append Row
Chris@0 608 @table << row_or_array
Chris@0 609 end
Chris@0 610
Chris@0 611 self # for chaining
Chris@0 612 end
Chris@0 613
Chris@0 614 #
Chris@0 615 # A shortcut for appending multiple rows. Equivalent to:
Chris@0 616 #
Chris@0 617 # rows.each { |row| self << row }
Chris@0 618 #
Chris@0 619 # This method returns the table for chaining.
Chris@0 620 #
Chris@0 621 def push(*rows)
Chris@0 622 rows.each { |row| self << row }
Chris@0 623
Chris@0 624 self # for chaining
Chris@0 625 end
Chris@0 626
Chris@0 627 #
Chris@0 628 # Removes and returns the indicated column or row. In the default mixed
Chris@0 629 # mode indices refer to rows and everything else is assumed to be a column
Chris@0 630 # header. Use by_col!() or by_row!() to force the lookup.
Chris@0 631 #
Chris@0 632 def delete(index_or_header)
Chris@0 633 if @mode == :row or # by index
Chris@0 634 (@mode == :col_or_row and index_or_header.is_a? Integer)
Chris@0 635 @table.delete_at(index_or_header)
Chris@0 636 else # by header
Chris@0 637 @table.map { |row| row.delete(index_or_header).last }
Chris@0 638 end
Chris@0 639 end
Chris@0 640
Chris@0 641 #
Chris@0 642 # Removes any column or row for which the block returns +true+. In the
Chris@0 643 # default mixed mode or row mode, iteration is the standard row major
Chris@0 644 # walking of rows. In column mode, interation will +yield+ two element
Chris@0 645 # tuples containing the column name and an Array of values for that column.
Chris@0 646 #
Chris@0 647 # This method returns the table for chaining.
Chris@0 648 #
Chris@0 649 def delete_if(&block)
Chris@0 650 if @mode == :row or @mode == :col_or_row # by index
Chris@0 651 @table.delete_if(&block)
Chris@0 652 else # by header
Chris@0 653 to_delete = Array.new
Chris@0 654 headers.each_with_index do |header, i|
Chris@0 655 to_delete << header if block[[header, self[header]]]
Chris@0 656 end
Chris@0 657 to_delete.map { |header| delete(header) }
Chris@0 658 end
Chris@0 659
Chris@0 660 self # for chaining
Chris@0 661 end
Chris@0 662
Chris@0 663 include Enumerable
Chris@0 664
Chris@0 665 #
Chris@0 666 # In the default mixed mode or row mode, iteration is the standard row major
Chris@0 667 # walking of rows. In column mode, interation will +yield+ two element
Chris@0 668 # tuples containing the column name and an Array of values for that column.
Chris@0 669 #
Chris@0 670 # This method returns the table for chaining.
Chris@0 671 #
Chris@0 672 def each(&block)
Chris@0 673 if @mode == :col
Chris@0 674 headers.each { |header| block[[header, self[header]]] }
Chris@0 675 else
Chris@0 676 @table.each(&block)
Chris@0 677 end
Chris@0 678
Chris@0 679 self # for chaining
Chris@0 680 end
Chris@0 681
Chris@0 682 # Returns +true+ if all rows of this table ==() +other+'s rows.
Chris@0 683 def ==(other)
Chris@0 684 @table == other.table
Chris@0 685 end
Chris@0 686
Chris@0 687 #
Chris@0 688 # Returns the table as an Array of Arrays. Headers will be the first row,
Chris@0 689 # then all of the field rows will follow.
Chris@0 690 #
Chris@0 691 def to_a
Chris@0 692 @table.inject([headers]) do |array, row|
Chris@0 693 if row.header_row?
Chris@0 694 array
Chris@0 695 else
Chris@0 696 array + [row.fields]
Chris@0 697 end
Chris@0 698 end
Chris@0 699 end
Chris@0 700
Chris@0 701 #
Chris@0 702 # Returns the table as a complete CSV String. Headers will be listed first,
Chris@0 703 # then all of the field rows.
Chris@0 704 #
Chris@0 705 def to_csv(options = Hash.new)
Chris@0 706 @table.inject([headers.to_csv(options)]) do |rows, row|
Chris@0 707 if row.header_row?
Chris@0 708 rows
Chris@0 709 else
Chris@0 710 rows + [row.fields.to_csv(options)]
Chris@0 711 end
Chris@0 712 end.join
Chris@0 713 end
Chris@0 714 alias_method :to_s, :to_csv
Chris@0 715
Chris@0 716 def inspect
Chris@0 717 "#<#{self.class} mode:#{@mode} row_count:#{to_a.size}>"
Chris@0 718 end
Chris@0 719 end
Chris@0 720
Chris@0 721 # The error thrown when the parser encounters illegal CSV formatting.
Chris@0 722 class MalformedCSVError < RuntimeError; end
Chris@0 723
Chris@0 724 #
Chris@0 725 # A FieldInfo Struct contains details about a field's position in the data
Chris@0 726 # source it was read from. FasterCSV will pass this Struct to some blocks
Chris@0 727 # that make decisions based on field structure. See
Chris@0 728 # FasterCSV.convert_fields() for an example.
Chris@0 729 #
Chris@0 730 # <b><tt>index</tt></b>:: The zero-based index of the field in its row.
Chris@0 731 # <b><tt>line</tt></b>:: The line of the data source this row is from.
Chris@0 732 # <b><tt>header</tt></b>:: The header for the column, when available.
Chris@0 733 #
Chris@0 734 FieldInfo = Struct.new(:index, :line, :header)
Chris@0 735
Chris@0 736 # A Regexp used to find and convert some common Date formats.
Chris@0 737 DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} |
Chris@0 738 \d{4}-\d{2}-\d{2} )\z /x
Chris@0 739 # A Regexp used to find and convert some common DateTime formats.
Chris@0 740 DateTimeMatcher =
Chris@0 741 / \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} |
Chris@0 742 \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} )\z /x
Chris@0 743 #
Chris@0 744 # This Hash holds the built-in converters of FasterCSV that can be accessed by
Chris@0 745 # name. You can select Converters with FasterCSV.convert() or through the
Chris@0 746 # +options+ Hash passed to FasterCSV::new().
Chris@0 747 #
Chris@0 748 # <b><tt>:integer</tt></b>:: Converts any field Integer() accepts.
Chris@0 749 # <b><tt>:float</tt></b>:: Converts any field Float() accepts.
Chris@0 750 # <b><tt>:numeric</tt></b>:: A combination of <tt>:integer</tt>
Chris@0 751 # and <tt>:float</tt>.
Chris@0 752 # <b><tt>:date</tt></b>:: Converts any field Date::parse() accepts.
Chris@0 753 # <b><tt>:date_time</tt></b>:: Converts any field DateTime::parse() accepts.
Chris@0 754 # <b><tt>:all</tt></b>:: All built-in converters. A combination of
Chris@0 755 # <tt>:date_time</tt> and <tt>:numeric</tt>.
Chris@0 756 #
Chris@0 757 # This Hash is intetionally left unfrozen and users should feel free to add
Chris@0 758 # values to it that can be accessed by all FasterCSV objects.
Chris@0 759 #
Chris@0 760 # To add a combo field, the value should be an Array of names. Combo fields
Chris@0 761 # can be nested with other combo fields.
Chris@0 762 #
Chris@0 763 Converters = { :integer => lambda { |f| Integer(f) rescue f },
Chris@0 764 :float => lambda { |f| Float(f) rescue f },
Chris@0 765 :numeric => [:integer, :float],
Chris@0 766 :date => lambda { |f|
Chris@0 767 f =~ DateMatcher ? (Date.parse(f) rescue f) : f
Chris@0 768 },
Chris@0 769 :date_time => lambda { |f|
Chris@0 770 f =~ DateTimeMatcher ? (DateTime.parse(f) rescue f) : f
Chris@0 771 },
Chris@0 772 :all => [:date_time, :numeric] }
Chris@0 773
Chris@0 774 #
Chris@0 775 # This Hash holds the built-in header converters of FasterCSV that can be
Chris@0 776 # accessed by name. You can select HeaderConverters with
Chris@0 777 # FasterCSV.header_convert() or through the +options+ Hash passed to
Chris@0 778 # FasterCSV::new().
Chris@0 779 #
Chris@0 780 # <b><tt>:downcase</tt></b>:: Calls downcase() on the header String.
Chris@0 781 # <b><tt>:symbol</tt></b>:: The header String is downcased, spaces are
Chris@0 782 # replaced with underscores, non-word characters
Chris@0 783 # are dropped, and finally to_sym() is called.
Chris@0 784 #
Chris@0 785 # This Hash is intetionally left unfrozen and users should feel free to add
Chris@0 786 # values to it that can be accessed by all FasterCSV objects.
Chris@0 787 #
Chris@0 788 # To add a combo field, the value should be an Array of names. Combo fields
Chris@0 789 # can be nested with other combo fields.
Chris@0 790 #
Chris@0 791 HeaderConverters = {
Chris@0 792 :downcase => lambda { |h| h.downcase },
Chris@0 793 :symbol => lambda { |h|
Chris@0 794 h.downcase.tr(" ", "_").delete("^a-z0-9_").to_sym
Chris@0 795 }
Chris@0 796 }
Chris@0 797
Chris@0 798 #
Chris@0 799 # The options used when no overrides are given by calling code. They are:
Chris@0 800 #
Chris@0 801 # <b><tt>:col_sep</tt></b>:: <tt>","</tt>
Chris@0 802 # <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
Chris@0 803 # <b><tt>:quote_char</tt></b>:: <tt>'"'</tt>
Chris@0 804 # <b><tt>:converters</tt></b>:: +nil+
Chris@0 805 # <b><tt>:unconverted_fields</tt></b>:: +nil+
Chris@0 806 # <b><tt>:headers</tt></b>:: +false+
Chris@0 807 # <b><tt>:return_headers</tt></b>:: +false+
Chris@0 808 # <b><tt>:header_converters</tt></b>:: +nil+
Chris@0 809 # <b><tt>:skip_blanks</tt></b>:: +false+
Chris@0 810 # <b><tt>:force_quotes</tt></b>:: +false+
Chris@0 811 #
Chris@0 812 DEFAULT_OPTIONS = { :col_sep => ",",
Chris@0 813 :row_sep => :auto,
Chris@0 814 :quote_char => '"',
Chris@0 815 :converters => nil,
Chris@0 816 :unconverted_fields => nil,
Chris@0 817 :headers => false,
Chris@0 818 :return_headers => false,
Chris@0 819 :header_converters => nil,
Chris@0 820 :skip_blanks => false,
Chris@0 821 :force_quotes => false }.freeze
Chris@0 822
Chris@0 823 #
Chris@0 824 # This method will build a drop-in replacement for many of the standard CSV
Chris@0 825 # methods. It allows you to write code like:
Chris@0 826 #
Chris@0 827 # begin
Chris@0 828 # require "faster_csv"
Chris@0 829 # FasterCSV.build_csv_interface
Chris@0 830 # rescue LoadError
Chris@0 831 # require "csv"
Chris@0 832 # end
Chris@0 833 # # ... use CSV here ...
Chris@0 834 #
Chris@0 835 # This is not a complete interface with completely identical behavior.
Chris@0 836 # However, it is intended to be close enough that you won't notice the
Chris@0 837 # difference in most cases. CSV methods supported are:
Chris@0 838 #
Chris@0 839 # * foreach()
Chris@0 840 # * generate_line()
Chris@0 841 # * open()
Chris@0 842 # * parse()
Chris@0 843 # * parse_line()
Chris@0 844 # * readlines()
Chris@0 845 #
Chris@0 846 # Be warned that this interface is slower than vanilla FasterCSV due to the
Chris@0 847 # extra layer of method calls. Depending on usage, this can slow it down to
Chris@0 848 # near CSV speeds.
Chris@0 849 #
Chris@0 850 def self.build_csv_interface
Chris@0 851 Object.const_set(:CSV, Class.new).class_eval do
Chris@0 852 def self.foreach(path, rs = :auto, &block) # :nodoc:
Chris@0 853 FasterCSV.foreach(path, :row_sep => rs, &block)
Chris@0 854 end
Chris@0 855
Chris@0 856 def self.generate_line(row, fs = ",", rs = "") # :nodoc:
Chris@0 857 FasterCSV.generate_line(row, :col_sep => fs, :row_sep => rs)
Chris@0 858 end
Chris@0 859
Chris@0 860 def self.open(path, mode, fs = ",", rs = :auto, &block) # :nodoc:
Chris@0 861 if block and mode.include? "r"
Chris@0 862 FasterCSV.open(path, mode, :col_sep => fs, :row_sep => rs) do |csv|
Chris@0 863 csv.each(&block)
Chris@0 864 end
Chris@0 865 else
Chris@0 866 FasterCSV.open(path, mode, :col_sep => fs, :row_sep => rs, &block)
Chris@0 867 end
Chris@0 868 end
Chris@0 869
Chris@0 870 def self.parse(str_or_readable, fs = ",", rs = :auto, &block) # :nodoc:
Chris@0 871 FasterCSV.parse(str_or_readable, :col_sep => fs, :row_sep => rs, &block)
Chris@0 872 end
Chris@0 873
Chris@0 874 def self.parse_line(src, fs = ",", rs = :auto) # :nodoc:
Chris@0 875 FasterCSV.parse_line(src, :col_sep => fs, :row_sep => rs)
Chris@0 876 end
Chris@0 877
Chris@0 878 def self.readlines(path, rs = :auto) # :nodoc:
Chris@0 879 FasterCSV.readlines(path, :row_sep => rs)
Chris@0 880 end
Chris@0 881 end
Chris@0 882 end
Chris@0 883
Chris@0 884 #
Chris@0 885 # This method allows you to serialize an Array of Ruby objects to a String or
Chris@0 886 # File of CSV data. This is not as powerful as Marshal or YAML, but perhaps
Chris@0 887 # useful for spreadsheet and database interaction.
Chris@0 888 #
Chris@0 889 # Out of the box, this method is intended to work with simple data objects or
Chris@0 890 # Structs. It will serialize a list of instance variables and/or
Chris@0 891 # Struct.members().
Chris@0 892 #
Chris@0 893 # If you need need more complicated serialization, you can control the process
Chris@0 894 # by adding methods to the class to be serialized.
Chris@0 895 #
Chris@0 896 # A class method csv_meta() is responsible for returning the first row of the
Chris@0 897 # document (as an Array). This row is considered to be a Hash of the form
Chris@0 898 # key_1,value_1,key_2,value_2,... FasterCSV::load() expects to find a class
Chris@0 899 # key with a value of the stringified class name and FasterCSV::dump() will
Chris@0 900 # create this, if you do not define this method. This method is only called
Chris@0 901 # on the first object of the Array.
Chris@0 902 #
Chris@0 903 # The next method you can provide is an instance method called csv_headers().
Chris@0 904 # This method is expected to return the second line of the document (again as
Chris@0 905 # an Array), which is to be used to give each column a header. By default,
Chris@0 906 # FasterCSV::load() will set an instance variable if the field header starts
Chris@0 907 # with an @ character or call send() passing the header as the method name and
Chris@0 908 # the field value as an argument. This method is only called on the first
Chris@0 909 # object of the Array.
Chris@0 910 #
Chris@0 911 # Finally, you can provide an instance method called csv_dump(), which will
Chris@0 912 # be passed the headers. This should return an Array of fields that can be
Chris@0 913 # serialized for this object. This method is called once for every object in
Chris@0 914 # the Array.
Chris@0 915 #
Chris@0 916 # The +io+ parameter can be used to serialize to a File, and +options+ can be
Chris@0 917 # anything FasterCSV::new() accepts.
Chris@0 918 #
Chris@0 919 def self.dump(ary_of_objs, io = "", options = Hash.new)
Chris@0 920 obj_template = ary_of_objs.first
Chris@0 921
Chris@0 922 csv = FasterCSV.new(io, options)
Chris@0 923
Chris@0 924 # write meta information
Chris@0 925 begin
Chris@0 926 csv << obj_template.class.csv_meta
Chris@0 927 rescue NoMethodError
Chris@0 928 csv << [:class, obj_template.class]
Chris@0 929 end
Chris@0 930
Chris@0 931 # write headers
Chris@0 932 begin
Chris@0 933 headers = obj_template.csv_headers
Chris@0 934 rescue NoMethodError
Chris@0 935 headers = obj_template.instance_variables.sort
Chris@0 936 if obj_template.class.ancestors.find { |cls| cls.to_s =~ /\AStruct\b/ }
Chris@0 937 headers += obj_template.members.map { |mem| "#{mem}=" }.sort
Chris@0 938 end
Chris@0 939 end
Chris@0 940 csv << headers
Chris@0 941
Chris@0 942 # serialize each object
Chris@0 943 ary_of_objs.each do |obj|
Chris@0 944 begin
Chris@0 945 csv << obj.csv_dump(headers)
Chris@0 946 rescue NoMethodError
Chris@0 947 csv << headers.map do |var|
Chris@0 948 if var[0] == ?@
Chris@0 949 obj.instance_variable_get(var)
Chris@0 950 else
Chris@0 951 obj[var[0..-2]]
Chris@0 952 end
Chris@0 953 end
Chris@0 954 end
Chris@0 955 end
Chris@0 956
Chris@0 957 if io.is_a? String
Chris@0 958 csv.string
Chris@0 959 else
Chris@0 960 csv.close
Chris@0 961 end
Chris@0 962 end
Chris@0 963
Chris@0 964 #
Chris@0 965 # :call-seq:
Chris@0 966 # filter( options = Hash.new ) { |row| ... }
Chris@0 967 # filter( input, options = Hash.new ) { |row| ... }
Chris@0 968 # filter( input, output, options = Hash.new ) { |row| ... }
Chris@0 969 #
Chris@0 970 # This method is a convenience for building Unix-like filters for CSV data.
Chris@0 971 # Each row is yielded to the provided block which can alter it as needed.
Chris@0 972 # After the block returns, the row is appended to +output+ altered or not.
Chris@0 973 #
Chris@0 974 # The +input+ and +output+ arguments can be anything FasterCSV::new() accepts
Chris@0 975 # (generally String or IO objects). If not given, they default to
Chris@0 976 # <tt>ARGF</tt> and <tt>$stdout</tt>.
Chris@0 977 #
Chris@0 978 # The +options+ parameter is also filtered down to FasterCSV::new() after some
Chris@0 979 # clever key parsing. Any key beginning with <tt>:in_</tt> or
Chris@0 980 # <tt>:input_</tt> will have that leading identifier stripped and will only
Chris@0 981 # be used in the +options+ Hash for the +input+ object. Keys starting with
Chris@0 982 # <tt>:out_</tt> or <tt>:output_</tt> affect only +output+. All other keys
Chris@0 983 # are assigned to both objects.
Chris@0 984 #
Chris@0 985 # The <tt>:output_row_sep</tt> +option+ defaults to
Chris@0 986 # <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
Chris@0 987 #
Chris@0 988 def self.filter(*args)
Chris@0 989 # parse options for input, output, or both
Chris@0 990 in_options, out_options = Hash.new, {:row_sep => $INPUT_RECORD_SEPARATOR}
Chris@0 991 if args.last.is_a? Hash
Chris@0 992 args.pop.each do |key, value|
Chris@0 993 case key.to_s
Chris@0 994 when /\Ain(?:put)?_(.+)\Z/
Chris@0 995 in_options[$1.to_sym] = value
Chris@0 996 when /\Aout(?:put)?_(.+)\Z/
Chris@0 997 out_options[$1.to_sym] = value
Chris@0 998 else
Chris@0 999 in_options[key] = value
Chris@0 1000 out_options[key] = value
Chris@0 1001 end
Chris@0 1002 end
Chris@0 1003 end
Chris@0 1004 # build input and output wrappers
Chris@0 1005 input = FasterCSV.new(args.shift || ARGF, in_options)
Chris@0 1006 output = FasterCSV.new(args.shift || $stdout, out_options)
Chris@0 1007
Chris@0 1008 # read, yield, write
Chris@0 1009 input.each do |row|
Chris@0 1010 yield row
Chris@0 1011 output << row
Chris@0 1012 end
Chris@0 1013 end
Chris@0 1014
Chris@0 1015 #
Chris@0 1016 # This method is intended as the primary interface for reading CSV files. You
Chris@0 1017 # pass a +path+ and any +options+ you wish to set for the read. Each row of
Chris@0 1018 # file will be passed to the provided +block+ in turn.
Chris@0 1019 #
Chris@0 1020 # The +options+ parameter can be anything FasterCSV::new() understands.
Chris@0 1021 #
Chris@0 1022 def self.foreach(path, options = Hash.new, &block)
Chris@0 1023 open(path, "rb", options) do |csv|
Chris@0 1024 csv.each(&block)
Chris@0 1025 end
Chris@0 1026 end
Chris@0 1027
Chris@0 1028 #
Chris@0 1029 # :call-seq:
Chris@0 1030 # generate( str, options = Hash.new ) { |faster_csv| ... }
Chris@0 1031 # generate( options = Hash.new ) { |faster_csv| ... }
Chris@0 1032 #
Chris@0 1033 # This method wraps a String you provide, or an empty default String, in a
Chris@0 1034 # FasterCSV object which is passed to the provided block. You can use the
Chris@0 1035 # block to append CSV rows to the String and when the block exits, the
Chris@0 1036 # final String will be returned.
Chris@0 1037 #
Chris@0 1038 # Note that a passed String *is* modfied by this method. Call dup() before
Chris@0 1039 # passing if you need a new String.
Chris@0 1040 #
Chris@0 1041 # The +options+ parameter can be anthing FasterCSV::new() understands.
Chris@0 1042 #
Chris@0 1043 def self.generate(*args)
Chris@0 1044 # add a default empty String, if none was given
Chris@0 1045 if args.first.is_a? String
Chris@0 1046 io = StringIO.new(args.shift)
Chris@0 1047 io.seek(0, IO::SEEK_END)
Chris@0 1048 args.unshift(io)
Chris@0 1049 else
Chris@0 1050 args.unshift("")
Chris@0 1051 end
Chris@0 1052 faster_csv = new(*args) # wrap
Chris@0 1053 yield faster_csv # yield for appending
Chris@0 1054 faster_csv.string # return final String
Chris@0 1055 end
Chris@0 1056
Chris@0 1057 #
Chris@0 1058 # This method is a shortcut for converting a single row (Array) into a CSV
Chris@0 1059 # String.
Chris@0 1060 #
Chris@0 1061 # The +options+ parameter can be anthing FasterCSV::new() understands.
Chris@0 1062 #
Chris@0 1063 # The <tt>:row_sep</tt> +option+ defaults to <tt>$INPUT_RECORD_SEPARATOR</tt>
Chris@0 1064 # (<tt>$/</tt>) when calling this method.
Chris@0 1065 #
Chris@0 1066 def self.generate_line(row, options = Hash.new)
Chris@0 1067 options = {:row_sep => $INPUT_RECORD_SEPARATOR}.merge(options)
Chris@0 1068 (new("", options) << row).string
Chris@0 1069 end
Chris@0 1070
Chris@0 1071 #
Chris@0 1072 # This method will return a FasterCSV instance, just like FasterCSV::new(),
Chris@0 1073 # but the instance will be cached and returned for all future calls to this
Chris@0 1074 # method for the same +data+ object (tested by Object#object_id()) with the
Chris@0 1075 # same +options+.
Chris@0 1076 #
Chris@0 1077 # If a block is given, the instance is passed to the block and the return
Chris@0 1078 # value becomes the return value of the block.
Chris@0 1079 #
Chris@0 1080 def self.instance(data = $stdout, options = Hash.new)
Chris@0 1081 # create a _signature_ for this method call, data object and options
Chris@0 1082 sig = [data.object_id] +
Chris@0 1083 options.values_at(*DEFAULT_OPTIONS.keys.sort_by { |sym| sym.to_s })
Chris@0 1084
Chris@0 1085 # fetch or create the instance for this signature
Chris@0 1086 @@instances ||= Hash.new
Chris@0 1087 instance = (@@instances[sig] ||= new(data, options))
Chris@0 1088
Chris@0 1089 if block_given?
Chris@0 1090 yield instance # run block, if given, returning result
Chris@0 1091 else
Chris@0 1092 instance # or return the instance
Chris@0 1093 end
Chris@0 1094 end
Chris@0 1095
Chris@0 1096 #
Chris@0 1097 # This method is the reading counterpart to FasterCSV::dump(). See that
Chris@0 1098 # method for a detailed description of the process.
Chris@0 1099 #
Chris@0 1100 # You can customize loading by adding a class method called csv_load() which
Chris@0 1101 # will be passed a Hash of meta information, an Array of headers, and an Array
Chris@0 1102 # of fields for the object the method is expected to return.
Chris@0 1103 #
Chris@0 1104 # Remember that all fields will be Strings after this load. If you need
Chris@0 1105 # something else, use +options+ to setup converters or provide a custom
Chris@0 1106 # csv_load() implementation.
Chris@0 1107 #
Chris@0 1108 def self.load(io_or_str, options = Hash.new)
Chris@0 1109 csv = FasterCSV.new(io_or_str, options)
Chris@0 1110
Chris@0 1111 # load meta information
Chris@0 1112 meta = Hash[*csv.shift]
Chris@0 1113 cls = meta["class"].split("::").inject(Object) do |c, const|
Chris@0 1114 c.const_get(const)
Chris@0 1115 end
Chris@0 1116
Chris@0 1117 # load headers
Chris@0 1118 headers = csv.shift
Chris@0 1119
Chris@0 1120 # unserialize each object stored in the file
Chris@0 1121 results = csv.inject(Array.new) do |all, row|
Chris@0 1122 begin
Chris@0 1123 obj = cls.csv_load(meta, headers, row)
Chris@0 1124 rescue NoMethodError
Chris@0 1125 obj = cls.allocate
Chris@0 1126 headers.zip(row) do |name, value|
Chris@0 1127 if name[0] == ?@
Chris@0 1128 obj.instance_variable_set(name, value)
Chris@0 1129 else
Chris@0 1130 obj.send(name, value)
Chris@0 1131 end
Chris@0 1132 end
Chris@0 1133 end
Chris@0 1134 all << obj
Chris@0 1135 end
Chris@0 1136
Chris@0 1137 csv.close unless io_or_str.is_a? String
Chris@0 1138
Chris@0 1139 results
Chris@0 1140 end
Chris@0 1141
Chris@0 1142 #
Chris@0 1143 # :call-seq:
Chris@0 1144 # open( filename, mode="rb", options = Hash.new ) { |faster_csv| ... }
Chris@0 1145 # open( filename, mode="rb", options = Hash.new )
Chris@0 1146 #
Chris@0 1147 # This method opens an IO object, and wraps that with FasterCSV. This is
Chris@0 1148 # intended as the primary interface for writing a CSV file.
Chris@0 1149 #
Chris@0 1150 # You may pass any +args+ Ruby's open() understands followed by an optional
Chris@0 1151 # Hash containing any +options+ FasterCSV::new() understands.
Chris@0 1152 #
Chris@0 1153 # This method works like Ruby's open() call, in that it will pass a FasterCSV
Chris@0 1154 # object to a provided block and close it when the block termminates, or it
Chris@0 1155 # will return the FasterCSV object when no block is provided. (*Note*: This
Chris@0 1156 # is different from the standard CSV library which passes rows to the block.
Chris@0 1157 # Use FasterCSV::foreach() for that behavior.)
Chris@0 1158 #
Chris@0 1159 # An opened FasterCSV object will delegate to many IO methods, for
Chris@0 1160 # convenience. You may call:
Chris@0 1161 #
Chris@0 1162 # * binmode()
Chris@0 1163 # * close()
Chris@0 1164 # * close_read()
Chris@0 1165 # * close_write()
Chris@0 1166 # * closed?()
Chris@0 1167 # * eof()
Chris@0 1168 # * eof?()
Chris@0 1169 # * fcntl()
Chris@0 1170 # * fileno()
Chris@0 1171 # * flush()
Chris@0 1172 # * fsync()
Chris@0 1173 # * ioctl()
Chris@0 1174 # * isatty()
Chris@0 1175 # * pid()
Chris@0 1176 # * pos()
Chris@0 1177 # * reopen()
Chris@0 1178 # * seek()
Chris@0 1179 # * stat()
Chris@0 1180 # * sync()
Chris@0 1181 # * sync=()
Chris@0 1182 # * tell()
Chris@0 1183 # * to_i()
Chris@0 1184 # * to_io()
Chris@0 1185 # * tty?()
Chris@0 1186 #
Chris@0 1187 def self.open(*args)
Chris@0 1188 # find the +options+ Hash
Chris@0 1189 options = if args.last.is_a? Hash then args.pop else Hash.new end
Chris@0 1190 # default to a binary open mode
Chris@0 1191 args << "rb" if args.size == 1
Chris@0 1192 # wrap a File opened with the remaining +args+
Chris@0 1193 csv = new(File.open(*args), options)
Chris@0 1194
Chris@0 1195 # handle blocks like Ruby's open(), not like the CSV library
Chris@0 1196 if block_given?
Chris@0 1197 begin
Chris@0 1198 yield csv
Chris@0 1199 ensure
Chris@0 1200 csv.close
Chris@0 1201 end
Chris@0 1202 else
Chris@0 1203 csv
Chris@0 1204 end
Chris@0 1205 end
Chris@0 1206
Chris@0 1207 #
Chris@0 1208 # :call-seq:
Chris@0 1209 # parse( str, options = Hash.new ) { |row| ... }
Chris@0 1210 # parse( str, options = Hash.new )
Chris@0 1211 #
Chris@0 1212 # This method can be used to easily parse CSV out of a String. You may either
Chris@0 1213 # provide a +block+ which will be called with each row of the String in turn,
Chris@0 1214 # or just use the returned Array of Arrays (when no +block+ is given).
Chris@0 1215 #
Chris@0 1216 # You pass your +str+ to read from, and an optional +options+ Hash containing
Chris@0 1217 # anything FasterCSV::new() understands.
Chris@0 1218 #
Chris@0 1219 def self.parse(*args, &block)
Chris@0 1220 csv = new(*args)
Chris@0 1221 if block.nil? # slurp contents, if no block is given
Chris@0 1222 begin
Chris@0 1223 csv.read
Chris@0 1224 ensure
Chris@0 1225 csv.close
Chris@0 1226 end
Chris@0 1227 else # or pass each row to a provided block
Chris@0 1228 csv.each(&block)
Chris@0 1229 end
Chris@0 1230 end
Chris@0 1231
Chris@0 1232 #
Chris@0 1233 # This method is a shortcut for converting a single line of a CSV String into
Chris@0 1234 # a into an Array. Note that if +line+ contains multiple rows, anything
Chris@0 1235 # beyond the first row is ignored.
Chris@0 1236 #
Chris@0 1237 # The +options+ parameter can be anthing FasterCSV::new() understands.
Chris@0 1238 #
Chris@0 1239 def self.parse_line(line, options = Hash.new)
Chris@0 1240 new(line, options).shift
Chris@0 1241 end
Chris@0 1242
Chris@0 1243 #
Chris@0 1244 # Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
Chris@0 1245 # file and any +options+ FasterCSV::new() understands.
Chris@0 1246 #
Chris@0 1247 def self.read(path, options = Hash.new)
Chris@0 1248 open(path, "rb", options) { |csv| csv.read }
Chris@0 1249 end
Chris@0 1250
Chris@0 1251 # Alias for FasterCSV::read().
Chris@0 1252 def self.readlines(*args)
Chris@0 1253 read(*args)
Chris@0 1254 end
Chris@0 1255
Chris@0 1256 #
Chris@0 1257 # A shortcut for:
Chris@0 1258 #
Chris@0 1259 # FasterCSV.read( path, { :headers => true,
Chris@0 1260 # :converters => :numeric,
Chris@0 1261 # :header_converters => :symbol }.merge(options) )
Chris@0 1262 #
Chris@0 1263 def self.table(path, options = Hash.new)
Chris@0 1264 read( path, { :headers => true,
Chris@0 1265 :converters => :numeric,
Chris@0 1266 :header_converters => :symbol }.merge(options) )
Chris@0 1267 end
Chris@0 1268
Chris@0 1269 #
Chris@0 1270 # This constructor will wrap either a String or IO object passed in +data+ for
Chris@0 1271 # reading and/or writing. In addition to the FasterCSV instance methods,
Chris@0 1272 # several IO methods are delegated. (See FasterCSV::open() for a complete
Chris@0 1273 # list.) If you pass a String for +data+, you can later retrieve it (after
Chris@0 1274 # writing to it, for example) with FasterCSV.string().
Chris@0 1275 #
Chris@0 1276 # Note that a wrapped String will be positioned at at the beginning (for
Chris@0 1277 # reading). If you want it at the end (for writing), use
Chris@0 1278 # FasterCSV::generate(). If you want any other positioning, pass a preset
Chris@0 1279 # StringIO object instead.
Chris@0 1280 #
Chris@0 1281 # You may set any reading and/or writing preferences in the +options+ Hash.
Chris@0 1282 # Available options are:
Chris@0 1283 #
Chris@0 1284 # <b><tt>:col_sep</tt></b>:: The String placed between each field.
Chris@0 1285 # <b><tt>:row_sep</tt></b>:: The String appended to the end of each
Chris@0 1286 # row. This can be set to the special
Chris@0 1287 # <tt>:auto</tt> setting, which requests
Chris@0 1288 # that FasterCSV automatically discover
Chris@0 1289 # this from the data. Auto-discovery
Chris@0 1290 # reads ahead in the data looking for
Chris@0 1291 # the next <tt>"\r\n"</tt>,
Chris@0 1292 # <tt>"\n"</tt>, or <tt>"\r"</tt>
Chris@0 1293 # sequence. A sequence will be selected
Chris@0 1294 # even if it occurs in a quoted field,
Chris@0 1295 # assuming that you would have the same
Chris@0 1296 # line endings there. If none of those
Chris@0 1297 # sequences is found, +data+ is
Chris@0 1298 # <tt>ARGF</tt>, <tt>STDIN</tt>,
Chris@0 1299 # <tt>STDOUT</tt>, or <tt>STDERR</tt>,
Chris@0 1300 # or the stream is only available for
Chris@0 1301 # output, the default
Chris@0 1302 # <tt>$INPUT_RECORD_SEPARATOR</tt>
Chris@0 1303 # (<tt>$/</tt>) is used. Obviously,
Chris@0 1304 # discovery takes a little time. Set
Chris@0 1305 # manually if speed is important. Also
Chris@0 1306 # note that IO objects should be opened
Chris@0 1307 # in binary mode on Windows if this
Chris@0 1308 # feature will be used as the
Chris@0 1309 # line-ending translation can cause
Chris@0 1310 # problems with resetting the document
Chris@0 1311 # position to where it was before the
Chris@0 1312 # read ahead.
Chris@0 1313 # <b><tt>:quote_char</tt></b>:: The character used to quote fields.
Chris@0 1314 # This has to be a single character
Chris@0 1315 # String. This is useful for
Chris@0 1316 # application that incorrectly use
Chris@0 1317 # <tt>'</tt> as the quote character
Chris@0 1318 # instead of the correct <tt>"</tt>.
Chris@0 1319 # FasterCSV will always consider a
Chris@0 1320 # double sequence this character to be
Chris@0 1321 # an escaped quote.
Chris@0 1322 # <b><tt>:encoding</tt></b>:: The encoding to use when parsing the
Chris@0 1323 # file. Defaults to your <tt>$KDOCE</tt>
Chris@0 1324 # setting. Valid values: <tt>`n’</tt> or
Chris@0 1325 # <tt>`N’</tt> for none, <tt>`e’</tt> or
Chris@0 1326 # <tt>`E’</tt> for EUC, <tt>`s’</tt> or
Chris@0 1327 # <tt>`S’</tt> for SJIS, and
Chris@0 1328 # <tt>`u’</tt> or <tt>`U’</tt> for UTF-8
Chris@0 1329 # (see Regexp.new()).
Chris@0 1330 # <b><tt>:field_size_limit</tt></b>:: This is a maximum size FasterCSV will
Chris@0 1331 # read ahead looking for the closing
Chris@0 1332 # quote for a field. (In truth, it
Chris@0 1333 # reads to the first line ending beyond
Chris@0 1334 # this size.) If a quote cannot be
Chris@0 1335 # found within the limit FasterCSV will
Chris@0 1336 # raise a MalformedCSVError, assuming
Chris@0 1337 # the data is faulty. You can use this
Chris@0 1338 # limit to prevent what are effectively
Chris@0 1339 # DoS attacks on the parser. However,
Chris@0 1340 # this limit can cause a legitimate
Chris@0 1341 # parse to fail and thus is set to
Chris@0 1342 # +nil+, or off, by default.
Chris@0 1343 # <b><tt>:converters</tt></b>:: An Array of names from the Converters
Chris@0 1344 # Hash and/or lambdas that handle custom
Chris@0 1345 # conversion. A single converter
Chris@0 1346 # doesn't have to be in an Array.
Chris@0 1347 # <b><tt>:unconverted_fields</tt></b>:: If set to +true+, an
Chris@0 1348 # unconverted_fields() method will be
Chris@0 1349 # added to all returned rows (Array or
Chris@0 1350 # FasterCSV::Row) that will return the
Chris@0 1351 # fields as they were before convertion.
Chris@0 1352 # Note that <tt>:headers</tt> supplied
Chris@0 1353 # by Array or String were not fields of
Chris@0 1354 # the document and thus will have an
Chris@0 1355 # empty Array attached.
Chris@0 1356 # <b><tt>:headers</tt></b>:: If set to <tt>:first_row</tt> or
Chris@0 1357 # +true+, the initial row of the CSV
Chris@0 1358 # file will be treated as a row of
Chris@0 1359 # headers. If set to an Array, the
Chris@0 1360 # contents will be used as the headers.
Chris@0 1361 # If set to a String, the String is run
Chris@0 1362 # through a call of
Chris@0 1363 # FasterCSV::parse_line() with the same
Chris@0 1364 # <tt>:col_sep</tt>, <tt>:row_sep</tt>,
Chris@0 1365 # and <tt>:quote_char</tt> as this
Chris@0 1366 # instance to produce an Array of
Chris@0 1367 # headers. This setting causes
Chris@0 1368 # FasterCSV.shift() to return rows as
Chris@0 1369 # FasterCSV::Row objects instead of
Chris@0 1370 # Arrays and FasterCSV.read() to return
Chris@0 1371 # FasterCSV::Table objects instead of
Chris@0 1372 # an Array of Arrays.
Chris@0 1373 # <b><tt>:return_headers</tt></b>:: When +false+, header rows are silently
Chris@0 1374 # swallowed. If set to +true+, header
Chris@0 1375 # rows are returned in a FasterCSV::Row
Chris@0 1376 # object with identical headers and
Chris@0 1377 # fields (save that the fields do not go
Chris@0 1378 # through the converters).
Chris@0 1379 # <b><tt>:write_headers</tt></b>:: When +true+ and <tt>:headers</tt> is
Chris@0 1380 # set, a header row will be added to the
Chris@0 1381 # output.
Chris@0 1382 # <b><tt>:header_converters</tt></b>:: Identical in functionality to
Chris@0 1383 # <tt>:converters</tt> save that the
Chris@0 1384 # conversions are only made to header
Chris@0 1385 # rows.
Chris@0 1386 # <b><tt>:skip_blanks</tt></b>:: When set to a +true+ value, FasterCSV
Chris@0 1387 # will skip over any rows with no
Chris@0 1388 # content.
Chris@0 1389 # <b><tt>:force_quotes</tt></b>:: When set to a +true+ value, FasterCSV
Chris@0 1390 # will quote all CSV fields it creates.
Chris@0 1391 #
Chris@0 1392 # See FasterCSV::DEFAULT_OPTIONS for the default settings.
Chris@0 1393 #
Chris@0 1394 # Options cannot be overriden in the instance methods for performance reasons,
Chris@0 1395 # so be sure to set what you want here.
Chris@0 1396 #
Chris@0 1397 def initialize(data, options = Hash.new)
Chris@0 1398 # build the options for this read/write
Chris@0 1399 options = DEFAULT_OPTIONS.merge(options)
Chris@0 1400
Chris@0 1401 # create the IO object we will read from
Chris@0 1402 @io = if data.is_a? String then StringIO.new(data) else data end
Chris@0 1403
Chris@0 1404 init_separators(options)
Chris@0 1405 init_parsers(options)
Chris@0 1406 init_converters(options)
Chris@0 1407 init_headers(options)
Chris@0 1408
Chris@0 1409 unless options.empty?
Chris@0 1410 raise ArgumentError, "Unknown options: #{options.keys.join(', ')}."
Chris@0 1411 end
Chris@0 1412
Chris@0 1413 # track our own lineno since IO gets confused about line-ends is CSV fields
Chris@0 1414 @lineno = 0
Chris@0 1415 end
Chris@0 1416
Chris@0 1417 #
Chris@0 1418 # The line number of the last row read from this file. Fields with nested
Chris@0 1419 # line-end characters will not affect this count.
Chris@0 1420 #
Chris@0 1421 attr_reader :lineno
Chris@0 1422
Chris@0 1423 ### IO and StringIO Delegation ###
Chris@0 1424
Chris@0 1425 extend Forwardable
Chris@0 1426 def_delegators :@io, :binmode, :close, :close_read, :close_write, :closed?,
Chris@0 1427 :eof, :eof?, :fcntl, :fileno, :flush, :fsync, :ioctl,
Chris@0 1428 :isatty, :pid, :pos, :reopen, :seek, :stat, :string,
Chris@0 1429 :sync, :sync=, :tell, :to_i, :to_io, :tty?
Chris@0 1430
Chris@0 1431 # Rewinds the underlying IO object and resets FasterCSV's lineno() counter.
Chris@0 1432 def rewind
Chris@0 1433 @headers = nil
Chris@0 1434 @lineno = 0
Chris@0 1435
Chris@0 1436 @io.rewind
Chris@0 1437 end
Chris@0 1438
Chris@0 1439 ### End Delegation ###
Chris@0 1440
Chris@0 1441 #
Chris@0 1442 # The primary write method for wrapped Strings and IOs, +row+ (an Array or
Chris@0 1443 # FasterCSV::Row) is converted to CSV and appended to the data source. When a
Chris@0 1444 # FasterCSV::Row is passed, only the row's fields() are appended to the
Chris@0 1445 # output.
Chris@0 1446 #
Chris@0 1447 # The data source must be open for writing.
Chris@0 1448 #
Chris@0 1449 def <<(row)
Chris@0 1450 # make sure headers have been assigned
Chris@0 1451 if header_row? and [Array, String].include? @use_headers.class
Chris@0 1452 parse_headers # won't read data for Array or String
Chris@0 1453 self << @headers if @write_headers
Chris@0 1454 end
Chris@0 1455
Chris@0 1456 # Handle FasterCSV::Row objects and Hashes
Chris@0 1457 row = case row
Chris@0 1458 when self.class::Row then row.fields
Chris@0 1459 when Hash then @headers.map { |header| row[header] }
Chris@0 1460 else row
Chris@0 1461 end
Chris@0 1462
Chris@0 1463 @headers = row if header_row?
Chris@0 1464 @lineno += 1
Chris@0 1465
Chris@0 1466 @io << row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
Chris@0 1467
Chris@0 1468 self # for chaining
Chris@0 1469 end
Chris@0 1470 alias_method :add_row, :<<
Chris@0 1471 alias_method :puts, :<<
Chris@0 1472
Chris@0 1473 #
Chris@0 1474 # :call-seq:
Chris@0 1475 # convert( name )
Chris@0 1476 # convert { |field| ... }
Chris@0 1477 # convert { |field, field_info| ... }
Chris@0 1478 #
Chris@0 1479 # You can use this method to install a FasterCSV::Converters built-in, or
Chris@0 1480 # provide a block that handles a custom conversion.
Chris@0 1481 #
Chris@0 1482 # If you provide a block that takes one argument, it will be passed the field
Chris@0 1483 # and is expected to return the converted value or the field itself. If your
Chris@0 1484 # block takes two arguments, it will also be passed a FieldInfo Struct,
Chris@0 1485 # containing details about the field. Again, the block should return a
Chris@0 1486 # converted field or the field itself.
Chris@0 1487 #
Chris@0 1488 def convert(name = nil, &converter)
Chris@0 1489 add_converter(:converters, self.class::Converters, name, &converter)
Chris@0 1490 end
Chris@0 1491
Chris@0 1492 #
Chris@0 1493 # :call-seq:
Chris@0 1494 # header_convert( name )
Chris@0 1495 # header_convert { |field| ... }
Chris@0 1496 # header_convert { |field, field_info| ... }
Chris@0 1497 #
Chris@0 1498 # Identical to FasterCSV.convert(), but for header rows.
Chris@0 1499 #
Chris@0 1500 # Note that this method must be called before header rows are read to have any
Chris@0 1501 # effect.
Chris@0 1502 #
Chris@0 1503 def header_convert(name = nil, &converter)
Chris@0 1504 add_converter( :header_converters,
Chris@0 1505 self.class::HeaderConverters,
Chris@0 1506 name,
Chris@0 1507 &converter )
Chris@0 1508 end
Chris@0 1509
Chris@0 1510 include Enumerable
Chris@0 1511
Chris@0 1512 #
Chris@0 1513 # Yields each row of the data source in turn.
Chris@0 1514 #
Chris@0 1515 # Support for Enumerable.
Chris@0 1516 #
Chris@0 1517 # The data source must be open for reading.
Chris@0 1518 #
Chris@0 1519 def each
Chris@0 1520 while row = shift
Chris@0 1521 yield row
Chris@0 1522 end
Chris@0 1523 end
Chris@0 1524
Chris@0 1525 #
Chris@0 1526 # Slurps the remaining rows and returns an Array of Arrays.
Chris@0 1527 #
Chris@0 1528 # The data source must be open for reading.
Chris@0 1529 #
Chris@0 1530 def read
Chris@0 1531 rows = to_a
Chris@0 1532 if @use_headers
Chris@0 1533 Table.new(rows)
Chris@0 1534 else
Chris@0 1535 rows
Chris@0 1536 end
Chris@0 1537 end
Chris@0 1538 alias_method :readlines, :read
Chris@0 1539
Chris@0 1540 # Returns +true+ if the next row read will be a header row.
Chris@0 1541 def header_row?
Chris@0 1542 @use_headers and @headers.nil?
Chris@0 1543 end
Chris@0 1544
Chris@0 1545 #
Chris@0 1546 # The primary read method for wrapped Strings and IOs, a single row is pulled
Chris@0 1547 # from the data source, parsed and returned as an Array of fields (if header
Chris@0 1548 # rows are not used) or a FasterCSV::Row (when header rows are used).
Chris@0 1549 #
Chris@0 1550 # The data source must be open for reading.
Chris@0 1551 #
Chris@0 1552 def shift
Chris@0 1553 #########################################################################
Chris@0 1554 ### This method is purposefully kept a bit long as simple conditional ###
Chris@0 1555 ### checks are faster than numerous (expensive) method calls. ###
Chris@0 1556 #########################################################################
Chris@0 1557
Chris@0 1558 # handle headers not based on document content
Chris@0 1559 if header_row? and @return_headers and
Chris@0 1560 [Array, String].include? @use_headers.class
Chris@0 1561 if @unconverted_fields
Chris@0 1562 return add_unconverted_fields(parse_headers, Array.new)
Chris@0 1563 else
Chris@0 1564 return parse_headers
Chris@0 1565 end
Chris@0 1566 end
Chris@0 1567
Chris@0 1568 # begin with a blank line, so we can always add to it
Chris@0 1569 line = String.new
Chris@0 1570
Chris@0 1571 #
Chris@0 1572 # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
Chris@0 1573 # because of \r and/or \n characters embedded in quoted fields
Chris@0 1574 #
Chris@0 1575 loop do
Chris@0 1576 # add another read to the line
Chris@0 1577 begin
Chris@0 1578 line += @io.gets(@row_sep)
Chris@0 1579 rescue
Chris@0 1580 return nil
Chris@0 1581 end
Chris@0 1582 # copy the line so we can chop it up in parsing
Chris@0 1583 parse = line.dup
Chris@0 1584 parse.sub!(@parsers[:line_end], "")
Chris@0 1585
Chris@0 1586 #
Chris@0 1587 # I believe a blank line should be an <tt>Array.new</tt>, not
Chris@0 1588 # CSV's <tt>[nil]</tt>
Chris@0 1589 #
Chris@0 1590 if parse.empty?
Chris@0 1591 @lineno += 1
Chris@0 1592 if @skip_blanks
Chris@0 1593 line = ""
Chris@0 1594 next
Chris@0 1595 elsif @unconverted_fields
Chris@0 1596 return add_unconverted_fields(Array.new, Array.new)
Chris@0 1597 elsif @use_headers
Chris@0 1598 return FasterCSV::Row.new(Array.new, Array.new)
Chris@0 1599 else
Chris@0 1600 return Array.new
Chris@0 1601 end
Chris@0 1602 end
Chris@0 1603
Chris@0 1604 # parse the fields with a mix of String#split and regular expressions
Chris@0 1605 csv = Array.new
Chris@0 1606 current_field = String.new
Chris@0 1607 field_quotes = 0
Chris@0 1608 parse.split(@col_sep, -1).each do |match|
Chris@0 1609 if current_field.empty? && match.count(@quote_and_newlines).zero?
Chris@0 1610 csv << (match.empty? ? nil : match)
Chris@0 1611 elsif(current_field.empty? ? match[0] : current_field[0]) == @quote_char[0]
Chris@0 1612 current_field << match
Chris@0 1613 field_quotes += match.count(@quote_char)
Chris@0 1614 if field_quotes % 2 == 0
Chris@0 1615 in_quotes = current_field[@parsers[:quoted_field], 1]
Chris@0 1616 raise MalformedCSVError unless in_quotes
Chris@0 1617 current_field = in_quotes
Chris@0 1618 current_field.gsub!(@quote_char * 2, @quote_char) # unescape contents
Chris@0 1619 csv << current_field
Chris@0 1620 current_field = String.new
Chris@0 1621 field_quotes = 0
Chris@0 1622 else # we found a quoted field that spans multiple lines
Chris@0 1623 current_field << @col_sep
Chris@0 1624 end
Chris@0 1625 elsif match.count("\r\n").zero?
Chris@0 1626 raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
Chris@0 1627 else
Chris@0 1628 raise MalformedCSVError, "Unquoted fields do not allow " +
Chris@0 1629 "\\r or \\n (line #{lineno + 1})."
Chris@0 1630 end
Chris@0 1631 end
Chris@0 1632
Chris@0 1633 # if parse is empty?(), we found all the fields on the line...
Chris@0 1634 if field_quotes % 2 == 0
Chris@0 1635 @lineno += 1
Chris@0 1636
Chris@0 1637 # save fields unconverted fields, if needed...
Chris@0 1638 unconverted = csv.dup if @unconverted_fields
Chris@0 1639
Chris@0 1640 # convert fields, if needed...
Chris@0 1641 csv = convert_fields(csv) unless @use_headers or @converters.empty?
Chris@0 1642 # parse out header rows and handle FasterCSV::Row conversions...
Chris@0 1643 csv = parse_headers(csv) if @use_headers
Chris@0 1644
Chris@0 1645 # inject unconverted fields and accessor, if requested...
Chris@0 1646 if @unconverted_fields and not csv.respond_to? :unconverted_fields
Chris@0 1647 add_unconverted_fields(csv, unconverted)
Chris@0 1648 end
Chris@0 1649
Chris@0 1650 # return the results
Chris@0 1651 break csv
Chris@0 1652 end
Chris@0 1653 # if we're not empty?() but at eof?(), a quoted field wasn't closed...
Chris@0 1654 if @io.eof?
Chris@0 1655 raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
Chris@0 1656 elsif @field_size_limit and current_field.size >= @field_size_limit
Chris@0 1657 raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
Chris@0 1658 end
Chris@0 1659 # otherwise, we need to loop and pull some more data to complete the row
Chris@0 1660 end
Chris@0 1661 end
Chris@0 1662 alias_method :gets, :shift
Chris@0 1663 alias_method :readline, :shift
Chris@0 1664
Chris@0 1665 # Returns a simplified description of the key FasterCSV attributes.
Chris@0 1666 def inspect
Chris@0 1667 str = "<##{self.class} io_type:"
Chris@0 1668 # show type of wrapped IO
Chris@0 1669 if @io == $stdout then str << "$stdout"
Chris@0 1670 elsif @io == $stdin then str << "$stdin"
Chris@0 1671 elsif @io == $stderr then str << "$stderr"
Chris@0 1672 else str << @io.class.to_s
Chris@0 1673 end
Chris@0 1674 # show IO.path(), if available
Chris@0 1675 if @io.respond_to?(:path) and (p = @io.path)
Chris@0 1676 str << " io_path:#{p.inspect}"
Chris@0 1677 end
Chris@0 1678 # show other attributes
Chris@0 1679 %w[ lineno col_sep row_sep
Chris@0 1680 quote_char skip_blanks encoding ].each do |attr_name|
Chris@0 1681 if a = instance_variable_get("@#{attr_name}")
Chris@0 1682 str << " #{attr_name}:#{a.inspect}"
Chris@0 1683 end
Chris@0 1684 end
Chris@0 1685 if @use_headers
Chris@0 1686 str << " headers:#{(@headers || true).inspect}"
Chris@0 1687 end
Chris@0 1688 str << ">"
Chris@0 1689 end
Chris@0 1690
Chris@0 1691 private
Chris@0 1692
Chris@0 1693 #
Chris@0 1694 # Stores the indicated separators for later use.
Chris@0 1695 #
Chris@0 1696 # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
Chris@0 1697 # ahead in the <tt>@io</tt> and try to find one. +ARGF+, +STDIN+, +STDOUT+,
Chris@0 1698 # +STDERR+ and any stream open for output only with a default
Chris@0 1699 # <tt>@row_sep</tt> of <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
Chris@0 1700 #
Chris@0 1701 # This method also establishes the quoting rules used for CSV output.
Chris@0 1702 #
Chris@0 1703 def init_separators(options)
Chris@0 1704 # store the selected separators
Chris@0 1705 @col_sep = options.delete(:col_sep)
Chris@0 1706 @row_sep = options.delete(:row_sep)
Chris@0 1707 @quote_char = options.delete(:quote_char)
Chris@0 1708 @quote_and_newlines = "#{@quote_char}\r\n"
Chris@0 1709
Chris@0 1710 if @quote_char.length != 1
Chris@0 1711 raise ArgumentError, ":quote_char has to be a single character String"
Chris@0 1712 end
Chris@0 1713
Chris@0 1714 # automatically discover row separator when requested
Chris@0 1715 if @row_sep == :auto
Chris@0 1716 if [ARGF, STDIN, STDOUT, STDERR].include?(@io) or
Chris@0 1717 (defined?(Zlib) and @io.class == Zlib::GzipWriter)
Chris@0 1718 @row_sep = $INPUT_RECORD_SEPARATOR
Chris@0 1719 else
Chris@0 1720 begin
Chris@0 1721 saved_pos = @io.pos # remember where we were
Chris@0 1722 while @row_sep == :auto
Chris@0 1723 #
Chris@0 1724 # if we run out of data, it's probably a single line
Chris@0 1725 # (use a sensible default)
Chris@0 1726 #
Chris@0 1727 if @io.eof?
Chris@0 1728 @row_sep = $INPUT_RECORD_SEPARATOR
Chris@0 1729 break
Chris@0 1730 end
Chris@0 1731
Chris@0 1732 # read ahead a bit
Chris@0 1733 sample = @io.read(1024)
Chris@0 1734 sample += @io.read(1) if sample[-1..-1] == "\r" and not @io.eof?
Chris@0 1735
Chris@0 1736 # try to find a standard separator
Chris@0 1737 if sample =~ /\r\n?|\n/
Chris@0 1738 @row_sep = $&
Chris@0 1739 break
Chris@0 1740 end
Chris@0 1741 end
Chris@0 1742 # tricky seek() clone to work around GzipReader's lack of seek()
Chris@0 1743 @io.rewind
Chris@0 1744 # reset back to the remembered position
Chris@0 1745 while saved_pos > 1024 # avoid loading a lot of data into memory
Chris@0 1746 @io.read(1024)
Chris@0 1747 saved_pos -= 1024
Chris@0 1748 end
Chris@0 1749 @io.read(saved_pos) if saved_pos.nonzero?
Chris@0 1750 rescue IOError # stream not opened for reading
Chris@0 1751 @row_sep = $INPUT_RECORD_SEPARATOR
Chris@0 1752 end
Chris@0 1753 end
Chris@0 1754 end
Chris@0 1755
Chris@0 1756 # establish quoting rules
Chris@0 1757 do_quote = lambda do |field|
Chris@0 1758 @quote_char +
Chris@0 1759 String(field).gsub(@quote_char, @quote_char * 2) +
Chris@0 1760 @quote_char
Chris@0 1761 end
Chris@0 1762 @quote = if options.delete(:force_quotes)
Chris@0 1763 do_quote
Chris@0 1764 else
Chris@0 1765 lambda do |field|
Chris@0 1766 if field.nil? # represent +nil+ fields as empty unquoted fields
Chris@0 1767 ""
Chris@0 1768 else
Chris@0 1769 field = String(field) # Stringify fields
Chris@0 1770 # represent empty fields as empty quoted fields
Chris@0 1771 if field.empty? or
Chris@0 1772 field.count("\r\n#{@col_sep}#{@quote_char}").nonzero?
Chris@0 1773 do_quote.call(field)
Chris@0 1774 else
Chris@0 1775 field # unquoted field
Chris@0 1776 end
Chris@0 1777 end
Chris@0 1778 end
Chris@0 1779 end
Chris@0 1780 end
Chris@0 1781
Chris@0 1782 # Pre-compiles parsers and stores them by name for access during reads.
Chris@0 1783 def init_parsers(options)
Chris@0 1784 # store the parser behaviors
Chris@0 1785 @skip_blanks = options.delete(:skip_blanks)
Chris@0 1786 @encoding = options.delete(:encoding) # nil will use $KCODE
Chris@0 1787 @field_size_limit = options.delete(:field_size_limit)
Chris@0 1788
Chris@0 1789 # prebuild Regexps for faster parsing
Chris@0 1790 esc_col_sep = Regexp.escape(@col_sep)
Chris@0 1791 esc_row_sep = Regexp.escape(@row_sep)
Chris@0 1792 esc_quote = Regexp.escape(@quote_char)
Chris@0 1793 @parsers = {
Chris@0 1794 :any_field => Regexp.new( "[^#{esc_col_sep}]+",
Chris@0 1795 Regexp::MULTILINE,
Chris@0 1796 @encoding ),
Chris@0 1797 :quoted_field => Regexp.new( "^#{esc_quote}(.*)#{esc_quote}$",
Chris@0 1798 Regexp::MULTILINE,
Chris@0 1799 @encoding ),
Chris@0 1800 # safer than chomp!()
Chris@0 1801 :line_end => Regexp.new("#{esc_row_sep}\\z", nil, @encoding)
Chris@0 1802 }
Chris@0 1803 end
Chris@0 1804
Chris@0 1805 #
Chris@0 1806 # Loads any converters requested during construction.
Chris@0 1807 #
Chris@0 1808 # If +field_name+ is set <tt>:converters</tt> (the default) field converters
Chris@0 1809 # are set. When +field_name+ is <tt>:header_converters</tt> header converters
Chris@0 1810 # are added instead.
Chris@0 1811 #
Chris@0 1812 # The <tt>:unconverted_fields</tt> option is also actived for
Chris@0 1813 # <tt>:converters</tt> calls, if requested.
Chris@0 1814 #
Chris@0 1815 def init_converters(options, field_name = :converters)
Chris@0 1816 if field_name == :converters
Chris@0 1817 @unconverted_fields = options.delete(:unconverted_fields)
Chris@0 1818 end
Chris@0 1819
Chris@0 1820 instance_variable_set("@#{field_name}", Array.new)
Chris@0 1821
Chris@0 1822 # find the correct method to add the coverters
Chris@0 1823 convert = method(field_name.to_s.sub(/ers\Z/, ""))
Chris@0 1824
Chris@0 1825 # load converters
Chris@0 1826 unless options[field_name].nil?
Chris@0 1827 # allow a single converter not wrapped in an Array
Chris@0 1828 unless options[field_name].is_a? Array
Chris@0 1829 options[field_name] = [options[field_name]]
Chris@0 1830 end
Chris@0 1831 # load each converter...
Chris@0 1832 options[field_name].each do |converter|
Chris@0 1833 if converter.is_a? Proc # custom code block
Chris@0 1834 convert.call(&converter)
Chris@0 1835 else # by name
Chris@0 1836 convert.call(converter)
Chris@0 1837 end
Chris@0 1838 end
Chris@0 1839 end
Chris@0 1840
Chris@0 1841 options.delete(field_name)
Chris@0 1842 end
Chris@0 1843
Chris@0 1844 # Stores header row settings and loads header converters, if needed.
Chris@0 1845 def init_headers(options)
Chris@0 1846 @use_headers = options.delete(:headers)
Chris@0 1847 @return_headers = options.delete(:return_headers)
Chris@0 1848 @write_headers = options.delete(:write_headers)
Chris@0 1849
Chris@0 1850 # headers must be delayed until shift(), in case they need a row of content
Chris@0 1851 @headers = nil
Chris@0 1852
Chris@0 1853 init_converters(options, :header_converters)
Chris@0 1854 end
Chris@0 1855
Chris@0 1856 #
Chris@0 1857 # The actual work method for adding converters, used by both
Chris@0 1858 # FasterCSV.convert() and FasterCSV.header_convert().
Chris@0 1859 #
Chris@0 1860 # This method requires the +var_name+ of the instance variable to place the
Chris@0 1861 # converters in, the +const+ Hash to lookup named converters in, and the
Chris@0 1862 # normal parameters of the FasterCSV.convert() and FasterCSV.header_convert()
Chris@0 1863 # methods.
Chris@0 1864 #
Chris@0 1865 def add_converter(var_name, const, name = nil, &converter)
Chris@0 1866 if name.nil? # custom converter
Chris@0 1867 instance_variable_get("@#{var_name}") << converter
Chris@0 1868 else # named converter
Chris@0 1869 combo = const[name]
Chris@0 1870 case combo
Chris@0 1871 when Array # combo converter
Chris@0 1872 combo.each do |converter_name|
Chris@0 1873 add_converter(var_name, const, converter_name)
Chris@0 1874 end
Chris@0 1875 else # individual named converter
Chris@0 1876 instance_variable_get("@#{var_name}") << combo
Chris@0 1877 end
Chris@0 1878 end
Chris@0 1879 end
Chris@0 1880
Chris@0 1881 #
Chris@0 1882 # Processes +fields+ with <tt>@converters</tt>, or <tt>@header_converters</tt>
Chris@0 1883 # if +headers+ is passed as +true+, returning the converted field set. Any
Chris@0 1884 # converter that changes the field into something other than a String halts
Chris@0 1885 # the pipeline of conversion for that field. This is primarily an efficiency
Chris@0 1886 # shortcut.
Chris@0 1887 #
Chris@0 1888 def convert_fields(fields, headers = false)
Chris@0 1889 # see if we are converting headers or fields
Chris@0 1890 converters = headers ? @header_converters : @converters
Chris@0 1891
Chris@0 1892 fields.enum_for(:each_with_index).map do |field, index| # map_with_index
Chris@0 1893 converters.each do |converter|
Chris@0 1894 field = if converter.arity == 1 # straight field converter
Chris@0 1895 converter[field]
Chris@0 1896 else # FieldInfo converter
Chris@0 1897 header = @use_headers && !headers ? @headers[index] : nil
Chris@0 1898 converter[field, FieldInfo.new(index, lineno, header)]
Chris@0 1899 end
Chris@0 1900 break unless field.is_a? String # short-curcuit pipeline for speed
Chris@0 1901 end
Chris@0 1902 field # return final state of each field, converted or original
Chris@0 1903 end
Chris@0 1904 end
Chris@0 1905
Chris@0 1906 #
Chris@0 1907 # This methods is used to turn a finished +row+ into a FasterCSV::Row. Header
Chris@0 1908 # rows are also dealt with here, either by returning a FasterCSV::Row with
Chris@0 1909 # identical headers and fields (save that the fields do not go through the
Chris@0 1910 # converters) or by reading past them to return a field row. Headers are also
Chris@0 1911 # saved in <tt>@headers</tt> for use in future rows.
Chris@0 1912 #
Chris@0 1913 # When +nil+, +row+ is assumed to be a header row not based on an actual row
Chris@0 1914 # of the stream.
Chris@0 1915 #
Chris@0 1916 def parse_headers(row = nil)
Chris@0 1917 if @headers.nil? # header row
Chris@0 1918 @headers = case @use_headers # save headers
Chris@0 1919 # Array of headers
Chris@0 1920 when Array then @use_headers
Chris@0 1921 # CSV header String
Chris@0 1922 when String
Chris@0 1923 self.class.parse_line( @use_headers,
Chris@0 1924 :col_sep => @col_sep,
Chris@0 1925 :row_sep => @row_sep,
Chris@0 1926 :quote_char => @quote_char )
Chris@0 1927 # first row is headers
Chris@0 1928 else row
Chris@0 1929 end
Chris@0 1930
Chris@0 1931 # prepare converted and unconverted copies
Chris@0 1932 row = @headers if row.nil?
Chris@0 1933 @headers = convert_fields(@headers, true)
Chris@0 1934
Chris@0 1935 if @return_headers # return headers
Chris@0 1936 return FasterCSV::Row.new(@headers, row, true)
Chris@0 1937 elsif not [Array, String].include? @use_headers.class # skip to field row
Chris@0 1938 return shift
Chris@0 1939 end
Chris@0 1940 end
Chris@0 1941
Chris@0 1942 FasterCSV::Row.new(@headers, convert_fields(row)) # field row
Chris@0 1943 end
Chris@0 1944
Chris@0 1945 #
Chris@0 1946 # Thiw methods injects an instance variable <tt>unconverted_fields</tt> into
Chris@0 1947 # +row+ and an accessor method for it called unconverted_fields(). The
Chris@0 1948 # variable is set to the contents of +fields+.
Chris@0 1949 #
Chris@0 1950 def add_unconverted_fields(row, fields)
Chris@0 1951 class << row
Chris@0 1952 attr_reader :unconverted_fields
Chris@0 1953 end
Chris@0 1954 row.instance_eval { @unconverted_fields = fields }
Chris@0 1955 row
Chris@0 1956 end
Chris@0 1957 end
Chris@0 1958
Chris@0 1959 # Another name for FasterCSV.
Chris@0 1960 FCSV = FasterCSV
Chris@0 1961
Chris@0 1962 # Another name for FasterCSV::instance().
Chris@0 1963 def FasterCSV(*args, &block)
Chris@0 1964 FasterCSV.instance(*args, &block)
Chris@0 1965 end
Chris@0 1966
Chris@0 1967 # Another name for FCSV::instance().
Chris@0 1968 def FCSV(*args, &block)
Chris@0 1969 FCSV.instance(*args, &block)
Chris@0 1970 end
Chris@0 1971
Chris@0 1972 class Array
Chris@0 1973 # Equivalent to <tt>FasterCSV::generate_line(self, options)</tt>.
Chris@0 1974 def to_csv(options = Hash.new)
Chris@0 1975 FasterCSV.generate_line(self, options)
Chris@0 1976 end
Chris@0 1977 end
Chris@0 1978
Chris@0 1979 class String
Chris@0 1980 # Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
Chris@0 1981 def parse_csv(options = Hash.new)
Chris@0 1982 FasterCSV.parse_line(self, options)
Chris@0 1983 end
Chris@0 1984 end