Chris@0: module CodeRay
Chris@0: 
Chris@0:   # = Tokens
Chris@0:   #
Chris@0:   # The Tokens class represents a list of tokens returnd from
Chris@0:   # a Scanner.
Chris@0:   #
Chris@0:   # A token is not a special object, just a two-element Array
Chris@0:   # consisting of
Chris@0:   # * the _token_ _kind_ (a Symbol representing the type of the token)
Chris@0:   # * the _token_ _text_ (the original source of the token in a String)
Chris@0:   #
Chris@0:   # A token looks like this:
Chris@0:   #
Chris@0:   #   [:comment, '# It looks like this']
Chris@0:   #   [:float, '3.1415926']
Chris@0:   #   [:error, '$^']
Chris@0:   #
Chris@0:   # Some scanners also yield some kind of sub-tokens, represented by special
Chris@0:   # token texts, namely :open and :close .
Chris@0:   #
Chris@0:   # The Ruby scanner, for example, splits "a string" into:
Chris@0:   #
Chris@0:   #  [
Chris@0:   #   [:open, :string],
Chris@0:   #   [:delimiter, '"'],
Chris@0:   #   [:content, 'a string'],
Chris@0:   #   [:delimiter, '"'],
Chris@0:   #   [:close, :string]
Chris@0:   #  ]
Chris@0:   #
Chris@0:   # Tokens is also the interface between Scanners and Encoders:
Chris@0:   # The input is split and saved into a Tokens object. The Encoder
Chris@0:   # then builds the output from this object.
Chris@0:   #
Chris@0:   # Thus, the syntax below becomes clear:
Chris@0:   #
Chris@0:   #   CodeRay.scan('price = 2.59', :ruby).html
Chris@0:   #   # the Tokens object is here -------^
Chris@0:   #
Chris@0:   # See how small it is? ;)
Chris@0:   #
Chris@0:   # Tokens gives you the power to handle pre-scanned code very easily:
Chris@0:   # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
Chris@0:   # that you put in your DB.
Chris@0:   #
Chris@0:   # Tokens' subclass TokenStream allows streaming to save memory.
Chris@0:   class Tokens < Array
Chris@0:     
Chris@0:     # The Scanner instance that created the tokens.
Chris@0:     attr_accessor :scanner
Chris@0:     
Chris@0:     # Whether the object is a TokenStream.
Chris@0:     #
Chris@0:     # Returns false.
Chris@0:     def stream?
Chris@0:       false
Chris@0:     end
Chris@0: 
Chris@0:     # Iterates over all tokens.
Chris@0:     #
Chris@0:     # If a filter is given, only tokens of that kind are yielded.
Chris@0:     def each kind_filter = nil, &block
Chris@0:       unless kind_filter
Chris@0:         super(&block)
Chris@0:       else
Chris@0:         super() do |text, kind|
Chris@0:           next unless kind == kind_filter
Chris@0:           yield text, kind
Chris@0:         end
Chris@0:       end
Chris@0:     end
Chris@0: 
Chris@0:     # Iterates over all text tokens.
Chris@0:     # Range tokens like [:open, :string] are left out.
Chris@0:     #
Chris@0:     # Example:
Chris@0:     #   tokens.each_text_token { |text, kind| text.replace html_escape(text) }
Chris@0:     def each_text_token
Chris@0:       each do |text, kind|
Chris@0:         next unless text.is_a? ::String
Chris@0:         yield text, kind
Chris@0:       end
Chris@0:     end
Chris@0: 
Chris@0:     # Encode the tokens using encoder.
Chris@0:     #
Chris@0:     # encoder can be
Chris@0:     # * a symbol like :html oder :statistic
Chris@0:     # * an Encoder class
Chris@0:     # * an Encoder object
Chris@0:     #
Chris@0:     # options are passed to the encoder.
Chris@0:     def encode encoder, options = {}
Chris@0:       unless encoder.is_a? Encoders::Encoder
Chris@0:         unless encoder.is_a? Class
Chris@0:           encoder_class = Encoders[encoder]
Chris@0:         end
Chris@0:         encoder = encoder_class.new options
Chris@0:       end
Chris@0:       encoder.encode_tokens self, options
Chris@0:     end
Chris@0: 
Chris@0: 
Chris@0:     # Turn into a string using Encoders::Text.
Chris@0:     #
Chris@0:     # +options+ are passed to the encoder if given.
Chris@0:     def to_s options = {}
Chris@0:       encode :text, options
Chris@0:     end
Chris@0: 
Chris@0:     # Redirects unknown methods to encoder calls.
Chris@0:     #
Chris@0:     # For example, if you call +tokens.html+, the HTML encoder
Chris@0:     # is used to highlight the tokens.
Chris@0:     def method_missing meth, options = {}
Chris@0:       Encoders[meth].new(options).encode_tokens self
Chris@0:     end
Chris@0: 
Chris@0:     # Returns the tokens compressed by joining consecutive
Chris@0:     # tokens of the same kind.
Chris@0:     #
Chris@0:     # This can not be undone, but should yield the same output
Chris@0:     # in most Encoders.  It basically makes the output smaller.
Chris@0:     #
Chris@0:     # Combined with dump, it saves space for the cost of time.
Chris@0:     #
Chris@0:     # If the scanner is written carefully, this is not required -
Chris@0:     # for example, consecutive //-comment lines could already be
Chris@0:     # joined in one comment token by the Scanner.
Chris@0:     def optimize
Chris@0:       last_kind = last_text = nil
Chris@0:       new = self.class.new
Chris@0:       for text, kind in self
Chris@0:         if text.is_a? String
Chris@0:           if kind == last_kind
Chris@0:             last_text << text
Chris@0:           else
Chris@0:             new << [last_text, last_kind] if last_kind
Chris@0:             last_text = text
Chris@0:             last_kind = kind
Chris@0:           end
Chris@0:         else
Chris@0:           new << [last_text, last_kind] if last_kind
Chris@0:           last_kind = last_text = nil
Chris@0:           new << [text, kind]
Chris@0:         end
Chris@0:       end
Chris@0:       new << [last_text, last_kind] if last_kind
Chris@0:       new
Chris@0:     end
Chris@0: 
Chris@0:     # Compact the object itself; see optimize.
Chris@0:     def optimize!
Chris@0:       replace optimize
Chris@0:     end
Chris@0:     
Chris@0:     # Ensure that all :open tokens have a correspondent :close one.
Chris@0:     #
Chris@0:     # TODO: Test this!
Chris@0:     def fix
Chris@0:       tokens = self.class.new
Chris@0:       # Check token nesting using a stack of kinds.
Chris@0:       opened = []
Chris@0:       for type, kind in self
Chris@0:         case type
Chris@0:         when :open
Chris@0:           opened.push [:close, kind]
Chris@0:         when :begin_line
Chris@0:           opened.push [:end_line, kind]
Chris@0:         when :close, :end_line
Chris@0:           expected = opened.pop
Chris@0:           if [type, kind] != expected
Chris@0:             # Unexpected :close; decide what to do based on the kind:
Chris@0:             # - token was never opened: delete the :close (just skip it)
Chris@0:             next unless opened.rindex expected
Chris@0:             # - token was opened earlier: also close tokens in between
Chris@0:             tokens << token until (token = opened.pop) == expected
Chris@0:           end
Chris@0:         end
Chris@0:         tokens << [type, kind]
Chris@0:       end
Chris@0:       # Close remaining opened tokens
Chris@0:       tokens << token while token = opened.pop
Chris@0:       tokens
Chris@0:     end
Chris@0:     
Chris@0:     def fix!
Chris@0:       replace fix
Chris@0:     end
Chris@0:     
Chris@0:     # TODO: Scanner#split_into_lines
Chris@0:     # 
Chris@0:     # Makes sure that:
Chris@0:     # - newlines are single tokens
Chris@0:     #   (which means all other token are single-line)
Chris@0:     # - there are no open tokens at the end the line
Chris@0:     #
Chris@0:     # This makes it simple for encoders that work line-oriented,
Chris@0:     # like HTML with list-style numeration.
Chris@0:     def split_into_lines
Chris@0:       raise NotImplementedError
Chris@0:     end
Chris@0: 
Chris@0:     def split_into_lines!
Chris@0:       replace split_into_lines
Chris@0:     end
Chris@0: 
Chris@0:     # Dumps the object into a String that can be saved
Chris@0:     # in files or databases.
Chris@0:     #
Chris@0:     # The dump is created with Marshal.dump;
Chris@0:     # In addition, it is gzipped using GZip.gzip.
Chris@0:     #
Chris@0:     # The returned String object includes Undumping
Chris@0:     # so it has an #undump method. See Tokens.load.
Chris@0:     #
Chris@0:     # You can configure the level of compression,
Chris@0:     # but the default value 7 should be what you want
Chris@0:     # in most cases as it is a good compromise between
Chris@0:     # speed and compression rate.
Chris@0:     #
Chris@0:     # See GZip module.
Chris@0:     def dump gzip_level = 7
Chris@0:       require 'coderay/helpers/gzip_simple'
Chris@0:       dump = Marshal.dump self
Chris@0:       dump = dump.gzip gzip_level
Chris@0:       dump.extend Undumping
Chris@0:     end
Chris@0: 
Chris@0:     # The total size of the tokens.
Chris@0:     # Should be equal to the input size before
Chris@0:     # scanning.
Chris@0:     def text_size
Chris@0:       size = 0
Chris@0:       each_text_token do |t, k|
Chris@0:         size + t.size
Chris@0:       end
Chris@0:       size
Chris@0:     end
Chris@0: 
Chris@0:     # The total size of the tokens.
Chris@0:     # Should be equal to the input size before
Chris@0:     # scanning.
Chris@0:     def text
Chris@0:       map { |t, k| t if t.is_a? ::String }.join
Chris@0:     end
Chris@0: 
Chris@0:     # Include this module to give an object an #undump
Chris@0:     # method.
Chris@0:     #
Chris@0:     # The string returned by Tokens.dump includes Undumping.
Chris@0:     module Undumping
Chris@0:       # Calls Tokens.load with itself.
Chris@0:       def undump
Chris@0:         Tokens.load self
Chris@0:       end
Chris@0:     end
Chris@0: 
Chris@0:     # Undump the object using Marshal.load, then
Chris@0:     # unzip it using GZip.gunzip.
Chris@0:     #
Chris@0:     # The result is commonly a Tokens object, but
Chris@0:     # this is not guaranteed.
Chris@0:     def Tokens.load dump
Chris@0:       require 'coderay/helpers/gzip_simple'
Chris@0:       dump = dump.gunzip
Chris@0:       @dump = Marshal.load dump
Chris@0:     end
Chris@0: 
Chris@0:   end
Chris@0: 
Chris@0: 
Chris@0:   # = TokenStream
Chris@0:   #
Chris@0:   # The TokenStream class is a fake Array without elements.
Chris@0:   #
Chris@0:   # It redirects the method << to a block given at creation.
Chris@0:   #
Chris@0:   # This allows scanners and Encoders to use streaming (no
Chris@0:   # tokens are saved, the input is highlighted the same time it
Chris@0:   # is scanned) with the same code.
Chris@0:   #
Chris@0:   # See CodeRay.encode_stream and CodeRay.scan_stream
Chris@0:   class TokenStream < Tokens
Chris@0: 
Chris@0:     # Whether the object is a TokenStream.
Chris@0:     #
Chris@0:     # Returns true.
Chris@0:     def stream?
Chris@0:       true
Chris@0:     end
Chris@0: 
Chris@0:     # The Array is empty, but size counts the tokens given by <<.
Chris@0:     attr_reader :size
Chris@0: 
Chris@0:     # Creates a new TokenStream that calls +block+ whenever
Chris@0:     # its << method is called.
Chris@0:     #
Chris@0:     # Example:
Chris@0:     #
Chris@0:     #   require 'coderay'
Chris@0:     #   
Chris@0:     #   token_stream = CodeRay::TokenStream.new do |kind, text|
Chris@0:     #     puts 'kind: %s, text size: %d.' % [kind, text.size]
Chris@0:     #   end
Chris@0:     #   
Chris@0:     #   token_stream << [:regexp, '/\d+/']
Chris@0:     #   #-> kind: rexpexp, text size: 5.
Chris@0:     #
Chris@0:     def initialize &block
Chris@0:       raise ArgumentError, 'Block expected for streaming.' unless block
Chris@0:       @callback = block
Chris@0:       @size = 0
Chris@0:     end
Chris@0: 
Chris@0:     # Calls +block+ with +token+ and increments size.
Chris@0:     #
Chris@0:     # Returns self.
Chris@0:     def << token
Chris@0:       @callback.call(*token)
Chris@0:       @size += 1
Chris@0:       self
Chris@0:     end
Chris@0: 
Chris@0:     # This method is not implemented due to speed reasons. Use Tokens.
Chris@0:     def text_size
Chris@0:       raise NotImplementedError,
Chris@0:         'This method is not implemented due to speed reasons.'
Chris@0:     end
Chris@0: 
Chris@0:     # A TokenStream cannot be dumped. Use Tokens.
Chris@0:     def dump
Chris@0:       raise NotImplementedError, 'A TokenStream cannot be dumped.'
Chris@0:     end
Chris@0: 
Chris@0:     # A TokenStream cannot be optimized. Use Tokens.
Chris@0:     def optimize
Chris@0:       raise NotImplementedError, 'A TokenStream cannot be optimized.'
Chris@0:     end
Chris@0: 
Chris@0:   end
Chris@0: 
Chris@0: end
Chris@0: 
Chris@0: if $0 == __FILE__
Chris@0:   $VERBOSE = true
Chris@0:   $: << File.join(File.dirname(__FILE__), '..')
Chris@0:   eval DATA.read, nil, $0, __LINE__ + 4
Chris@0: end
Chris@0: 
Chris@0: __END__
Chris@0: require 'test/unit'
Chris@0: 
Chris@0: class TokensTest < Test::Unit::TestCase
Chris@0:   
Chris@0:   def test_creation
Chris@0:     assert CodeRay::Tokens < Array
Chris@0:     tokens = nil
Chris@0:     assert_nothing_raised do
Chris@0:       tokens = CodeRay::Tokens.new
Chris@0:     end
Chris@0:     assert_kind_of Array, tokens
Chris@0:   end
Chris@0:   
Chris@0:   def test_adding_tokens
Chris@0:     tokens = CodeRay::Tokens.new
Chris@0:     assert_nothing_raised do
Chris@0:       tokens << ['string', :type]
Chris@0:       tokens << ['()', :operator]
Chris@0:     end
Chris@0:     assert_equal tokens.size, 2
Chris@0:   end
Chris@0:   
Chris@0:   def test_dump_undump
Chris@0:     tokens = CodeRay::Tokens.new
Chris@0:     assert_nothing_raised do
Chris@0:       tokens << ['string', :type]
Chris@0:       tokens << ['()', :operator]
Chris@0:     end
Chris@0:     tokens2 = nil
Chris@0:     assert_nothing_raised do
Chris@0:       tokens2 = tokens.dump.undump
Chris@0:     end
Chris@0:     assert_equal tokens, tokens2
Chris@0:   end
Chris@0:   
Chris@0: end