Chris@210: module CodeRay Chris@210: Chris@210: # = Tokens Chris@210: # Chris@210: # The Tokens class represents a list of tokens returnd from Chris@210: # a Scanner. Chris@210: # Chris@210: # A token is not a special object, just a two-element Array Chris@210: # consisting of Chris@210: # * the _token_ _text_ (the original source of the token in a String) or Chris@210: # a _token_ _action_ (:open, :close, :begin_line, :end_line) Chris@210: # * the _token_ _kind_ (a Symbol representing the type of the token) Chris@210: # Chris@210: # A token looks like this: Chris@210: # Chris@210: # ['# It looks like this', :comment] Chris@210: # ['3.1415926', :float] Chris@210: # ['$^', :error] Chris@210: # Chris@210: # Some scanners also yield sub-tokens, represented by special Chris@210: # token actions, namely :open and :close. Chris@210: # Chris@210: # The Ruby scanner, for example, splits "a string" into: Chris@210: # Chris@210: # [ Chris@210: # [:open, :string], Chris@210: # ['"', :delimiter], Chris@210: # ['a string', :content], Chris@210: # ['"', :delimiter], Chris@210: # [:close, :string] Chris@210: # ] Chris@210: # Chris@210: # Tokens is the interface between Scanners and Encoders: Chris@210: # The input is split and saved into a Tokens object. The Encoder Chris@210: # then builds the output from this object. Chris@210: # Chris@210: # Thus, the syntax below becomes clear: Chris@210: # Chris@210: # CodeRay.scan('price = 2.59', :ruby).html Chris@210: # # the Tokens object is here -------^ Chris@210: # Chris@210: # See how small it is? ;) Chris@210: # Chris@210: # Tokens gives you the power to handle pre-scanned code very easily: Chris@210: # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string Chris@210: # that you put in your DB. Chris@210: # Chris@210: # It also allows you to generate tokens directly (without using a scanner), Chris@210: # to load them from a file, and still use any Encoder that CodeRay provides. Chris@210: # Chris@210: # Tokens' subclass TokenStream allows streaming to save memory. Chris@210: class Tokens < Array Chris@210: Chris@210: # The Scanner instance that created the tokens. Chris@210: attr_accessor :scanner Chris@210: Chris@210: # Whether the object is a TokenStream. Chris@210: # Chris@210: # Returns false. Chris@210: def stream? Chris@210: false Chris@210: end Chris@210: Chris@210: # Iterates over all tokens. Chris@210: # Chris@210: # If a filter is given, only tokens of that kind are yielded. Chris@210: def each kind_filter = nil, &block Chris@210: unless kind_filter Chris@210: super(&block) Chris@210: else Chris@210: super() do |text, kind| Chris@210: next unless kind == kind_filter Chris@210: yield text, kind Chris@210: end Chris@210: end Chris@210: end Chris@210: Chris@210: # Iterates over all text tokens. Chris@210: # Range tokens like [:open, :string] are left out. Chris@210: # Chris@210: # Example: Chris@210: # tokens.each_text_token { |text, kind| text.replace html_escape(text) } Chris@210: def each_text_token Chris@210: each do |text, kind| Chris@210: next unless text.is_a? ::String Chris@210: yield text, kind Chris@210: end Chris@210: end Chris@210: Chris@210: # Encode the tokens using encoder. Chris@210: # Chris@210: # encoder can be Chris@210: # * a symbol like :html oder :statistic Chris@210: # * an Encoder class Chris@210: # * an Encoder object Chris@210: # Chris@210: # options are passed to the encoder. Chris@210: def encode encoder, options = {} Chris@210: unless encoder.is_a? Encoders::Encoder Chris@210: unless encoder.is_a? Class Chris@210: encoder_class = Encoders[encoder] Chris@210: end Chris@210: encoder = encoder_class.new options Chris@210: end Chris@210: encoder.encode_tokens self, options Chris@210: end Chris@210: Chris@210: Chris@210: # Turn into a string using Encoders::Text. Chris@210: # Chris@210: # +options+ are passed to the encoder if given. Chris@210: def to_s options = {} Chris@210: encode :text, options Chris@210: end Chris@210: Chris@210: # Redirects unknown methods to encoder calls. Chris@210: # Chris@210: # For example, if you call +tokens.html+, the HTML encoder Chris@210: # is used to highlight the tokens. Chris@210: def method_missing meth, options = {} Chris@210: Encoders[meth].new(options).encode_tokens self Chris@210: end Chris@210: Chris@210: # Returns the tokens compressed by joining consecutive Chris@210: # tokens of the same kind. Chris@210: # Chris@210: # This can not be undone, but should yield the same output Chris@210: # in most Encoders. It basically makes the output smaller. Chris@210: # Chris@210: # Combined with dump, it saves space for the cost of time. Chris@210: # Chris@210: # If the scanner is written carefully, this is not required - Chris@210: # for example, consecutive //-comment lines could already be Chris@210: # joined in one comment token by the Scanner. Chris@210: def optimize Chris@210: last_kind = last_text = nil Chris@210: new = self.class.new Chris@210: for text, kind in self Chris@210: if text.is_a? String Chris@210: if kind == last_kind Chris@210: last_text << text Chris@210: else Chris@210: new << [last_text, last_kind] if last_kind Chris@210: last_text = text Chris@210: last_kind = kind Chris@210: end Chris@210: else Chris@210: new << [last_text, last_kind] if last_kind Chris@210: last_kind = last_text = nil Chris@210: new << [text, kind] Chris@210: end Chris@210: end Chris@210: new << [last_text, last_kind] if last_kind Chris@210: new Chris@210: end Chris@210: Chris@210: # Compact the object itself; see optimize. Chris@210: def optimize! Chris@210: replace optimize Chris@210: end Chris@210: Chris@210: # Ensure that all :open tokens have a correspondent :close one. Chris@210: # Chris@210: # TODO: Test this! Chris@210: def fix Chris@210: tokens = self.class.new Chris@210: # Check token nesting using a stack of kinds. Chris@210: opened = [] Chris@210: for type, kind in self Chris@210: case type Chris@210: when :open Chris@210: opened.push [:close, kind] Chris@210: when :begin_line Chris@210: opened.push [:end_line, kind] Chris@210: when :close, :end_line Chris@210: expected = opened.pop Chris@210: if [type, kind] != expected Chris@210: # Unexpected :close; decide what to do based on the kind: Chris@210: # - token was never opened: delete the :close (just skip it) Chris@210: next unless opened.rindex expected Chris@210: # - token was opened earlier: also close tokens in between Chris@210: tokens << token until (token = opened.pop) == expected Chris@210: end Chris@210: end Chris@210: tokens << [type, kind] Chris@210: end Chris@210: # Close remaining opened tokens Chris@210: tokens << token while token = opened.pop Chris@210: tokens Chris@210: end Chris@210: Chris@210: def fix! Chris@210: replace fix Chris@210: end Chris@210: Chris@210: # TODO: Scanner#split_into_lines Chris@210: # Chris@210: # Makes sure that: Chris@210: # - newlines are single tokens Chris@210: # (which means all other token are single-line) Chris@210: # - there are no open tokens at the end the line Chris@210: # Chris@210: # This makes it simple for encoders that work line-oriented, Chris@210: # like HTML with list-style numeration. Chris@210: def split_into_lines Chris@210: raise NotImplementedError Chris@210: end Chris@210: Chris@210: def split_into_lines! Chris@210: replace split_into_lines Chris@210: end Chris@210: Chris@210: # Dumps the object into a String that can be saved Chris@210: # in files or databases. Chris@210: # Chris@210: # The dump is created with Marshal.dump; Chris@210: # In addition, it is gzipped using GZip.gzip. Chris@210: # Chris@210: # The returned String object includes Undumping Chris@210: # so it has an #undump method. See Tokens.load. Chris@210: # Chris@210: # You can configure the level of compression, Chris@210: # but the default value 7 should be what you want Chris@210: # in most cases as it is a good compromise between Chris@210: # speed and compression rate. Chris@210: # Chris@210: # See GZip module. Chris@210: def dump gzip_level = 7 Chris@210: require 'coderay/helpers/gzip_simple' Chris@210: dump = Marshal.dump self Chris@210: dump = dump.gzip gzip_level Chris@210: dump.extend Undumping Chris@210: end Chris@210: Chris@210: # The total size of the tokens. Chris@210: # Should be equal to the input size before Chris@210: # scanning. Chris@210: def text_size Chris@210: size = 0 Chris@210: each_text_token do |t, k| Chris@210: size + t.size Chris@210: end Chris@210: size Chris@210: end Chris@210: Chris@210: # Return all text tokens joined into a single string. Chris@210: def text Chris@210: map { |t, k| t if t.is_a? ::String }.join Chris@210: end Chris@210: Chris@210: # Include this module to give an object an #undump Chris@210: # method. Chris@210: # Chris@210: # The string returned by Tokens.dump includes Undumping. Chris@210: module Undumping Chris@210: # Calls Tokens.load with itself. Chris@210: def undump Chris@210: Tokens.load self Chris@210: end Chris@210: end Chris@210: Chris@210: # Undump the object using Marshal.load, then Chris@210: # unzip it using GZip.gunzip. Chris@210: # Chris@210: # The result is commonly a Tokens object, but Chris@210: # this is not guaranteed. Chris@210: def Tokens.load dump Chris@210: require 'coderay/helpers/gzip_simple' Chris@210: dump = dump.gunzip Chris@210: @dump = Marshal.load dump Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: Chris@210: # = TokenStream Chris@210: # Chris@210: # The TokenStream class is a fake Array without elements. Chris@210: # Chris@210: # It redirects the method << to a block given at creation. Chris@210: # Chris@210: # This allows scanners and Encoders to use streaming (no Chris@210: # tokens are saved, the input is highlighted the same time it Chris@210: # is scanned) with the same code. Chris@210: # Chris@210: # See CodeRay.encode_stream and CodeRay.scan_stream Chris@210: class TokenStream < Tokens Chris@210: Chris@210: # Whether the object is a TokenStream. Chris@210: # Chris@210: # Returns true. Chris@210: def stream? Chris@210: true Chris@210: end Chris@210: Chris@210: # The Array is empty, but size counts the tokens given by <<. Chris@210: attr_reader :size Chris@210: Chris@210: # Creates a new TokenStream that calls +block+ whenever Chris@210: # its << method is called. Chris@210: # Chris@210: # Example: Chris@210: # Chris@210: # require 'coderay' Chris@210: # Chris@210: # token_stream = CodeRay::TokenStream.new do |text, kind| Chris@210: # puts 'kind: %s, text size: %d.' % [kind, text.size] Chris@210: # end Chris@210: # Chris@210: # token_stream << ['/\d+/', :regexp] Chris@210: # #-> kind: rexpexp, text size: 5. Chris@210: # Chris@210: def initialize &block Chris@210: raise ArgumentError, 'Block expected for streaming.' unless block Chris@210: @callback = block Chris@210: @size = 0 Chris@210: end Chris@210: Chris@210: # Calls +block+ with +token+ and increments size. Chris@210: # Chris@210: # Returns self. Chris@210: def << token Chris@210: @callback.call(*token) Chris@210: @size += 1 Chris@210: self Chris@210: end Chris@210: Chris@210: # This method is not implemented due to speed reasons. Use Tokens. Chris@210: def text_size Chris@210: raise NotImplementedError, Chris@210: 'This method is not implemented due to speed reasons.' Chris@210: end Chris@210: Chris@210: # A TokenStream cannot be dumped. Use Tokens. Chris@210: def dump Chris@210: raise NotImplementedError, 'A TokenStream cannot be dumped.' Chris@210: end Chris@210: Chris@210: # A TokenStream cannot be optimized. Use Tokens. Chris@210: def optimize Chris@210: raise NotImplementedError, 'A TokenStream cannot be optimized.' Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: if $0 == __FILE__ Chris@210: $VERBOSE = true Chris@210: $: << File.join(File.dirname(__FILE__), '..') Chris@210: eval DATA.read, nil, $0, __LINE__ + 4 Chris@210: end Chris@210: Chris@210: __END__ Chris@210: require 'test/unit' Chris@210: Chris@210: class TokensTest < Test::Unit::TestCase Chris@210: Chris@210: def test_creation Chris@210: assert CodeRay::Tokens < Array Chris@210: tokens = nil Chris@210: assert_nothing_raised do Chris@210: tokens = CodeRay::Tokens.new Chris@210: end Chris@210: assert_kind_of Array, tokens Chris@210: end Chris@210: Chris@210: def test_adding_tokens Chris@210: tokens = CodeRay::Tokens.new Chris@210: assert_nothing_raised do Chris@210: tokens << ['string', :type] Chris@210: tokens << ['()', :operator] Chris@210: end Chris@210: assert_equal tokens.size, 2 Chris@210: end Chris@210: Chris@210: def test_dump_undump Chris@210: tokens = CodeRay::Tokens.new Chris@210: assert_nothing_raised do Chris@210: tokens << ['string', :type] Chris@210: tokens << ['()', :operator] Chris@210: end Chris@210: tokens2 = nil Chris@210: assert_nothing_raised do Chris@210: tokens2 = tokens.dump.undump Chris@210: end Chris@210: assert_equal tokens, tokens2 Chris@210: end Chris@210: Chris@210: end