Chris@0: module CodeRay Chris@0: Chris@0: # = Tokens Chris@0: # Chris@0: # The Tokens class represents a list of tokens returnd from Chris@0: # a Scanner. Chris@0: # Chris@0: # A token is not a special object, just a two-element Array Chris@0: # consisting of Chris@0: # * the _token_ _kind_ (a Symbol representing the type of the token) Chris@0: # * the _token_ _text_ (the original source of the token in a String) Chris@0: # Chris@0: # A token looks like this: Chris@0: # Chris@0: # [:comment, '# It looks like this'] Chris@0: # [:float, '3.1415926'] Chris@0: # [:error, '$^'] Chris@0: # Chris@0: # Some scanners also yield some kind of sub-tokens, represented by special Chris@0: # token texts, namely :open and :close . Chris@0: # Chris@0: # The Ruby scanner, for example, splits "a string" into: Chris@0: # Chris@0: # [ Chris@0: # [:open, :string], Chris@0: # [:delimiter, '"'], Chris@0: # [:content, 'a string'], Chris@0: # [:delimiter, '"'], Chris@0: # [:close, :string] Chris@0: # ] Chris@0: # Chris@0: # Tokens is also the interface between Scanners and Encoders: Chris@0: # The input is split and saved into a Tokens object. The Encoder Chris@0: # then builds the output from this object. Chris@0: # Chris@0: # Thus, the syntax below becomes clear: Chris@0: # Chris@0: # CodeRay.scan('price = 2.59', :ruby).html Chris@0: # # the Tokens object is here -------^ Chris@0: # Chris@0: # See how small it is? ;) Chris@0: # Chris@0: # Tokens gives you the power to handle pre-scanned code very easily: Chris@0: # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string Chris@0: # that you put in your DB. Chris@0: # Chris@0: # Tokens' subclass TokenStream allows streaming to save memory. Chris@0: class Tokens < Array Chris@0: Chris@0: # The Scanner instance that created the tokens. Chris@0: attr_accessor :scanner Chris@0: Chris@0: # Whether the object is a TokenStream. Chris@0: # Chris@0: # Returns false. Chris@0: def stream? Chris@0: false Chris@0: end Chris@0: Chris@0: # Iterates over all tokens. Chris@0: # Chris@0: # If a filter is given, only tokens of that kind are yielded. Chris@0: def each kind_filter = nil, &block Chris@0: unless kind_filter Chris@0: super(&block) Chris@0: else Chris@0: super() do |text, kind| Chris@0: next unless kind == kind_filter Chris@0: yield text, kind Chris@0: end Chris@0: end Chris@0: end Chris@0: Chris@0: # Iterates over all text tokens. Chris@0: # Range tokens like [:open, :string] are left out. Chris@0: # Chris@0: # Example: Chris@0: # tokens.each_text_token { |text, kind| text.replace html_escape(text) } Chris@0: def each_text_token Chris@0: each do |text, kind| Chris@0: next unless text.is_a? ::String Chris@0: yield text, kind Chris@0: end Chris@0: end Chris@0: Chris@0: # Encode the tokens using encoder. Chris@0: # Chris@0: # encoder can be Chris@0: # * a symbol like :html oder :statistic Chris@0: # * an Encoder class Chris@0: # * an Encoder object Chris@0: # Chris@0: # options are passed to the encoder. Chris@0: def encode encoder, options = {} Chris@0: unless encoder.is_a? Encoders::Encoder Chris@0: unless encoder.is_a? Class Chris@0: encoder_class = Encoders[encoder] Chris@0: end Chris@0: encoder = encoder_class.new options Chris@0: end Chris@0: encoder.encode_tokens self, options Chris@0: end Chris@0: Chris@0: Chris@0: # Turn into a string using Encoders::Text. Chris@0: # Chris@0: # +options+ are passed to the encoder if given. Chris@0: def to_s options = {} Chris@0: encode :text, options Chris@0: end Chris@0: Chris@0: # Redirects unknown methods to encoder calls. Chris@0: # Chris@0: # For example, if you call +tokens.html+, the HTML encoder Chris@0: # is used to highlight the tokens. Chris@0: def method_missing meth, options = {} Chris@0: Encoders[meth].new(options).encode_tokens self Chris@0: end Chris@0: Chris@0: # Returns the tokens compressed by joining consecutive Chris@0: # tokens of the same kind. Chris@0: # Chris@0: # This can not be undone, but should yield the same output Chris@0: # in most Encoders. It basically makes the output smaller. Chris@0: # Chris@0: # Combined with dump, it saves space for the cost of time. Chris@0: # Chris@0: # If the scanner is written carefully, this is not required - Chris@0: # for example, consecutive //-comment lines could already be Chris@0: # joined in one comment token by the Scanner. Chris@0: def optimize Chris@0: last_kind = last_text = nil Chris@0: new = self.class.new Chris@0: for text, kind in self Chris@0: if text.is_a? String Chris@0: if kind == last_kind Chris@0: last_text << text Chris@0: else Chris@0: new << [last_text, last_kind] if last_kind Chris@0: last_text = text Chris@0: last_kind = kind Chris@0: end Chris@0: else Chris@0: new << [last_text, last_kind] if last_kind Chris@0: last_kind = last_text = nil Chris@0: new << [text, kind] Chris@0: end Chris@0: end Chris@0: new << [last_text, last_kind] if last_kind Chris@0: new Chris@0: end Chris@0: Chris@0: # Compact the object itself; see optimize. Chris@0: def optimize! Chris@0: replace optimize Chris@0: end Chris@0: Chris@0: # Ensure that all :open tokens have a correspondent :close one. Chris@0: # Chris@0: # TODO: Test this! Chris@0: def fix Chris@0: tokens = self.class.new Chris@0: # Check token nesting using a stack of kinds. Chris@0: opened = [] Chris@0: for type, kind in self Chris@0: case type Chris@0: when :open Chris@0: opened.push [:close, kind] Chris@0: when :begin_line Chris@0: opened.push [:end_line, kind] Chris@0: when :close, :end_line Chris@0: expected = opened.pop Chris@0: if [type, kind] != expected Chris@0: # Unexpected :close; decide what to do based on the kind: Chris@0: # - token was never opened: delete the :close (just skip it) Chris@0: next unless opened.rindex expected Chris@0: # - token was opened earlier: also close tokens in between Chris@0: tokens << token until (token = opened.pop) == expected Chris@0: end Chris@0: end Chris@0: tokens << [type, kind] Chris@0: end Chris@0: # Close remaining opened tokens Chris@0: tokens << token while token = opened.pop Chris@0: tokens Chris@0: end Chris@0: Chris@0: def fix! Chris@0: replace fix Chris@0: end Chris@0: Chris@0: # TODO: Scanner#split_into_lines Chris@0: # Chris@0: # Makes sure that: Chris@0: # - newlines are single tokens Chris@0: # (which means all other token are single-line) Chris@0: # - there are no open tokens at the end the line Chris@0: # Chris@0: # This makes it simple for encoders that work line-oriented, Chris@0: # like HTML with list-style numeration. Chris@0: def split_into_lines Chris@0: raise NotImplementedError Chris@0: end Chris@0: Chris@0: def split_into_lines! Chris@0: replace split_into_lines Chris@0: end Chris@0: Chris@0: # Dumps the object into a String that can be saved Chris@0: # in files or databases. Chris@0: # Chris@0: # The dump is created with Marshal.dump; Chris@0: # In addition, it is gzipped using GZip.gzip. Chris@0: # Chris@0: # The returned String object includes Undumping Chris@0: # so it has an #undump method. See Tokens.load. Chris@0: # Chris@0: # You can configure the level of compression, Chris@0: # but the default value 7 should be what you want Chris@0: # in most cases as it is a good compromise between Chris@0: # speed and compression rate. Chris@0: # Chris@0: # See GZip module. Chris@0: def dump gzip_level = 7 Chris@0: require 'coderay/helpers/gzip_simple' Chris@0: dump = Marshal.dump self Chris@0: dump = dump.gzip gzip_level Chris@0: dump.extend Undumping Chris@0: end Chris@0: Chris@0: # The total size of the tokens. Chris@0: # Should be equal to the input size before Chris@0: # scanning. Chris@0: def text_size Chris@0: size = 0 Chris@0: each_text_token do |t, k| Chris@0: size + t.size Chris@0: end Chris@0: size Chris@0: end Chris@0: Chris@0: # The total size of the tokens. Chris@0: # Should be equal to the input size before Chris@0: # scanning. Chris@0: def text Chris@0: map { |t, k| t if t.is_a? ::String }.join Chris@0: end Chris@0: Chris@0: # Include this module to give an object an #undump Chris@0: # method. Chris@0: # Chris@0: # The string returned by Tokens.dump includes Undumping. Chris@0: module Undumping Chris@0: # Calls Tokens.load with itself. Chris@0: def undump Chris@0: Tokens.load self Chris@0: end Chris@0: end Chris@0: Chris@0: # Undump the object using Marshal.load, then Chris@0: # unzip it using GZip.gunzip. Chris@0: # Chris@0: # The result is commonly a Tokens object, but Chris@0: # this is not guaranteed. Chris@0: def Tokens.load dump Chris@0: require 'coderay/helpers/gzip_simple' Chris@0: dump = dump.gunzip Chris@0: @dump = Marshal.load dump Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: Chris@0: # = TokenStream Chris@0: # Chris@0: # The TokenStream class is a fake Array without elements. Chris@0: # Chris@0: # It redirects the method << to a block given at creation. Chris@0: # Chris@0: # This allows scanners and Encoders to use streaming (no Chris@0: # tokens are saved, the input is highlighted the same time it Chris@0: # is scanned) with the same code. Chris@0: # Chris@0: # See CodeRay.encode_stream and CodeRay.scan_stream Chris@0: class TokenStream < Tokens Chris@0: Chris@0: # Whether the object is a TokenStream. Chris@0: # Chris@0: # Returns true. Chris@0: def stream? Chris@0: true Chris@0: end Chris@0: Chris@0: # The Array is empty, but size counts the tokens given by <<. Chris@0: attr_reader :size Chris@0: Chris@0: # Creates a new TokenStream that calls +block+ whenever Chris@0: # its << method is called. Chris@0: # Chris@0: # Example: Chris@0: # Chris@0: # require 'coderay' Chris@0: # Chris@0: # token_stream = CodeRay::TokenStream.new do |kind, text| Chris@0: # puts 'kind: %s, text size: %d.' % [kind, text.size] Chris@0: # end Chris@0: # Chris@0: # token_stream << [:regexp, '/\d+/'] Chris@0: # #-> kind: rexpexp, text size: 5. Chris@0: # Chris@0: def initialize &block Chris@0: raise ArgumentError, 'Block expected for streaming.' unless block Chris@0: @callback = block Chris@0: @size = 0 Chris@0: end Chris@0: Chris@0: # Calls +block+ with +token+ and increments size. Chris@0: # Chris@0: # Returns self. Chris@0: def << token Chris@0: @callback.call(*token) Chris@0: @size += 1 Chris@0: self Chris@0: end Chris@0: Chris@0: # This method is not implemented due to speed reasons. Use Tokens. Chris@0: def text_size Chris@0: raise NotImplementedError, Chris@0: 'This method is not implemented due to speed reasons.' Chris@0: end Chris@0: Chris@0: # A TokenStream cannot be dumped. Use Tokens. Chris@0: def dump Chris@0: raise NotImplementedError, 'A TokenStream cannot be dumped.' Chris@0: end Chris@0: Chris@0: # A TokenStream cannot be optimized. Use Tokens. Chris@0: def optimize Chris@0: raise NotImplementedError, 'A TokenStream cannot be optimized.' Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: if $0 == __FILE__ Chris@0: $VERBOSE = true Chris@0: $: << File.join(File.dirname(__FILE__), '..') Chris@0: eval DATA.read, nil, $0, __LINE__ + 4 Chris@0: end Chris@0: Chris@0: __END__ Chris@0: require 'test/unit' Chris@0: Chris@0: class TokensTest < Test::Unit::TestCase Chris@0: Chris@0: def test_creation Chris@0: assert CodeRay::Tokens < Array Chris@0: tokens = nil Chris@0: assert_nothing_raised do Chris@0: tokens = CodeRay::Tokens.new Chris@0: end Chris@0: assert_kind_of Array, tokens Chris@0: end Chris@0: Chris@0: def test_adding_tokens Chris@0: tokens = CodeRay::Tokens.new Chris@0: assert_nothing_raised do Chris@0: tokens << ['string', :type] Chris@0: tokens << ['()', :operator] Chris@0: end Chris@0: assert_equal tokens.size, 2 Chris@0: end Chris@0: Chris@0: def test_dump_undump Chris@0: tokens = CodeRay::Tokens.new Chris@0: assert_nothing_raised do Chris@0: tokens << ['string', :type] Chris@0: tokens << ['()', :operator] Chris@0: end Chris@0: tokens2 = nil Chris@0: assert_nothing_raised do Chris@0: tokens2 = tokens.dump.undump Chris@0: end Chris@0: assert_equal tokens, tokens2 Chris@0: end Chris@0: Chris@0: end