Chris@210: module CodeRay Chris@210: Chris@210: require 'coderay/helpers/plugin' Chris@210: Chris@210: # = Scanners Chris@210: # Chris@210: # This module holds the Scanner class and its subclasses. Chris@210: # For example, the Ruby scanner is named CodeRay::Scanners::Ruby Chris@210: # can be found in coderay/scanners/ruby. Chris@210: # Chris@210: # Scanner also provides methods and constants for the register Chris@210: # mechanism and the [] method that returns the Scanner class Chris@210: # belonging to the given lang. Chris@210: # Chris@210: # See PluginHost. Chris@210: module Scanners Chris@210: extend PluginHost Chris@210: plugin_path File.dirname(__FILE__), 'scanners' Chris@210: Chris@210: require 'strscan' Chris@210: Chris@210: # = Scanner Chris@210: # Chris@210: # The base class for all Scanners. Chris@210: # Chris@210: # It is a subclass of Ruby's great +StringScanner+, which Chris@210: # makes it easy to access the scanning methods inside. Chris@210: # Chris@210: # It is also +Enumerable+, so you can use it like an Array of Chris@210: # Tokens: Chris@210: # Chris@210: # require 'coderay' Chris@210: # Chris@210: # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;" Chris@210: # Chris@210: # for text, kind in c_scanner Chris@210: # puts text if kind == :operator Chris@210: # end Chris@210: # Chris@210: # # prints: (*==)++; Chris@210: # Chris@210: # OK, this is a very simple example :) Chris@210: # You can also use +map+, +any?+, +find+ and even +sort_by+, Chris@210: # if you want. Chris@210: class Scanner < StringScanner Chris@210: Chris@210: extend Plugin Chris@210: plugin_host Scanners Chris@210: Chris@210: # Raised if a Scanner fails while scanning Chris@210: ScanError = Class.new(Exception) Chris@210: Chris@210: require 'coderay/helpers/word_list' Chris@210: Chris@210: # The default options for all scanner classes. Chris@210: # Chris@210: # Define @default_options for subclasses. Chris@210: DEFAULT_OPTIONS = { :stream => false } Chris@210: Chris@210: KINDS_NOT_LOC = [:comment, :doctype] Chris@210: Chris@210: class << self Chris@210: Chris@210: # Returns if the Scanner can be used in streaming mode. Chris@210: def streamable? Chris@210: is_a? Streamable Chris@210: end Chris@210: Chris@210: def normify code Chris@210: code = code.to_s Chris@210: if code.respond_to?(:encoding) && (code.encoding.name != 'UTF-8' || !code.valid_encoding?) Chris@210: code = code.dup Chris@210: original_encoding = code.encoding Chris@210: code.force_encoding 'Windows-1252' Chris@210: unless code.valid_encoding? Chris@210: code.force_encoding original_encoding Chris@210: if code.encoding.name == 'UTF-8' Chris@210: code.encode! 'UTF-16BE', :invalid => :replace, :undef => :replace, :replace => '?' Chris@210: end Chris@210: code.encode! 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '?' Chris@210: end Chris@210: end Chris@210: code.to_unix Chris@210: end Chris@210: Chris@210: def file_extension extension = nil Chris@210: if extension Chris@210: @file_extension = extension.to_s Chris@210: else Chris@210: @file_extension ||= plugin_id.to_s Chris@210: end Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: =begin Chris@210: ## Excluded for speed reasons; protected seems to make methods slow. Chris@210: Chris@210: # Save the StringScanner methods from being called. Chris@210: # This would not be useful for highlighting. Chris@210: strscan_public_methods = Chris@210: StringScanner.instance_methods - Chris@210: StringScanner.ancestors[1].instance_methods Chris@210: protected(*strscan_public_methods) Chris@210: =end Chris@210: Chris@210: # Create a new Scanner. Chris@210: # Chris@210: # * +code+ is the input String and is handled by the superclass Chris@210: # StringScanner. Chris@210: # * +options+ is a Hash with Symbols as keys. Chris@210: # It is merged with the default options of the class (you can Chris@210: # overwrite default options here.) Chris@210: # * +block+ is the callback for streamed highlighting. Chris@210: # Chris@210: # If you set :stream to +true+ in the options, the Scanner uses a Chris@210: # TokenStream with the +block+ as callback to handle the tokens. Chris@210: # Chris@210: # Else, a Tokens object is used. Chris@210: def initialize code='', options = {}, &block Chris@210: raise "I am only the basic Scanner class. I can't scan "\ Chris@210: "anything. :( Use my subclasses." if self.class == Scanner Chris@210: Chris@210: @options = self.class::DEFAULT_OPTIONS.merge options Chris@210: Chris@210: super Scanner.normify(code) Chris@210: Chris@210: @tokens = options[:tokens] Chris@210: if @options[:stream] Chris@210: warn "warning in CodeRay::Scanner.new: :stream is set, "\ Chris@210: "but no block was given" unless block_given? Chris@210: raise NotStreamableError, self unless kind_of? Streamable Chris@210: @tokens ||= TokenStream.new(&block) Chris@210: else Chris@210: warn "warning in CodeRay::Scanner.new: Block given, "\ Chris@210: "but :stream is #{@options[:stream]}" if block_given? Chris@210: @tokens ||= Tokens.new Chris@210: end Chris@210: @tokens.scanner = self Chris@210: Chris@210: setup Chris@210: end Chris@210: Chris@210: def reset Chris@210: super Chris@210: reset_instance Chris@210: end Chris@210: Chris@210: def string= code Chris@210: code = Scanner.normify(code) Chris@210: if defined?(RUBY_DESCRIPTION) && RUBY_DESCRIPTION['rubinius 1.0.1'] Chris@210: reset_state Chris@210: @string = code Chris@210: else Chris@210: super code Chris@210: end Chris@210: reset_instance Chris@210: end Chris@210: Chris@210: # More mnemonic accessor name for the input string. Chris@210: alias code string Chris@210: alias code= string= Chris@210: Chris@210: # Returns the Plugin ID for this scanner. Chris@210: def lang Chris@210: self.class.plugin_id Chris@210: end Chris@210: Chris@210: # Scans the code and returns all tokens in a Tokens object. Chris@210: def tokenize new_string=nil, options = {} Chris@210: options = @options.merge(options) Chris@210: self.string = new_string if new_string Chris@210: @cached_tokens = Chris@210: if @options[:stream] # :stream must have been set already Chris@210: reset unless new_string Chris@210: scan_tokens @tokens, options Chris@210: @tokens Chris@210: else Chris@210: scan_tokens @tokens, options Chris@210: end Chris@210: end Chris@210: Chris@210: def tokens Chris@210: @cached_tokens ||= tokenize Chris@210: end Chris@210: Chris@210: # Whether the scanner is in streaming mode. Chris@210: def streaming? Chris@210: !!@options[:stream] Chris@210: end Chris@210: Chris@210: # Traverses the tokens. Chris@210: def each &block Chris@210: raise ArgumentError, Chris@210: 'Cannot traverse TokenStream.' if @options[:stream] Chris@210: tokens.each(&block) Chris@210: end Chris@210: include Enumerable Chris@210: Chris@210: # The current line position of the scanner. Chris@210: # Chris@210: # Beware, this is implemented inefficiently. It should be used Chris@210: # for debugging only. Chris@210: def line Chris@210: string[0..pos].count("\n") + 1 Chris@210: end Chris@210: Chris@210: def column pos = self.pos Chris@210: return 0 if pos <= 0 Chris@210: string = string() Chris@210: if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size) Chris@210: @bin_string ||= string.dup.force_encoding('binary') Chris@210: string = @bin_string Chris@210: end Chris@210: pos - (string.rindex(?\n, pos) || 0) Chris@210: end Chris@210: Chris@210: def marshal_dump Chris@210: @options Chris@210: end Chris@210: Chris@210: def marshal_load options Chris@210: @options = options Chris@210: end Chris@210: Chris@210: protected Chris@210: Chris@210: # Can be implemented by subclasses to do some initialization Chris@210: # that has to be done once per instance. Chris@210: # Chris@210: # Use reset for initialization that has to be done once per Chris@210: # scan. Chris@210: def setup Chris@210: end Chris@210: Chris@210: # This is the central method, and commonly the only one a Chris@210: # subclass implements. Chris@210: # Chris@210: # Subclasses must implement this method; it must return +tokens+ Chris@210: # and must only use Tokens#<< for storing scanned tokens! Chris@210: def scan_tokens tokens, options Chris@210: raise NotImplementedError, Chris@210: "#{self.class}#scan_tokens not implemented." Chris@210: end Chris@210: Chris@210: def reset_instance Chris@210: @tokens.clear unless @options[:keep_tokens] Chris@210: @cached_tokens = nil Chris@210: @bin_string = nil if defined? @bin_string Chris@210: end Chris@210: Chris@210: # Scanner error with additional status information Chris@210: def raise_inspect msg, tokens, state = 'No state given!', ambit = 30 Chris@210: raise ScanError, <<-EOE % [ Chris@210: Chris@210: Chris@210: ***ERROR in %s: %s (after %d tokens) Chris@210: Chris@210: tokens: Chris@210: %s Chris@210: Chris@210: current line: %d column: %d pos: %d Chris@210: matched: %p state: %p Chris@210: bol? = %p, eos? = %p Chris@210: Chris@210: surrounding code: Chris@210: %p ~~ %p Chris@210: Chris@210: Chris@210: ***ERROR*** Chris@210: Chris@210: EOE Chris@210: File.basename(caller[0]), Chris@210: msg, Chris@210: tokens.size, Chris@210: tokens.last(10).map { |t| t.inspect }.join("\n"), Chris@210: line, column, pos, Chris@210: matched, state, bol?, eos?, Chris@210: string[pos - ambit, ambit], Chris@210: string[pos, ambit], Chris@210: ] Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: end Chris@210: Chris@210: class String Chris@210: # I love this hack. It seems to silence all dos/unix/mac newline problems. Chris@210: def to_unix Chris@210: if index ?\r Chris@210: gsub(/\r\n?/, "\n") Chris@210: else Chris@210: self Chris@210: end Chris@210: end Chris@210: end