Chris@909: # encoding: utf-8 Chris@909: require 'strscan' Chris@909: Chris@909: module CodeRay Chris@909: Chris@909: autoload :WordList, 'coderay/helpers/word_list' Chris@909: Chris@909: # = Scanners Chris@909: # Chris@909: # This module holds the Scanner class and its subclasses. Chris@909: # For example, the Ruby scanner is named CodeRay::Scanners::Ruby Chris@909: # can be found in coderay/scanners/ruby. Chris@909: # Chris@909: # Scanner also provides methods and constants for the register Chris@909: # mechanism and the [] method that returns the Scanner class Chris@909: # belonging to the given lang. Chris@909: # Chris@909: # See PluginHost. Chris@909: module Scanners Chris@909: extend PluginHost Chris@909: plugin_path File.dirname(__FILE__), 'scanners' Chris@909: Chris@909: Chris@909: # = Scanner Chris@909: # Chris@909: # The base class for all Scanners. Chris@909: # Chris@909: # It is a subclass of Ruby's great +StringScanner+, which Chris@909: # makes it easy to access the scanning methods inside. Chris@909: # Chris@909: # It is also +Enumerable+, so you can use it like an Array of Chris@909: # Tokens: Chris@909: # Chris@909: # require 'coderay' Chris@909: # Chris@909: # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;" Chris@909: # Chris@909: # for text, kind in c_scanner Chris@909: # puts text if kind == :operator Chris@909: # end Chris@909: # Chris@909: # # prints: (*==)++; Chris@909: # Chris@909: # OK, this is a very simple example :) Chris@909: # You can also use +map+, +any?+, +find+ and even +sort_by+, Chris@909: # if you want. Chris@909: class Scanner < StringScanner Chris@909: Chris@909: extend Plugin Chris@909: plugin_host Scanners Chris@909: Chris@909: # Raised if a Scanner fails while scanning Chris@909: ScanError = Class.new StandardError Chris@909: Chris@909: # The default options for all scanner classes. Chris@909: # Chris@909: # Define @default_options for subclasses. Chris@909: DEFAULT_OPTIONS = { } Chris@909: Chris@909: KINDS_NOT_LOC = [:comment, :doctype, :docstring] Chris@909: Chris@909: attr_accessor :state Chris@909: Chris@909: class << self Chris@909: Chris@909: # Normalizes the given code into a string with UNIX newlines, in the Chris@909: # scanner's internal encoding, with invalid and undefined charachters Chris@909: # replaced by placeholders. Always returns a new object. Chris@909: def normalize code Chris@909: # original = code Chris@909: code = code.to_s unless code.is_a? ::String Chris@909: return code if code.empty? Chris@909: Chris@909: if code.respond_to? :encoding Chris@909: code = encode_with_encoding code, self.encoding Chris@909: else Chris@909: code = to_unix code Chris@909: end Chris@909: # code = code.dup if code.eql? original Chris@909: code Chris@909: end Chris@909: Chris@909: # The typical filename suffix for this scanner's language. Chris@909: def file_extension extension = lang Chris@909: @file_extension ||= extension.to_s Chris@909: end Chris@909: Chris@909: # The encoding used internally by this scanner. Chris@909: def encoding name = 'UTF-8' Chris@909: @encoding ||= defined?(Encoding.find) && Encoding.find(name) Chris@909: end Chris@909: Chris@909: # The lang of this Scanner class, which is equal to its Plugin ID. Chris@909: def lang Chris@909: @plugin_id Chris@909: end Chris@909: Chris@909: protected Chris@909: Chris@909: def encode_with_encoding code, target_encoding Chris@909: if code.encoding == target_encoding Chris@909: if code.valid_encoding? Chris@909: return to_unix(code) Chris@909: else Chris@909: source_encoding = guess_encoding code Chris@909: end Chris@909: else Chris@909: source_encoding = code.encoding Chris@909: end Chris@909: # print "encode_with_encoding from #{source_encoding} to #{target_encoding}" Chris@909: code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace Chris@909: end Chris@909: Chris@909: def to_unix code Chris@909: code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code Chris@909: end Chris@909: Chris@909: def guess_encoding s Chris@909: #:nocov: Chris@909: IO.popen("file -b --mime -", "w+") do |file| Chris@909: file.write s[0, 1024] Chris@909: file.close_write Chris@909: begin Chris@909: Encoding.find file.gets[/charset=([-\w]+)/, 1] Chris@909: rescue ArgumentError Chris@909: Encoding::BINARY Chris@909: end Chris@909: end Chris@909: #:nocov: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: # Create a new Scanner. Chris@909: # Chris@909: # * +code+ is the input String and is handled by the superclass Chris@909: # StringScanner. Chris@909: # * +options+ is a Hash with Symbols as keys. Chris@909: # It is merged with the default options of the class (you can Chris@909: # overwrite default options here.) Chris@909: # Chris@909: # Else, a Tokens object is used. Chris@909: def initialize code = '', options = {} Chris@909: if self.class == Scanner Chris@909: raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses." Chris@909: end Chris@909: Chris@909: @options = self.class::DEFAULT_OPTIONS.merge options Chris@909: Chris@909: super self.class.normalize(code) Chris@909: Chris@909: @tokens = options[:tokens] || Tokens.new Chris@909: @tokens.scanner = self if @tokens.respond_to? :scanner= Chris@909: Chris@909: setup Chris@909: end Chris@909: Chris@909: # Sets back the scanner. Subclasses should redefine the reset_instance Chris@909: # method instead of this one. Chris@909: def reset Chris@909: super Chris@909: reset_instance Chris@909: end Chris@909: Chris@909: # Set a new string to be scanned. Chris@909: def string= code Chris@909: code = self.class.normalize(code) Chris@909: super code Chris@909: reset_instance Chris@909: end Chris@909: Chris@909: # the Plugin ID for this scanner Chris@909: def lang Chris@909: self.class.lang Chris@909: end Chris@909: Chris@909: # the default file extension for this scanner Chris@909: def file_extension Chris@909: self.class.file_extension Chris@909: end Chris@909: Chris@909: # Scan the code and returns all tokens in a Tokens object. Chris@909: def tokenize source = nil, options = {} Chris@909: options = @options.merge(options) Chris@909: @tokens = options[:tokens] || @tokens || Tokens.new Chris@909: @tokens.scanner = self if @tokens.respond_to? :scanner= Chris@909: case source Chris@909: when Array Chris@909: self.string = self.class.normalize(source.join) Chris@909: when nil Chris@909: reset Chris@909: else Chris@909: self.string = self.class.normalize(source) Chris@909: end Chris@909: Chris@909: begin Chris@909: scan_tokens @tokens, options Chris@909: rescue => e Chris@909: message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state] Chris@909: raise_inspect e.message, @tokens, message, 30, e.backtrace Chris@909: end Chris@909: Chris@909: @cached_tokens = @tokens Chris@909: if source.is_a? Array Chris@909: @tokens.split_into_parts(*source.map { |part| part.size }) Chris@909: else Chris@909: @tokens Chris@909: end Chris@909: end Chris@909: Chris@909: # Cache the result of tokenize. Chris@909: def tokens Chris@909: @cached_tokens ||= tokenize Chris@909: end Chris@909: Chris@909: # Traverse the tokens. Chris@909: def each &block Chris@909: tokens.each(&block) Chris@909: end Chris@909: include Enumerable Chris@909: Chris@909: # The current line position of the scanner, starting with 1. Chris@909: # See also: #column. Chris@909: # Chris@909: # Beware, this is implemented inefficiently. It should be used Chris@909: # for debugging only. Chris@909: def line pos = self.pos Chris@909: return 1 if pos <= 0 Chris@909: binary_string[0...pos].count("\n") + 1 Chris@909: end Chris@909: Chris@909: # The current column position of the scanner, starting with 1. Chris@909: # See also: #line. Chris@909: def column pos = self.pos Chris@909: return 1 if pos <= 0 Chris@909: pos - (binary_string.rindex(?\n, pos - 1) || -1) Chris@909: end Chris@909: Chris@909: # The string in binary encoding. Chris@909: # Chris@909: # To be used with #pos, which is the index of the byte the scanner Chris@909: # will scan next. Chris@909: def binary_string Chris@909: @binary_string ||= Chris@909: if string.respond_to?(:bytesize) && string.bytesize != string.size Chris@909: #:nocov: Chris@909: string.dup.force_encoding('binary') Chris@909: #:nocov: Chris@909: else Chris@909: string Chris@909: end Chris@909: end Chris@909: Chris@909: protected Chris@909: Chris@909: # Can be implemented by subclasses to do some initialization Chris@909: # that has to be done once per instance. Chris@909: # Chris@909: # Use reset for initialization that has to be done once per Chris@909: # scan. Chris@909: def setup # :doc: Chris@909: end Chris@909: Chris@909: # This is the central method, and commonly the only one a Chris@909: # subclass implements. Chris@909: # Chris@909: # Subclasses must implement this method; it must return +tokens+ Chris@909: # and must only use Tokens#<< for storing scanned tokens! Chris@909: def scan_tokens tokens, options # :doc: Chris@909: raise NotImplementedError, "#{self.class}#scan_tokens not implemented." Chris@909: end Chris@909: Chris@909: # Resets the scanner. Chris@909: def reset_instance Chris@909: @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens] Chris@909: @cached_tokens = nil Chris@909: @binary_string = nil if defined? @binary_string Chris@909: end Chris@909: Chris@909: # Scanner error with additional status information Chris@909: def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller Chris@909: raise ScanError, <<-EOE % [ Chris@909: Chris@909: Chris@909: ***ERROR in %s: %s (after %d tokens) Chris@909: Chris@909: tokens: Chris@909: %s Chris@909: Chris@909: current line: %d column: %d pos: %d Chris@909: matched: %p state: %p Chris@909: bol? = %p, eos? = %p Chris@909: Chris@909: surrounding code: Chris@909: %p ~~ %p Chris@909: Chris@909: Chris@909: ***ERROR*** Chris@909: Chris@909: EOE Chris@909: File.basename(caller[0]), Chris@909: msg, Chris@909: tokens.respond_to?(:size) ? tokens.size : 0, Chris@909: tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '', Chris@909: line, column, pos, Chris@909: matched, state, bol?, eos?, Chris@909: binary_string[pos - ambit, ambit], Chris@909: binary_string[pos, ambit], Chris@909: ], backtrace Chris@909: end Chris@909: Chris@909: # Shorthand for scan_until(/\z/). Chris@909: # This method also avoids a JRuby 1.9 mode bug. Chris@909: def scan_rest Chris@909: rest = self.rest Chris@909: terminate Chris@909: rest Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: end