annotate vendor/gems/coderay-1.0.0/lib/coderay/scanner.rb @ 1171:b4558bc5837f bug_505

Close obsolete branch bug_505
author Chris Cannam
date Fri, 03 Aug 2012 19:40:23 +0100
parents cbb26bc654de
children
rev   line source
Chris@909 1 # encoding: utf-8
Chris@909 2 require 'strscan'
Chris@909 3
Chris@909 4 module CodeRay
Chris@909 5
Chris@909 6 autoload :WordList, 'coderay/helpers/word_list'
Chris@909 7
Chris@909 8 # = Scanners
Chris@909 9 #
Chris@909 10 # This module holds the Scanner class and its subclasses.
Chris@909 11 # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
Chris@909 12 # can be found in coderay/scanners/ruby.
Chris@909 13 #
Chris@909 14 # Scanner also provides methods and constants for the register
Chris@909 15 # mechanism and the [] method that returns the Scanner class
Chris@909 16 # belonging to the given lang.
Chris@909 17 #
Chris@909 18 # See PluginHost.
Chris@909 19 module Scanners
Chris@909 20 extend PluginHost
Chris@909 21 plugin_path File.dirname(__FILE__), 'scanners'
Chris@909 22
Chris@909 23
Chris@909 24 # = Scanner
Chris@909 25 #
Chris@909 26 # The base class for all Scanners.
Chris@909 27 #
Chris@909 28 # It is a subclass of Ruby's great +StringScanner+, which
Chris@909 29 # makes it easy to access the scanning methods inside.
Chris@909 30 #
Chris@909 31 # It is also +Enumerable+, so you can use it like an Array of
Chris@909 32 # Tokens:
Chris@909 33 #
Chris@909 34 # require 'coderay'
Chris@909 35 #
Chris@909 36 # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
Chris@909 37 #
Chris@909 38 # for text, kind in c_scanner
Chris@909 39 # puts text if kind == :operator
Chris@909 40 # end
Chris@909 41 #
Chris@909 42 # # prints: (*==)++;
Chris@909 43 #
Chris@909 44 # OK, this is a very simple example :)
Chris@909 45 # You can also use +map+, +any?+, +find+ and even +sort_by+,
Chris@909 46 # if you want.
Chris@909 47 class Scanner < StringScanner
Chris@909 48
Chris@909 49 extend Plugin
Chris@909 50 plugin_host Scanners
Chris@909 51
Chris@909 52 # Raised if a Scanner fails while scanning
Chris@909 53 ScanError = Class.new StandardError
Chris@909 54
Chris@909 55 # The default options for all scanner classes.
Chris@909 56 #
Chris@909 57 # Define @default_options for subclasses.
Chris@909 58 DEFAULT_OPTIONS = { }
Chris@909 59
Chris@909 60 KINDS_NOT_LOC = [:comment, :doctype, :docstring]
Chris@909 61
Chris@909 62 attr_accessor :state
Chris@909 63
Chris@909 64 class << self
Chris@909 65
Chris@909 66 # Normalizes the given code into a string with UNIX newlines, in the
Chris@909 67 # scanner's internal encoding, with invalid and undefined charachters
Chris@909 68 # replaced by placeholders. Always returns a new object.
Chris@909 69 def normalize code
Chris@909 70 # original = code
Chris@909 71 code = code.to_s unless code.is_a? ::String
Chris@909 72 return code if code.empty?
Chris@909 73
Chris@909 74 if code.respond_to? :encoding
Chris@909 75 code = encode_with_encoding code, self.encoding
Chris@909 76 else
Chris@909 77 code = to_unix code
Chris@909 78 end
Chris@909 79 # code = code.dup if code.eql? original
Chris@909 80 code
Chris@909 81 end
Chris@909 82
Chris@909 83 # The typical filename suffix for this scanner's language.
Chris@909 84 def file_extension extension = lang
Chris@909 85 @file_extension ||= extension.to_s
Chris@909 86 end
Chris@909 87
Chris@909 88 # The encoding used internally by this scanner.
Chris@909 89 def encoding name = 'UTF-8'
Chris@909 90 @encoding ||= defined?(Encoding.find) && Encoding.find(name)
Chris@909 91 end
Chris@909 92
Chris@909 93 # The lang of this Scanner class, which is equal to its Plugin ID.
Chris@909 94 def lang
Chris@909 95 @plugin_id
Chris@909 96 end
Chris@909 97
Chris@909 98 protected
Chris@909 99
Chris@909 100 def encode_with_encoding code, target_encoding
Chris@909 101 if code.encoding == target_encoding
Chris@909 102 if code.valid_encoding?
Chris@909 103 return to_unix(code)
Chris@909 104 else
Chris@909 105 source_encoding = guess_encoding code
Chris@909 106 end
Chris@909 107 else
Chris@909 108 source_encoding = code.encoding
Chris@909 109 end
Chris@909 110 # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
Chris@909 111 code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
Chris@909 112 end
Chris@909 113
Chris@909 114 def to_unix code
Chris@909 115 code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
Chris@909 116 end
Chris@909 117
Chris@909 118 def guess_encoding s
Chris@909 119 #:nocov:
Chris@909 120 IO.popen("file -b --mime -", "w+") do |file|
Chris@909 121 file.write s[0, 1024]
Chris@909 122 file.close_write
Chris@909 123 begin
Chris@909 124 Encoding.find file.gets[/charset=([-\w]+)/, 1]
Chris@909 125 rescue ArgumentError
Chris@909 126 Encoding::BINARY
Chris@909 127 end
Chris@909 128 end
Chris@909 129 #:nocov:
Chris@909 130 end
Chris@909 131
Chris@909 132 end
Chris@909 133
Chris@909 134 # Create a new Scanner.
Chris@909 135 #
Chris@909 136 # * +code+ is the input String and is handled by the superclass
Chris@909 137 # StringScanner.
Chris@909 138 # * +options+ is a Hash with Symbols as keys.
Chris@909 139 # It is merged with the default options of the class (you can
Chris@909 140 # overwrite default options here.)
Chris@909 141 #
Chris@909 142 # Else, a Tokens object is used.
Chris@909 143 def initialize code = '', options = {}
Chris@909 144 if self.class == Scanner
Chris@909 145 raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
Chris@909 146 end
Chris@909 147
Chris@909 148 @options = self.class::DEFAULT_OPTIONS.merge options
Chris@909 149
Chris@909 150 super self.class.normalize(code)
Chris@909 151
Chris@909 152 @tokens = options[:tokens] || Tokens.new
Chris@909 153 @tokens.scanner = self if @tokens.respond_to? :scanner=
Chris@909 154
Chris@909 155 setup
Chris@909 156 end
Chris@909 157
Chris@909 158 # Sets back the scanner. Subclasses should redefine the reset_instance
Chris@909 159 # method instead of this one.
Chris@909 160 def reset
Chris@909 161 super
Chris@909 162 reset_instance
Chris@909 163 end
Chris@909 164
Chris@909 165 # Set a new string to be scanned.
Chris@909 166 def string= code
Chris@909 167 code = self.class.normalize(code)
Chris@909 168 super code
Chris@909 169 reset_instance
Chris@909 170 end
Chris@909 171
Chris@909 172 # the Plugin ID for this scanner
Chris@909 173 def lang
Chris@909 174 self.class.lang
Chris@909 175 end
Chris@909 176
Chris@909 177 # the default file extension for this scanner
Chris@909 178 def file_extension
Chris@909 179 self.class.file_extension
Chris@909 180 end
Chris@909 181
Chris@909 182 # Scan the code and returns all tokens in a Tokens object.
Chris@909 183 def tokenize source = nil, options = {}
Chris@909 184 options = @options.merge(options)
Chris@909 185 @tokens = options[:tokens] || @tokens || Tokens.new
Chris@909 186 @tokens.scanner = self if @tokens.respond_to? :scanner=
Chris@909 187 case source
Chris@909 188 when Array
Chris@909 189 self.string = self.class.normalize(source.join)
Chris@909 190 when nil
Chris@909 191 reset
Chris@909 192 else
Chris@909 193 self.string = self.class.normalize(source)
Chris@909 194 end
Chris@909 195
Chris@909 196 begin
Chris@909 197 scan_tokens @tokens, options
Chris@909 198 rescue => e
Chris@909 199 message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
Chris@909 200 raise_inspect e.message, @tokens, message, 30, e.backtrace
Chris@909 201 end
Chris@909 202
Chris@909 203 @cached_tokens = @tokens
Chris@909 204 if source.is_a? Array
Chris@909 205 @tokens.split_into_parts(*source.map { |part| part.size })
Chris@909 206 else
Chris@909 207 @tokens
Chris@909 208 end
Chris@909 209 end
Chris@909 210
Chris@909 211 # Cache the result of tokenize.
Chris@909 212 def tokens
Chris@909 213 @cached_tokens ||= tokenize
Chris@909 214 end
Chris@909 215
Chris@909 216 # Traverse the tokens.
Chris@909 217 def each &block
Chris@909 218 tokens.each(&block)
Chris@909 219 end
Chris@909 220 include Enumerable
Chris@909 221
Chris@909 222 # The current line position of the scanner, starting with 1.
Chris@909 223 # See also: #column.
Chris@909 224 #
Chris@909 225 # Beware, this is implemented inefficiently. It should be used
Chris@909 226 # for debugging only.
Chris@909 227 def line pos = self.pos
Chris@909 228 return 1 if pos <= 0
Chris@909 229 binary_string[0...pos].count("\n") + 1
Chris@909 230 end
Chris@909 231
Chris@909 232 # The current column position of the scanner, starting with 1.
Chris@909 233 # See also: #line.
Chris@909 234 def column pos = self.pos
Chris@909 235 return 1 if pos <= 0
Chris@909 236 pos - (binary_string.rindex(?\n, pos - 1) || -1)
Chris@909 237 end
Chris@909 238
Chris@909 239 # The string in binary encoding.
Chris@909 240 #
Chris@909 241 # To be used with #pos, which is the index of the byte the scanner
Chris@909 242 # will scan next.
Chris@909 243 def binary_string
Chris@909 244 @binary_string ||=
Chris@909 245 if string.respond_to?(:bytesize) && string.bytesize != string.size
Chris@909 246 #:nocov:
Chris@909 247 string.dup.force_encoding('binary')
Chris@909 248 #:nocov:
Chris@909 249 else
Chris@909 250 string
Chris@909 251 end
Chris@909 252 end
Chris@909 253
Chris@909 254 protected
Chris@909 255
Chris@909 256 # Can be implemented by subclasses to do some initialization
Chris@909 257 # that has to be done once per instance.
Chris@909 258 #
Chris@909 259 # Use reset for initialization that has to be done once per
Chris@909 260 # scan.
Chris@909 261 def setup # :doc:
Chris@909 262 end
Chris@909 263
Chris@909 264 # This is the central method, and commonly the only one a
Chris@909 265 # subclass implements.
Chris@909 266 #
Chris@909 267 # Subclasses must implement this method; it must return +tokens+
Chris@909 268 # and must only use Tokens#<< for storing scanned tokens!
Chris@909 269 def scan_tokens tokens, options # :doc:
Chris@909 270 raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
Chris@909 271 end
Chris@909 272
Chris@909 273 # Resets the scanner.
Chris@909 274 def reset_instance
Chris@909 275 @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
Chris@909 276 @cached_tokens = nil
Chris@909 277 @binary_string = nil if defined? @binary_string
Chris@909 278 end
Chris@909 279
Chris@909 280 # Scanner error with additional status information
Chris@909 281 def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller
Chris@909 282 raise ScanError, <<-EOE % [
Chris@909 283
Chris@909 284
Chris@909 285 ***ERROR in %s: %s (after %d tokens)
Chris@909 286
Chris@909 287 tokens:
Chris@909 288 %s
Chris@909 289
Chris@909 290 current line: %d column: %d pos: %d
Chris@909 291 matched: %p state: %p
Chris@909 292 bol? = %p, eos? = %p
Chris@909 293
Chris@909 294 surrounding code:
Chris@909 295 %p ~~ %p
Chris@909 296
Chris@909 297
Chris@909 298 ***ERROR***
Chris@909 299
Chris@909 300 EOE
Chris@909 301 File.basename(caller[0]),
Chris@909 302 msg,
Chris@909 303 tokens.respond_to?(:size) ? tokens.size : 0,
Chris@909 304 tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '',
Chris@909 305 line, column, pos,
Chris@909 306 matched, state, bol?, eos?,
Chris@909 307 binary_string[pos - ambit, ambit],
Chris@909 308 binary_string[pos, ambit],
Chris@909 309 ], backtrace
Chris@909 310 end
Chris@909 311
Chris@909 312 # Shorthand for scan_until(/\z/).
Chris@909 313 # This method also avoids a JRuby 1.9 mode bug.
Chris@909 314 def scan_rest
Chris@909 315 rest = self.rest
Chris@909 316 terminate
Chris@909 317 rest
Chris@909 318 end
Chris@909 319
Chris@909 320 end
Chris@909 321
Chris@909 322 end
Chris@909 323 end