annotate vendor/gems/coderay-0.9.7/lib/coderay/scanner.rb @ 855:7294e8db2515 bug_162

Close obsolete branch bug_162
author Chris Cannam
date Thu, 14 Jul 2011 11:59:19 +0100
parents 0579821a129a
children
rev   line source
Chris@210 1 module CodeRay
Chris@210 2
Chris@210 3 require 'coderay/helpers/plugin'
Chris@210 4
Chris@210 5 # = Scanners
Chris@210 6 #
Chris@210 7 # This module holds the Scanner class and its subclasses.
Chris@210 8 # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
Chris@210 9 # can be found in coderay/scanners/ruby.
Chris@210 10 #
Chris@210 11 # Scanner also provides methods and constants for the register
Chris@210 12 # mechanism and the [] method that returns the Scanner class
Chris@210 13 # belonging to the given lang.
Chris@210 14 #
Chris@210 15 # See PluginHost.
Chris@210 16 module Scanners
Chris@210 17 extend PluginHost
Chris@210 18 plugin_path File.dirname(__FILE__), 'scanners'
Chris@210 19
Chris@210 20 require 'strscan'
Chris@210 21
Chris@210 22 # = Scanner
Chris@210 23 #
Chris@210 24 # The base class for all Scanners.
Chris@210 25 #
Chris@210 26 # It is a subclass of Ruby's great +StringScanner+, which
Chris@210 27 # makes it easy to access the scanning methods inside.
Chris@210 28 #
Chris@210 29 # It is also +Enumerable+, so you can use it like an Array of
Chris@210 30 # Tokens:
Chris@210 31 #
Chris@210 32 # require 'coderay'
Chris@210 33 #
Chris@210 34 # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
Chris@210 35 #
Chris@210 36 # for text, kind in c_scanner
Chris@210 37 # puts text if kind == :operator
Chris@210 38 # end
Chris@210 39 #
Chris@210 40 # # prints: (*==)++;
Chris@210 41 #
Chris@210 42 # OK, this is a very simple example :)
Chris@210 43 # You can also use +map+, +any?+, +find+ and even +sort_by+,
Chris@210 44 # if you want.
Chris@210 45 class Scanner < StringScanner
Chris@210 46
Chris@210 47 extend Plugin
Chris@210 48 plugin_host Scanners
Chris@210 49
Chris@210 50 # Raised if a Scanner fails while scanning
Chris@210 51 ScanError = Class.new(Exception)
Chris@210 52
Chris@210 53 require 'coderay/helpers/word_list'
Chris@210 54
Chris@210 55 # The default options for all scanner classes.
Chris@210 56 #
Chris@210 57 # Define @default_options for subclasses.
Chris@210 58 DEFAULT_OPTIONS = { :stream => false }
Chris@210 59
Chris@210 60 KINDS_NOT_LOC = [:comment, :doctype]
Chris@210 61
Chris@210 62 class << self
Chris@210 63
Chris@210 64 # Returns if the Scanner can be used in streaming mode.
Chris@210 65 def streamable?
Chris@210 66 is_a? Streamable
Chris@210 67 end
Chris@210 68
Chris@210 69 def normify code
Chris@210 70 code = code.to_s
Chris@210 71 if code.respond_to?(:encoding) && (code.encoding.name != 'UTF-8' || !code.valid_encoding?)
Chris@210 72 code = code.dup
Chris@210 73 original_encoding = code.encoding
Chris@210 74 code.force_encoding 'Windows-1252'
Chris@210 75 unless code.valid_encoding?
Chris@210 76 code.force_encoding original_encoding
Chris@210 77 if code.encoding.name == 'UTF-8'
Chris@210 78 code.encode! 'UTF-16BE', :invalid => :replace, :undef => :replace, :replace => '?'
Chris@210 79 end
Chris@210 80 code.encode! 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '?'
Chris@210 81 end
Chris@210 82 end
Chris@210 83 code.to_unix
Chris@210 84 end
Chris@210 85
Chris@210 86 def file_extension extension = nil
Chris@210 87 if extension
Chris@210 88 @file_extension = extension.to_s
Chris@210 89 else
Chris@210 90 @file_extension ||= plugin_id.to_s
Chris@210 91 end
Chris@210 92 end
Chris@210 93
Chris@210 94 end
Chris@210 95
Chris@210 96 =begin
Chris@210 97 ## Excluded for speed reasons; protected seems to make methods slow.
Chris@210 98
Chris@210 99 # Save the StringScanner methods from being called.
Chris@210 100 # This would not be useful for highlighting.
Chris@210 101 strscan_public_methods =
Chris@210 102 StringScanner.instance_methods -
Chris@210 103 StringScanner.ancestors[1].instance_methods
Chris@210 104 protected(*strscan_public_methods)
Chris@210 105 =end
Chris@210 106
Chris@210 107 # Create a new Scanner.
Chris@210 108 #
Chris@210 109 # * +code+ is the input String and is handled by the superclass
Chris@210 110 # StringScanner.
Chris@210 111 # * +options+ is a Hash with Symbols as keys.
Chris@210 112 # It is merged with the default options of the class (you can
Chris@210 113 # overwrite default options here.)
Chris@210 114 # * +block+ is the callback for streamed highlighting.
Chris@210 115 #
Chris@210 116 # If you set :stream to +true+ in the options, the Scanner uses a
Chris@210 117 # TokenStream with the +block+ as callback to handle the tokens.
Chris@210 118 #
Chris@210 119 # Else, a Tokens object is used.
Chris@210 120 def initialize code='', options = {}, &block
Chris@210 121 raise "I am only the basic Scanner class. I can't scan "\
Chris@210 122 "anything. :( Use my subclasses." if self.class == Scanner
Chris@210 123
Chris@210 124 @options = self.class::DEFAULT_OPTIONS.merge options
Chris@210 125
Chris@210 126 super Scanner.normify(code)
Chris@210 127
Chris@210 128 @tokens = options[:tokens]
Chris@210 129 if @options[:stream]
Chris@210 130 warn "warning in CodeRay::Scanner.new: :stream is set, "\
Chris@210 131 "but no block was given" unless block_given?
Chris@210 132 raise NotStreamableError, self unless kind_of? Streamable
Chris@210 133 @tokens ||= TokenStream.new(&block)
Chris@210 134 else
Chris@210 135 warn "warning in CodeRay::Scanner.new: Block given, "\
Chris@210 136 "but :stream is #{@options[:stream]}" if block_given?
Chris@210 137 @tokens ||= Tokens.new
Chris@210 138 end
Chris@210 139 @tokens.scanner = self
Chris@210 140
Chris@210 141 setup
Chris@210 142 end
Chris@210 143
Chris@210 144 def reset
Chris@210 145 super
Chris@210 146 reset_instance
Chris@210 147 end
Chris@210 148
Chris@210 149 def string= code
Chris@210 150 code = Scanner.normify(code)
Chris@210 151 if defined?(RUBY_DESCRIPTION) && RUBY_DESCRIPTION['rubinius 1.0.1']
Chris@210 152 reset_state
Chris@210 153 @string = code
Chris@210 154 else
Chris@210 155 super code
Chris@210 156 end
Chris@210 157 reset_instance
Chris@210 158 end
Chris@210 159
Chris@210 160 # More mnemonic accessor name for the input string.
Chris@210 161 alias code string
Chris@210 162 alias code= string=
Chris@210 163
Chris@210 164 # Returns the Plugin ID for this scanner.
Chris@210 165 def lang
Chris@210 166 self.class.plugin_id
Chris@210 167 end
Chris@210 168
Chris@210 169 # Scans the code and returns all tokens in a Tokens object.
Chris@210 170 def tokenize new_string=nil, options = {}
Chris@210 171 options = @options.merge(options)
Chris@210 172 self.string = new_string if new_string
Chris@210 173 @cached_tokens =
Chris@210 174 if @options[:stream] # :stream must have been set already
Chris@210 175 reset unless new_string
Chris@210 176 scan_tokens @tokens, options
Chris@210 177 @tokens
Chris@210 178 else
Chris@210 179 scan_tokens @tokens, options
Chris@210 180 end
Chris@210 181 end
Chris@210 182
Chris@210 183 def tokens
Chris@210 184 @cached_tokens ||= tokenize
Chris@210 185 end
Chris@210 186
Chris@210 187 # Whether the scanner is in streaming mode.
Chris@210 188 def streaming?
Chris@210 189 !!@options[:stream]
Chris@210 190 end
Chris@210 191
Chris@210 192 # Traverses the tokens.
Chris@210 193 def each &block
Chris@210 194 raise ArgumentError,
Chris@210 195 'Cannot traverse TokenStream.' if @options[:stream]
Chris@210 196 tokens.each(&block)
Chris@210 197 end
Chris@210 198 include Enumerable
Chris@210 199
Chris@210 200 # The current line position of the scanner.
Chris@210 201 #
Chris@210 202 # Beware, this is implemented inefficiently. It should be used
Chris@210 203 # for debugging only.
Chris@210 204 def line
Chris@210 205 string[0..pos].count("\n") + 1
Chris@210 206 end
Chris@210 207
Chris@210 208 def column pos = self.pos
Chris@210 209 return 0 if pos <= 0
Chris@210 210 string = string()
Chris@210 211 if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size)
Chris@210 212 @bin_string ||= string.dup.force_encoding('binary')
Chris@210 213 string = @bin_string
Chris@210 214 end
Chris@210 215 pos - (string.rindex(?\n, pos) || 0)
Chris@210 216 end
Chris@210 217
Chris@210 218 def marshal_dump
Chris@210 219 @options
Chris@210 220 end
Chris@210 221
Chris@210 222 def marshal_load options
Chris@210 223 @options = options
Chris@210 224 end
Chris@210 225
Chris@210 226 protected
Chris@210 227
Chris@210 228 # Can be implemented by subclasses to do some initialization
Chris@210 229 # that has to be done once per instance.
Chris@210 230 #
Chris@210 231 # Use reset for initialization that has to be done once per
Chris@210 232 # scan.
Chris@210 233 def setup
Chris@210 234 end
Chris@210 235
Chris@210 236 # This is the central method, and commonly the only one a
Chris@210 237 # subclass implements.
Chris@210 238 #
Chris@210 239 # Subclasses must implement this method; it must return +tokens+
Chris@210 240 # and must only use Tokens#<< for storing scanned tokens!
Chris@210 241 def scan_tokens tokens, options
Chris@210 242 raise NotImplementedError,
Chris@210 243 "#{self.class}#scan_tokens not implemented."
Chris@210 244 end
Chris@210 245
Chris@210 246 def reset_instance
Chris@210 247 @tokens.clear unless @options[:keep_tokens]
Chris@210 248 @cached_tokens = nil
Chris@210 249 @bin_string = nil if defined? @bin_string
Chris@210 250 end
Chris@210 251
Chris@210 252 # Scanner error with additional status information
Chris@210 253 def raise_inspect msg, tokens, state = 'No state given!', ambit = 30
Chris@210 254 raise ScanError, <<-EOE % [
Chris@210 255
Chris@210 256
Chris@210 257 ***ERROR in %s: %s (after %d tokens)
Chris@210 258
Chris@210 259 tokens:
Chris@210 260 %s
Chris@210 261
Chris@210 262 current line: %d column: %d pos: %d
Chris@210 263 matched: %p state: %p
Chris@210 264 bol? = %p, eos? = %p
Chris@210 265
Chris@210 266 surrounding code:
Chris@210 267 %p ~~ %p
Chris@210 268
Chris@210 269
Chris@210 270 ***ERROR***
Chris@210 271
Chris@210 272 EOE
Chris@210 273 File.basename(caller[0]),
Chris@210 274 msg,
Chris@210 275 tokens.size,
Chris@210 276 tokens.last(10).map { |t| t.inspect }.join("\n"),
Chris@210 277 line, column, pos,
Chris@210 278 matched, state, bol?, eos?,
Chris@210 279 string[pos - ambit, ambit],
Chris@210 280 string[pos, ambit],
Chris@210 281 ]
Chris@210 282 end
Chris@210 283
Chris@210 284 end
Chris@210 285
Chris@210 286 end
Chris@210 287 end
Chris@210 288
Chris@210 289 class String
Chris@210 290 # I love this hack. It seems to silence all dos/unix/mac newline problems.
Chris@210 291 def to_unix
Chris@210 292 if index ?\r
Chris@210 293 gsub(/\r\n?/, "\n")
Chris@210 294 else
Chris@210 295 self
Chris@210 296 end
Chris@210 297 end
Chris@210 298 end