annotate vendor/gems/coderay-0.9.7/lib/coderay/scanners/ruby.rb @ 855:7294e8db2515 bug_162

Close obsolete branch bug_162
author Chris Cannam
date Thu, 14 Jul 2011 11:59:19 +0100
parents 0579821a129a
children
rev   line source
Chris@210 1 # encoding: utf-8
Chris@210 2 module CodeRay
Chris@210 3 module Scanners
Chris@210 4
Chris@210 5 # This scanner is really complex, since Ruby _is_ a complex language!
Chris@210 6 #
Chris@210 7 # It tries to highlight 100% of all common code,
Chris@210 8 # and 90% of strange codes.
Chris@210 9 #
Chris@210 10 # It is optimized for HTML highlighting, and is not very useful for
Chris@210 11 # parsing or pretty printing.
Chris@210 12 #
Chris@210 13 # For now, I think it's better than the scanners in VIM or Syntax, or
Chris@210 14 # any highlighter I was able to find, except Caleb's RubyLexer.
Chris@210 15 #
Chris@210 16 # I hope it's also better than the rdoc/irb lexer.
Chris@210 17 class Ruby < Scanner
Chris@210 18
Chris@210 19 include Streamable
Chris@210 20
Chris@210 21 register_for :ruby
Chris@210 22 file_extension 'rb'
Chris@210 23
Chris@210 24 helper :patterns
Chris@210 25
Chris@210 26 if not defined? EncodingError
Chris@210 27 EncodingError = Class.new Exception
Chris@210 28 end
Chris@210 29
Chris@210 30 private
Chris@210 31 def scan_tokens tokens, options
Chris@210 32 if string.respond_to?(:encoding)
Chris@210 33 unless string.encoding == Encoding::UTF_8
Chris@210 34 self.string = string.encode Encoding::UTF_8,
Chris@210 35 :invalid => :replace, :undef => :replace, :replace => '?'
Chris@210 36 end
Chris@210 37 unicode = false
Chris@210 38 else
Chris@210 39 unicode = exist?(/[^\x00-\x7f]/)
Chris@210 40 end
Chris@210 41
Chris@210 42 last_token_dot = false
Chris@210 43 value_expected = true
Chris@210 44 heredocs = nil
Chris@210 45 last_state = nil
Chris@210 46 state = :initial
Chris@210 47 depth = nil
Chris@210 48 inline_block_stack = []
Chris@210 49
Chris@210 50
Chris@210 51 patterns = Patterns # avoid constant lookup
Chris@210 52
Chris@210 53 until eos?
Chris@210 54 match = nil
Chris@210 55 kind = nil
Chris@210 56
Chris@210 57 if state.instance_of? patterns::StringState
Chris@210 58 # {{{
Chris@210 59 match = scan_until(state.pattern) || scan_until(/\z/)
Chris@210 60 tokens << [match, :content] unless match.empty?
Chris@210 61 break if eos?
Chris@210 62
Chris@210 63 if state.heredoc and self[1] # end of heredoc
Chris@210 64 match = getch.to_s
Chris@210 65 match << scan_until(/$/) unless eos?
Chris@210 66 tokens << [match, :delimiter]
Chris@210 67 tokens << [:close, state.type]
Chris@210 68 state = state.next_state
Chris@210 69 next
Chris@210 70 end
Chris@210 71
Chris@210 72 case match = getch
Chris@210 73
Chris@210 74 when state.delim
Chris@210 75 if state.paren
Chris@210 76 state.paren_depth -= 1
Chris@210 77 if state.paren_depth > 0
Chris@210 78 tokens << [match, :nesting_delimiter]
Chris@210 79 next
Chris@210 80 end
Chris@210 81 end
Chris@210 82 tokens << [match, :delimiter]
Chris@210 83 if state.type == :regexp and not eos?
Chris@210 84 modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox)
Chris@210 85 tokens << [modifiers, :modifier] unless modifiers.empty?
Chris@210 86 end
Chris@210 87 tokens << [:close, state.type]
Chris@210 88 value_expected = false
Chris@210 89 state = state.next_state
Chris@210 90
Chris@210 91 when '\\'
Chris@210 92 if state.interpreted
Chris@210 93 if esc = scan(/ #{patterns::ESCAPE} /ox)
Chris@210 94 tokens << [match + esc, :char]
Chris@210 95 else
Chris@210 96 tokens << [match, :error]
Chris@210 97 end
Chris@210 98 else
Chris@210 99 case m = getch
Chris@210 100 when state.delim, '\\'
Chris@210 101 tokens << [match + m, :char]
Chris@210 102 when nil
Chris@210 103 tokens << [match, :error]
Chris@210 104 else
Chris@210 105 tokens << [match + m, :content]
Chris@210 106 end
Chris@210 107 end
Chris@210 108
Chris@210 109 when '#'
Chris@210 110 case peek(1)
Chris@210 111 when '{'
Chris@210 112 inline_block_stack << [state, depth, heredocs]
Chris@210 113 value_expected = true
Chris@210 114 state = :initial
Chris@210 115 depth = 1
Chris@210 116 tokens << [:open, :inline]
Chris@210 117 tokens << [match + getch, :inline_delimiter]
Chris@210 118 when '$', '@'
Chris@210 119 tokens << [match, :escape]
Chris@210 120 last_state = state # scan one token as normal code, then return here
Chris@210 121 state = :initial
Chris@210 122 else
Chris@210 123 raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens
Chris@210 124 end
Chris@210 125
Chris@210 126 when state.paren
Chris@210 127 state.paren_depth += 1
Chris@210 128 tokens << [match, :nesting_delimiter]
Chris@210 129
Chris@210 130 when /#{patterns::REGEXP_SYMBOLS}/ox
Chris@210 131 tokens << [match, :function]
Chris@210 132
Chris@210 133 else
Chris@210 134 raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens
Chris@210 135
Chris@210 136 end
Chris@210 137 next
Chris@210 138 # }}}
Chris@210 139 else
Chris@210 140 # {{{
Chris@210 141 if match = scan(/[ \t\f]+/)
Chris@210 142 kind = :space
Chris@210 143 match << scan(/\s*/) unless eos? || heredocs
Chris@210 144 value_expected = true if match.index(?\n)
Chris@210 145 tokens << [match, kind]
Chris@210 146 next
Chris@210 147
Chris@210 148 elsif match = scan(/\\?\n/)
Chris@210 149 kind = :space
Chris@210 150 if match == "\n"
Chris@210 151 value_expected = true
Chris@210 152 state = :initial if state == :undef_comma_expected
Chris@210 153 end
Chris@210 154 if heredocs
Chris@210 155 unscan # heredoc scanning needs \n at start
Chris@210 156 state = heredocs.shift
Chris@210 157 tokens << [:open, state.type]
Chris@210 158 heredocs = nil if heredocs.empty?
Chris@210 159 next
Chris@210 160 else
Chris@210 161 match << scan(/\s*/) unless eos?
Chris@210 162 end
Chris@210 163 tokens << [match, kind]
Chris@210 164 next
Chris@210 165
Chris@210 166 elsif bol? && match = scan(/\#!.*/)
Chris@210 167 tokens << [match, :doctype]
Chris@210 168 next
Chris@210 169
Chris@210 170 elsif match = scan(/\#.*/) or
Chris@210 171 ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) )
Chris@210 172 kind = :comment
Chris@210 173 tokens << [match, kind]
Chris@210 174 next
Chris@210 175
Chris@210 176 elsif state == :initial
Chris@210 177
Chris@210 178 # IDENTS #
Chris@210 179 if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo :
Chris@210 180 /#{patterns::METHOD_NAME}/o)
Chris@210 181 if last_token_dot
Chris@210 182 kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end
Chris@210 183 else
Chris@210 184 if value_expected != :expect_colon && scan(/:(?= )/)
Chris@210 185 tokens << [match, :key]
Chris@210 186 match = ':'
Chris@210 187 kind = :operator
Chris@210 188 else
Chris@210 189 kind = patterns::IDENT_KIND[match]
Chris@210 190 if kind == :ident
Chris@210 191 if match[/\A[A-Z]/] and not match[/[!?]$/] and not match?(/\(/)
Chris@210 192 kind = :constant
Chris@210 193 end
Chris@210 194 elsif kind == :reserved
Chris@210 195 state = patterns::DEF_NEW_STATE[match]
Chris@210 196 value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match]
Chris@210 197 end
Chris@210 198 end
Chris@210 199 end
Chris@210 200 value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o)
Chris@210 201
Chris@210 202 elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o)
Chris@210 203 kind = :ident
Chris@210 204 value_expected = :set if check(unicode ? /#{patterns::VALUE_FOLLOWS}/uo :
Chris@210 205 /#{patterns::VALUE_FOLLOWS}/o)
Chris@210 206
Chris@210 207 # OPERATORS #
Chris@210 208 elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x)
Chris@210 209 if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/
Chris@210 210 value_expected = :set
Chris@210 211 end
Chris@210 212 last_token_dot = :set if self[1]
Chris@210 213 kind = :operator
Chris@210 214 unless inline_block_stack.empty?
Chris@210 215 case match
Chris@210 216 when '{'
Chris@210 217 depth += 1
Chris@210 218 when '}'
Chris@210 219 depth -= 1
Chris@210 220 if depth == 0 # closing brace of inline block reached
Chris@210 221 state, depth, heredocs = inline_block_stack.pop
Chris@210 222 heredocs = nil if heredocs && heredocs.empty?
Chris@210 223 tokens << [match, :inline_delimiter]
Chris@210 224 kind = :inline
Chris@210 225 match = :close
Chris@210 226 end
Chris@210 227 end
Chris@210 228 end
Chris@210 229
Chris@210 230 elsif match = scan(/ ['"] /mx)
Chris@210 231 tokens << [:open, :string]
Chris@210 232 kind = :delimiter
Chris@210 233 state = patterns::StringState.new :string, match == '"', match # important for streaming
Chris@210 234
Chris@210 235 elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo :
Chris@210 236 /#{patterns::INSTANCE_VARIABLE}/o)
Chris@210 237 kind = :instance_variable
Chris@210 238
Chris@210 239 elsif value_expected and match = scan(/\//)
Chris@210 240 tokens << [:open, :regexp]
Chris@210 241 kind = :delimiter
Chris@210 242 interpreted = true
Chris@210 243 state = patterns::StringState.new :regexp, interpreted, match
Chris@210 244
Chris@210 245 # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o)
Chris@210 246 elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o)
Chris@210 247 kind = self[1] ? :float : :integer
Chris@210 248
Chris@210 249 elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo :
Chris@210 250 /#{patterns::SYMBOL}/o)
Chris@210 251 case delim = match[1]
Chris@210 252 when ?', ?"
Chris@210 253 tokens << [:open, :symbol]
Chris@210 254 tokens << [':', :symbol]
Chris@210 255 match = delim.chr
Chris@210 256 kind = :delimiter
Chris@210 257 state = patterns::StringState.new :symbol, delim == ?", match
Chris@210 258 else
Chris@210 259 kind = :symbol
Chris@210 260 end
Chris@210 261
Chris@210 262 elsif match = scan(/ -[>=]? | [+!~^]=? | [*|&]{1,2}=? | >>? /x)
Chris@210 263 value_expected = :set
Chris@210 264 kind = :operator
Chris@210 265
Chris@210 266 elsif value_expected and match = scan(unicode ? /#{patterns::HEREDOC_OPEN}/uo :
Chris@210 267 /#{patterns::HEREDOC_OPEN}/o)
Chris@210 268 indented = self[1] == '-'
Chris@210 269 quote = self[3]
Chris@210 270 delim = self[quote ? 4 : 2]
Chris@210 271 kind = patterns::QUOTE_TO_TYPE[quote]
Chris@210 272 tokens << [:open, kind]
Chris@210 273 tokens << [match, :delimiter]
Chris@210 274 match = :close
Chris@210 275 heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart )
Chris@210 276 heredocs ||= [] # create heredocs if empty
Chris@210 277 heredocs << heredoc
Chris@210 278
Chris@210 279 elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o)
Chris@210 280 kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do
Chris@210 281 raise_inspect 'Unknown fancy string: %%%p' % k, tokens
Chris@210 282 end
Chris@210 283 tokens << [:open, kind]
Chris@210 284 state = patterns::StringState.new kind, interpreted, self[2]
Chris@210 285 kind = :delimiter
Chris@210 286
Chris@210 287 elsif value_expected and match = scan(unicode ? /#{patterns::CHARACTER}/uo :
Chris@210 288 /#{patterns::CHARACTER}/o)
Chris@210 289 kind = :integer
Chris@210 290
Chris@210 291 elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x)
Chris@210 292 value_expected = :set
Chris@210 293 kind = :operator
Chris@210 294
Chris@210 295 elsif match = scan(/`/)
Chris@210 296 if last_token_dot
Chris@210 297 kind = :operator
Chris@210 298 else
Chris@210 299 tokens << [:open, :shell]
Chris@210 300 kind = :delimiter
Chris@210 301 state = patterns::StringState.new :shell, true, match
Chris@210 302 end
Chris@210 303
Chris@210 304 elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo :
Chris@210 305 /#{patterns::GLOBAL_VARIABLE}/o)
Chris@210 306 kind = :global_variable
Chris@210 307
Chris@210 308 elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo :
Chris@210 309 /#{patterns::CLASS_VARIABLE}/o)
Chris@210 310 kind = :class_variable
Chris@210 311
Chris@210 312 else
Chris@210 313 if !unicode && !string.respond_to?(:encoding)
Chris@210 314 # check for unicode
Chris@210 315 debug, $DEBUG = $DEBUG, false
Chris@210 316 begin
Chris@210 317 if check(/./mu).size > 1
Chris@210 318 # seems like we should try again with unicode
Chris@210 319 unicode = true
Chris@210 320 end
Chris@210 321 rescue
Chris@210 322 # bad unicode char; use getch
Chris@210 323 ensure
Chris@210 324 $DEBUG = debug
Chris@210 325 end
Chris@210 326 next if unicode
Chris@210 327 end
Chris@210 328 kind = :error
Chris@210 329 match = scan(unicode ? /./mu : /./m)
Chris@210 330
Chris@210 331 end
Chris@210 332
Chris@210 333 elsif state == :def_expected
Chris@210 334 state = :initial
Chris@210 335 if scan(/self\./)
Chris@210 336 tokens << ['self', :pre_constant]
Chris@210 337 tokens << ['.', :operator]
Chris@210 338 end
Chris@210 339 if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo :
Chris@210 340 /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o)
Chris@210 341 kind = :method
Chris@210 342 else
Chris@210 343 next
Chris@210 344 end
Chris@210 345
Chris@210 346 elsif state == :module_expected
Chris@210 347 if match = scan(/<</)
Chris@210 348 kind = :operator
Chris@210 349 else
Chris@210 350 state = :initial
Chris@210 351 if match = scan(unicode ? /(?:#{patterns::IDENT}::)*#{patterns::IDENT}/uo :
Chris@210 352 /(?:#{patterns::IDENT}::)*#{patterns::IDENT}/o)
Chris@210 353 kind = :class
Chris@210 354 else
Chris@210 355 next
Chris@210 356 end
Chris@210 357 end
Chris@210 358
Chris@210 359 elsif state == :undef_expected
Chris@210 360 state = :undef_comma_expected
Chris@210 361 if match = scan(unicode ? /#{patterns::METHOD_NAME_EX}/uo :
Chris@210 362 /#{patterns::METHOD_NAME_EX}/o)
Chris@210 363 kind = :method
Chris@210 364 elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo :
Chris@210 365 /#{patterns::SYMBOL}/o)
Chris@210 366 case delim = match[1]
Chris@210 367 when ?', ?"
Chris@210 368 tokens << [:open, :symbol]
Chris@210 369 tokens << [':', :symbol]
Chris@210 370 match = delim.chr
Chris@210 371 kind = :delimiter
Chris@210 372 state = patterns::StringState.new :symbol, delim == ?", match
Chris@210 373 state.next_state = :undef_comma_expected
Chris@210 374 else
Chris@210 375 kind = :symbol
Chris@210 376 end
Chris@210 377 else
Chris@210 378 state = :initial
Chris@210 379 next
Chris@210 380 end
Chris@210 381
Chris@210 382 elsif state == :alias_expected
Chris@210 383 match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo :
Chris@210 384 /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o)
Chris@210 385
Chris@210 386 if match
Chris@210 387 tokens << [self[1], (self[1][0] == ?: ? :symbol : :method)]
Chris@210 388 tokens << [self[2], :space]
Chris@210 389 tokens << [self[3], (self[3][0] == ?: ? :symbol : :method)]
Chris@210 390 end
Chris@210 391 state = :initial
Chris@210 392 next
Chris@210 393
Chris@210 394 elsif state == :undef_comma_expected
Chris@210 395 if match = scan(/,/)
Chris@210 396 kind = :operator
Chris@210 397 state = :undef_expected
Chris@210 398 else
Chris@210 399 state = :initial
Chris@210 400 next
Chris@210 401 end
Chris@210 402
Chris@210 403 end
Chris@210 404 # }}}
Chris@210 405
Chris@210 406 unless kind == :error
Chris@210 407 if value_expected = value_expected == :set
Chris@210 408 value_expected = :expect_colon if match == '?' || match == 'when'
Chris@210 409 end
Chris@210 410 last_token_dot = last_token_dot == :set
Chris@210 411 end
Chris@210 412
Chris@210 413 if $CODERAY_DEBUG and not kind
Chris@210 414 raise_inspect 'Error token %p in line %d' %
Chris@210 415 [[match, kind], line], tokens, state
Chris@210 416 end
Chris@210 417 raise_inspect 'Empty token', tokens unless match
Chris@210 418
Chris@210 419 tokens << [match, kind]
Chris@210 420
Chris@210 421 if last_state
Chris@210 422 state = last_state
Chris@210 423 last_state = nil
Chris@210 424 end
Chris@210 425 end
Chris@210 426 end
Chris@210 427
Chris@210 428 inline_block_stack << [state] if state.is_a? patterns::StringState
Chris@210 429 until inline_block_stack.empty?
Chris@210 430 this_block = inline_block_stack.pop
Chris@210 431 tokens << [:close, :inline] if this_block.size > 1
Chris@210 432 state = this_block.first
Chris@210 433 tokens << [:close, state.type]
Chris@210 434 end
Chris@210 435
Chris@210 436 tokens
Chris@210 437 end
Chris@210 438
Chris@210 439 end
Chris@210 440
Chris@210 441 end
Chris@210 442 end
Chris@210 443
Chris@210 444 # vim:fdm=marker