Chris@210
|
1 # encoding: utf-8
|
Chris@210
|
2 module CodeRay
|
Chris@210
|
3 module Scanners
|
Chris@210
|
4
|
Chris@210
|
5 # This scanner is really complex, since Ruby _is_ a complex language!
|
Chris@210
|
6 #
|
Chris@210
|
7 # It tries to highlight 100% of all common code,
|
Chris@210
|
8 # and 90% of strange codes.
|
Chris@210
|
9 #
|
Chris@210
|
10 # It is optimized for HTML highlighting, and is not very useful for
|
Chris@210
|
11 # parsing or pretty printing.
|
Chris@210
|
12 #
|
Chris@210
|
13 # For now, I think it's better than the scanners in VIM or Syntax, or
|
Chris@210
|
14 # any highlighter I was able to find, except Caleb's RubyLexer.
|
Chris@210
|
15 #
|
Chris@210
|
16 # I hope it's also better than the rdoc/irb lexer.
|
Chris@210
|
17 class Ruby < Scanner
|
Chris@210
|
18
|
Chris@210
|
19 include Streamable
|
Chris@210
|
20
|
Chris@210
|
21 register_for :ruby
|
Chris@210
|
22 file_extension 'rb'
|
Chris@210
|
23
|
Chris@210
|
24 helper :patterns
|
Chris@210
|
25
|
Chris@210
|
26 if not defined? EncodingError
|
Chris@210
|
27 EncodingError = Class.new Exception
|
Chris@210
|
28 end
|
Chris@210
|
29
|
Chris@210
|
30 private
|
Chris@210
|
31 def scan_tokens tokens, options
|
Chris@210
|
32 if string.respond_to?(:encoding)
|
Chris@210
|
33 unless string.encoding == Encoding::UTF_8
|
Chris@210
|
34 self.string = string.encode Encoding::UTF_8,
|
Chris@210
|
35 :invalid => :replace, :undef => :replace, :replace => '?'
|
Chris@210
|
36 end
|
Chris@210
|
37 unicode = false
|
Chris@210
|
38 else
|
Chris@210
|
39 unicode = exist?(/[^\x00-\x7f]/)
|
Chris@210
|
40 end
|
Chris@210
|
41
|
Chris@210
|
42 last_token_dot = false
|
Chris@210
|
43 value_expected = true
|
Chris@210
|
44 heredocs = nil
|
Chris@210
|
45 last_state = nil
|
Chris@210
|
46 state = :initial
|
Chris@210
|
47 depth = nil
|
Chris@210
|
48 inline_block_stack = []
|
Chris@210
|
49
|
Chris@210
|
50
|
Chris@210
|
51 patterns = Patterns # avoid constant lookup
|
Chris@210
|
52
|
Chris@210
|
53 until eos?
|
Chris@210
|
54 match = nil
|
Chris@210
|
55 kind = nil
|
Chris@210
|
56
|
Chris@210
|
57 if state.instance_of? patterns::StringState
|
Chris@210
|
58 # {{{
|
Chris@210
|
59 match = scan_until(state.pattern) || scan_until(/\z/)
|
Chris@210
|
60 tokens << [match, :content] unless match.empty?
|
Chris@210
|
61 break if eos?
|
Chris@210
|
62
|
Chris@210
|
63 if state.heredoc and self[1] # end of heredoc
|
Chris@210
|
64 match = getch.to_s
|
Chris@210
|
65 match << scan_until(/$/) unless eos?
|
Chris@210
|
66 tokens << [match, :delimiter]
|
Chris@210
|
67 tokens << [:close, state.type]
|
Chris@210
|
68 state = state.next_state
|
Chris@210
|
69 next
|
Chris@210
|
70 end
|
Chris@210
|
71
|
Chris@210
|
72 case match = getch
|
Chris@210
|
73
|
Chris@210
|
74 when state.delim
|
Chris@210
|
75 if state.paren
|
Chris@210
|
76 state.paren_depth -= 1
|
Chris@210
|
77 if state.paren_depth > 0
|
Chris@210
|
78 tokens << [match, :nesting_delimiter]
|
Chris@210
|
79 next
|
Chris@210
|
80 end
|
Chris@210
|
81 end
|
Chris@210
|
82 tokens << [match, :delimiter]
|
Chris@210
|
83 if state.type == :regexp and not eos?
|
Chris@210
|
84 modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox)
|
Chris@210
|
85 tokens << [modifiers, :modifier] unless modifiers.empty?
|
Chris@210
|
86 end
|
Chris@210
|
87 tokens << [:close, state.type]
|
Chris@210
|
88 value_expected = false
|
Chris@210
|
89 state = state.next_state
|
Chris@210
|
90
|
Chris@210
|
91 when '\\'
|
Chris@210
|
92 if state.interpreted
|
Chris@210
|
93 if esc = scan(/ #{patterns::ESCAPE} /ox)
|
Chris@210
|
94 tokens << [match + esc, :char]
|
Chris@210
|
95 else
|
Chris@210
|
96 tokens << [match, :error]
|
Chris@210
|
97 end
|
Chris@210
|
98 else
|
Chris@210
|
99 case m = getch
|
Chris@210
|
100 when state.delim, '\\'
|
Chris@210
|
101 tokens << [match + m, :char]
|
Chris@210
|
102 when nil
|
Chris@210
|
103 tokens << [match, :error]
|
Chris@210
|
104 else
|
Chris@210
|
105 tokens << [match + m, :content]
|
Chris@210
|
106 end
|
Chris@210
|
107 end
|
Chris@210
|
108
|
Chris@210
|
109 when '#'
|
Chris@210
|
110 case peek(1)
|
Chris@210
|
111 when '{'
|
Chris@210
|
112 inline_block_stack << [state, depth, heredocs]
|
Chris@210
|
113 value_expected = true
|
Chris@210
|
114 state = :initial
|
Chris@210
|
115 depth = 1
|
Chris@210
|
116 tokens << [:open, :inline]
|
Chris@210
|
117 tokens << [match + getch, :inline_delimiter]
|
Chris@210
|
118 when '$', '@'
|
Chris@210
|
119 tokens << [match, :escape]
|
Chris@210
|
120 last_state = state # scan one token as normal code, then return here
|
Chris@210
|
121 state = :initial
|
Chris@210
|
122 else
|
Chris@210
|
123 raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens
|
Chris@210
|
124 end
|
Chris@210
|
125
|
Chris@210
|
126 when state.paren
|
Chris@210
|
127 state.paren_depth += 1
|
Chris@210
|
128 tokens << [match, :nesting_delimiter]
|
Chris@210
|
129
|
Chris@210
|
130 when /#{patterns::REGEXP_SYMBOLS}/ox
|
Chris@210
|
131 tokens << [match, :function]
|
Chris@210
|
132
|
Chris@210
|
133 else
|
Chris@210
|
134 raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens
|
Chris@210
|
135
|
Chris@210
|
136 end
|
Chris@210
|
137 next
|
Chris@210
|
138 # }}}
|
Chris@210
|
139 else
|
Chris@210
|
140 # {{{
|
Chris@210
|
141 if match = scan(/[ \t\f]+/)
|
Chris@210
|
142 kind = :space
|
Chris@210
|
143 match << scan(/\s*/) unless eos? || heredocs
|
Chris@210
|
144 value_expected = true if match.index(?\n)
|
Chris@210
|
145 tokens << [match, kind]
|
Chris@210
|
146 next
|
Chris@210
|
147
|
Chris@210
|
148 elsif match = scan(/\\?\n/)
|
Chris@210
|
149 kind = :space
|
Chris@210
|
150 if match == "\n"
|
Chris@210
|
151 value_expected = true
|
Chris@210
|
152 state = :initial if state == :undef_comma_expected
|
Chris@210
|
153 end
|
Chris@210
|
154 if heredocs
|
Chris@210
|
155 unscan # heredoc scanning needs \n at start
|
Chris@210
|
156 state = heredocs.shift
|
Chris@210
|
157 tokens << [:open, state.type]
|
Chris@210
|
158 heredocs = nil if heredocs.empty?
|
Chris@210
|
159 next
|
Chris@210
|
160 else
|
Chris@210
|
161 match << scan(/\s*/) unless eos?
|
Chris@210
|
162 end
|
Chris@210
|
163 tokens << [match, kind]
|
Chris@210
|
164 next
|
Chris@210
|
165
|
Chris@210
|
166 elsif bol? && match = scan(/\#!.*/)
|
Chris@210
|
167 tokens << [match, :doctype]
|
Chris@210
|
168 next
|
Chris@210
|
169
|
Chris@210
|
170 elsif match = scan(/\#.*/) or
|
Chris@210
|
171 ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) )
|
Chris@210
|
172 kind = :comment
|
Chris@210
|
173 tokens << [match, kind]
|
Chris@210
|
174 next
|
Chris@210
|
175
|
Chris@210
|
176 elsif state == :initial
|
Chris@210
|
177
|
Chris@210
|
178 # IDENTS #
|
Chris@210
|
179 if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo :
|
Chris@210
|
180 /#{patterns::METHOD_NAME}/o)
|
Chris@210
|
181 if last_token_dot
|
Chris@210
|
182 kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end
|
Chris@210
|
183 else
|
Chris@210
|
184 if value_expected != :expect_colon && scan(/:(?= )/)
|
Chris@210
|
185 tokens << [match, :key]
|
Chris@210
|
186 match = ':'
|
Chris@210
|
187 kind = :operator
|
Chris@210
|
188 else
|
Chris@210
|
189 kind = patterns::IDENT_KIND[match]
|
Chris@210
|
190 if kind == :ident
|
Chris@210
|
191 if match[/\A[A-Z]/] and not match[/[!?]$/] and not match?(/\(/)
|
Chris@210
|
192 kind = :constant
|
Chris@210
|
193 end
|
Chris@210
|
194 elsif kind == :reserved
|
Chris@210
|
195 state = patterns::DEF_NEW_STATE[match]
|
Chris@210
|
196 value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match]
|
Chris@210
|
197 end
|
Chris@210
|
198 end
|
Chris@210
|
199 end
|
Chris@210
|
200 value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o)
|
Chris@210
|
201
|
Chris@210
|
202 elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o)
|
Chris@210
|
203 kind = :ident
|
Chris@210
|
204 value_expected = :set if check(unicode ? /#{patterns::VALUE_FOLLOWS}/uo :
|
Chris@210
|
205 /#{patterns::VALUE_FOLLOWS}/o)
|
Chris@210
|
206
|
Chris@210
|
207 # OPERATORS #
|
Chris@210
|
208 elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x)
|
Chris@210
|
209 if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/
|
Chris@210
|
210 value_expected = :set
|
Chris@210
|
211 end
|
Chris@210
|
212 last_token_dot = :set if self[1]
|
Chris@210
|
213 kind = :operator
|
Chris@210
|
214 unless inline_block_stack.empty?
|
Chris@210
|
215 case match
|
Chris@210
|
216 when '{'
|
Chris@210
|
217 depth += 1
|
Chris@210
|
218 when '}'
|
Chris@210
|
219 depth -= 1
|
Chris@210
|
220 if depth == 0 # closing brace of inline block reached
|
Chris@210
|
221 state, depth, heredocs = inline_block_stack.pop
|
Chris@210
|
222 heredocs = nil if heredocs && heredocs.empty?
|
Chris@210
|
223 tokens << [match, :inline_delimiter]
|
Chris@210
|
224 kind = :inline
|
Chris@210
|
225 match = :close
|
Chris@210
|
226 end
|
Chris@210
|
227 end
|
Chris@210
|
228 end
|
Chris@210
|
229
|
Chris@210
|
230 elsif match = scan(/ ['"] /mx)
|
Chris@210
|
231 tokens << [:open, :string]
|
Chris@210
|
232 kind = :delimiter
|
Chris@210
|
233 state = patterns::StringState.new :string, match == '"', match # important for streaming
|
Chris@210
|
234
|
Chris@210
|
235 elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo :
|
Chris@210
|
236 /#{patterns::INSTANCE_VARIABLE}/o)
|
Chris@210
|
237 kind = :instance_variable
|
Chris@210
|
238
|
Chris@210
|
239 elsif value_expected and match = scan(/\//)
|
Chris@210
|
240 tokens << [:open, :regexp]
|
Chris@210
|
241 kind = :delimiter
|
Chris@210
|
242 interpreted = true
|
Chris@210
|
243 state = patterns::StringState.new :regexp, interpreted, match
|
Chris@210
|
244
|
Chris@210
|
245 # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o)
|
Chris@210
|
246 elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o)
|
Chris@210
|
247 kind = self[1] ? :float : :integer
|
Chris@210
|
248
|
Chris@210
|
249 elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo :
|
Chris@210
|
250 /#{patterns::SYMBOL}/o)
|
Chris@210
|
251 case delim = match[1]
|
Chris@210
|
252 when ?', ?"
|
Chris@210
|
253 tokens << [:open, :symbol]
|
Chris@210
|
254 tokens << [':', :symbol]
|
Chris@210
|
255 match = delim.chr
|
Chris@210
|
256 kind = :delimiter
|
Chris@210
|
257 state = patterns::StringState.new :symbol, delim == ?", match
|
Chris@210
|
258 else
|
Chris@210
|
259 kind = :symbol
|
Chris@210
|
260 end
|
Chris@210
|
261
|
Chris@210
|
262 elsif match = scan(/ -[>=]? | [+!~^]=? | [*|&]{1,2}=? | >>? /x)
|
Chris@210
|
263 value_expected = :set
|
Chris@210
|
264 kind = :operator
|
Chris@210
|
265
|
Chris@210
|
266 elsif value_expected and match = scan(unicode ? /#{patterns::HEREDOC_OPEN}/uo :
|
Chris@210
|
267 /#{patterns::HEREDOC_OPEN}/o)
|
Chris@210
|
268 indented = self[1] == '-'
|
Chris@210
|
269 quote = self[3]
|
Chris@210
|
270 delim = self[quote ? 4 : 2]
|
Chris@210
|
271 kind = patterns::QUOTE_TO_TYPE[quote]
|
Chris@210
|
272 tokens << [:open, kind]
|
Chris@210
|
273 tokens << [match, :delimiter]
|
Chris@210
|
274 match = :close
|
Chris@210
|
275 heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart )
|
Chris@210
|
276 heredocs ||= [] # create heredocs if empty
|
Chris@210
|
277 heredocs << heredoc
|
Chris@210
|
278
|
Chris@210
|
279 elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o)
|
Chris@210
|
280 kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do
|
Chris@210
|
281 raise_inspect 'Unknown fancy string: %%%p' % k, tokens
|
Chris@210
|
282 end
|
Chris@210
|
283 tokens << [:open, kind]
|
Chris@210
|
284 state = patterns::StringState.new kind, interpreted, self[2]
|
Chris@210
|
285 kind = :delimiter
|
Chris@210
|
286
|
Chris@210
|
287 elsif value_expected and match = scan(unicode ? /#{patterns::CHARACTER}/uo :
|
Chris@210
|
288 /#{patterns::CHARACTER}/o)
|
Chris@210
|
289 kind = :integer
|
Chris@210
|
290
|
Chris@210
|
291 elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x)
|
Chris@210
|
292 value_expected = :set
|
Chris@210
|
293 kind = :operator
|
Chris@210
|
294
|
Chris@210
|
295 elsif match = scan(/`/)
|
Chris@210
|
296 if last_token_dot
|
Chris@210
|
297 kind = :operator
|
Chris@210
|
298 else
|
Chris@210
|
299 tokens << [:open, :shell]
|
Chris@210
|
300 kind = :delimiter
|
Chris@210
|
301 state = patterns::StringState.new :shell, true, match
|
Chris@210
|
302 end
|
Chris@210
|
303
|
Chris@210
|
304 elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo :
|
Chris@210
|
305 /#{patterns::GLOBAL_VARIABLE}/o)
|
Chris@210
|
306 kind = :global_variable
|
Chris@210
|
307
|
Chris@210
|
308 elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo :
|
Chris@210
|
309 /#{patterns::CLASS_VARIABLE}/o)
|
Chris@210
|
310 kind = :class_variable
|
Chris@210
|
311
|
Chris@210
|
312 else
|
Chris@210
|
313 if !unicode && !string.respond_to?(:encoding)
|
Chris@210
|
314 # check for unicode
|
Chris@210
|
315 debug, $DEBUG = $DEBUG, false
|
Chris@210
|
316 begin
|
Chris@210
|
317 if check(/./mu).size > 1
|
Chris@210
|
318 # seems like we should try again with unicode
|
Chris@210
|
319 unicode = true
|
Chris@210
|
320 end
|
Chris@210
|
321 rescue
|
Chris@210
|
322 # bad unicode char; use getch
|
Chris@210
|
323 ensure
|
Chris@210
|
324 $DEBUG = debug
|
Chris@210
|
325 end
|
Chris@210
|
326 next if unicode
|
Chris@210
|
327 end
|
Chris@210
|
328 kind = :error
|
Chris@210
|
329 match = scan(unicode ? /./mu : /./m)
|
Chris@210
|
330
|
Chris@210
|
331 end
|
Chris@210
|
332
|
Chris@210
|
333 elsif state == :def_expected
|
Chris@210
|
334 state = :initial
|
Chris@210
|
335 if scan(/self\./)
|
Chris@210
|
336 tokens << ['self', :pre_constant]
|
Chris@210
|
337 tokens << ['.', :operator]
|
Chris@210
|
338 end
|
Chris@210
|
339 if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo :
|
Chris@210
|
340 /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o)
|
Chris@210
|
341 kind = :method
|
Chris@210
|
342 else
|
Chris@210
|
343 next
|
Chris@210
|
344 end
|
Chris@210
|
345
|
Chris@210
|
346 elsif state == :module_expected
|
Chris@210
|
347 if match = scan(/<</)
|
Chris@210
|
348 kind = :operator
|
Chris@210
|
349 else
|
Chris@210
|
350 state = :initial
|
Chris@210
|
351 if match = scan(unicode ? /(?:#{patterns::IDENT}::)*#{patterns::IDENT}/uo :
|
Chris@210
|
352 /(?:#{patterns::IDENT}::)*#{patterns::IDENT}/o)
|
Chris@210
|
353 kind = :class
|
Chris@210
|
354 else
|
Chris@210
|
355 next
|
Chris@210
|
356 end
|
Chris@210
|
357 end
|
Chris@210
|
358
|
Chris@210
|
359 elsif state == :undef_expected
|
Chris@210
|
360 state = :undef_comma_expected
|
Chris@210
|
361 if match = scan(unicode ? /#{patterns::METHOD_NAME_EX}/uo :
|
Chris@210
|
362 /#{patterns::METHOD_NAME_EX}/o)
|
Chris@210
|
363 kind = :method
|
Chris@210
|
364 elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo :
|
Chris@210
|
365 /#{patterns::SYMBOL}/o)
|
Chris@210
|
366 case delim = match[1]
|
Chris@210
|
367 when ?', ?"
|
Chris@210
|
368 tokens << [:open, :symbol]
|
Chris@210
|
369 tokens << [':', :symbol]
|
Chris@210
|
370 match = delim.chr
|
Chris@210
|
371 kind = :delimiter
|
Chris@210
|
372 state = patterns::StringState.new :symbol, delim == ?", match
|
Chris@210
|
373 state.next_state = :undef_comma_expected
|
Chris@210
|
374 else
|
Chris@210
|
375 kind = :symbol
|
Chris@210
|
376 end
|
Chris@210
|
377 else
|
Chris@210
|
378 state = :initial
|
Chris@210
|
379 next
|
Chris@210
|
380 end
|
Chris@210
|
381
|
Chris@210
|
382 elsif state == :alias_expected
|
Chris@210
|
383 match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo :
|
Chris@210
|
384 /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o)
|
Chris@210
|
385
|
Chris@210
|
386 if match
|
Chris@210
|
387 tokens << [self[1], (self[1][0] == ?: ? :symbol : :method)]
|
Chris@210
|
388 tokens << [self[2], :space]
|
Chris@210
|
389 tokens << [self[3], (self[3][0] == ?: ? :symbol : :method)]
|
Chris@210
|
390 end
|
Chris@210
|
391 state = :initial
|
Chris@210
|
392 next
|
Chris@210
|
393
|
Chris@210
|
394 elsif state == :undef_comma_expected
|
Chris@210
|
395 if match = scan(/,/)
|
Chris@210
|
396 kind = :operator
|
Chris@210
|
397 state = :undef_expected
|
Chris@210
|
398 else
|
Chris@210
|
399 state = :initial
|
Chris@210
|
400 next
|
Chris@210
|
401 end
|
Chris@210
|
402
|
Chris@210
|
403 end
|
Chris@210
|
404 # }}}
|
Chris@210
|
405
|
Chris@210
|
406 unless kind == :error
|
Chris@210
|
407 if value_expected = value_expected == :set
|
Chris@210
|
408 value_expected = :expect_colon if match == '?' || match == 'when'
|
Chris@210
|
409 end
|
Chris@210
|
410 last_token_dot = last_token_dot == :set
|
Chris@210
|
411 end
|
Chris@210
|
412
|
Chris@210
|
413 if $CODERAY_DEBUG and not kind
|
Chris@210
|
414 raise_inspect 'Error token %p in line %d' %
|
Chris@210
|
415 [[match, kind], line], tokens, state
|
Chris@210
|
416 end
|
Chris@210
|
417 raise_inspect 'Empty token', tokens unless match
|
Chris@210
|
418
|
Chris@210
|
419 tokens << [match, kind]
|
Chris@210
|
420
|
Chris@210
|
421 if last_state
|
Chris@210
|
422 state = last_state
|
Chris@210
|
423 last_state = nil
|
Chris@210
|
424 end
|
Chris@210
|
425 end
|
Chris@210
|
426 end
|
Chris@210
|
427
|
Chris@210
|
428 inline_block_stack << [state] if state.is_a? patterns::StringState
|
Chris@210
|
429 until inline_block_stack.empty?
|
Chris@210
|
430 this_block = inline_block_stack.pop
|
Chris@210
|
431 tokens << [:close, :inline] if this_block.size > 1
|
Chris@210
|
432 state = this_block.first
|
Chris@210
|
433 tokens << [:close, state.type]
|
Chris@210
|
434 end
|
Chris@210
|
435
|
Chris@210
|
436 tokens
|
Chris@210
|
437 end
|
Chris@210
|
438
|
Chris@210
|
439 end
|
Chris@210
|
440
|
Chris@210
|
441 end
|
Chris@210
|
442 end
|
Chris@210
|
443
|
Chris@210
|
444 # vim:fdm=marker
|