Chris@0
|
1 module CodeRay
|
Chris@0
|
2 module Scanners
|
Chris@0
|
3
|
Chris@0
|
4 # This scanner is really complex, since Ruby _is_ a complex language!
|
Chris@0
|
5 #
|
Chris@0
|
6 # It tries to highlight 100% of all common code,
|
Chris@0
|
7 # and 90% of strange codes.
|
Chris@0
|
8 #
|
Chris@0
|
9 # It is optimized for HTML highlighting, and is not very useful for
|
Chris@0
|
10 # parsing or pretty printing.
|
Chris@0
|
11 #
|
Chris@0
|
12 # For now, I think it's better than the scanners in VIM or Syntax, or
|
Chris@0
|
13 # any highlighter I was able to find, except Caleb's RubyLexer.
|
Chris@0
|
14 #
|
Chris@0
|
15 # I hope it's also better than the rdoc/irb lexer.
|
Chris@0
|
16 class Ruby < Scanner
|
Chris@0
|
17
|
Chris@0
|
18 include Streamable
|
Chris@0
|
19
|
Chris@0
|
20 register_for :ruby
|
Chris@0
|
21 file_extension 'rb'
|
Chris@0
|
22
|
Chris@0
|
23 helper :patterns
|
Chris@0
|
24
|
Chris@0
|
25 if not defined? EncodingError
|
Chris@0
|
26 EncodingError = Class.new Exception
|
Chris@0
|
27 end
|
Chris@0
|
28
|
Chris@0
|
29 private
|
Chris@0
|
30 def scan_tokens tokens, options
|
Chris@0
|
31 last_token_dot = false
|
Chris@0
|
32 value_expected = true
|
Chris@0
|
33 heredocs = nil
|
Chris@0
|
34 last_state = nil
|
Chris@0
|
35 state = :initial
|
Chris@0
|
36 depth = nil
|
Chris@0
|
37 inline_block_stack = []
|
Chris@0
|
38 unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
|
Chris@0
|
39
|
Chris@0
|
40 patterns = Patterns # avoid constant lookup
|
Chris@0
|
41
|
Chris@0
|
42 until eos?
|
Chris@0
|
43 match = nil
|
Chris@0
|
44 kind = nil
|
Chris@0
|
45
|
Chris@0
|
46 if state.instance_of? patterns::StringState
|
Chris@0
|
47 # {{{
|
Chris@0
|
48 match = scan_until(state.pattern) || scan_until(/\z/)
|
Chris@0
|
49 tokens << [match, :content] unless match.empty?
|
Chris@0
|
50 break if eos?
|
Chris@0
|
51
|
Chris@0
|
52 if state.heredoc and self[1] # end of heredoc
|
Chris@0
|
53 match = getch.to_s
|
Chris@0
|
54 match << scan_until(/$/) unless eos?
|
Chris@0
|
55 tokens << [match, :delimiter]
|
Chris@0
|
56 tokens << [:close, state.type]
|
Chris@0
|
57 state = state.next_state
|
Chris@0
|
58 next
|
Chris@0
|
59 end
|
Chris@0
|
60
|
Chris@0
|
61 case match = getch
|
Chris@0
|
62
|
Chris@0
|
63 when state.delim
|
Chris@0
|
64 if state.paren
|
Chris@0
|
65 state.paren_depth -= 1
|
Chris@0
|
66 if state.paren_depth > 0
|
Chris@0
|
67 tokens << [match, :nesting_delimiter]
|
Chris@0
|
68 next
|
Chris@0
|
69 end
|
Chris@0
|
70 end
|
Chris@0
|
71 tokens << [match, :delimiter]
|
Chris@0
|
72 if state.type == :regexp and not eos?
|
Chris@0
|
73 modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox)
|
Chris@0
|
74 tokens << [modifiers, :modifier] unless modifiers.empty?
|
Chris@0
|
75 end
|
Chris@0
|
76 tokens << [:close, state.type]
|
Chris@0
|
77 value_expected = false
|
Chris@0
|
78 state = state.next_state
|
Chris@0
|
79
|
Chris@0
|
80 when '\\'
|
Chris@0
|
81 if state.interpreted
|
Chris@0
|
82 if esc = scan(/ #{patterns::ESCAPE} /ox)
|
Chris@0
|
83 tokens << [match + esc, :char]
|
Chris@0
|
84 else
|
Chris@0
|
85 tokens << [match, :error]
|
Chris@0
|
86 end
|
Chris@0
|
87 else
|
Chris@0
|
88 case m = getch
|
Chris@0
|
89 when state.delim, '\\'
|
Chris@0
|
90 tokens << [match + m, :char]
|
Chris@0
|
91 when nil
|
Chris@0
|
92 tokens << [match, :error]
|
Chris@0
|
93 else
|
Chris@0
|
94 tokens << [match + m, :content]
|
Chris@0
|
95 end
|
Chris@0
|
96 end
|
Chris@0
|
97
|
Chris@0
|
98 when '#'
|
Chris@0
|
99 case peek(1)
|
Chris@0
|
100 when '{'
|
Chris@0
|
101 inline_block_stack << [state, depth, heredocs]
|
Chris@0
|
102 value_expected = true
|
Chris@0
|
103 state = :initial
|
Chris@0
|
104 depth = 1
|
Chris@0
|
105 tokens << [:open, :inline]
|
Chris@0
|
106 tokens << [match + getch, :inline_delimiter]
|
Chris@0
|
107 when '$', '@'
|
Chris@0
|
108 tokens << [match, :escape]
|
Chris@0
|
109 last_state = state # scan one token as normal code, then return here
|
Chris@0
|
110 state = :initial
|
Chris@0
|
111 else
|
Chris@0
|
112 raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens
|
Chris@0
|
113 end
|
Chris@0
|
114
|
Chris@0
|
115 when state.paren
|
Chris@0
|
116 state.paren_depth += 1
|
Chris@0
|
117 tokens << [match, :nesting_delimiter]
|
Chris@0
|
118
|
Chris@0
|
119 when /#{patterns::REGEXP_SYMBOLS}/ox
|
Chris@0
|
120 tokens << [match, :function]
|
Chris@0
|
121
|
Chris@0
|
122 else
|
Chris@0
|
123 raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens
|
Chris@0
|
124
|
Chris@0
|
125 end
|
Chris@0
|
126 next
|
Chris@0
|
127 # }}}
|
Chris@0
|
128 else
|
Chris@0
|
129 # {{{
|
Chris@0
|
130 if match = scan(/[ \t\f]+/)
|
Chris@0
|
131 kind = :space
|
Chris@0
|
132 match << scan(/\s*/) unless eos? || heredocs
|
Chris@0
|
133 value_expected = true if match.index(?\n)
|
Chris@0
|
134 tokens << [match, kind]
|
Chris@0
|
135 next
|
Chris@0
|
136
|
Chris@0
|
137 elsif match = scan(/\\?\n/)
|
Chris@0
|
138 kind = :space
|
Chris@0
|
139 if match == "\n"
|
Chris@0
|
140 value_expected = true
|
Chris@0
|
141 state = :initial if state == :undef_comma_expected
|
Chris@0
|
142 end
|
Chris@0
|
143 if heredocs
|
Chris@0
|
144 unscan # heredoc scanning needs \n at start
|
Chris@0
|
145 state = heredocs.shift
|
Chris@0
|
146 tokens << [:open, state.type]
|
Chris@0
|
147 heredocs = nil if heredocs.empty?
|
Chris@0
|
148 next
|
Chris@0
|
149 else
|
Chris@0
|
150 match << scan(/\s*/) unless eos?
|
Chris@0
|
151 end
|
Chris@0
|
152 tokens << [match, kind]
|
Chris@0
|
153 next
|
Chris@0
|
154
|
Chris@0
|
155 elsif bol? && match = scan(/\#!.*/)
|
Chris@0
|
156 tokens << [match, :doctype]
|
Chris@0
|
157 next
|
Chris@0
|
158
|
Chris@0
|
159 elsif match = scan(/\#.*/) or
|
Chris@0
|
160 ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) )
|
Chris@0
|
161 kind = :comment
|
Chris@0
|
162 tokens << [match, kind]
|
Chris@0
|
163 next
|
Chris@0
|
164
|
Chris@0
|
165 elsif state == :initial
|
Chris@0
|
166
|
Chris@0
|
167 # IDENTS #
|
Chris@0
|
168 if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo :
|
Chris@0
|
169 /#{patterns::METHOD_NAME}/o)
|
Chris@0
|
170 if last_token_dot
|
Chris@0
|
171 kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end
|
Chris@0
|
172 else
|
Chris@0
|
173 kind = patterns::IDENT_KIND[match]
|
Chris@0
|
174 if kind == :ident and match[/^[A-Z]/] and not match[/[!?]$/] and not match?(/\(/)
|
Chris@0
|
175 kind = :constant
|
Chris@0
|
176 elsif kind == :reserved
|
Chris@0
|
177 state = patterns::DEF_NEW_STATE[match]
|
Chris@0
|
178 value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match]
|
Chris@0
|
179 end
|
Chris@0
|
180 end
|
Chris@0
|
181 value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o)
|
Chris@0
|
182
|
Chris@0
|
183 elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o)
|
Chris@0
|
184 kind = :ident
|
Chris@0
|
185 value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o)
|
Chris@0
|
186
|
Chris@0
|
187 # OPERATORS #
|
Chris@0
|
188 elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x)
|
Chris@0
|
189 if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/
|
Chris@0
|
190 value_expected = :set
|
Chris@0
|
191 end
|
Chris@0
|
192 last_token_dot = :set if self[1]
|
Chris@0
|
193 kind = :operator
|
Chris@0
|
194 unless inline_block_stack.empty?
|
Chris@0
|
195 case match
|
Chris@0
|
196 when '{'
|
Chris@0
|
197 depth += 1
|
Chris@0
|
198 when '}'
|
Chris@0
|
199 depth -= 1
|
Chris@0
|
200 if depth == 0 # closing brace of inline block reached
|
Chris@0
|
201 state, depth, heredocs = inline_block_stack.pop
|
Chris@0
|
202 heredocs = nil if heredocs && heredocs.empty?
|
Chris@0
|
203 tokens << [match, :inline_delimiter]
|
Chris@0
|
204 kind = :inline
|
Chris@0
|
205 match = :close
|
Chris@0
|
206 end
|
Chris@0
|
207 end
|
Chris@0
|
208 end
|
Chris@0
|
209
|
Chris@0
|
210 elsif match = scan(/ ['"] /mx)
|
Chris@0
|
211 tokens << [:open, :string]
|
Chris@0
|
212 kind = :delimiter
|
Chris@0
|
213 state = patterns::StringState.new :string, match == '"', match # important for streaming
|
Chris@0
|
214
|
Chris@0
|
215 elsif match = scan(/#{patterns::INSTANCE_VARIABLE}/o)
|
Chris@0
|
216 kind = :instance_variable
|
Chris@0
|
217
|
Chris@0
|
218 elsif value_expected and match = scan(/\//)
|
Chris@0
|
219 tokens << [:open, :regexp]
|
Chris@0
|
220 kind = :delimiter
|
Chris@0
|
221 interpreted = true
|
Chris@0
|
222 state = patterns::StringState.new :regexp, interpreted, match
|
Chris@0
|
223
|
Chris@0
|
224 # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o)
|
Chris@0
|
225 elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o)
|
Chris@0
|
226 kind = self[1] ? :float : :integer
|
Chris@0
|
227
|
Chris@0
|
228 elsif match = scan(/#{patterns::SYMBOL}/o)
|
Chris@0
|
229 case delim = match[1]
|
Chris@0
|
230 when ?', ?"
|
Chris@0
|
231 tokens << [:open, :symbol]
|
Chris@0
|
232 tokens << [':', :symbol]
|
Chris@0
|
233 match = delim.chr
|
Chris@0
|
234 kind = :delimiter
|
Chris@0
|
235 state = patterns::StringState.new :symbol, delim == ?", match
|
Chris@0
|
236 else
|
Chris@0
|
237 kind = :symbol
|
Chris@0
|
238 end
|
Chris@0
|
239
|
Chris@0
|
240 elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /x)
|
Chris@0
|
241 value_expected = :set
|
Chris@0
|
242 kind = :operator
|
Chris@0
|
243
|
Chris@0
|
244 elsif value_expected and match = scan(/#{patterns::HEREDOC_OPEN}/o)
|
Chris@0
|
245 indented = self[1] == '-'
|
Chris@0
|
246 quote = self[3]
|
Chris@0
|
247 delim = self[quote ? 4 : 2]
|
Chris@0
|
248 kind = patterns::QUOTE_TO_TYPE[quote]
|
Chris@0
|
249 tokens << [:open, kind]
|
Chris@0
|
250 tokens << [match, :delimiter]
|
Chris@0
|
251 match = :close
|
Chris@0
|
252 heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart )
|
Chris@0
|
253 heredocs ||= [] # create heredocs if empty
|
Chris@0
|
254 heredocs << heredoc
|
Chris@0
|
255
|
Chris@0
|
256 elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o)
|
Chris@0
|
257 kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do
|
Chris@0
|
258 raise_inspect 'Unknown fancy string: %%%p' % k, tokens
|
Chris@0
|
259 end
|
Chris@0
|
260 tokens << [:open, kind]
|
Chris@0
|
261 state = patterns::StringState.new kind, interpreted, self[2]
|
Chris@0
|
262 kind = :delimiter
|
Chris@0
|
263
|
Chris@0
|
264 elsif value_expected and match = scan(/#{patterns::CHARACTER}/o)
|
Chris@0
|
265 kind = :integer
|
Chris@0
|
266
|
Chris@0
|
267 elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x)
|
Chris@0
|
268 value_expected = :set
|
Chris@0
|
269 kind = :operator
|
Chris@0
|
270
|
Chris@0
|
271 elsif match = scan(/`/)
|
Chris@0
|
272 if last_token_dot
|
Chris@0
|
273 kind = :operator
|
Chris@0
|
274 else
|
Chris@0
|
275 tokens << [:open, :shell]
|
Chris@0
|
276 kind = :delimiter
|
Chris@0
|
277 state = patterns::StringState.new :shell, true, match
|
Chris@0
|
278 end
|
Chris@0
|
279
|
Chris@0
|
280 elsif match = scan(/#{patterns::GLOBAL_VARIABLE}/o)
|
Chris@0
|
281 kind = :global_variable
|
Chris@0
|
282
|
Chris@0
|
283 elsif match = scan(/#{patterns::CLASS_VARIABLE}/o)
|
Chris@0
|
284 kind = :class_variable
|
Chris@0
|
285
|
Chris@0
|
286 else
|
Chris@0
|
287 if !unicode
|
Chris@0
|
288 # check for unicode
|
Chris@0
|
289 debug, $DEBUG = $DEBUG, false
|
Chris@0
|
290 begin
|
Chris@0
|
291 if check(/./mu).size > 1
|
Chris@0
|
292 # seems like we should try again with unicode
|
Chris@0
|
293 unicode = true
|
Chris@0
|
294 end
|
Chris@0
|
295 rescue
|
Chris@0
|
296 # bad unicode char; use getch
|
Chris@0
|
297 ensure
|
Chris@0
|
298 $DEBUG = debug
|
Chris@0
|
299 end
|
Chris@0
|
300 next if unicode
|
Chris@0
|
301 end
|
Chris@0
|
302 kind = :error
|
Chris@0
|
303 match = getch
|
Chris@0
|
304
|
Chris@0
|
305 end
|
Chris@0
|
306
|
Chris@0
|
307 elsif state == :def_expected
|
Chris@0
|
308 state = :initial
|
Chris@0
|
309 if scan(/self\./)
|
Chris@0
|
310 tokens << ['self', :pre_constant]
|
Chris@0
|
311 tokens << ['.', :operator]
|
Chris@0
|
312 end
|
Chris@0
|
313 if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo :
|
Chris@0
|
314 /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o)
|
Chris@0
|
315 kind = :method
|
Chris@0
|
316 else
|
Chris@0
|
317 next
|
Chris@0
|
318 end
|
Chris@0
|
319
|
Chris@0
|
320 elsif state == :module_expected
|
Chris@0
|
321 if match = scan(/<</)
|
Chris@0
|
322 kind = :operator
|
Chris@0
|
323 else
|
Chris@0
|
324 state = :initial
|
Chris@0
|
325 if match = scan(/ (?:#{patterns::IDENT}::)* #{patterns::IDENT} /ox)
|
Chris@0
|
326 kind = :class
|
Chris@0
|
327 else
|
Chris@0
|
328 next
|
Chris@0
|
329 end
|
Chris@0
|
330 end
|
Chris@0
|
331
|
Chris@0
|
332 elsif state == :undef_expected
|
Chris@0
|
333 state = :undef_comma_expected
|
Chris@0
|
334 if match = scan(/#{patterns::METHOD_NAME_EX}/o)
|
Chris@0
|
335 kind = :method
|
Chris@0
|
336 elsif match = scan(/#{patterns::SYMBOL}/o)
|
Chris@0
|
337 case delim = match[1]
|
Chris@0
|
338 when ?', ?"
|
Chris@0
|
339 tokens << [:open, :symbol]
|
Chris@0
|
340 tokens << [':', :symbol]
|
Chris@0
|
341 match = delim.chr
|
Chris@0
|
342 kind = :delimiter
|
Chris@0
|
343 state = patterns::StringState.new :symbol, delim == ?", match
|
Chris@0
|
344 state.next_state = :undef_comma_expected
|
Chris@0
|
345 else
|
Chris@0
|
346 kind = :symbol
|
Chris@0
|
347 end
|
Chris@0
|
348 else
|
Chris@0
|
349 state = :initial
|
Chris@0
|
350 next
|
Chris@0
|
351 end
|
Chris@0
|
352
|
Chris@0
|
353 elsif state == :alias_expected
|
Chris@0
|
354 match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo :
|
Chris@0
|
355 /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o)
|
Chris@0
|
356
|
Chris@0
|
357 if match
|
Chris@0
|
358 tokens << [self[1], (self[1][0] == ?: ? :symbol : :method)]
|
Chris@0
|
359 tokens << [self[2], :space]
|
Chris@0
|
360 tokens << [self[3], (self[3][0] == ?: ? :symbol : :method)]
|
Chris@0
|
361 end
|
Chris@0
|
362 state = :initial
|
Chris@0
|
363 next
|
Chris@0
|
364
|
Chris@0
|
365 elsif state == :undef_comma_expected
|
Chris@0
|
366 if match = scan(/,/)
|
Chris@0
|
367 kind = :operator
|
Chris@0
|
368 state = :undef_expected
|
Chris@0
|
369 else
|
Chris@0
|
370 state = :initial
|
Chris@0
|
371 next
|
Chris@0
|
372 end
|
Chris@0
|
373
|
Chris@0
|
374 end
|
Chris@0
|
375 # }}}
|
Chris@0
|
376
|
Chris@0
|
377 unless kind == :error
|
Chris@0
|
378 value_expected = value_expected == :set
|
Chris@0
|
379 last_token_dot = last_token_dot == :set
|
Chris@0
|
380 end
|
Chris@0
|
381
|
Chris@0
|
382 if $CODERAY_DEBUG and not kind
|
Chris@0
|
383 raise_inspect 'Error token %p in line %d' %
|
Chris@0
|
384 [[match, kind], line], tokens, state
|
Chris@0
|
385 end
|
Chris@0
|
386 raise_inspect 'Empty token', tokens unless match
|
Chris@0
|
387
|
Chris@0
|
388 tokens << [match, kind]
|
Chris@0
|
389
|
Chris@0
|
390 if last_state
|
Chris@0
|
391 state = last_state
|
Chris@0
|
392 last_state = nil
|
Chris@0
|
393 end
|
Chris@0
|
394 end
|
Chris@0
|
395 end
|
Chris@0
|
396
|
Chris@0
|
397 inline_block_stack << [state] if state.is_a? patterns::StringState
|
Chris@0
|
398 until inline_block_stack.empty?
|
Chris@0
|
399 this_block = inline_block_stack.pop
|
Chris@0
|
400 tokens << [:close, :inline] if this_block.size > 1
|
Chris@0
|
401 state = this_block.first
|
Chris@0
|
402 tokens << [:close, state.type]
|
Chris@0
|
403 end
|
Chris@0
|
404
|
Chris@0
|
405 tokens
|
Chris@0
|
406 end
|
Chris@0
|
407
|
Chris@0
|
408 end
|
Chris@0
|
409
|
Chris@0
|
410 end
|
Chris@0
|
411 end
|
Chris@0
|
412
|
Chris@0
|
413 # vim:fdm=marker
|