Chris@909
|
1 module CodeRay
|
Chris@909
|
2 module Scanners
|
Chris@909
|
3
|
Chris@909
|
4 # Scanner for Python. Supports Python 3.
|
Chris@909
|
5 #
|
Chris@909
|
6 # Based on pygments' PythonLexer, see
|
Chris@909
|
7 # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py.
|
Chris@909
|
8 class Python < Scanner
|
Chris@909
|
9
|
Chris@909
|
10 register_for :python
|
Chris@909
|
11 file_extension 'py'
|
Chris@909
|
12
|
Chris@909
|
13 KEYWORDS = [
|
Chris@909
|
14 'and', 'as', 'assert', 'break', 'class', 'continue', 'def',
|
Chris@909
|
15 'del', 'elif', 'else', 'except', 'finally', 'for',
|
Chris@909
|
16 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not',
|
Chris@909
|
17 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield',
|
Chris@909
|
18 'nonlocal', # new in Python 3
|
Chris@909
|
19 ] # :nodoc:
|
Chris@909
|
20
|
Chris@909
|
21 OLD_KEYWORDS = [
|
Chris@909
|
22 'exec', 'print', # gone in Python 3
|
Chris@909
|
23 ] # :nodoc:
|
Chris@909
|
24
|
Chris@909
|
25 PREDEFINED_METHODS_AND_TYPES = %w[
|
Chris@909
|
26 __import__ abs all any apply basestring bin bool buffer
|
Chris@909
|
27 bytearray bytes callable chr classmethod cmp coerce compile
|
Chris@909
|
28 complex delattr dict dir divmod enumerate eval execfile exit
|
Chris@909
|
29 file filter float frozenset getattr globals hasattr hash hex id
|
Chris@909
|
30 input int intern isinstance issubclass iter len list locals
|
Chris@909
|
31 long map max min next object oct open ord pow property range
|
Chris@909
|
32 raw_input reduce reload repr reversed round set setattr slice
|
Chris@909
|
33 sorted staticmethod str sum super tuple type unichr unicode
|
Chris@909
|
34 vars xrange zip
|
Chris@909
|
35 ] # :nodoc:
|
Chris@909
|
36
|
Chris@909
|
37 PREDEFINED_EXCEPTIONS = %w[
|
Chris@909
|
38 ArithmeticError AssertionError AttributeError
|
Chris@909
|
39 BaseException DeprecationWarning EOFError EnvironmentError
|
Chris@909
|
40 Exception FloatingPointError FutureWarning GeneratorExit IOError
|
Chris@909
|
41 ImportError ImportWarning IndentationError IndexError KeyError
|
Chris@909
|
42 KeyboardInterrupt LookupError MemoryError NameError
|
Chris@909
|
43 NotImplemented NotImplementedError OSError OverflowError
|
Chris@909
|
44 OverflowWarning PendingDeprecationWarning ReferenceError
|
Chris@909
|
45 RuntimeError RuntimeWarning StandardError StopIteration
|
Chris@909
|
46 SyntaxError SyntaxWarning SystemError SystemExit TabError
|
Chris@909
|
47 TypeError UnboundLocalError UnicodeDecodeError
|
Chris@909
|
48 UnicodeEncodeError UnicodeError UnicodeTranslateError
|
Chris@909
|
49 UnicodeWarning UserWarning ValueError Warning ZeroDivisionError
|
Chris@909
|
50 ] # :nodoc:
|
Chris@909
|
51
|
Chris@909
|
52 PREDEFINED_VARIABLES_AND_CONSTANTS = [
|
Chris@909
|
53 'False', 'True', 'None', # "keywords" since Python 3
|
Chris@909
|
54 'self', 'Ellipsis', 'NotImplemented',
|
Chris@909
|
55 ] # :nodoc:
|
Chris@909
|
56
|
Chris@909
|
57 IDENT_KIND = WordList.new(:ident).
|
Chris@909
|
58 add(KEYWORDS, :keyword).
|
Chris@909
|
59 add(OLD_KEYWORDS, :old_keyword).
|
Chris@909
|
60 add(PREDEFINED_METHODS_AND_TYPES, :predefined).
|
Chris@909
|
61 add(PREDEFINED_VARIABLES_AND_CONSTANTS, :predefined_constant).
|
Chris@909
|
62 add(PREDEFINED_EXCEPTIONS, :exception) # :nodoc:
|
Chris@909
|
63
|
Chris@909
|
64 NAME = / [^\W\d] \w* /x # :nodoc:
|
Chris@909
|
65 ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
|
Chris@909
|
66 UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x # :nodoc:
|
Chris@909
|
67
|
Chris@909
|
68 OPERATOR = /
|
Chris@909
|
69 \.\.\. | # ellipsis
|
Chris@909
|
70 \.(?!\d) | # dot but not decimal point
|
Chris@909
|
71 [,;:()\[\]{}] | # simple delimiters
|
Chris@909
|
72 \/\/=? | \*\*=? | # special math
|
Chris@909
|
73 [-+*\/%&|^]=? | # ordinary math and binary logic
|
Chris@909
|
74 [~`] | # binary complement and inspection
|
Chris@909
|
75 <<=? | >>=? | [<>=]=? | != # comparison and assignment
|
Chris@909
|
76 /x # :nodoc:
|
Chris@909
|
77
|
Chris@909
|
78 STRING_DELIMITER_REGEXP = Hash.new { |h, delimiter|
|
Chris@909
|
79 h[delimiter] = Regexp.union delimiter # :nodoc:
|
Chris@909
|
80 }
|
Chris@909
|
81
|
Chris@909
|
82 STRING_CONTENT_REGEXP = Hash.new { |h, delimiter|
|
Chris@909
|
83 h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x # :nodoc:
|
Chris@909
|
84 }
|
Chris@909
|
85
|
Chris@909
|
86 DEF_NEW_STATE = WordList.new(:initial).
|
Chris@909
|
87 add(%w(def), :def_expected).
|
Chris@909
|
88 add(%w(import from), :include_expected).
|
Chris@909
|
89 add(%w(class), :class_expected) # :nodoc:
|
Chris@909
|
90
|
Chris@909
|
91 DESCRIPTOR = /
|
Chris@909
|
92 #{NAME}
|
Chris@909
|
93 (?: \. #{NAME} )*
|
Chris@909
|
94 | \*
|
Chris@909
|
95 /x # :nodoc:
|
Chris@909
|
96
|
Chris@909
|
97 DOCSTRING_COMING = /
|
Chris@909
|
98 [ \t]* u?r? ("""|''')
|
Chris@909
|
99 /x # :nodoc:
|
Chris@909
|
100
|
Chris@909
|
101 protected
|
Chris@909
|
102
|
Chris@909
|
103 def scan_tokens encoder, options
|
Chris@909
|
104
|
Chris@909
|
105 state = :initial
|
Chris@909
|
106 string_delimiter = nil
|
Chris@909
|
107 string_raw = false
|
Chris@909
|
108 string_type = nil
|
Chris@909
|
109 docstring_coming = match?(/#{DOCSTRING_COMING}/o)
|
Chris@909
|
110 last_token_dot = false
|
Chris@909
|
111 unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
|
Chris@909
|
112 from_import_state = []
|
Chris@909
|
113
|
Chris@909
|
114 until eos?
|
Chris@909
|
115
|
Chris@909
|
116 if state == :string
|
Chris@909
|
117 if match = scan(STRING_DELIMITER_REGEXP[string_delimiter])
|
Chris@909
|
118 encoder.text_token match, :delimiter
|
Chris@909
|
119 encoder.end_group string_type
|
Chris@909
|
120 string_type = nil
|
Chris@909
|
121 state = :initial
|
Chris@909
|
122 next
|
Chris@909
|
123 elsif string_delimiter.size == 3 && match = scan(/\n/)
|
Chris@909
|
124 encoder.text_token match, :content
|
Chris@909
|
125 elsif match = scan(STRING_CONTENT_REGEXP[string_delimiter])
|
Chris@909
|
126 encoder.text_token match, :content
|
Chris@909
|
127 elsif !string_raw && match = scan(/ \\ #{ESCAPE} /ox)
|
Chris@909
|
128 encoder.text_token match, :char
|
Chris@909
|
129 elsif match = scan(/ \\ #{UNICODE_ESCAPE} /ox)
|
Chris@909
|
130 encoder.text_token match, :char
|
Chris@909
|
131 elsif match = scan(/ \\ . /x)
|
Chris@909
|
132 encoder.text_token match, :content
|
Chris@909
|
133 elsif match = scan(/ \\ | $ /x)
|
Chris@909
|
134 encoder.end_group string_type
|
Chris@909
|
135 string_type = nil
|
Chris@909
|
136 encoder.text_token match, :error
|
Chris@909
|
137 state = :initial
|
Chris@909
|
138 else
|
Chris@909
|
139 raise_inspect "else case \" reached; %p not handled." % peek(1), encoder, state
|
Chris@909
|
140 end
|
Chris@909
|
141
|
Chris@909
|
142 elsif match = scan(/ [ \t]+ | \\?\n /x)
|
Chris@909
|
143 encoder.text_token match, :space
|
Chris@909
|
144 if match == "\n"
|
Chris@909
|
145 state = :initial if state == :include_expected
|
Chris@909
|
146 docstring_coming = true if match?(/#{DOCSTRING_COMING}/o)
|
Chris@909
|
147 end
|
Chris@909
|
148 next
|
Chris@909
|
149
|
Chris@909
|
150 elsif match = scan(/ \# [^\n]* /mx)
|
Chris@909
|
151 encoder.text_token match, :comment
|
Chris@909
|
152 next
|
Chris@909
|
153
|
Chris@909
|
154 elsif state == :initial
|
Chris@909
|
155
|
Chris@909
|
156 if match = scan(/#{OPERATOR}/o)
|
Chris@909
|
157 encoder.text_token match, :operator
|
Chris@909
|
158
|
Chris@909
|
159 elsif match = scan(/(u?r?|b)?("""|"|'''|')/i)
|
Chris@909
|
160 string_delimiter = self[2]
|
Chris@909
|
161 string_type = docstring_coming ? :docstring : :string
|
Chris@909
|
162 docstring_coming = false if docstring_coming
|
Chris@909
|
163 encoder.begin_group string_type
|
Chris@909
|
164 string_raw = false
|
Chris@909
|
165 modifiers = self[1]
|
Chris@909
|
166 unless modifiers.empty?
|
Chris@909
|
167 string_raw = !!modifiers.index(?r)
|
Chris@909
|
168 encoder.text_token modifiers, :modifier
|
Chris@909
|
169 match = string_delimiter
|
Chris@909
|
170 end
|
Chris@909
|
171 state = :string
|
Chris@909
|
172 encoder.text_token match, :delimiter
|
Chris@909
|
173
|
Chris@909
|
174 # TODO: backticks
|
Chris@909
|
175
|
Chris@909
|
176 elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
|
Chris@909
|
177 kind = IDENT_KIND[match]
|
Chris@909
|
178 # TODO: keyword arguments
|
Chris@909
|
179 kind = :ident if last_token_dot
|
Chris@909
|
180 if kind == :old_keyword
|
Chris@909
|
181 kind = check(/\(/) ? :ident : :keyword
|
Chris@909
|
182 elsif kind == :predefined && check(/ *=/)
|
Chris@909
|
183 kind = :ident
|
Chris@909
|
184 elsif kind == :keyword
|
Chris@909
|
185 state = DEF_NEW_STATE[match]
|
Chris@909
|
186 from_import_state << match.to_sym if state == :include_expected
|
Chris@909
|
187 end
|
Chris@909
|
188 encoder.text_token match, kind
|
Chris@909
|
189
|
Chris@909
|
190 elsif match = scan(/@[a-zA-Z0-9_.]+[lL]?/)
|
Chris@909
|
191 encoder.text_token match, :decorator
|
Chris@909
|
192
|
Chris@909
|
193 elsif match = scan(/0[xX][0-9A-Fa-f]+[lL]?/)
|
Chris@909
|
194 encoder.text_token match, :hex
|
Chris@909
|
195
|
Chris@909
|
196 elsif match = scan(/0[bB][01]+[lL]?/)
|
Chris@909
|
197 encoder.text_token match, :binary
|
Chris@909
|
198
|
Chris@909
|
199 elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/)
|
Chris@909
|
200 if scan(/[jJ]/)
|
Chris@909
|
201 match << matched
|
Chris@909
|
202 encoder.text_token match, :imaginary
|
Chris@909
|
203 else
|
Chris@909
|
204 encoder.text_token match, :float
|
Chris@909
|
205 end
|
Chris@909
|
206
|
Chris@909
|
207 elsif match = scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
|
Chris@909
|
208 encoder.text_token match, :octal
|
Chris@909
|
209
|
Chris@909
|
210 elsif match = scan(/\d+([lL])?/)
|
Chris@909
|
211 if self[1] == nil && scan(/[jJ]/)
|
Chris@909
|
212 match << matched
|
Chris@909
|
213 encoder.text_token match, :imaginary
|
Chris@909
|
214 else
|
Chris@909
|
215 encoder.text_token match, :integer
|
Chris@909
|
216 end
|
Chris@909
|
217
|
Chris@909
|
218 else
|
Chris@909
|
219 encoder.text_token getch, :error
|
Chris@909
|
220
|
Chris@909
|
221 end
|
Chris@909
|
222
|
Chris@909
|
223 elsif state == :def_expected
|
Chris@909
|
224 state = :initial
|
Chris@909
|
225 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
|
Chris@909
|
226 encoder.text_token match, :method
|
Chris@909
|
227 else
|
Chris@909
|
228 next
|
Chris@909
|
229 end
|
Chris@909
|
230
|
Chris@909
|
231 elsif state == :class_expected
|
Chris@909
|
232 state = :initial
|
Chris@909
|
233 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
|
Chris@909
|
234 encoder.text_token match, :class
|
Chris@909
|
235 else
|
Chris@909
|
236 next
|
Chris@909
|
237 end
|
Chris@909
|
238
|
Chris@909
|
239 elsif state == :include_expected
|
Chris@909
|
240 if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o)
|
Chris@909
|
241 if match == 'as'
|
Chris@909
|
242 encoder.text_token match, :keyword
|
Chris@909
|
243 from_import_state << :as
|
Chris@909
|
244 elsif from_import_state.first == :from && match == 'import'
|
Chris@909
|
245 encoder.text_token match, :keyword
|
Chris@909
|
246 from_import_state << :import
|
Chris@909
|
247 elsif from_import_state.last == :as
|
Chris@909
|
248 # encoder.text_token match, match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
|
Chris@909
|
249 encoder.text_token match, :ident
|
Chris@909
|
250 from_import_state.pop
|
Chris@909
|
251 elsif IDENT_KIND[match] == :keyword
|
Chris@909
|
252 unscan
|
Chris@909
|
253 match = nil
|
Chris@909
|
254 state = :initial
|
Chris@909
|
255 next
|
Chris@909
|
256 else
|
Chris@909
|
257 encoder.text_token match, :include
|
Chris@909
|
258 end
|
Chris@909
|
259 elsif match = scan(/,/)
|
Chris@909
|
260 from_import_state.pop if from_import_state.last == :as
|
Chris@909
|
261 encoder.text_token match, :operator
|
Chris@909
|
262 else
|
Chris@909
|
263 from_import_state = []
|
Chris@909
|
264 state = :initial
|
Chris@909
|
265 next
|
Chris@909
|
266 end
|
Chris@909
|
267
|
Chris@909
|
268 else
|
Chris@909
|
269 raise_inspect 'Unknown state', encoder, state
|
Chris@909
|
270
|
Chris@909
|
271 end
|
Chris@909
|
272
|
Chris@909
|
273 last_token_dot = match == '.'
|
Chris@909
|
274
|
Chris@909
|
275 end
|
Chris@909
|
276
|
Chris@909
|
277 if state == :string
|
Chris@909
|
278 encoder.end_group string_type
|
Chris@909
|
279 end
|
Chris@909
|
280
|
Chris@909
|
281 encoder
|
Chris@909
|
282 end
|
Chris@909
|
283
|
Chris@909
|
284 end
|
Chris@909
|
285
|
Chris@909
|
286 end
|
Chris@909
|
287 end
|