annotate vendor/gems/coderay-1.0.0/lib/coderay/scanners/python.rb @ 1171:b4558bc5837f bug_505

Close obsolete branch bug_505
author Chris Cannam
date Fri, 03 Aug 2012 19:40:23 +0100
parents cbb26bc654de
children
rev   line source
Chris@909 1 module CodeRay
Chris@909 2 module Scanners
Chris@909 3
Chris@909 4 # Scanner for Python. Supports Python 3.
Chris@909 5 #
Chris@909 6 # Based on pygments' PythonLexer, see
Chris@909 7 # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py.
Chris@909 8 class Python < Scanner
Chris@909 9
Chris@909 10 register_for :python
Chris@909 11 file_extension 'py'
Chris@909 12
Chris@909 13 KEYWORDS = [
Chris@909 14 'and', 'as', 'assert', 'break', 'class', 'continue', 'def',
Chris@909 15 'del', 'elif', 'else', 'except', 'finally', 'for',
Chris@909 16 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not',
Chris@909 17 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield',
Chris@909 18 'nonlocal', # new in Python 3
Chris@909 19 ] # :nodoc:
Chris@909 20
Chris@909 21 OLD_KEYWORDS = [
Chris@909 22 'exec', 'print', # gone in Python 3
Chris@909 23 ] # :nodoc:
Chris@909 24
Chris@909 25 PREDEFINED_METHODS_AND_TYPES = %w[
Chris@909 26 __import__ abs all any apply basestring bin bool buffer
Chris@909 27 bytearray bytes callable chr classmethod cmp coerce compile
Chris@909 28 complex delattr dict dir divmod enumerate eval execfile exit
Chris@909 29 file filter float frozenset getattr globals hasattr hash hex id
Chris@909 30 input int intern isinstance issubclass iter len list locals
Chris@909 31 long map max min next object oct open ord pow property range
Chris@909 32 raw_input reduce reload repr reversed round set setattr slice
Chris@909 33 sorted staticmethod str sum super tuple type unichr unicode
Chris@909 34 vars xrange zip
Chris@909 35 ] # :nodoc:
Chris@909 36
Chris@909 37 PREDEFINED_EXCEPTIONS = %w[
Chris@909 38 ArithmeticError AssertionError AttributeError
Chris@909 39 BaseException DeprecationWarning EOFError EnvironmentError
Chris@909 40 Exception FloatingPointError FutureWarning GeneratorExit IOError
Chris@909 41 ImportError ImportWarning IndentationError IndexError KeyError
Chris@909 42 KeyboardInterrupt LookupError MemoryError NameError
Chris@909 43 NotImplemented NotImplementedError OSError OverflowError
Chris@909 44 OverflowWarning PendingDeprecationWarning ReferenceError
Chris@909 45 RuntimeError RuntimeWarning StandardError StopIteration
Chris@909 46 SyntaxError SyntaxWarning SystemError SystemExit TabError
Chris@909 47 TypeError UnboundLocalError UnicodeDecodeError
Chris@909 48 UnicodeEncodeError UnicodeError UnicodeTranslateError
Chris@909 49 UnicodeWarning UserWarning ValueError Warning ZeroDivisionError
Chris@909 50 ] # :nodoc:
Chris@909 51
Chris@909 52 PREDEFINED_VARIABLES_AND_CONSTANTS = [
Chris@909 53 'False', 'True', 'None', # "keywords" since Python 3
Chris@909 54 'self', 'Ellipsis', 'NotImplemented',
Chris@909 55 ] # :nodoc:
Chris@909 56
Chris@909 57 IDENT_KIND = WordList.new(:ident).
Chris@909 58 add(KEYWORDS, :keyword).
Chris@909 59 add(OLD_KEYWORDS, :old_keyword).
Chris@909 60 add(PREDEFINED_METHODS_AND_TYPES, :predefined).
Chris@909 61 add(PREDEFINED_VARIABLES_AND_CONSTANTS, :predefined_constant).
Chris@909 62 add(PREDEFINED_EXCEPTIONS, :exception) # :nodoc:
Chris@909 63
Chris@909 64 NAME = / [^\W\d] \w* /x # :nodoc:
Chris@909 65 ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
Chris@909 66 UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x # :nodoc:
Chris@909 67
Chris@909 68 OPERATOR = /
Chris@909 69 \.\.\. | # ellipsis
Chris@909 70 \.(?!\d) | # dot but not decimal point
Chris@909 71 [,;:()\[\]{}] | # simple delimiters
Chris@909 72 \/\/=? | \*\*=? | # special math
Chris@909 73 [-+*\/%&|^]=? | # ordinary math and binary logic
Chris@909 74 [~`] | # binary complement and inspection
Chris@909 75 <<=? | >>=? | [<>=]=? | != # comparison and assignment
Chris@909 76 /x # :nodoc:
Chris@909 77
Chris@909 78 STRING_DELIMITER_REGEXP = Hash.new { |h, delimiter|
Chris@909 79 h[delimiter] = Regexp.union delimiter # :nodoc:
Chris@909 80 }
Chris@909 81
Chris@909 82 STRING_CONTENT_REGEXP = Hash.new { |h, delimiter|
Chris@909 83 h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x # :nodoc:
Chris@909 84 }
Chris@909 85
Chris@909 86 DEF_NEW_STATE = WordList.new(:initial).
Chris@909 87 add(%w(def), :def_expected).
Chris@909 88 add(%w(import from), :include_expected).
Chris@909 89 add(%w(class), :class_expected) # :nodoc:
Chris@909 90
Chris@909 91 DESCRIPTOR = /
Chris@909 92 #{NAME}
Chris@909 93 (?: \. #{NAME} )*
Chris@909 94 | \*
Chris@909 95 /x # :nodoc:
Chris@909 96
Chris@909 97 DOCSTRING_COMING = /
Chris@909 98 [ \t]* u?r? ("""|''')
Chris@909 99 /x # :nodoc:
Chris@909 100
Chris@909 101 protected
Chris@909 102
Chris@909 103 def scan_tokens encoder, options
Chris@909 104
Chris@909 105 state = :initial
Chris@909 106 string_delimiter = nil
Chris@909 107 string_raw = false
Chris@909 108 string_type = nil
Chris@909 109 docstring_coming = match?(/#{DOCSTRING_COMING}/o)
Chris@909 110 last_token_dot = false
Chris@909 111 unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
Chris@909 112 from_import_state = []
Chris@909 113
Chris@909 114 until eos?
Chris@909 115
Chris@909 116 if state == :string
Chris@909 117 if match = scan(STRING_DELIMITER_REGEXP[string_delimiter])
Chris@909 118 encoder.text_token match, :delimiter
Chris@909 119 encoder.end_group string_type
Chris@909 120 string_type = nil
Chris@909 121 state = :initial
Chris@909 122 next
Chris@909 123 elsif string_delimiter.size == 3 && match = scan(/\n/)
Chris@909 124 encoder.text_token match, :content
Chris@909 125 elsif match = scan(STRING_CONTENT_REGEXP[string_delimiter])
Chris@909 126 encoder.text_token match, :content
Chris@909 127 elsif !string_raw && match = scan(/ \\ #{ESCAPE} /ox)
Chris@909 128 encoder.text_token match, :char
Chris@909 129 elsif match = scan(/ \\ #{UNICODE_ESCAPE} /ox)
Chris@909 130 encoder.text_token match, :char
Chris@909 131 elsif match = scan(/ \\ . /x)
Chris@909 132 encoder.text_token match, :content
Chris@909 133 elsif match = scan(/ \\ | $ /x)
Chris@909 134 encoder.end_group string_type
Chris@909 135 string_type = nil
Chris@909 136 encoder.text_token match, :error
Chris@909 137 state = :initial
Chris@909 138 else
Chris@909 139 raise_inspect "else case \" reached; %p not handled." % peek(1), encoder, state
Chris@909 140 end
Chris@909 141
Chris@909 142 elsif match = scan(/ [ \t]+ | \\?\n /x)
Chris@909 143 encoder.text_token match, :space
Chris@909 144 if match == "\n"
Chris@909 145 state = :initial if state == :include_expected
Chris@909 146 docstring_coming = true if match?(/#{DOCSTRING_COMING}/o)
Chris@909 147 end
Chris@909 148 next
Chris@909 149
Chris@909 150 elsif match = scan(/ \# [^\n]* /mx)
Chris@909 151 encoder.text_token match, :comment
Chris@909 152 next
Chris@909 153
Chris@909 154 elsif state == :initial
Chris@909 155
Chris@909 156 if match = scan(/#{OPERATOR}/o)
Chris@909 157 encoder.text_token match, :operator
Chris@909 158
Chris@909 159 elsif match = scan(/(u?r?|b)?("""|"|'''|')/i)
Chris@909 160 string_delimiter = self[2]
Chris@909 161 string_type = docstring_coming ? :docstring : :string
Chris@909 162 docstring_coming = false if docstring_coming
Chris@909 163 encoder.begin_group string_type
Chris@909 164 string_raw = false
Chris@909 165 modifiers = self[1]
Chris@909 166 unless modifiers.empty?
Chris@909 167 string_raw = !!modifiers.index(?r)
Chris@909 168 encoder.text_token modifiers, :modifier
Chris@909 169 match = string_delimiter
Chris@909 170 end
Chris@909 171 state = :string
Chris@909 172 encoder.text_token match, :delimiter
Chris@909 173
Chris@909 174 # TODO: backticks
Chris@909 175
Chris@909 176 elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@909 177 kind = IDENT_KIND[match]
Chris@909 178 # TODO: keyword arguments
Chris@909 179 kind = :ident if last_token_dot
Chris@909 180 if kind == :old_keyword
Chris@909 181 kind = check(/\(/) ? :ident : :keyword
Chris@909 182 elsif kind == :predefined && check(/ *=/)
Chris@909 183 kind = :ident
Chris@909 184 elsif kind == :keyword
Chris@909 185 state = DEF_NEW_STATE[match]
Chris@909 186 from_import_state << match.to_sym if state == :include_expected
Chris@909 187 end
Chris@909 188 encoder.text_token match, kind
Chris@909 189
Chris@909 190 elsif match = scan(/@[a-zA-Z0-9_.]+[lL]?/)
Chris@909 191 encoder.text_token match, :decorator
Chris@909 192
Chris@909 193 elsif match = scan(/0[xX][0-9A-Fa-f]+[lL]?/)
Chris@909 194 encoder.text_token match, :hex
Chris@909 195
Chris@909 196 elsif match = scan(/0[bB][01]+[lL]?/)
Chris@909 197 encoder.text_token match, :binary
Chris@909 198
Chris@909 199 elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/)
Chris@909 200 if scan(/[jJ]/)
Chris@909 201 match << matched
Chris@909 202 encoder.text_token match, :imaginary
Chris@909 203 else
Chris@909 204 encoder.text_token match, :float
Chris@909 205 end
Chris@909 206
Chris@909 207 elsif match = scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
Chris@909 208 encoder.text_token match, :octal
Chris@909 209
Chris@909 210 elsif match = scan(/\d+([lL])?/)
Chris@909 211 if self[1] == nil && scan(/[jJ]/)
Chris@909 212 match << matched
Chris@909 213 encoder.text_token match, :imaginary
Chris@909 214 else
Chris@909 215 encoder.text_token match, :integer
Chris@909 216 end
Chris@909 217
Chris@909 218 else
Chris@909 219 encoder.text_token getch, :error
Chris@909 220
Chris@909 221 end
Chris@909 222
Chris@909 223 elsif state == :def_expected
Chris@909 224 state = :initial
Chris@909 225 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@909 226 encoder.text_token match, :method
Chris@909 227 else
Chris@909 228 next
Chris@909 229 end
Chris@909 230
Chris@909 231 elsif state == :class_expected
Chris@909 232 state = :initial
Chris@909 233 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@909 234 encoder.text_token match, :class
Chris@909 235 else
Chris@909 236 next
Chris@909 237 end
Chris@909 238
Chris@909 239 elsif state == :include_expected
Chris@909 240 if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o)
Chris@909 241 if match == 'as'
Chris@909 242 encoder.text_token match, :keyword
Chris@909 243 from_import_state << :as
Chris@909 244 elsif from_import_state.first == :from && match == 'import'
Chris@909 245 encoder.text_token match, :keyword
Chris@909 246 from_import_state << :import
Chris@909 247 elsif from_import_state.last == :as
Chris@909 248 # encoder.text_token match, match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
Chris@909 249 encoder.text_token match, :ident
Chris@909 250 from_import_state.pop
Chris@909 251 elsif IDENT_KIND[match] == :keyword
Chris@909 252 unscan
Chris@909 253 match = nil
Chris@909 254 state = :initial
Chris@909 255 next
Chris@909 256 else
Chris@909 257 encoder.text_token match, :include
Chris@909 258 end
Chris@909 259 elsif match = scan(/,/)
Chris@909 260 from_import_state.pop if from_import_state.last == :as
Chris@909 261 encoder.text_token match, :operator
Chris@909 262 else
Chris@909 263 from_import_state = []
Chris@909 264 state = :initial
Chris@909 265 next
Chris@909 266 end
Chris@909 267
Chris@909 268 else
Chris@909 269 raise_inspect 'Unknown state', encoder, state
Chris@909 270
Chris@909 271 end
Chris@909 272
Chris@909 273 last_token_dot = match == '.'
Chris@909 274
Chris@909 275 end
Chris@909 276
Chris@909 277 if state == :string
Chris@909 278 encoder.end_group string_type
Chris@909 279 end
Chris@909 280
Chris@909 281 encoder
Chris@909 282 end
Chris@909 283
Chris@909 284 end
Chris@909 285
Chris@909 286 end
Chris@909 287 end