annotate vendor/plugins/coderay-0.9.2/lib/coderay/scanners/python.rb @ 36:de76cd3e8c8e cc-branches

* Probably abortive experiments in extracting the branch from Hg
author Chris Cannam <chris.cannam@soundsoftware.ac.uk>
date Wed, 20 Oct 2010 10:07:29 +0100
parents 513646585e45
children
rev   line source
Chris@0 1 module CodeRay
Chris@0 2 module Scanners
Chris@0 3
Chris@0 4 # Bases on pygments' PythonLexer, see
Chris@0 5 # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py.
Chris@0 6 class Python < Scanner
Chris@0 7
Chris@0 8 include Streamable
Chris@0 9
Chris@0 10 register_for :python
Chris@0 11 file_extension 'py'
Chris@0 12
Chris@0 13 KEYWORDS = [
Chris@0 14 'and', 'as', 'assert', 'break', 'class', 'continue', 'def',
Chris@0 15 'del', 'elif', 'else', 'except', 'finally', 'for',
Chris@0 16 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not',
Chris@0 17 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield',
Chris@0 18 'nonlocal', # new in Python 3
Chris@0 19 ]
Chris@0 20
Chris@0 21 OLD_KEYWORDS = [
Chris@0 22 'exec', 'print', # gone in Python 3
Chris@0 23 ]
Chris@0 24
Chris@0 25 PREDEFINED_METHODS_AND_TYPES = %w[
Chris@0 26 __import__ abs all any apply basestring bin bool buffer
Chris@0 27 bytearray bytes callable chr classmethod cmp coerce compile
Chris@0 28 complex delattr dict dir divmod enumerate eval execfile exit
Chris@0 29 file filter float frozenset getattr globals hasattr hash hex id
Chris@0 30 input int intern isinstance issubclass iter len list locals
Chris@0 31 long map max min next object oct open ord pow property range
Chris@0 32 raw_input reduce reload repr reversed round set setattr slice
Chris@0 33 sorted staticmethod str sum super tuple type unichr unicode
Chris@0 34 vars xrange zip
Chris@0 35 ]
Chris@0 36
Chris@0 37 PREDEFINED_EXCEPTIONS = %w[
Chris@0 38 ArithmeticError AssertionError AttributeError
Chris@0 39 BaseException DeprecationWarning EOFError EnvironmentError
Chris@0 40 Exception FloatingPointError FutureWarning GeneratorExit IOError
Chris@0 41 ImportError ImportWarning IndentationError IndexError KeyError
Chris@0 42 KeyboardInterrupt LookupError MemoryError NameError
Chris@0 43 NotImplemented NotImplementedError OSError OverflowError
Chris@0 44 OverflowWarning PendingDeprecationWarning ReferenceError
Chris@0 45 RuntimeError RuntimeWarning StandardError StopIteration
Chris@0 46 SyntaxError SyntaxWarning SystemError SystemExit TabError
Chris@0 47 TypeError UnboundLocalError UnicodeDecodeError
Chris@0 48 UnicodeEncodeError UnicodeError UnicodeTranslateError
Chris@0 49 UnicodeWarning UserWarning ValueError Warning ZeroDivisionError
Chris@0 50 ]
Chris@0 51
Chris@0 52 PREDEFINED_VARIABLES_AND_CONSTANTS = [
Chris@0 53 'False', 'True', 'None', # "keywords" since Python 3
Chris@0 54 'self', 'Ellipsis', 'NotImplemented',
Chris@0 55 ]
Chris@0 56
Chris@0 57 IDENT_KIND = WordList.new(:ident).
Chris@0 58 add(KEYWORDS, :keyword).
Chris@0 59 add(OLD_KEYWORDS, :old_keyword).
Chris@0 60 add(PREDEFINED_METHODS_AND_TYPES, :predefined).
Chris@0 61 add(PREDEFINED_VARIABLES_AND_CONSTANTS, :pre_constant).
Chris@0 62 add(PREDEFINED_EXCEPTIONS, :exception)
Chris@0 63
Chris@0 64 NAME = / [^\W\d] \w* /x
Chris@0 65 ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x
Chris@0 66 UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x
Chris@0 67
Chris@0 68 OPERATOR = /
Chris@0 69 \.\.\. | # ellipsis
Chris@0 70 \.(?!\d) | # dot but not decimal point
Chris@0 71 [,;:()\[\]{}] | # simple delimiters
Chris@0 72 \/\/=? | \*\*=? | # special math
Chris@0 73 [-+*\/%&|^]=? | # ordinary math and binary logic
Chris@0 74 [~`] | # binary complement and inspection
Chris@0 75 <<=? | >>=? | [<>=]=? | != # comparison and assignment
Chris@0 76 /x
Chris@0 77
Chris@0 78 STRING_DELIMITER_REGEXP = Hash.new do |h, delimiter|
Chris@0 79 h[delimiter] = Regexp.union delimiter
Chris@0 80 end
Chris@0 81
Chris@0 82 STRING_CONTENT_REGEXP = Hash.new do |h, delimiter|
Chris@0 83 h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x
Chris@0 84 end
Chris@0 85
Chris@0 86 DEF_NEW_STATE = WordList.new(:initial).
Chris@0 87 add(%w(def), :def_expected).
Chris@0 88 add(%w(import from), :include_expected).
Chris@0 89 add(%w(class), :class_expected)
Chris@0 90
Chris@0 91 DESCRIPTOR = /
Chris@0 92 #{NAME}
Chris@0 93 (?: \. #{NAME} )*
Chris@0 94 | \*
Chris@0 95 /x
Chris@0 96
Chris@0 97 def scan_tokens tokens, options
Chris@0 98
Chris@0 99 state = :initial
Chris@0 100 string_delimiter = nil
Chris@0 101 string_raw = false
Chris@0 102 import_clause = class_name_follows = last_token_dot = false
Chris@0 103 unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
Chris@0 104 from_import_state = []
Chris@0 105
Chris@0 106 until eos?
Chris@0 107
Chris@0 108 kind = nil
Chris@0 109 match = nil
Chris@0 110
Chris@0 111 if state == :string
Chris@0 112 if scan(STRING_DELIMITER_REGEXP[string_delimiter])
Chris@0 113 tokens << [matched, :delimiter]
Chris@0 114 tokens << [:close, :string]
Chris@0 115 state = :initial
Chris@0 116 next
Chris@0 117 elsif string_delimiter.size == 3 && scan(/\n/)
Chris@0 118 kind = :content
Chris@0 119 elsif scan(STRING_CONTENT_REGEXP[string_delimiter])
Chris@0 120 kind = :content
Chris@0 121 elsif !string_raw && scan(/ \\ #{ESCAPE} /ox)
Chris@0 122 kind = :char
Chris@0 123 elsif scan(/ \\ #{UNICODE_ESCAPE} /ox)
Chris@0 124 kind = :char
Chris@0 125 elsif scan(/ \\ . /x)
Chris@0 126 kind = :content
Chris@0 127 elsif scan(/ \\ | $ /x)
Chris@0 128 tokens << [:close, :string]
Chris@0 129 kind = :error
Chris@0 130 state = :initial
Chris@0 131 else
Chris@0 132 raise_inspect "else case \" reached; %p not handled." % peek(1), tokens, state
Chris@0 133 end
Chris@0 134
Chris@0 135 elsif match = scan(/ [ \t]+ | \\\n /x)
Chris@0 136 tokens << [match, :space]
Chris@0 137 next
Chris@0 138
Chris@0 139 elsif match = scan(/\n/)
Chris@0 140 tokens << [match, :space]
Chris@0 141 state = :initial if state == :include_expected
Chris@0 142 next
Chris@0 143
Chris@0 144 elsif match = scan(/ \# [^\n]* /mx)
Chris@0 145 tokens << [match, :comment]
Chris@0 146 next
Chris@0 147
Chris@0 148 elsif state == :initial
Chris@0 149
Chris@0 150 if scan(/#{OPERATOR}/o)
Chris@0 151 kind = :operator
Chris@0 152
Chris@0 153 elsif match = scan(/(u?r?|b)?("""|"|'''|')/i)
Chris@0 154 tokens << [:open, :string]
Chris@0 155 string_delimiter = self[2]
Chris@0 156 string_raw = false
Chris@0 157 modifiers = self[1]
Chris@0 158 unless modifiers.empty?
Chris@0 159 string_raw = !!modifiers.index(?r)
Chris@0 160 tokens << [modifiers, :modifier]
Chris@0 161 match = string_delimiter
Chris@0 162 end
Chris@0 163 state = :string
Chris@0 164 kind = :delimiter
Chris@0 165
Chris@0 166 # TODO: backticks
Chris@0 167
Chris@0 168 elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@0 169 kind = IDENT_KIND[match]
Chris@0 170 # TODO: keyword arguments
Chris@0 171 kind = :ident if last_token_dot
Chris@0 172 if kind == :old_keyword
Chris@0 173 kind = check(/\(/) ? :ident : :keyword
Chris@0 174 elsif kind == :predefined && check(/ *=/)
Chris@0 175 kind = :ident
Chris@0 176 elsif kind == :keyword
Chris@0 177 state = DEF_NEW_STATE[match]
Chris@0 178 from_import_state << match.to_sym if state == :include_expected
Chris@0 179 end
Chris@0 180
Chris@0 181 elsif scan(/@[a-zA-Z0-9_.]+[lL]?/)
Chris@0 182 kind = :decorator
Chris@0 183
Chris@0 184 elsif scan(/0[xX][0-9A-Fa-f]+[lL]?/)
Chris@0 185 kind = :hex
Chris@0 186
Chris@0 187 elsif scan(/0[bB][01]+[lL]?/)
Chris@0 188 kind = :bin
Chris@0 189
Chris@0 190 elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/)
Chris@0 191 kind = :float
Chris@0 192 if scan(/[jJ]/)
Chris@0 193 match << matched
Chris@0 194 kind = :imaginary
Chris@0 195 end
Chris@0 196
Chris@0 197 elsif scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
Chris@0 198 kind = :oct
Chris@0 199
Chris@0 200 elsif match = scan(/\d+([lL])?/)
Chris@0 201 kind = :integer
Chris@0 202 if self[1] == nil && scan(/[jJ]/)
Chris@0 203 match << matched
Chris@0 204 kind = :imaginary
Chris@0 205 end
Chris@0 206
Chris@0 207 else
Chris@0 208 getch
Chris@0 209 kind = :error
Chris@0 210
Chris@0 211 end
Chris@0 212
Chris@0 213 elsif state == :def_expected
Chris@0 214 state = :initial
Chris@0 215 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@0 216 kind = :method
Chris@0 217 else
Chris@0 218 next
Chris@0 219 end
Chris@0 220
Chris@0 221 elsif state == :class_expected
Chris@0 222 state = :initial
Chris@0 223 if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
Chris@0 224 kind = :class
Chris@0 225 else
Chris@0 226 next
Chris@0 227 end
Chris@0 228
Chris@0 229 elsif state == :include_expected
Chris@0 230 if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o)
Chris@0 231 kind = :include
Chris@0 232 if match == 'as'
Chris@0 233 kind = :keyword
Chris@0 234 from_import_state << :as
Chris@0 235 elsif from_import_state.first == :from && match == 'import'
Chris@0 236 kind = :keyword
Chris@0 237 from_import_state << :import
Chris@0 238 elsif from_import_state.last == :as
Chris@0 239 # kind = match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
Chris@0 240 kind = :ident
Chris@0 241 from_import_state.pop
Chris@0 242 elsif IDENT_KIND[match] == :keyword
Chris@0 243 unscan
Chris@0 244 match = nil
Chris@0 245 state = :initial
Chris@0 246 next
Chris@0 247 end
Chris@0 248 elsif match = scan(/,/)
Chris@0 249 from_import_state.pop if from_import_state.last == :as
Chris@0 250 kind = :operator
Chris@0 251 else
Chris@0 252 from_import_state = []
Chris@0 253 state = :initial
Chris@0 254 next
Chris@0 255 end
Chris@0 256
Chris@0 257 else
Chris@0 258 raise_inspect 'Unknown state', tokens, state
Chris@0 259
Chris@0 260 end
Chris@0 261
Chris@0 262 match ||= matched
Chris@0 263 if $CODERAY_DEBUG and not kind
Chris@0 264 raise_inspect 'Error token %p in line %d' %
Chris@0 265 [[match, kind], line], tokens, state
Chris@0 266 end
Chris@0 267 raise_inspect 'Empty token', tokens, state unless match
Chris@0 268
Chris@0 269 last_token_dot = match == '.'
Chris@0 270
Chris@0 271 tokens << [match, kind]
Chris@0 272
Chris@0 273 end
Chris@0 274
Chris@0 275 if state == :string
Chris@0 276 tokens << [:close, :string]
Chris@0 277 end
Chris@0 278
Chris@0 279 tokens
Chris@0 280 end
Chris@0 281
Chris@0 282 end
Chris@0 283
Chris@0 284 end
Chris@0 285 end