Chris@909: module CodeRay Chris@909: module Scanners Chris@909: Chris@909: # Scanner for Python. Supports Python 3. Chris@909: # Chris@909: # Based on pygments' PythonLexer, see Chris@909: # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py. Chris@909: class Python < Scanner Chris@909: Chris@909: register_for :python Chris@909: file_extension 'py' Chris@909: Chris@909: KEYWORDS = [ Chris@909: 'and', 'as', 'assert', 'break', 'class', 'continue', 'def', Chris@909: 'del', 'elif', 'else', 'except', 'finally', 'for', Chris@909: 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not', Chris@909: 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield', Chris@909: 'nonlocal', # new in Python 3 Chris@909: ] # :nodoc: Chris@909: Chris@909: OLD_KEYWORDS = [ Chris@909: 'exec', 'print', # gone in Python 3 Chris@909: ] # :nodoc: Chris@909: Chris@909: PREDEFINED_METHODS_AND_TYPES = %w[ Chris@909: __import__ abs all any apply basestring bin bool buffer Chris@909: bytearray bytes callable chr classmethod cmp coerce compile Chris@909: complex delattr dict dir divmod enumerate eval execfile exit Chris@909: file filter float frozenset getattr globals hasattr hash hex id Chris@909: input int intern isinstance issubclass iter len list locals Chris@909: long map max min next object oct open ord pow property range Chris@909: raw_input reduce reload repr reversed round set setattr slice Chris@909: sorted staticmethod str sum super tuple type unichr unicode Chris@909: vars xrange zip Chris@909: ] # :nodoc: Chris@909: Chris@909: PREDEFINED_EXCEPTIONS = %w[ Chris@909: ArithmeticError AssertionError AttributeError Chris@909: BaseException DeprecationWarning EOFError EnvironmentError Chris@909: Exception FloatingPointError FutureWarning GeneratorExit IOError Chris@909: ImportError ImportWarning IndentationError IndexError KeyError Chris@909: KeyboardInterrupt LookupError MemoryError NameError Chris@909: NotImplemented NotImplementedError OSError OverflowError Chris@909: OverflowWarning PendingDeprecationWarning ReferenceError Chris@909: RuntimeError RuntimeWarning StandardError StopIteration Chris@909: SyntaxError SyntaxWarning SystemError SystemExit TabError Chris@909: TypeError UnboundLocalError UnicodeDecodeError Chris@909: UnicodeEncodeError UnicodeError UnicodeTranslateError Chris@909: UnicodeWarning UserWarning ValueError Warning ZeroDivisionError Chris@909: ] # :nodoc: Chris@909: Chris@909: PREDEFINED_VARIABLES_AND_CONSTANTS = [ Chris@909: 'False', 'True', 'None', # "keywords" since Python 3 Chris@909: 'self', 'Ellipsis', 'NotImplemented', Chris@909: ] # :nodoc: Chris@909: Chris@909: IDENT_KIND = WordList.new(:ident). Chris@909: add(KEYWORDS, :keyword). Chris@909: add(OLD_KEYWORDS, :old_keyword). Chris@909: add(PREDEFINED_METHODS_AND_TYPES, :predefined). Chris@909: add(PREDEFINED_VARIABLES_AND_CONSTANTS, :predefined_constant). Chris@909: add(PREDEFINED_EXCEPTIONS, :exception) # :nodoc: Chris@909: Chris@909: NAME = / [^\W\d] \w* /x # :nodoc: Chris@909: ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: Chris@909: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x # :nodoc: Chris@909: Chris@909: OPERATOR = / Chris@909: \.\.\. | # ellipsis Chris@909: \.(?!\d) | # dot but not decimal point Chris@909: [,;:()\[\]{}] | # simple delimiters Chris@909: \/\/=? | \*\*=? | # special math Chris@909: [-+*\/%&|^]=? | # ordinary math and binary logic Chris@909: [~`] | # binary complement and inspection Chris@909: <<=? | >>=? | [<>=]=? | != # comparison and assignment Chris@909: /x # :nodoc: Chris@909: Chris@909: STRING_DELIMITER_REGEXP = Hash.new { |h, delimiter| Chris@909: h[delimiter] = Regexp.union delimiter # :nodoc: Chris@909: } Chris@909: Chris@909: STRING_CONTENT_REGEXP = Hash.new { |h, delimiter| Chris@909: h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x # :nodoc: Chris@909: } Chris@909: Chris@909: DEF_NEW_STATE = WordList.new(:initial). Chris@909: add(%w(def), :def_expected). Chris@909: add(%w(import from), :include_expected). Chris@909: add(%w(class), :class_expected) # :nodoc: Chris@909: Chris@909: DESCRIPTOR = / Chris@909: #{NAME} Chris@909: (?: \. #{NAME} )* Chris@909: | \* Chris@909: /x # :nodoc: Chris@909: Chris@909: DOCSTRING_COMING = / Chris@909: [ \t]* u?r? ("""|''') Chris@909: /x # :nodoc: Chris@909: Chris@909: protected Chris@909: Chris@909: def scan_tokens encoder, options Chris@909: Chris@909: state = :initial Chris@909: string_delimiter = nil Chris@909: string_raw = false Chris@909: string_type = nil Chris@909: docstring_coming = match?(/#{DOCSTRING_COMING}/o) Chris@909: last_token_dot = false Chris@909: unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8' Chris@909: from_import_state = [] Chris@909: Chris@909: until eos? Chris@909: Chris@909: if state == :string Chris@909: if match = scan(STRING_DELIMITER_REGEXP[string_delimiter]) Chris@909: encoder.text_token match, :delimiter Chris@909: encoder.end_group string_type Chris@909: string_type = nil Chris@909: state = :initial Chris@909: next Chris@909: elsif string_delimiter.size == 3 && match = scan(/\n/) Chris@909: encoder.text_token match, :content Chris@909: elsif match = scan(STRING_CONTENT_REGEXP[string_delimiter]) Chris@909: encoder.text_token match, :content Chris@909: elsif !string_raw && match = scan(/ \\ #{ESCAPE} /ox) Chris@909: encoder.text_token match, :char Chris@909: elsif match = scan(/ \\ #{UNICODE_ESCAPE} /ox) Chris@909: encoder.text_token match, :char Chris@909: elsif match = scan(/ \\ . /x) Chris@909: encoder.text_token match, :content Chris@909: elsif match = scan(/ \\ | $ /x) Chris@909: encoder.end_group string_type Chris@909: string_type = nil Chris@909: encoder.text_token match, :error Chris@909: state = :initial Chris@909: else Chris@909: raise_inspect "else case \" reached; %p not handled." % peek(1), encoder, state Chris@909: end Chris@909: Chris@909: elsif match = scan(/ [ \t]+ | \\?\n /x) Chris@909: encoder.text_token match, :space Chris@909: if match == "\n" Chris@909: state = :initial if state == :include_expected Chris@909: docstring_coming = true if match?(/#{DOCSTRING_COMING}/o) Chris@909: end Chris@909: next Chris@909: Chris@909: elsif match = scan(/ \# [^\n]* /mx) Chris@909: encoder.text_token match, :comment Chris@909: next Chris@909: Chris@909: elsif state == :initial Chris@909: Chris@909: if match = scan(/#{OPERATOR}/o) Chris@909: encoder.text_token match, :operator Chris@909: Chris@909: elsif match = scan(/(u?r?|b)?("""|"|'''|')/i) Chris@909: string_delimiter = self[2] Chris@909: string_type = docstring_coming ? :docstring : :string Chris@909: docstring_coming = false if docstring_coming Chris@909: encoder.begin_group string_type Chris@909: string_raw = false Chris@909: modifiers = self[1] Chris@909: unless modifiers.empty? Chris@909: string_raw = !!modifiers.index(?r) Chris@909: encoder.text_token modifiers, :modifier Chris@909: match = string_delimiter Chris@909: end Chris@909: state = :string Chris@909: encoder.text_token match, :delimiter Chris@909: Chris@909: # TODO: backticks Chris@909: Chris@909: elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) Chris@909: kind = IDENT_KIND[match] Chris@909: # TODO: keyword arguments Chris@909: kind = :ident if last_token_dot Chris@909: if kind == :old_keyword Chris@909: kind = check(/\(/) ? :ident : :keyword Chris@909: elsif kind == :predefined && check(/ *=/) Chris@909: kind = :ident Chris@909: elsif kind == :keyword Chris@909: state = DEF_NEW_STATE[match] Chris@909: from_import_state << match.to_sym if state == :include_expected Chris@909: end Chris@909: encoder.text_token match, kind Chris@909: Chris@909: elsif match = scan(/@[a-zA-Z0-9_.]+[lL]?/) Chris@909: encoder.text_token match, :decorator Chris@909: Chris@909: elsif match = scan(/0[xX][0-9A-Fa-f]+[lL]?/) Chris@909: encoder.text_token match, :hex Chris@909: Chris@909: elsif match = scan(/0[bB][01]+[lL]?/) Chris@909: encoder.text_token match, :binary Chris@909: Chris@909: elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/) Chris@909: if scan(/[jJ]/) Chris@909: match << matched Chris@909: encoder.text_token match, :imaginary Chris@909: else Chris@909: encoder.text_token match, :float Chris@909: end Chris@909: Chris@909: elsif match = scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/) Chris@909: encoder.text_token match, :octal Chris@909: Chris@909: elsif match = scan(/\d+([lL])?/) Chris@909: if self[1] == nil && scan(/[jJ]/) Chris@909: match << matched Chris@909: encoder.text_token match, :imaginary Chris@909: else Chris@909: encoder.text_token match, :integer Chris@909: end Chris@909: Chris@909: else Chris@909: encoder.text_token getch, :error Chris@909: Chris@909: end Chris@909: Chris@909: elsif state == :def_expected Chris@909: state = :initial Chris@909: if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) Chris@909: encoder.text_token match, :method Chris@909: else Chris@909: next Chris@909: end Chris@909: Chris@909: elsif state == :class_expected Chris@909: state = :initial Chris@909: if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o) Chris@909: encoder.text_token match, :class Chris@909: else Chris@909: next Chris@909: end Chris@909: Chris@909: elsif state == :include_expected Chris@909: if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o) Chris@909: if match == 'as' Chris@909: encoder.text_token match, :keyword Chris@909: from_import_state << :as Chris@909: elsif from_import_state.first == :from && match == 'import' Chris@909: encoder.text_token match, :keyword Chris@909: from_import_state << :import Chris@909: elsif from_import_state.last == :as Chris@909: # encoder.text_token match, match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method Chris@909: encoder.text_token match, :ident Chris@909: from_import_state.pop Chris@909: elsif IDENT_KIND[match] == :keyword Chris@909: unscan Chris@909: match = nil Chris@909: state = :initial Chris@909: next Chris@909: else Chris@909: encoder.text_token match, :include Chris@909: end Chris@909: elsif match = scan(/,/) Chris@909: from_import_state.pop if from_import_state.last == :as Chris@909: encoder.text_token match, :operator Chris@909: else Chris@909: from_import_state = [] Chris@909: state = :initial Chris@909: next Chris@909: end Chris@909: Chris@909: else Chris@909: raise_inspect 'Unknown state', encoder, state Chris@909: Chris@909: end Chris@909: Chris@909: last_token_dot = match == '.' Chris@909: Chris@909: end Chris@909: Chris@909: if state == :string Chris@909: encoder.end_group string_type Chris@909: end Chris@909: Chris@909: encoder Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: end