Chris@0: module CodeRay Chris@0: module Scanners Chris@0: Chris@0: class Java < Scanner Chris@0: Chris@0: include Streamable Chris@0: register_for :java Chris@0: helper :builtin_types Chris@0: Chris@0: # http://java.sun.com/docs/books/tutorial/java/nutsandbolts/_keywords.html Chris@0: KEYWORDS = %w[ Chris@0: assert break case catch continue default do else Chris@0: finally for if instanceof import new package Chris@0: return switch throw try typeof while Chris@0: debugger export Chris@0: ] Chris@0: RESERVED = %w[ const goto ] Chris@0: CONSTANTS = %w[ false null true ] Chris@0: MAGIC_VARIABLES = %w[ this super ] Chris@0: TYPES = %w[ Chris@0: boolean byte char class double enum float int interface long Chris@0: short void Chris@0: ] << '[]' # because int[] should be highlighted as a type Chris@0: DIRECTIVES = %w[ Chris@0: abstract extends final implements native private protected public Chris@0: static strictfp synchronized throws transient volatile Chris@0: ] Chris@0: Chris@0: IDENT_KIND = WordList.new(:ident). Chris@0: add(KEYWORDS, :keyword). Chris@0: add(RESERVED, :reserved). Chris@0: add(CONSTANTS, :pre_constant). Chris@0: add(MAGIC_VARIABLES, :local_variable). Chris@0: add(TYPES, :type). Chris@0: add(BuiltinTypes::List, :pre_type). Chris@0: add(BuiltinTypes::List.select { |builtin| builtin[/(Error|Exception)$/] }, :exception). Chris@0: add(DIRECTIVES, :directive) Chris@0: Chris@0: ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x Chris@0: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x Chris@0: STRING_CONTENT_PATTERN = { Chris@0: "'" => /[^\\']+/, Chris@0: '"' => /[^\\"]+/, Chris@0: '/' => /[^\\\/]+/, Chris@0: } Chris@0: IDENT = /[a-zA-Z_][A-Za-z_0-9]*/ Chris@0: Chris@0: def scan_tokens tokens, options Chris@0: Chris@0: state = :initial Chris@0: string_delimiter = nil Chris@0: import_clause = class_name_follows = last_token_dot = false Chris@0: Chris@0: until eos? Chris@0: Chris@0: kind = nil Chris@0: match = nil Chris@0: Chris@0: case state Chris@0: Chris@0: when :initial Chris@0: Chris@0: if match = scan(/ \s+ | \\\n /x) Chris@0: tokens << [match, :space] Chris@0: next Chris@0: Chris@0: elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx) Chris@0: tokens << [match, :comment] Chris@0: next Chris@0: Chris@0: elsif import_clause && scan(/ #{IDENT} (?: \. #{IDENT} )* /ox) Chris@0: kind = :include Chris@0: Chris@0: elsif match = scan(/ #{IDENT} | \[\] /ox) Chris@0: kind = IDENT_KIND[match] Chris@0: if last_token_dot Chris@0: kind = :ident Chris@0: elsif class_name_follows Chris@0: kind = :class Chris@0: class_name_follows = false Chris@0: else Chris@0: import_clause = true if match == 'import' Chris@0: class_name_follows = true if match == 'class' || match == 'interface' Chris@0: end Chris@0: Chris@0: elsif scan(/ \.(?!\d) | [,?:()\[\]}] | -- | \+\+ | && | \|\| | \*\*=? | [-+*\/%^~&|<>=!]=? | <<>>?=? /x) Chris@0: kind = :operator Chris@0: Chris@0: elsif scan(/;/) Chris@0: import_clause = false Chris@0: kind = :operator Chris@0: Chris@0: elsif scan(/\{/) Chris@0: class_name_follows = false Chris@0: kind = :operator Chris@0: Chris@0: elsif check(/[\d.]/) Chris@0: if scan(/0[xX][0-9A-Fa-f]+/) Chris@0: kind = :hex Chris@0: elsif scan(/(?>0[0-7]+)(?![89.eEfF])/) Chris@0: kind = :oct Chris@0: elsif scan(/\d+[fFdD]|\d*\.\d+(?:[eE][+-]?\d+)?[fFdD]?|\d+[eE][+-]?\d+[fFdD]?/) Chris@0: kind = :float Chris@0: elsif scan(/\d+[lL]?/) Chris@0: kind = :integer Chris@0: end Chris@0: Chris@0: elsif match = scan(/["']/) Chris@0: tokens << [:open, :string] Chris@0: state = :string Chris@0: string_delimiter = match Chris@0: kind = :delimiter Chris@0: Chris@0: elsif scan(/ @ #{IDENT} /ox) Chris@0: kind = :annotation Chris@0: Chris@0: else Chris@0: getch Chris@0: kind = :error Chris@0: Chris@0: end Chris@0: Chris@0: when :string Chris@0: if scan(STRING_CONTENT_PATTERN[string_delimiter]) Chris@0: kind = :content Chris@0: elsif match = scan(/["'\/]/) Chris@0: tokens << [match, :delimiter] Chris@0: tokens << [:close, state] Chris@0: string_delimiter = nil Chris@0: state = :initial Chris@0: next Chris@0: elsif state == :string && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) Chris@0: if string_delimiter == "'" && !(match == "\\\\" || match == "\\'") Chris@0: kind = :content Chris@0: else Chris@0: kind = :char Chris@0: end Chris@0: elsif scan(/\\./m) Chris@0: kind = :content Chris@0: elsif scan(/ \\ | $ /x) Chris@0: tokens << [:close, :delimiter] Chris@0: kind = :error Chris@0: state = :initial Chris@0: else Chris@0: raise_inspect "else case \" reached; %p not handled." % peek(1), tokens Chris@0: end Chris@0: Chris@0: else Chris@0: raise_inspect 'Unknown state', tokens Chris@0: Chris@0: end Chris@0: Chris@0: match ||= matched Chris@0: if $CODERAY_DEBUG and not kind Chris@0: raise_inspect 'Error token %p in line %d' % Chris@0: [[match, kind], line], tokens Chris@0: end Chris@0: raise_inspect 'Empty token', tokens unless match Chris@0: Chris@0: last_token_dot = match == '.' Chris@0: Chris@0: tokens << [match, kind] Chris@0: Chris@0: end Chris@0: Chris@0: if state == :string Chris@0: tokens << [:close, state] Chris@0: end Chris@0: Chris@0: tokens Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: end