Chris@210: # encoding: utf-8 Chris@210: module CodeRay Chris@210: module Scanners Chris@210: Chris@210: # This scanner is really complex, since Ruby _is_ a complex language! Chris@210: # Chris@210: # It tries to highlight 100% of all common code, Chris@210: # and 90% of strange codes. Chris@210: # Chris@210: # It is optimized for HTML highlighting, and is not very useful for Chris@210: # parsing or pretty printing. Chris@210: # Chris@210: # For now, I think it's better than the scanners in VIM or Syntax, or Chris@210: # any highlighter I was able to find, except Caleb's RubyLexer. Chris@210: # Chris@210: # I hope it's also better than the rdoc/irb lexer. Chris@210: class Ruby < Scanner Chris@210: Chris@210: include Streamable Chris@210: Chris@210: register_for :ruby Chris@210: file_extension 'rb' Chris@210: Chris@210: helper :patterns Chris@210: Chris@210: if not defined? EncodingError Chris@210: EncodingError = Class.new Exception Chris@210: end Chris@210: Chris@210: private Chris@210: def scan_tokens tokens, options Chris@210: if string.respond_to?(:encoding) Chris@210: unless string.encoding == Encoding::UTF_8 Chris@210: self.string = string.encode Encoding::UTF_8, Chris@210: :invalid => :replace, :undef => :replace, :replace => '?' Chris@210: end Chris@210: unicode = false Chris@210: else Chris@210: unicode = exist?(/[^\x00-\x7f]/) Chris@210: end Chris@210: Chris@210: last_token_dot = false Chris@210: value_expected = true Chris@210: heredocs = nil Chris@210: last_state = nil Chris@210: state = :initial Chris@210: depth = nil Chris@210: inline_block_stack = [] Chris@210: Chris@210: Chris@210: patterns = Patterns # avoid constant lookup Chris@210: Chris@210: until eos? Chris@210: match = nil Chris@210: kind = nil Chris@210: Chris@210: if state.instance_of? patterns::StringState Chris@210: # {{{ Chris@210: match = scan_until(state.pattern) || scan_until(/\z/) Chris@210: tokens << [match, :content] unless match.empty? Chris@210: break if eos? Chris@210: Chris@210: if state.heredoc and self[1] # end of heredoc Chris@210: match = getch.to_s Chris@210: match << scan_until(/$/) unless eos? Chris@210: tokens << [match, :delimiter] Chris@210: tokens << [:close, state.type] Chris@210: state = state.next_state Chris@210: next Chris@210: end Chris@210: Chris@210: case match = getch Chris@210: Chris@210: when state.delim Chris@210: if state.paren Chris@210: state.paren_depth -= 1 Chris@210: if state.paren_depth > 0 Chris@210: tokens << [match, :nesting_delimiter] Chris@210: next Chris@210: end Chris@210: end Chris@210: tokens << [match, :delimiter] Chris@210: if state.type == :regexp and not eos? Chris@210: modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) Chris@210: tokens << [modifiers, :modifier] unless modifiers.empty? Chris@210: end Chris@210: tokens << [:close, state.type] Chris@210: value_expected = false Chris@210: state = state.next_state Chris@210: Chris@210: when '\\' Chris@210: if state.interpreted Chris@210: if esc = scan(/ #{patterns::ESCAPE} /ox) Chris@210: tokens << [match + esc, :char] Chris@210: else Chris@210: tokens << [match, :error] Chris@210: end Chris@210: else Chris@210: case m = getch Chris@210: when state.delim, '\\' Chris@210: tokens << [match + m, :char] Chris@210: when nil Chris@210: tokens << [match, :error] Chris@210: else Chris@210: tokens << [match + m, :content] Chris@210: end Chris@210: end Chris@210: Chris@210: when '#' Chris@210: case peek(1) Chris@210: when '{' Chris@210: inline_block_stack << [state, depth, heredocs] Chris@210: value_expected = true Chris@210: state = :initial Chris@210: depth = 1 Chris@210: tokens << [:open, :inline] Chris@210: tokens << [match + getch, :inline_delimiter] Chris@210: when '$', '@' Chris@210: tokens << [match, :escape] Chris@210: last_state = state # scan one token as normal code, then return here Chris@210: state = :initial Chris@210: else Chris@210: raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens Chris@210: end Chris@210: Chris@210: when state.paren Chris@210: state.paren_depth += 1 Chris@210: tokens << [match, :nesting_delimiter] Chris@210: Chris@210: when /#{patterns::REGEXP_SYMBOLS}/ox Chris@210: tokens << [match, :function] Chris@210: Chris@210: else Chris@210: raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens Chris@210: Chris@210: end Chris@210: next Chris@210: # }}} Chris@210: else Chris@210: # {{{ Chris@210: if match = scan(/[ \t\f]+/) Chris@210: kind = :space Chris@210: match << scan(/\s*/) unless eos? || heredocs Chris@210: value_expected = true if match.index(?\n) Chris@210: tokens << [match, kind] Chris@210: next Chris@210: Chris@210: elsif match = scan(/\\?\n/) Chris@210: kind = :space Chris@210: if match == "\n" Chris@210: value_expected = true Chris@210: state = :initial if state == :undef_comma_expected Chris@210: end Chris@210: if heredocs Chris@210: unscan # heredoc scanning needs \n at start Chris@210: state = heredocs.shift Chris@210: tokens << [:open, state.type] Chris@210: heredocs = nil if heredocs.empty? Chris@210: next Chris@210: else Chris@210: match << scan(/\s*/) unless eos? Chris@210: end Chris@210: tokens << [match, kind] Chris@210: next Chris@210: Chris@210: elsif bol? && match = scan(/\#!.*/) Chris@210: tokens << [match, :doctype] Chris@210: next Chris@210: Chris@210: elsif match = scan(/\#.*/) or Chris@210: ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) Chris@210: kind = :comment Chris@210: tokens << [match, kind] Chris@210: next Chris@210: Chris@210: elsif state == :initial Chris@210: Chris@210: # IDENTS # Chris@210: if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo : Chris@210: /#{patterns::METHOD_NAME}/o) Chris@210: if last_token_dot Chris@210: kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end Chris@210: else Chris@210: if value_expected != :expect_colon && scan(/:(?= )/) Chris@210: tokens << [match, :key] Chris@210: match = ':' Chris@210: kind = :operator Chris@210: else Chris@210: kind = patterns::IDENT_KIND[match] Chris@210: if kind == :ident Chris@210: if match[/\A[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) Chris@210: kind = :constant Chris@210: end Chris@210: elsif kind == :reserved Chris@210: state = patterns::DEF_NEW_STATE[match] Chris@210: value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match] Chris@210: end Chris@210: end Chris@210: end Chris@210: value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o) Chris@210: Chris@210: elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o) Chris@210: kind = :ident Chris@210: value_expected = :set if check(unicode ? /#{patterns::VALUE_FOLLOWS}/uo : Chris@210: /#{patterns::VALUE_FOLLOWS}/o) Chris@210: Chris@210: # OPERATORS # Chris@210: elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x) Chris@210: if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ Chris@210: value_expected = :set Chris@210: end Chris@210: last_token_dot = :set if self[1] Chris@210: kind = :operator Chris@210: unless inline_block_stack.empty? Chris@210: case match Chris@210: when '{' Chris@210: depth += 1 Chris@210: when '}' Chris@210: depth -= 1 Chris@210: if depth == 0 # closing brace of inline block reached Chris@210: state, depth, heredocs = inline_block_stack.pop Chris@210: heredocs = nil if heredocs && heredocs.empty? Chris@210: tokens << [match, :inline_delimiter] Chris@210: kind = :inline Chris@210: match = :close Chris@210: end Chris@210: end Chris@210: end Chris@210: Chris@210: elsif match = scan(/ ['"] /mx) Chris@210: tokens << [:open, :string] Chris@210: kind = :delimiter Chris@210: state = patterns::StringState.new :string, match == '"', match # important for streaming Chris@210: Chris@210: elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo : Chris@210: /#{patterns::INSTANCE_VARIABLE}/o) Chris@210: kind = :instance_variable Chris@210: Chris@210: elsif value_expected and match = scan(/\//) Chris@210: tokens << [:open, :regexp] Chris@210: kind = :delimiter Chris@210: interpreted = true Chris@210: state = patterns::StringState.new :regexp, interpreted, match Chris@210: Chris@210: # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o) Chris@210: elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o) Chris@210: kind = self[1] ? :float : :integer Chris@210: Chris@210: elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo : Chris@210: /#{patterns::SYMBOL}/o) Chris@210: case delim = match[1] Chris@210: when ?', ?" Chris@210: tokens << [:open, :symbol] Chris@210: tokens << [':', :symbol] Chris@210: match = delim.chr Chris@210: kind = :delimiter Chris@210: state = patterns::StringState.new :symbol, delim == ?", match Chris@210: else Chris@210: kind = :symbol Chris@210: end Chris@210: Chris@210: elsif match = scan(/ -[>=]? | [+!~^]=? | [*|&]{1,2}=? | >>? /x) Chris@210: value_expected = :set Chris@210: kind = :operator Chris@210: Chris@210: elsif value_expected and match = scan(unicode ? /#{patterns::HEREDOC_OPEN}/uo : Chris@210: /#{patterns::HEREDOC_OPEN}/o) Chris@210: indented = self[1] == '-' Chris@210: quote = self[3] Chris@210: delim = self[quote ? 4 : 2] Chris@210: kind = patterns::QUOTE_TO_TYPE[quote] Chris@210: tokens << [:open, kind] Chris@210: tokens << [match, :delimiter] Chris@210: match = :close Chris@210: heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart ) Chris@210: heredocs ||= [] # create heredocs if empty Chris@210: heredocs << heredoc Chris@210: Chris@210: elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o) Chris@210: kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do Chris@210: raise_inspect 'Unknown fancy string: %%%p' % k, tokens Chris@210: end Chris@210: tokens << [:open, kind] Chris@210: state = patterns::StringState.new kind, interpreted, self[2] Chris@210: kind = :delimiter Chris@210: Chris@210: elsif value_expected and match = scan(unicode ? /#{patterns::CHARACTER}/uo : Chris@210: /#{patterns::CHARACTER}/o) Chris@210: kind = :integer Chris@210: Chris@210: elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) Chris@210: value_expected = :set Chris@210: kind = :operator Chris@210: Chris@210: elsif match = scan(/`/) Chris@210: if last_token_dot Chris@210: kind = :operator Chris@210: else Chris@210: tokens << [:open, :shell] Chris@210: kind = :delimiter Chris@210: state = patterns::StringState.new :shell, true, match Chris@210: end Chris@210: Chris@210: elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo : Chris@210: /#{patterns::GLOBAL_VARIABLE}/o) Chris@210: kind = :global_variable Chris@210: Chris@210: elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo : Chris@210: /#{patterns::CLASS_VARIABLE}/o) Chris@210: kind = :class_variable Chris@210: Chris@210: else Chris@210: if !unicode && !string.respond_to?(:encoding) Chris@210: # check for unicode Chris@210: debug, $DEBUG = $DEBUG, false Chris@210: begin Chris@210: if check(/./mu).size > 1 Chris@210: # seems like we should try again with unicode Chris@210: unicode = true Chris@210: end Chris@210: rescue Chris@210: # bad unicode char; use getch Chris@210: ensure Chris@210: $DEBUG = debug Chris@210: end Chris@210: next if unicode Chris@210: end Chris@210: kind = :error Chris@210: match = scan(unicode ? /./mu : /./m) Chris@210: Chris@210: end Chris@210: Chris@210: elsif state == :def_expected Chris@210: state = :initial Chris@210: if scan(/self\./) Chris@210: tokens << ['self', :pre_constant] Chris@210: tokens << ['.', :operator] Chris@210: end Chris@210: if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : Chris@210: /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) Chris@210: kind = :method Chris@210: else Chris@210: next Chris@210: end Chris@210: Chris@210: elsif state == :module_expected Chris@210: if match = scan(/< 1 Chris@210: state = this_block.first Chris@210: tokens << [:close, state.type] Chris@210: end Chris@210: Chris@210: tokens Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: end Chris@210: Chris@210: # vim:fdm=marker