Chris@0: module CodeRay Chris@0: module Scanners Chris@0: Chris@0: # This scanner is really complex, since Ruby _is_ a complex language! Chris@0: # Chris@0: # It tries to highlight 100% of all common code, Chris@0: # and 90% of strange codes. Chris@0: # Chris@0: # It is optimized for HTML highlighting, and is not very useful for Chris@0: # parsing or pretty printing. Chris@0: # Chris@0: # For now, I think it's better than the scanners in VIM or Syntax, or Chris@0: # any highlighter I was able to find, except Caleb's RubyLexer. Chris@0: # Chris@0: # I hope it's also better than the rdoc/irb lexer. Chris@0: class Ruby < Scanner Chris@0: Chris@0: include Streamable Chris@0: Chris@0: register_for :ruby Chris@0: file_extension 'rb' Chris@0: Chris@0: helper :patterns Chris@0: Chris@0: if not defined? EncodingError Chris@0: EncodingError = Class.new Exception Chris@0: end Chris@0: Chris@0: private Chris@0: def scan_tokens tokens, options Chris@0: last_token_dot = false Chris@0: value_expected = true Chris@0: heredocs = nil Chris@0: last_state = nil Chris@0: state = :initial Chris@0: depth = nil Chris@0: inline_block_stack = [] Chris@0: unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8' Chris@0: Chris@0: patterns = Patterns # avoid constant lookup Chris@0: Chris@0: until eos? Chris@0: match = nil Chris@0: kind = nil Chris@0: Chris@0: if state.instance_of? patterns::StringState Chris@0: # {{{ Chris@0: match = scan_until(state.pattern) || scan_until(/\z/) Chris@0: tokens << [match, :content] unless match.empty? Chris@0: break if eos? Chris@0: Chris@0: if state.heredoc and self[1] # end of heredoc Chris@0: match = getch.to_s Chris@0: match << scan_until(/$/) unless eos? Chris@0: tokens << [match, :delimiter] Chris@0: tokens << [:close, state.type] Chris@0: state = state.next_state Chris@0: next Chris@0: end Chris@0: Chris@0: case match = getch Chris@0: Chris@0: when state.delim Chris@0: if state.paren Chris@0: state.paren_depth -= 1 Chris@0: if state.paren_depth > 0 Chris@0: tokens << [match, :nesting_delimiter] Chris@0: next Chris@0: end Chris@0: end Chris@0: tokens << [match, :delimiter] Chris@0: if state.type == :regexp and not eos? Chris@0: modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) Chris@0: tokens << [modifiers, :modifier] unless modifiers.empty? Chris@0: end Chris@0: tokens << [:close, state.type] Chris@0: value_expected = false Chris@0: state = state.next_state Chris@0: Chris@0: when '\\' Chris@0: if state.interpreted Chris@0: if esc = scan(/ #{patterns::ESCAPE} /ox) Chris@0: tokens << [match + esc, :char] Chris@0: else Chris@0: tokens << [match, :error] Chris@0: end Chris@0: else Chris@0: case m = getch Chris@0: when state.delim, '\\' Chris@0: tokens << [match + m, :char] Chris@0: when nil Chris@0: tokens << [match, :error] Chris@0: else Chris@0: tokens << [match + m, :content] Chris@0: end Chris@0: end Chris@0: Chris@0: when '#' Chris@0: case peek(1) Chris@0: when '{' Chris@0: inline_block_stack << [state, depth, heredocs] Chris@0: value_expected = true Chris@0: state = :initial Chris@0: depth = 1 Chris@0: tokens << [:open, :inline] Chris@0: tokens << [match + getch, :inline_delimiter] Chris@0: when '$', '@' Chris@0: tokens << [match, :escape] Chris@0: last_state = state # scan one token as normal code, then return here Chris@0: state = :initial Chris@0: else Chris@0: raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens Chris@0: end Chris@0: Chris@0: when state.paren Chris@0: state.paren_depth += 1 Chris@0: tokens << [match, :nesting_delimiter] Chris@0: Chris@0: when /#{patterns::REGEXP_SYMBOLS}/ox Chris@0: tokens << [match, :function] Chris@0: Chris@0: else Chris@0: raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens Chris@0: Chris@0: end Chris@0: next Chris@0: # }}} Chris@0: else Chris@0: # {{{ Chris@0: if match = scan(/[ \t\f]+/) Chris@0: kind = :space Chris@0: match << scan(/\s*/) unless eos? || heredocs Chris@0: value_expected = true if match.index(?\n) Chris@0: tokens << [match, kind] Chris@0: next Chris@0: Chris@0: elsif match = scan(/\\?\n/) Chris@0: kind = :space Chris@0: if match == "\n" Chris@0: value_expected = true Chris@0: state = :initial if state == :undef_comma_expected Chris@0: end Chris@0: if heredocs Chris@0: unscan # heredoc scanning needs \n at start Chris@0: state = heredocs.shift Chris@0: tokens << [:open, state.type] Chris@0: heredocs = nil if heredocs.empty? Chris@0: next Chris@0: else Chris@0: match << scan(/\s*/) unless eos? Chris@0: end Chris@0: tokens << [match, kind] Chris@0: next Chris@0: Chris@0: elsif bol? && match = scan(/\#!.*/) Chris@0: tokens << [match, :doctype] Chris@0: next Chris@0: Chris@0: elsif match = scan(/\#.*/) or Chris@0: ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) Chris@0: kind = :comment Chris@0: tokens << [match, kind] Chris@0: next Chris@0: Chris@0: elsif state == :initial Chris@0: Chris@0: # IDENTS # Chris@0: if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo : Chris@0: /#{patterns::METHOD_NAME}/o) Chris@0: if last_token_dot Chris@0: kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end Chris@0: else Chris@0: kind = patterns::IDENT_KIND[match] Chris@0: if kind == :ident and match[/^[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) Chris@0: kind = :constant Chris@0: elsif kind == :reserved Chris@0: state = patterns::DEF_NEW_STATE[match] Chris@0: value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match] Chris@0: end Chris@0: end Chris@0: value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o) Chris@0: Chris@0: elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o) Chris@0: kind = :ident Chris@0: value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o) Chris@0: Chris@0: # OPERATORS # Chris@0: elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x) Chris@0: if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ Chris@0: value_expected = :set Chris@0: end Chris@0: last_token_dot = :set if self[1] Chris@0: kind = :operator Chris@0: unless inline_block_stack.empty? Chris@0: case match Chris@0: when '{' Chris@0: depth += 1 Chris@0: when '}' Chris@0: depth -= 1 Chris@0: if depth == 0 # closing brace of inline block reached Chris@0: state, depth, heredocs = inline_block_stack.pop Chris@0: heredocs = nil if heredocs && heredocs.empty? Chris@0: tokens << [match, :inline_delimiter] Chris@0: kind = :inline Chris@0: match = :close Chris@0: end Chris@0: end Chris@0: end Chris@0: Chris@0: elsif match = scan(/ ['"] /mx) Chris@0: tokens << [:open, :string] Chris@0: kind = :delimiter Chris@0: state = patterns::StringState.new :string, match == '"', match # important for streaming Chris@0: Chris@0: elsif match = scan(/#{patterns::INSTANCE_VARIABLE}/o) Chris@0: kind = :instance_variable Chris@0: Chris@0: elsif value_expected and match = scan(/\//) Chris@0: tokens << [:open, :regexp] Chris@0: kind = :delimiter Chris@0: interpreted = true Chris@0: state = patterns::StringState.new :regexp, interpreted, match Chris@0: Chris@0: # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o) Chris@0: elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o) Chris@0: kind = self[1] ? :float : :integer Chris@0: Chris@0: elsif match = scan(/#{patterns::SYMBOL}/o) Chris@0: case delim = match[1] Chris@0: when ?', ?" Chris@0: tokens << [:open, :symbol] Chris@0: tokens << [':', :symbol] Chris@0: match = delim.chr Chris@0: kind = :delimiter Chris@0: state = patterns::StringState.new :symbol, delim == ?", match Chris@0: else Chris@0: kind = :symbol Chris@0: end Chris@0: Chris@0: elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /x) Chris@0: value_expected = :set Chris@0: kind = :operator Chris@0: Chris@0: elsif value_expected and match = scan(/#{patterns::HEREDOC_OPEN}/o) Chris@0: indented = self[1] == '-' Chris@0: quote = self[3] Chris@0: delim = self[quote ? 4 : 2] Chris@0: kind = patterns::QUOTE_TO_TYPE[quote] Chris@0: tokens << [:open, kind] Chris@0: tokens << [match, :delimiter] Chris@0: match = :close Chris@0: heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart ) Chris@0: heredocs ||= [] # create heredocs if empty Chris@0: heredocs << heredoc Chris@0: Chris@0: elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o) Chris@0: kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do Chris@0: raise_inspect 'Unknown fancy string: %%%p' % k, tokens Chris@0: end Chris@0: tokens << [:open, kind] Chris@0: state = patterns::StringState.new kind, interpreted, self[2] Chris@0: kind = :delimiter Chris@0: Chris@0: elsif value_expected and match = scan(/#{patterns::CHARACTER}/o) Chris@0: kind = :integer Chris@0: Chris@0: elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) Chris@0: value_expected = :set Chris@0: kind = :operator Chris@0: Chris@0: elsif match = scan(/`/) Chris@0: if last_token_dot Chris@0: kind = :operator Chris@0: else Chris@0: tokens << [:open, :shell] Chris@0: kind = :delimiter Chris@0: state = patterns::StringState.new :shell, true, match Chris@0: end Chris@0: Chris@0: elsif match = scan(/#{patterns::GLOBAL_VARIABLE}/o) Chris@0: kind = :global_variable Chris@0: Chris@0: elsif match = scan(/#{patterns::CLASS_VARIABLE}/o) Chris@0: kind = :class_variable Chris@0: Chris@0: else Chris@0: if !unicode Chris@0: # check for unicode Chris@0: debug, $DEBUG = $DEBUG, false Chris@0: begin Chris@0: if check(/./mu).size > 1 Chris@0: # seems like we should try again with unicode Chris@0: unicode = true Chris@0: end Chris@0: rescue Chris@0: # bad unicode char; use getch Chris@0: ensure Chris@0: $DEBUG = debug Chris@0: end Chris@0: next if unicode Chris@0: end Chris@0: kind = :error Chris@0: match = getch Chris@0: Chris@0: end Chris@0: Chris@0: elsif state == :def_expected Chris@0: state = :initial Chris@0: if scan(/self\./) Chris@0: tokens << ['self', :pre_constant] Chris@0: tokens << ['.', :operator] Chris@0: end Chris@0: if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : Chris@0: /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) Chris@0: kind = :method Chris@0: else Chris@0: next Chris@0: end Chris@0: Chris@0: elsif state == :module_expected Chris@0: if match = scan(/< 1 Chris@0: state = this_block.first Chris@0: tokens << [:close, state.type] Chris@0: end Chris@0: Chris@0: tokens Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: end Chris@0: Chris@0: # vim:fdm=marker