Chris@909: module CodeRay Chris@909: module Scanners Chris@909: Chris@909: # This scanner is really complex, since Ruby _is_ a complex language! Chris@909: # Chris@909: # It tries to highlight 100% of all common code, Chris@909: # and 90% of strange codes. Chris@909: # Chris@909: # It is optimized for HTML highlighting, and is not very useful for Chris@909: # parsing or pretty printing. Chris@909: class Ruby < Scanner Chris@909: Chris@909: register_for :ruby Chris@909: file_extension 'rb' Chris@909: Chris@909: autoload :Patterns, 'coderay/scanners/ruby/patterns' Chris@909: autoload :StringState, 'coderay/scanners/ruby/string_state' Chris@909: Chris@909: def interpreted_string_state Chris@909: StringState.new :string, true, '"' Chris@909: end Chris@909: Chris@909: protected Chris@909: Chris@909: def setup Chris@909: @state = :initial Chris@909: end Chris@909: Chris@909: def scan_tokens encoder, options Chris@909: state, heredocs = options[:state] || @state Chris@909: heredocs = heredocs.dup if heredocs.is_a?(Array) Chris@909: Chris@909: if state && state.instance_of?(StringState) Chris@909: encoder.begin_group state.type Chris@909: end Chris@909: Chris@909: last_state = nil Chris@909: Chris@909: method_call_expected = false Chris@909: value_expected = true Chris@909: Chris@909: inline_block_stack = nil Chris@909: inline_block_curly_depth = 0 Chris@909: Chris@909: if heredocs Chris@909: state = heredocs.shift Chris@909: encoder.begin_group state.type Chris@909: heredocs = nil if heredocs.empty? Chris@909: end Chris@909: Chris@909: # def_object_stack = nil Chris@909: # def_object_paren_depth = 0 Chris@909: Chris@909: patterns = Patterns # avoid constant lookup Chris@909: Chris@909: unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8' Chris@909: Chris@909: until eos? Chris@909: Chris@909: if state.instance_of? ::Symbol Chris@909: Chris@909: if match = scan(/[ \t\f\v]+/) Chris@909: encoder.text_token match, :space Chris@909: Chris@909: elsif match = scan(/\n/) Chris@909: if heredocs Chris@909: unscan # heredoc scanning needs \n at start Chris@909: state = heredocs.shift Chris@909: encoder.begin_group state.type Chris@909: heredocs = nil if heredocs.empty? Chris@909: else Chris@909: state = :initial if state == :undef_comma_expected Chris@909: encoder.text_token match, :space Chris@909: value_expected = true Chris@909: end Chris@909: Chris@909: elsif match = scan(bol? ? / \#(!)?.* | #{patterns::RUBYDOC_OR_DATA} /ox : /\#.*/) Chris@909: encoder.text_token match, self[1] ? :doctype : :comment Chris@909: Chris@909: elsif match = scan(/\\\n/) Chris@909: if heredocs Chris@909: unscan # heredoc scanning needs \n at start Chris@909: encoder.text_token scan(/\\/), :space Chris@909: state = heredocs.shift Chris@909: encoder.begin_group state.type Chris@909: heredocs = nil if heredocs.empty? Chris@909: else Chris@909: encoder.text_token match, :space Chris@909: end Chris@909: Chris@909: elsif state == :initial Chris@909: Chris@909: # IDENTS # Chris@909: if !method_call_expected && Chris@909: match = scan(unicode ? /#{patterns::METHOD_NAME}/uo : Chris@909: /#{patterns::METHOD_NAME}/o) Chris@909: value_expected = false Chris@909: kind = patterns::IDENT_KIND[match] Chris@909: if kind == :ident Chris@909: if match[/\A[A-Z]/] && !(match[/[!?]$/] || match?(/\(/)) Chris@909: kind = :constant Chris@909: end Chris@909: elsif kind == :keyword Chris@909: state = patterns::KEYWORD_NEW_STATE[match] Chris@909: value_expected = true if patterns::KEYWORDS_EXPECTING_VALUE[match] Chris@909: end Chris@909: value_expected = true if !value_expected && check(/#{patterns::VALUE_FOLLOWS}/o) Chris@909: encoder.text_token match, kind Chris@909: Chris@909: elsif method_call_expected && Chris@909: match = scan(unicode ? /#{patterns::METHOD_AFTER_DOT}/uo : Chris@909: /#{patterns::METHOD_AFTER_DOT}/o) Chris@909: if method_call_expected == '::' && match[/\A[A-Z]/] && !match?(/\(/) Chris@909: encoder.text_token match, :constant Chris@909: else Chris@909: encoder.text_token match, :ident Chris@909: end Chris@909: method_call_expected = false Chris@909: value_expected = check(/#{patterns::VALUE_FOLLOWS}/o) Chris@909: Chris@909: # OPERATORS # Chris@909: elsif !method_call_expected && match = scan(/ (\.(?!\.)|::) | (?: \.\.\.? | ==?=? | [,\(\[\{] )() | [\)\]\}] /x) Chris@909: method_call_expected = self[1] Chris@909: value_expected = !method_call_expected && self[2] Chris@909: if inline_block_stack Chris@909: case match Chris@909: when '{' Chris@909: inline_block_curly_depth += 1 Chris@909: when '}' Chris@909: inline_block_curly_depth -= 1 Chris@909: if inline_block_curly_depth == 0 # closing brace of inline block reached Chris@909: state, inline_block_curly_depth, heredocs = inline_block_stack.pop Chris@909: inline_block_stack = nil if inline_block_stack.empty? Chris@909: heredocs = nil if heredocs && heredocs.empty? Chris@909: encoder.text_token match, :inline_delimiter Chris@909: encoder.end_group :inline Chris@909: next Chris@909: end Chris@909: end Chris@909: end Chris@909: encoder.text_token match, :operator Chris@909: Chris@909: elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo : Chris@909: /#{patterns::SYMBOL}/o) Chris@909: case delim = match[1] Chris@909: when ?', ?" Chris@909: encoder.begin_group :symbol Chris@909: encoder.text_token ':', :symbol Chris@909: match = delim.chr Chris@909: encoder.text_token match, :delimiter Chris@909: state = self.class::StringState.new :symbol, delim == ?", match Chris@909: else Chris@909: encoder.text_token match, :symbol Chris@909: value_expected = false Chris@909: end Chris@909: Chris@909: elsif match = scan(/ ' (?:(?>[^'\\]*) ')? | " (?:(?>[^"\\\#]*) ")? /mx) Chris@909: encoder.begin_group :string Chris@909: if match.size == 1 Chris@909: encoder.text_token match, :delimiter Chris@909: state = self.class::StringState.new :string, match == '"', match # important for streaming Chris@909: else Chris@909: encoder.text_token match[0,1], :delimiter Chris@909: encoder.text_token match[1..-2], :content if match.size > 2 Chris@909: encoder.text_token match[-1,1], :delimiter Chris@909: encoder.end_group :string Chris@909: value_expected = false Chris@909: end Chris@909: Chris@909: elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo : Chris@909: /#{patterns::INSTANCE_VARIABLE}/o) Chris@909: value_expected = false Chris@909: encoder.text_token match, :instance_variable Chris@909: Chris@909: elsif value_expected && match = scan(/\//) Chris@909: encoder.begin_group :regexp Chris@909: encoder.text_token match, :delimiter Chris@909: state = self.class::StringState.new :regexp, true, '/' Chris@909: Chris@909: elsif match = scan(value_expected ? /[-+]?#{patterns::NUMERIC}/o : /#{patterns::NUMERIC}/o) Chris@909: if method_call_expected Chris@909: encoder.text_token match, :error Chris@909: method_call_expected = false Chris@909: else Chris@909: encoder.text_token match, self[1] ? :float : :integer # TODO: send :hex/:octal/:binary Chris@909: end Chris@909: value_expected = false Chris@909: Chris@909: elsif match = scan(/ [-+!~^\/]=? | [:;] | [*|&]{1,2}=? | >>? /x) Chris@909: value_expected = true Chris@909: encoder.text_token match, :operator Chris@909: Chris@909: elsif value_expected && match = scan(/#{patterns::HEREDOC_OPEN}/o) Chris@909: quote = self[3] Chris@909: delim = self[quote ? 4 : 2] Chris@909: kind = patterns::QUOTE_TO_TYPE[quote] Chris@909: encoder.begin_group kind Chris@909: encoder.text_token match, :delimiter Chris@909: encoder.end_group kind Chris@909: heredocs ||= [] # create heredocs if empty Chris@909: heredocs << self.class::StringState.new(kind, quote != "'", delim, Chris@909: self[1] == '-' ? :indented : :linestart) Chris@909: value_expected = false Chris@909: Chris@909: elsif value_expected && match = scan(/#{patterns::FANCY_STRING_START}/o) Chris@909: kind = patterns::FANCY_STRING_KIND[self[1]] Chris@909: encoder.begin_group kind Chris@909: state = self.class::StringState.new kind, patterns::FANCY_STRING_INTERPRETED[self[1]], self[2] Chris@909: encoder.text_token match, :delimiter Chris@909: Chris@909: elsif value_expected && match = scan(/#{patterns::CHARACTER}/o) Chris@909: value_expected = false Chris@909: encoder.text_token match, :integer Chris@909: Chris@909: elsif match = scan(/ %=? | <(?:<|=>?)? | \? /x) Chris@909: value_expected = true Chris@909: encoder.text_token match, :operator Chris@909: Chris@909: elsif match = scan(/`/) Chris@909: encoder.begin_group :shell Chris@909: encoder.text_token match, :delimiter Chris@909: state = self.class::StringState.new :shell, true, match Chris@909: Chris@909: elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo : Chris@909: /#{patterns::GLOBAL_VARIABLE}/o) Chris@909: encoder.text_token match, :global_variable Chris@909: value_expected = false Chris@909: Chris@909: elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo : Chris@909: /#{patterns::CLASS_VARIABLE}/o) Chris@909: encoder.text_token match, :class_variable Chris@909: value_expected = false Chris@909: Chris@909: elsif match = scan(/\\\z/) Chris@909: encoder.text_token match, :space Chris@909: Chris@909: else Chris@909: if method_call_expected Chris@909: method_call_expected = false Chris@909: next Chris@909: end Chris@909: unless unicode Chris@909: # check for unicode Chris@909: $DEBUG_BEFORE, $DEBUG = $DEBUG, false Chris@909: begin Chris@909: if check(/./mu).size > 1 Chris@909: # seems like we should try again with unicode Chris@909: unicode = true Chris@909: end Chris@909: rescue Chris@909: # bad unicode char; use getch Chris@909: ensure Chris@909: $DEBUG = $DEBUG_BEFORE Chris@909: end Chris@909: next if unicode Chris@909: end Chris@909: Chris@909: encoder.text_token getch, :error Chris@909: Chris@909: end Chris@909: Chris@909: if last_state Chris@909: state = last_state Chris@909: last_state = nil Chris@909: end Chris@909: Chris@909: elsif state == :def_expected Chris@909: if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : Chris@909: /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) Chris@909: encoder.text_token match, :method Chris@909: state = :initial Chris@909: else Chris@909: last_state = :dot_expected Chris@909: state = :initial Chris@909: end Chris@909: Chris@909: elsif state == :dot_expected Chris@909: if match = scan(/\.|::/) Chris@909: # invalid definition Chris@909: state = :def_expected Chris@909: encoder.text_token match, :operator Chris@909: else Chris@909: state = :initial Chris@909: end Chris@909: Chris@909: elsif state == :module_expected Chris@909: if match = scan(/<#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : Chris@909: /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) Chris@909: encoder.text_token match, :method Chris@909: elsif match = scan(/#{patterns::SYMBOL}/o) Chris@909: case delim = match[1] Chris@909: when ?', ?" Chris@909: encoder.begin_group :symbol Chris@909: encoder.text_token ':', :symbol Chris@909: match = delim.chr Chris@909: encoder.text_token match, :delimiter Chris@909: state = self.class::StringState.new :symbol, delim == ?", match Chris@909: state.next_state = :undef_comma_expected Chris@909: else Chris@909: encoder.text_token match, :symbol Chris@909: end Chris@909: else Chris@909: state = :initial Chris@909: end Chris@909: Chris@909: elsif state == :undef_comma_expected Chris@909: if match = scan(/,/) Chris@909: encoder.text_token match, :operator Chris@909: state = :undef_expected Chris@909: else Chris@909: state = :initial Chris@909: end Chris@909: Chris@909: elsif state == :alias_expected Chris@909: match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo : Chris@909: /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o) Chris@909: Chris@909: if match Chris@909: encoder.text_token self[1], (self[1][0] == ?: ? :symbol : :method) Chris@909: encoder.text_token self[2], :space Chris@909: encoder.text_token self[3], (self[3][0] == ?: ? :symbol : :method) Chris@909: end Chris@909: state = :initial Chris@909: Chris@909: else Chris@909: #:nocov: Chris@909: raise_inspect 'Unknown state: %p' % [state], encoder Chris@909: #:nocov: Chris@909: end Chris@909: Chris@909: else # StringState Chris@909: Chris@909: match = scan_until(state.pattern) || scan_rest Chris@909: unless match.empty? Chris@909: encoder.text_token match, :content Chris@909: break if eos? Chris@909: end Chris@909: Chris@909: if state.heredoc && self[1] # end of heredoc Chris@909: match = getch Chris@909: match << scan_until(/$/) unless eos? Chris@909: encoder.text_token match, :delimiter unless match.empty? Chris@909: encoder.end_group state.type Chris@909: state = state.next_state Chris@909: next Chris@909: end Chris@909: Chris@909: case match = getch Chris@909: Chris@909: when state.delim Chris@909: if state.paren_depth Chris@909: state.paren_depth -= 1 Chris@909: if state.paren_depth > 0 Chris@909: encoder.text_token match, :content Chris@909: next Chris@909: end Chris@909: end Chris@909: encoder.text_token match, :delimiter Chris@909: if state.type == :regexp && !eos? Chris@909: match = scan(/#{patterns::REGEXP_MODIFIERS}/o) Chris@909: encoder.text_token match, :modifier unless match.empty? Chris@909: end Chris@909: encoder.end_group state.type Chris@909: value_expected = false Chris@909: state = state.next_state Chris@909: Chris@909: when '\\' Chris@909: if state.interpreted Chris@909: if esc = scan(/#{patterns::ESCAPE}/o) Chris@909: encoder.text_token match + esc, :char Chris@909: else Chris@909: encoder.text_token match, :error Chris@909: end Chris@909: else Chris@909: case esc = getch Chris@909: when nil Chris@909: encoder.text_token match, :content Chris@909: when state.delim, '\\' Chris@909: encoder.text_token match + esc, :char Chris@909: else Chris@909: encoder.text_token match + esc, :content Chris@909: end Chris@909: end Chris@909: Chris@909: when '#' Chris@909: case peek(1) Chris@909: when '{' Chris@909: inline_block_stack ||= [] Chris@909: inline_block_stack << [state, inline_block_curly_depth, heredocs] Chris@909: value_expected = true Chris@909: state = :initial Chris@909: inline_block_curly_depth = 1 Chris@909: encoder.begin_group :inline Chris@909: encoder.text_token match + getch, :inline_delimiter Chris@909: when '$', '@' Chris@909: encoder.text_token match, :escape Chris@909: last_state = state Chris@909: state = :initial Chris@909: else Chris@909: #:nocov: Chris@909: raise_inspect 'else-case # reached; #%p not handled' % [peek(1)], encoder Chris@909: #:nocov: Chris@909: end Chris@909: Chris@909: when state.opening_paren Chris@909: state.paren_depth += 1 Chris@909: encoder.text_token match, :content Chris@909: Chris@909: else Chris@909: #:nocov Chris@909: raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], encoder Chris@909: #:nocov: Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: # cleaning up Chris@909: if state.is_a? StringState Chris@909: encoder.end_group state.type Chris@909: end Chris@909: Chris@909: if options[:keep_state] Chris@909: if state.is_a?(StringState) && state.heredoc Chris@909: (heredocs ||= []).unshift state Chris@909: state = :initial Chris@909: elsif heredocs && heredocs.empty? Chris@909: heredocs = nil Chris@909: end Chris@909: @state = state, heredocs Chris@909: end Chris@909: Chris@909: if inline_block_stack Chris@909: until inline_block_stack.empty? Chris@909: state, = *inline_block_stack.pop Chris@909: encoder.end_group :inline Chris@909: encoder.end_group state.type Chris@909: end Chris@909: end Chris@909: Chris@909: encoder Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: end