Chris@210: # encoding: utf-8 Chris@210: module CodeRay Chris@210: module Scanners Chris@210: Chris@210: module Ruby::Patterns # :nodoc: Chris@210: Chris@210: RESERVED_WORDS = %w[ Chris@210: and def end in or unless begin Chris@210: defined? ensure module redo super until Chris@210: BEGIN break do next rescue then Chris@210: when END case else for retry Chris@210: while alias class elsif if not return Chris@210: undef yield Chris@210: ] Chris@210: Chris@210: DEF_KEYWORDS = %w[ def ] Chris@210: UNDEF_KEYWORDS = %w[ undef ] Chris@210: ALIAS_KEYWORDS = %w[ alias ] Chris@210: MODULE_KEYWORDS = %w[ class module ] Chris@210: DEF_NEW_STATE = WordList.new(:initial). Chris@210: add(DEF_KEYWORDS, :def_expected). Chris@210: add(UNDEF_KEYWORDS, :undef_expected). Chris@210: add(ALIAS_KEYWORDS, :alias_expected). Chris@210: add(MODULE_KEYWORDS, :module_expected) Chris@210: Chris@210: PREDEFINED_CONSTANTS = %w[ Chris@210: nil true false self Chris@210: DATA ARGV ARGF Chris@210: __FILE__ __LINE__ __ENCODING__ Chris@210: ] Chris@210: Chris@210: IDENT_KIND = WordList.new(:ident). Chris@210: add(RESERVED_WORDS, :reserved). Chris@210: add(PREDEFINED_CONSTANTS, :pre_constant) Chris@210: Chris@210: if /\w/u === '∑' Chris@210: # MRI 1.8.6, 1.8.7 Chris@210: IDENT = /[^\W\d]\w*/ Chris@210: else Chris@210: if //.respond_to? :encoding Chris@210: # MRI 1.9.1, 1.9.2 Chris@210: IDENT = Regexp.new '[\p{L}\p{M}\p{Pc}\p{Sm}&&[^\x00-\x40\x5b-\x5e\x60\x7b-\x7f]][\p{L}\p{M}\p{N}\p{Pc}\p{Sm}&&[^\x00-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f]]*' Chris@210: else Chris@210: # JRuby, Rubinius Chris@210: IDENT = /[^\x00-\x40\x5b-\x5e\x60\x7b-\x7f][^\x00-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f]*/ Chris@210: end Chris@210: end Chris@210: Chris@210: METHOD_NAME = / #{IDENT} [?!]? /ox Chris@210: METHOD_NAME_OPERATOR = / Chris@210: \*\*? # multiplication and power Chris@210: | [-+~]@? # plus, minus, tilde with and without at sign Chris@210: | [\/%&|^`] # division, modulo or format strings, and, or, xor, system Chris@210: | \[\]=? # array getter and setter Chris@210: | << | >> # append or shift left, shift right Chris@210: | <=?>? | >=? # comparison, rocket operator Chris@210: | ===? | =~ # simple equality, case equality, match Chris@210: | ![~=@]? # negation with and without at sign, not-equal and not-match Chris@210: /ox Chris@210: METHOD_NAME_EX = / #{IDENT} (?:[?!]|=(?!>))? | #{METHOD_NAME_OPERATOR} /ox Chris@210: INSTANCE_VARIABLE = / @ #{IDENT} /ox Chris@210: CLASS_VARIABLE = / @@ #{IDENT} /ox Chris@210: OBJECT_VARIABLE = / @@? #{IDENT} /ox Chris@210: GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9]\d* | 0\w* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox Chris@210: PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} | #{OBJECT_VARIABLE} /ox Chris@210: VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox Chris@210: Chris@210: QUOTE_TO_TYPE = { Chris@210: '`' => :shell, Chris@210: '/'=> :regexp, Chris@210: } Chris@210: QUOTE_TO_TYPE.default = :string Chris@210: Chris@210: REGEXP_MODIFIERS = /[mixounse]*/ Chris@210: REGEXP_SYMBOLS = /[|?*+(){}\[\].^$]/ Chris@210: Chris@210: DECIMAL = /\d+(?:_\d+)*/ Chris@210: OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ Chris@210: HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ Chris@210: BINARY = /0b[01]+(?:_[01]+)*/ Chris@210: Chris@210: EXPONENT = / [eE] [+-]? #{DECIMAL} /ox Chris@210: FLOAT_SUFFIX = / #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? /ox Chris@210: FLOAT_OR_INT = / #{DECIMAL} (?: #{FLOAT_SUFFIX} () )? /ox Chris@210: NUMERIC = / (?: (?=0) (?: #{OCTAL} | #{HEXADECIMAL} | #{BINARY} ) | #{FLOAT_OR_INT} ) /ox Chris@210: Chris@210: SYMBOL = / Chris@210: : Chris@210: (?: Chris@210: #{METHOD_NAME_EX} Chris@210: | #{PREFIX_VARIABLE} Chris@210: | ['"] Chris@210: ) Chris@210: /ox Chris@210: METHOD_NAME_OR_SYMBOL = / #{METHOD_NAME_EX} | #{SYMBOL} /ox Chris@210: Chris@210: SIMPLE_ESCAPE = / Chris@210: [abefnrstv] Chris@210: | [0-7]{1,3} Chris@210: | x[0-9A-Fa-f]{1,2} Chris@210: | .? Chris@210: /mx Chris@210: Chris@210: CONTROL_META_ESCAPE = / Chris@210: (?: M-|C-|c ) Chris@210: (?: \\ (?: M-|C-|c ) )* Chris@210: (?: [^\\] | \\ #{SIMPLE_ESCAPE} )? Chris@210: /mox Chris@210: Chris@210: ESCAPE = / Chris@210: #{CONTROL_META_ESCAPE} | #{SIMPLE_ESCAPE} Chris@210: /mox Chris@210: Chris@210: CHARACTER = / Chris@210: \? Chris@210: (?: Chris@210: [^\s\\] Chris@210: | \\ #{ESCAPE} Chris@210: ) Chris@210: /mox Chris@210: Chris@210: # NOTE: This is not completely correct, but Chris@210: # nobody needs heredoc delimiters ending with \n. Chris@210: # Also, delimiters starting with numbers are allowed. Chris@210: # but they are more often than not a false positive. Chris@210: HEREDOC_OPEN = / Chris@210: << (-)? # $1 = float Chris@210: (?: Chris@210: ( #{IDENT} ) # $2 = delim Chris@210: | Chris@210: ( ["'`\/] ) # $3 = quote, type Chris@210: ( [^\n]*? ) \3 # $4 = delim Chris@210: ) Chris@210: /mx Chris@210: Chris@210: RUBYDOC = / Chris@210: =begin (?!\S) Chris@210: .*? Chris@210: (?: \Z | ^=end (?!\S) [^\n]* ) Chris@210: /mx Chris@210: Chris@210: DATA = / Chris@210: __END__$ Chris@210: .*? Chris@210: (?: \Z | (?=^\#CODE) ) Chris@210: /mx Chris@210: Chris@210: # Checks for a valid value to follow. This enables Chris@210: # value_expected in method calls without parentheses. Chris@210: VALUE_FOLLOWS = / Chris@210: (?>[ \t\f\v]+) Chris@210: (?: Chris@210: [%\/][^\s=] Chris@210: | <<-?\S Chris@210: | [-+] \d Chris@210: | #{CHARACTER} Chris@210: ) Chris@210: /x Chris@210: KEYWORDS_EXPECTING_VALUE = WordList.new.add(%w[ Chris@210: and end in or unless begin Chris@210: defined? ensure redo super until Chris@210: break do next rescue then Chris@210: when case else for retry Chris@210: while elsif if not return Chris@210: yield Chris@210: ]) Chris@210: Chris@210: RUBYDOC_OR_DATA = / #{RUBYDOC} | #{DATA} /xo Chris@210: Chris@210: RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x Chris@210: Chris@210: FANCY_START_CORRECT = / % ( [qQwWxsr] | (?![a-zA-Z0-9]) ) ([^a-zA-Z0-9]) /mx Chris@210: Chris@210: FancyStringType = { Chris@210: 'q' => [:string, false], Chris@210: 'Q' => [:string, true], Chris@210: 'r' => [:regexp, true], Chris@210: 's' => [:symbol, false], Chris@210: 'x' => [:shell, true] Chris@210: } Chris@210: FancyStringType['w'] = FancyStringType['q'] Chris@210: FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] Chris@210: Chris@210: class StringState < Struct.new :type, :interpreted, :delim, :heredoc, Chris@210: :paren, :paren_depth, :pattern, :next_state Chris@210: Chris@210: CLOSING_PAREN = Hash[ *%w[ Chris@210: ( ) Chris@210: [ ] Chris@210: < > Chris@210: { } Chris@210: ] ] Chris@210: Chris@210: CLOSING_PAREN.each { |k,v| k.freeze; v.freeze } # debug, if I try to change it with << Chris@210: OPENING_PAREN = CLOSING_PAREN.invert Chris@210: Chris@210: STRING_PATTERN = Hash.new do |h, k| Chris@210: delim, interpreted = *k Chris@210: delim_pattern = Regexp.escape(delim.dup) # dup: workaround for old Ruby Chris@210: if closing_paren = CLOSING_PAREN[delim] Chris@210: delim_pattern = delim_pattern[0..-1] if defined? JRUBY_VERSION # JRuby fix Chris@210: delim_pattern << Regexp.escape(closing_paren) Chris@210: end Chris@210: delim_pattern << '\\\\' unless delim == '\\' Chris@210: Chris@210: special_escapes = Chris@210: case interpreted Chris@210: when :regexp_symbols Chris@210: '| ' + REGEXP_SYMBOLS.source Chris@210: when :words Chris@210: '| \s' Chris@210: end Chris@210: Chris@210: h[k] = Chris@210: if interpreted and not delim == '#' Chris@210: / (?= [#{delim_pattern}] | \# [{$@] #{special_escapes} ) /mx Chris@210: else Chris@210: / (?= [#{delim_pattern}] #{special_escapes} ) /mx Chris@210: end Chris@210: end Chris@210: Chris@210: HEREDOC_PATTERN = Hash.new do |h, k| Chris@210: delim, interpreted, indented = *k Chris@210: delim_pattern = Regexp.escape(delim.dup) # dup: workaround for old Ruby Chris@210: delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x Chris@210: h[k] = Chris@210: if interpreted Chris@210: / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx # $1 set == end of heredoc Chris@210: else Chris@210: / (?= #{delim_pattern}() | \\ ) /mx Chris@210: end Chris@210: end Chris@210: Chris@210: def initialize kind, interpreted, delim, heredoc = false Chris@210: if heredoc Chris@210: pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ] Chris@210: delim = nil Chris@210: else Chris@210: pattern = STRING_PATTERN[ [delim, interpreted] ] Chris@210: if paren = CLOSING_PAREN[delim] Chris@210: delim, paren = paren, delim Chris@210: paren_depth = 1 Chris@210: end Chris@210: end Chris@210: super kind, interpreted, delim, heredoc, paren, paren_depth, pattern, :initial Chris@210: end Chris@210: end unless defined? StringState Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: end