Chris@0: # encoding: utf-8 Chris@0: module CodeRay Chris@0: module Scanners Chris@0: Chris@0: module Ruby::Patterns # :nodoc: Chris@0: Chris@0: RESERVED_WORDS = %w[ Chris@0: and def end in or unless begin Chris@0: defined? ensure module redo super until Chris@0: BEGIN break do next rescue then Chris@0: when END case else for retry Chris@0: while alias class elsif if not return Chris@0: undef yield Chris@0: ] Chris@0: Chris@0: DEF_KEYWORDS = %w[ def ] Chris@0: UNDEF_KEYWORDS = %w[ undef ] Chris@0: ALIAS_KEYWORDS = %w[ alias ] Chris@0: MODULE_KEYWORDS = %w[class module] Chris@0: DEF_NEW_STATE = WordList.new(:initial). Chris@0: add(DEF_KEYWORDS, :def_expected). Chris@0: add(UNDEF_KEYWORDS, :undef_expected). Chris@0: add(ALIAS_KEYWORDS, :alias_expected). Chris@0: add(MODULE_KEYWORDS, :module_expected) Chris@0: Chris@0: PREDEFINED_CONSTANTS = %w[ Chris@0: nil true false self Chris@0: DATA ARGV ARGF __FILE__ __LINE__ Chris@0: ] Chris@0: Chris@0: IDENT_KIND = WordList.new(:ident). Chris@0: add(RESERVED_WORDS, :reserved). Chris@0: add(PREDEFINED_CONSTANTS, :pre_constant) Chris@0: Chris@0: IDENT = 'ä'[/[[:alpha:]]/] == 'ä' ? /[[:alpha:]_][[:alnum:]_]*/ : /[^\W\d]\w*/ Chris@0: Chris@0: METHOD_NAME = / #{IDENT} [?!]? /ox Chris@0: METHOD_NAME_OPERATOR = / Chris@0: \*\*? # multiplication and power Chris@0: | [-+~]@? # plus, minus, tilde with and without at sign Chris@0: | [\/%&|^`] # division, modulo or format strings, and, or, xor, system Chris@0: | \[\]=? # array getter and setter Chris@0: | << | >> # append or shift left, shift right Chris@0: | <=?>? | >=? # comparison, rocket operator Chris@0: | ===? | =~ # simple equality, case equality, match Chris@0: | ![~=@]? # negation with and without at sign, not-equal and not-match Chris@0: /ox Chris@0: METHOD_NAME_EX = / #{IDENT} (?:[?!]|=(?!>))? | #{METHOD_NAME_OPERATOR} /ox Chris@0: INSTANCE_VARIABLE = / @ #{IDENT} /ox Chris@0: CLASS_VARIABLE = / @@ #{IDENT} /ox Chris@0: OBJECT_VARIABLE = / @@? #{IDENT} /ox Chris@0: GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9]\d* | 0\w* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox Chris@0: PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} | #{OBJECT_VARIABLE} /ox Chris@0: VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox Chris@0: Chris@0: QUOTE_TO_TYPE = { Chris@0: '`' => :shell, Chris@0: '/'=> :regexp, Chris@0: } Chris@0: QUOTE_TO_TYPE.default = :string Chris@0: Chris@0: REGEXP_MODIFIERS = /[mixounse]*/ Chris@0: REGEXP_SYMBOLS = /[|?*+(){}\[\].^$]/ Chris@0: Chris@0: DECIMAL = /\d+(?:_\d+)*/ Chris@0: OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ Chris@0: HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ Chris@0: BINARY = /0b[01]+(?:_[01]+)*/ Chris@0: Chris@0: EXPONENT = / [eE] [+-]? #{DECIMAL} /ox Chris@0: FLOAT_SUFFIX = / #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? /ox Chris@0: FLOAT_OR_INT = / #{DECIMAL} (?: #{FLOAT_SUFFIX} () )? /ox Chris@0: NUMERIC = / (?: (?=0) (?: #{OCTAL} | #{HEXADECIMAL} | #{BINARY} ) | #{FLOAT_OR_INT} ) /ox Chris@0: Chris@0: SYMBOL = / Chris@0: : Chris@0: (?: Chris@0: #{METHOD_NAME_EX} Chris@0: | #{PREFIX_VARIABLE} Chris@0: | ['"] Chris@0: ) Chris@0: /ox Chris@0: METHOD_NAME_OR_SYMBOL = / #{METHOD_NAME_EX} | #{SYMBOL} /ox Chris@0: Chris@0: SIMPLE_ESCAPE = / Chris@0: [abefnrstv] Chris@0: | [0-7]{1,3} Chris@0: | x[0-9A-Fa-f]{1,2} Chris@0: | .? Chris@0: /mx Chris@0: Chris@0: CONTROL_META_ESCAPE = / Chris@0: (?: M-|C-|c ) Chris@0: (?: \\ (?: M-|C-|c ) )* Chris@0: (?: [^\\] | \\ #{SIMPLE_ESCAPE} )? Chris@0: /mox Chris@0: Chris@0: ESCAPE = / Chris@0: #{CONTROL_META_ESCAPE} | #{SIMPLE_ESCAPE} Chris@0: /mox Chris@0: Chris@0: CHARACTER = / Chris@0: \? Chris@0: (?: Chris@0: [^\s\\] Chris@0: | \\ #{ESCAPE} Chris@0: ) Chris@0: /mox Chris@0: Chris@0: # NOTE: This is not completely correct, but Chris@0: # nobody needs heredoc delimiters ending with \n. Chris@0: HEREDOC_OPEN = / Chris@0: << (-)? # $1 = float Chris@0: (?: Chris@0: ( [A-Za-z_0-9]+ ) # $2 = delim Chris@0: | Chris@0: ( ["'`\/] ) # $3 = quote, type Chris@0: ( [^\n]*? ) \3 # $4 = delim Chris@0: ) Chris@0: /mx Chris@0: Chris@0: RUBYDOC = / Chris@0: =begin (?!\S) Chris@0: .*? Chris@0: (?: \Z | ^=end (?!\S) [^\n]* ) Chris@0: /mx Chris@0: Chris@0: DATA = / Chris@0: __END__$ Chris@0: .*? Chris@0: (?: \Z | (?=^\#CODE) ) Chris@0: /mx Chris@0: Chris@0: # Checks for a valid value to follow. This enables Chris@0: # value_expected in method calls without parentheses. Chris@0: VALUE_FOLLOWS = / Chris@0: (?>[ \t\f\v]+) Chris@0: (?: Chris@0: [%\/][^\s=] Chris@0: | <<-?\S Chris@0: | [-+] \d Chris@0: | #{CHARACTER} Chris@0: ) Chris@0: /x Chris@0: KEYWORDS_EXPECTING_VALUE = WordList.new.add(%w[ Chris@0: and end in or unless begin Chris@0: defined? ensure redo super until Chris@0: break do next rescue then Chris@0: when case else for retry Chris@0: while elsif if not return Chris@0: yield Chris@0: ]) Chris@0: Chris@0: RUBYDOC_OR_DATA = / #{RUBYDOC} | #{DATA} /xo Chris@0: Chris@0: RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x Chris@0: Chris@0: FANCY_START_CORRECT = / % ( [qQwWxsr] | (?![a-zA-Z0-9]) ) ([^a-zA-Z0-9]) /mx Chris@0: Chris@0: FancyStringType = { Chris@0: 'q' => [:string, false], Chris@0: 'Q' => [:string, true], Chris@0: 'r' => [:regexp, true], Chris@0: 's' => [:symbol, false], Chris@0: 'x' => [:shell, true] Chris@0: } Chris@0: FancyStringType['w'] = FancyStringType['q'] Chris@0: FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] Chris@0: Chris@0: class StringState < Struct.new :type, :interpreted, :delim, :heredoc, Chris@0: :paren, :paren_depth, :pattern, :next_state Chris@0: Chris@0: CLOSING_PAREN = Hash[ *%w[ Chris@0: ( ) Chris@0: [ ] Chris@0: < > Chris@0: { } Chris@0: ] ] Chris@0: Chris@0: CLOSING_PAREN.each { |k,v| k.freeze; v.freeze } # debug, if I try to change it with << Chris@0: OPENING_PAREN = CLOSING_PAREN.invert Chris@0: Chris@0: STRING_PATTERN = Hash.new do |h, k| Chris@0: delim, interpreted = *k Chris@0: delim_pattern = Regexp.escape(delim.dup) # dup: workaround for old Ruby Chris@0: if closing_paren = CLOSING_PAREN[delim] Chris@0: delim_pattern = delim_pattern[0..-1] if defined? JRUBY_VERSION # JRuby fix Chris@0: delim_pattern << Regexp.escape(closing_paren) Chris@0: end Chris@0: delim_pattern << '\\\\' unless delim == '\\' Chris@0: Chris@0: special_escapes = Chris@0: case interpreted Chris@0: when :regexp_symbols Chris@0: '| ' + REGEXP_SYMBOLS.source Chris@0: when :words Chris@0: '| \s' Chris@0: end Chris@0: Chris@0: h[k] = Chris@0: if interpreted and not delim == '#' Chris@0: / (?= [#{delim_pattern}] | \# [{$@] #{special_escapes} ) /mx Chris@0: else Chris@0: / (?= [#{delim_pattern}] #{special_escapes} ) /mx Chris@0: end Chris@0: end Chris@0: Chris@0: HEREDOC_PATTERN = Hash.new do |h, k| Chris@0: delim, interpreted, indented = *k Chris@0: delim_pattern = Regexp.escape(delim.dup) # dup: workaround for old Ruby Chris@0: delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x Chris@0: h[k] = Chris@0: if interpreted Chris@0: / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx # $1 set == end of heredoc Chris@0: else Chris@0: / (?= #{delim_pattern}() | \\ ) /mx Chris@0: end Chris@0: end Chris@0: Chris@0: def initialize kind, interpreted, delim, heredoc = false Chris@0: if heredoc Chris@0: pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ] Chris@0: delim = nil Chris@0: else Chris@0: pattern = STRING_PATTERN[ [delim, interpreted] ] Chris@0: if paren = CLOSING_PAREN[delim] Chris@0: delim, paren = paren, delim Chris@0: paren_depth = 1 Chris@0: end Chris@0: end Chris@0: super kind, interpreted, delim, heredoc, paren, paren_depth, pattern, :initial Chris@0: end Chris@0: end unless defined? StringState Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: end