Chris@909: module CodeRay Chris@909: module Scanners Chris@909: Chris@909: # Scanner for C. Chris@909: class C < Scanner Chris@909: Chris@909: register_for :c Chris@909: file_extension 'c' Chris@909: Chris@909: KEYWORDS = [ Chris@909: 'asm', 'break', 'case', 'continue', 'default', 'do', Chris@909: 'else', 'enum', 'for', 'goto', 'if', 'return', Chris@909: 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', Chris@909: 'restrict', # added in C99 Chris@909: ] # :nodoc: Chris@909: Chris@909: PREDEFINED_TYPES = [ Chris@909: 'int', 'long', 'short', 'char', Chris@909: 'signed', 'unsigned', 'float', 'double', Chris@909: 'bool', 'complex', # added in C99 Chris@909: ] # :nodoc: Chris@909: Chris@909: PREDEFINED_CONSTANTS = [ Chris@909: 'EOF', 'NULL', Chris@909: 'true', 'false', # added in C99 Chris@909: ] # :nodoc: Chris@909: DIRECTIVES = [ Chris@909: 'auto', 'extern', 'register', 'static', 'void', Chris@909: 'const', 'volatile', # added in C89 Chris@909: 'inline', # added in C99 Chris@909: ] # :nodoc: Chris@909: Chris@909: IDENT_KIND = WordList.new(:ident). Chris@909: add(KEYWORDS, :keyword). Chris@909: add(PREDEFINED_TYPES, :predefined_type). Chris@909: add(DIRECTIVES, :directive). Chris@909: add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: Chris@909: Chris@909: ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: Chris@909: UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: Chris@909: Chris@909: protected Chris@909: Chris@909: def scan_tokens encoder, options Chris@909: Chris@909: state = :initial Chris@909: label_expected = true Chris@909: case_expected = false Chris@909: label_expected_before_preproc_line = nil Chris@909: in_preproc_line = false Chris@909: Chris@909: until eos? Chris@909: Chris@909: case state Chris@909: Chris@909: when :initial Chris@909: Chris@909: if match = scan(/ \s+ | \\\n /x) Chris@909: if in_preproc_line && match != "\\\n" && match.index(?\n) Chris@909: in_preproc_line = false Chris@909: label_expected = label_expected_before_preproc_line Chris@909: end Chris@909: encoder.text_token match, :space Chris@909: Chris@909: elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx) Chris@909: encoder.text_token match, :comment Chris@909: Chris@909: elsif match = scan(/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/=? | \.(?!\d) /x) Chris@909: label_expected = match =~ /[;\{\}]/ Chris@909: if case_expected Chris@909: label_expected = true if match == ':' Chris@909: case_expected = false Chris@909: end Chris@909: encoder.text_token match, :operator Chris@909: Chris@909: elsif match = scan(/ [A-Za-z_][A-Za-z_0-9]* /x) Chris@909: kind = IDENT_KIND[match] Chris@909: if kind == :ident && label_expected && !in_preproc_line && scan(/:(?!:)/) Chris@909: kind = :label Chris@909: match << matched Chris@909: else Chris@909: label_expected = false Chris@909: if kind == :keyword Chris@909: case match Chris@909: when 'case', 'default' Chris@909: case_expected = true Chris@909: end Chris@909: end Chris@909: end Chris@909: encoder.text_token match, kind Chris@909: Chris@909: elsif match = scan(/L?"/) Chris@909: encoder.begin_group :string Chris@909: if match[0] == ?L Chris@909: encoder.text_token 'L', :modifier Chris@909: match = '"' Chris@909: end Chris@909: encoder.text_token match, :delimiter Chris@909: state = :string Chris@909: Chris@909: elsif match = scan(/ \# \s* if \s* 0 /x) Chris@909: match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /xm) unless eos? Chris@909: encoder.text_token match, :comment Chris@909: Chris@909: elsif match = scan(/#[ \t]*(\w*)/) Chris@909: encoder.text_token match, :preprocessor Chris@909: in_preproc_line = true Chris@909: label_expected_before_preproc_line = label_expected Chris@909: state = :include_expected if self[1] == 'include' Chris@909: Chris@909: elsif match = scan(/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /ox) Chris@909: label_expected = false Chris@909: encoder.text_token match, :char Chris@909: Chris@909: elsif match = scan(/\$/) Chris@909: encoder.text_token match, :ident Chris@909: Chris@909: elsif match = scan(/0[xX][0-9A-Fa-f]+/) Chris@909: label_expected = false Chris@909: encoder.text_token match, :hex Chris@909: Chris@909: elsif match = scan(/(?:0[0-7]+)(?![89.eEfF])/) Chris@909: label_expected = false Chris@909: encoder.text_token match, :octal Chris@909: Chris@909: elsif match = scan(/(?:\d+)(?![.eEfF])L?L?/) Chris@909: label_expected = false Chris@909: encoder.text_token match, :integer Chris@909: Chris@909: elsif match = scan(/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) Chris@909: label_expected = false Chris@909: encoder.text_token match, :float Chris@909: Chris@909: else Chris@909: encoder.text_token getch, :error Chris@909: Chris@909: end Chris@909: Chris@909: when :string Chris@909: if match = scan(/[^\\\n"]+/) Chris@909: encoder.text_token match, :content Chris@909: elsif match = scan(/"/) Chris@909: encoder.text_token match, :delimiter Chris@909: encoder.end_group :string Chris@909: state = :initial Chris@909: label_expected = false Chris@909: elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox) Chris@909: encoder.text_token match, :char Chris@909: elsif match = scan(/ \\ | $ /x) Chris@909: encoder.end_group :string Chris@909: encoder.text_token match, :error Chris@909: state = :initial Chris@909: label_expected = false Chris@909: else Chris@909: raise_inspect "else case \" reached; %p not handled." % peek(1), encoder Chris@909: end Chris@909: Chris@909: when :include_expected Chris@909: if match = scan(/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/) Chris@909: encoder.text_token match, :include Chris@909: state = :initial Chris@909: Chris@909: elsif match = scan(/\s+/) Chris@909: encoder.text_token match, :space Chris@909: state = :initial if match.index ?\n Chris@909: Chris@909: else Chris@909: state = :initial Chris@909: Chris@909: end Chris@909: Chris@909: else Chris@909: raise_inspect 'Unknown state', encoder Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: if state == :string Chris@909: encoder.end_group :string Chris@909: end Chris@909: Chris@909: encoder Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: end