Chris@0: module CodeRay Chris@0: module Scanners Chris@0: Chris@0: # HTML Scanner Chris@0: class HTML < Scanner Chris@0: Chris@0: include Streamable Chris@0: register_for :html Chris@0: Chris@0: KINDS_NOT_LOC = [ Chris@0: :comment, :doctype, :preprocessor, Chris@0: :tag, :attribute_name, :operator, Chris@0: :attribute_value, :delimiter, :content, Chris@0: :plain, :entity, :error Chris@0: ] Chris@0: Chris@0: ATTR_NAME = /[\w.:-]+/ Chris@0: ATTR_VALUE_UNQUOTED = ATTR_NAME Chris@0: TAG_END = /\/?>/ Chris@0: HEX = /[0-9a-fA-F]/ Chris@0: ENTITY = / Chris@0: & Chris@0: (?: Chris@0: \w+ Chris@0: | Chris@0: \# Chris@0: (?: Chris@0: \d+ Chris@0: | Chris@0: x#{HEX}+ Chris@0: ) Chris@0: ) Chris@0: ; Chris@0: /ox Chris@0: Chris@0: PLAIN_STRING_CONTENT = { Chris@0: "'" => /[^&'>\n]+/, Chris@0: '"' => /[^&">\n]+/, Chris@0: } Chris@0: Chris@0: def reset Chris@0: super Chris@0: @state = :initial Chris@0: end Chris@0: Chris@0: private Chris@0: def setup Chris@0: @state = :initial Chris@0: @plain_string_content = nil Chris@0: end Chris@0: Chris@0: def scan_tokens tokens, options Chris@0: Chris@0: state = @state Chris@0: plain_string_content = @plain_string_content Chris@0: Chris@0: until eos? Chris@0: Chris@0: kind = nil Chris@0: match = nil Chris@0: Chris@0: if scan(/\s+/m) Chris@0: kind = :space Chris@0: Chris@0: else Chris@0: Chris@0: case state Chris@0: Chris@0: when :initial Chris@0: if scan(//m) Chris@0: kind = :comment Chris@0: elsif scan(//m) Chris@0: kind = :doctype Chris@0: elsif scan(/<\?xml.*?\?>/m) Chris@0: kind = :preprocessor Chris@0: elsif scan(/<\?.*?\?>|<%.*?%>/m) Chris@0: kind = :comment Chris@0: elsif scan(/<\/[-\w.:]*>/m) Chris@0: kind = :tag Chris@0: elsif match = scan(/<[-\w.:]+>?/m) Chris@0: kind = :tag Chris@0: state = :attribute unless match[-1] == ?> Chris@0: elsif scan(/[^<>&]+/) Chris@0: kind = :plain Chris@0: elsif scan(/#{ENTITY}/ox) Chris@0: kind = :entity Chris@0: elsif scan(/[<>&]/) Chris@0: kind = :error Chris@0: else Chris@0: raise_inspect '[BUG] else-case reached with state %p' % [state], tokens Chris@0: end Chris@0: Chris@0: when :attribute Chris@0: if scan(/#{TAG_END}/) Chris@0: kind = :tag Chris@0: state = :initial Chris@0: elsif scan(/#{ATTR_NAME}/o) Chris@0: kind = :attribute_name Chris@0: state = :attribute_equal Chris@0: else Chris@0: kind = :error Chris@0: getch Chris@0: end Chris@0: Chris@0: when :attribute_equal Chris@0: if scan(/=/) Chris@0: kind = :operator Chris@0: state = :attribute_value Chris@0: elsif scan(/#{ATTR_NAME}/o) Chris@0: kind = :attribute_name Chris@0: elsif scan(/#{TAG_END}/o) Chris@0: kind = :tag Chris@0: state = :initial Chris@0: elsif scan(/./) Chris@0: kind = :error Chris@0: state = :attribute Chris@0: end Chris@0: Chris@0: when :attribute_value Chris@0: if scan(/#{ATTR_VALUE_UNQUOTED}/o) Chris@0: kind = :attribute_value Chris@0: state = :attribute Chris@0: elsif match = scan(/["']/) Chris@0: tokens << [:open, :string] Chris@0: state = :attribute_value_string Chris@0: plain_string_content = PLAIN_STRING_CONTENT[match] Chris@0: kind = :delimiter Chris@0: elsif scan(/#{TAG_END}/o) Chris@0: kind = :tag Chris@0: state = :initial Chris@0: else Chris@0: kind = :error Chris@0: getch Chris@0: end Chris@0: Chris@0: when :attribute_value_string Chris@0: if scan(plain_string_content) Chris@0: kind = :content Chris@0: elsif scan(/['"]/) Chris@0: tokens << [matched, :delimiter] Chris@0: tokens << [:close, :string] Chris@0: state = :attribute Chris@0: next Chris@0: elsif scan(/#{ENTITY}/ox) Chris@0: kind = :entity Chris@0: elsif scan(/&/) Chris@0: kind = :content Chris@0: elsif scan(/[\n>]/) Chris@0: tokens << [:close, :string] Chris@0: kind = :error Chris@0: state = :initial Chris@0: end Chris@0: Chris@0: else Chris@0: raise_inspect 'Unknown state: %p' % [state], tokens Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: match ||= matched Chris@0: if $CODERAY_DEBUG and not kind Chris@0: raise_inspect 'Error token %p in line %d' % Chris@0: [[match, kind], line], tokens, state Chris@0: end Chris@0: raise_inspect 'Empty token', tokens unless match Chris@0: Chris@0: tokens << [match, kind] Chris@0: end Chris@0: Chris@0: if options[:keep_state] Chris@0: @state = state Chris@0: @plain_string_content = plain_string_content Chris@0: end Chris@0: Chris@0: tokens Chris@0: end Chris@0: Chris@0: end Chris@0: Chris@0: end Chris@0: end