Chris@210: module CodeRay Chris@210: module Scanners Chris@210: Chris@210: # HTML Scanner Chris@210: class HTML < Scanner Chris@210: Chris@210: include Streamable Chris@210: register_for :html Chris@210: Chris@210: KINDS_NOT_LOC = [ Chris@210: :comment, :doctype, :preprocessor, Chris@210: :tag, :attribute_name, :operator, Chris@210: :attribute_value, :delimiter, :content, Chris@210: :plain, :entity, :error Chris@210: ] Chris@210: Chris@210: ATTR_NAME = /[\w.:-]+/ Chris@210: ATTR_VALUE_UNQUOTED = ATTR_NAME Chris@210: TAG_END = /\/?>/ Chris@210: HEX = /[0-9a-fA-F]/ Chris@210: ENTITY = / Chris@210: & Chris@210: (?: Chris@210: \w+ Chris@210: | Chris@210: \# Chris@210: (?: Chris@210: \d+ Chris@210: | Chris@210: x#{HEX}+ Chris@210: ) Chris@210: ) Chris@210: ; Chris@210: /ox Chris@210: Chris@210: PLAIN_STRING_CONTENT = { Chris@210: "'" => /[^&'>\n]+/, Chris@210: '"' => /[^&">\n]+/, Chris@210: } Chris@210: Chris@210: def reset Chris@210: super Chris@210: @state = :initial Chris@210: end Chris@210: Chris@210: private Chris@210: def setup Chris@210: @state = :initial Chris@210: @plain_string_content = nil Chris@210: end Chris@210: Chris@210: def scan_tokens tokens, options Chris@210: Chris@210: state = @state Chris@210: plain_string_content = @plain_string_content Chris@210: Chris@210: until eos? Chris@210: Chris@210: kind = nil Chris@210: match = nil Chris@210: Chris@210: if scan(/\s+/m) Chris@210: kind = :space Chris@210: Chris@210: else Chris@210: Chris@210: case state Chris@210: Chris@210: when :initial Chris@210: if scan(//m) Chris@210: kind = :comment Chris@210: elsif scan(//m) Chris@210: kind = :doctype Chris@210: elsif scan(/<\?xml.*?\?>/m) Chris@210: kind = :preprocessor Chris@210: elsif scan(/<\?.*?\?>|<%.*?%>/m) Chris@210: kind = :comment Chris@210: elsif scan(/<\/[-\w.:]*>/m) Chris@210: kind = :tag Chris@210: elsif match = scan(/<[-\w.:]+>?/m) Chris@210: kind = :tag Chris@210: state = :attribute unless match[-1] == ?> Chris@210: elsif scan(/[^<>&]+/) Chris@210: kind = :plain Chris@210: elsif scan(/#{ENTITY}/ox) Chris@210: kind = :entity Chris@210: elsif scan(/[<>&]/) Chris@210: kind = :error Chris@210: else Chris@210: raise_inspect '[BUG] else-case reached with state %p' % [state], tokens Chris@210: end Chris@210: Chris@210: when :attribute Chris@210: if scan(/#{TAG_END}/o) Chris@210: kind = :tag Chris@210: state = :initial Chris@210: elsif scan(/#{ATTR_NAME}/o) Chris@210: kind = :attribute_name Chris@210: state = :attribute_equal Chris@210: else Chris@210: kind = :error Chris@210: getch Chris@210: end Chris@210: Chris@210: when :attribute_equal Chris@210: if scan(/=/) Chris@210: kind = :operator Chris@210: state = :attribute_value Chris@210: elsif scan(/#{ATTR_NAME}/o) Chris@210: kind = :attribute_name Chris@210: elsif scan(/#{TAG_END}/o) Chris@210: kind = :tag Chris@210: state = :initial Chris@210: elsif scan(/./) Chris@210: kind = :error Chris@210: state = :attribute Chris@210: end Chris@210: Chris@210: when :attribute_value Chris@210: if scan(/#{ATTR_VALUE_UNQUOTED}/o) Chris@210: kind = :attribute_value Chris@210: state = :attribute Chris@210: elsif match = scan(/["']/) Chris@210: tokens << [:open, :string] Chris@210: state = :attribute_value_string Chris@210: plain_string_content = PLAIN_STRING_CONTENT[match] Chris@210: kind = :delimiter Chris@210: elsif scan(/#{TAG_END}/o) Chris@210: kind = :tag Chris@210: state = :initial Chris@210: else Chris@210: kind = :error Chris@210: getch Chris@210: end Chris@210: Chris@210: when :attribute_value_string Chris@210: if scan(plain_string_content) Chris@210: kind = :content Chris@210: elsif scan(/['"]/) Chris@210: tokens << [matched, :delimiter] Chris@210: tokens << [:close, :string] Chris@210: state = :attribute Chris@210: next Chris@210: elsif scan(/#{ENTITY}/ox) Chris@210: kind = :entity Chris@210: elsif scan(/&/) Chris@210: kind = :content Chris@210: elsif scan(/[\n>]/) Chris@210: tokens << [:close, :string] Chris@210: kind = :error Chris@210: state = :initial Chris@210: end Chris@210: Chris@210: else Chris@210: raise_inspect 'Unknown state: %p' % [state], tokens Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: match ||= matched Chris@210: if $CODERAY_DEBUG and not kind Chris@210: raise_inspect 'Error token %p in line %d' % Chris@210: [[match, kind], line], tokens, state Chris@210: end Chris@210: raise_inspect 'Empty token', tokens unless match Chris@210: Chris@210: tokens << [match, kind] Chris@210: end Chris@210: Chris@210: if options[:keep_state] Chris@210: @state = state Chris@210: @plain_string_content = plain_string_content Chris@210: end Chris@210: Chris@210: tokens Chris@210: end Chris@210: Chris@210: end Chris@210: Chris@210: end Chris@210: end