Chris@909: module CodeRay Chris@909: module Scanners Chris@909: Chris@909: # HTML Scanner Chris@909: # Chris@909: # Alias: +xhtml+ Chris@909: # Chris@909: # See also: Scanners::XML Chris@909: class HTML < Scanner Chris@909: Chris@909: register_for :html Chris@909: Chris@909: KINDS_NOT_LOC = [ Chris@909: :comment, :doctype, :preprocessor, Chris@909: :tag, :attribute_name, :operator, Chris@909: :attribute_value, :string, Chris@909: :plain, :entity, :error, Chris@909: ] # :nodoc: Chris@909: Chris@909: EVENT_ATTRIBUTES = %w( Chris@909: onabort onafterprint onbeforeprint onbeforeunload onblur oncanplay Chris@909: oncanplaythrough onchange onclick oncontextmenu oncuechange ondblclick Chris@909: ondrag ondragdrop ondragend ondragenter ondragleave ondragover Chris@909: ondragstart ondrop ondurationchange onemptied onended onerror onfocus Chris@909: onformchange onforminput onhashchange oninput oninvalid onkeydown Chris@909: onkeypress onkeyup onload onloadeddata onloadedmetadata onloadstart Chris@909: onmessage onmousedown onmousemove onmouseout onmouseover onmouseup Chris@909: onmousewheel onmove onoffline ononline onpagehide onpageshow onpause Chris@909: onplay onplaying onpopstate onprogress onratechange onreadystatechange Chris@909: onredo onreset onresize onscroll onseeked onseeking onselect onshow Chris@909: onstalled onstorage onsubmit onsuspend ontimeupdate onundo onunload Chris@909: onvolumechange onwaiting Chris@909: ) Chris@909: Chris@909: IN_ATTRIBUTE = WordList::CaseIgnoring.new(nil). Chris@909: add(EVENT_ATTRIBUTES, :script) Chris@909: Chris@909: ATTR_NAME = /[\w.:-]+/ # :nodoc: Chris@909: TAG_END = /\/?>/ # :nodoc: Chris@909: HEX = /[0-9a-fA-F]/ # :nodoc: Chris@909: ENTITY = / Chris@909: & Chris@909: (?: Chris@909: \w+ Chris@909: | Chris@909: \# Chris@909: (?: Chris@909: \d+ Chris@909: | Chris@909: x#{HEX}+ Chris@909: ) Chris@909: ) Chris@909: ; Chris@909: /ox # :nodoc: Chris@909: Chris@909: PLAIN_STRING_CONTENT = { Chris@909: "'" => /[^&'>\n]+/, Chris@909: '"' => /[^&">\n]+/, Chris@909: } # :nodoc: Chris@909: Chris@909: def reset Chris@909: super Chris@909: @state = :initial Chris@909: @plain_string_content = nil Chris@909: end Chris@909: Chris@909: protected Chris@909: Chris@909: def setup Chris@909: @state = :initial Chris@909: @plain_string_content = nil Chris@909: end Chris@909: Chris@909: def scan_java_script encoder, code Chris@909: if code && !code.empty? Chris@909: @java_script_scanner ||= Scanners::JavaScript.new '', :keep_tokens => true Chris@909: # encoder.begin_group :inline Chris@909: @java_script_scanner.tokenize code, :tokens => encoder Chris@909: # encoder.end_group :inline Chris@909: end Chris@909: end Chris@909: Chris@909: def scan_tokens encoder, options Chris@909: state = options[:state] || @state Chris@909: plain_string_content = @plain_string_content Chris@909: in_tag = in_attribute = nil Chris@909: Chris@909: encoder.begin_group :string if state == :attribute_value_string Chris@909: Chris@909: until eos? Chris@909: Chris@909: if state != :in_special_tag && match = scan(/\s+/m) Chris@909: encoder.text_token match, :space Chris@909: Chris@909: else Chris@909: Chris@909: case state Chris@909: Chris@909: when :initial Chris@909: if match = scan(/|.*)/m) Chris@909: encoder.text_token match, :comment Chris@909: elsif match = scan(/|.*)/m) Chris@909: encoder.text_token match, :doctype Chris@909: elsif match = scan(/<\?xml(?:.*?\?>|.*)/m) Chris@909: encoder.text_token match, :preprocessor Chris@909: elsif match = scan(/<\?(?:.*?\?>|.*)/m) Chris@909: encoder.text_token match, :comment Chris@909: elsif match = scan(/<\/[-\w.:]*>?/m) Chris@909: in_tag = nil Chris@909: encoder.text_token match, :tag Chris@909: elsif match = scan(/<(?:(script)|[-\w.:]+)(>)?/m) Chris@909: encoder.text_token match, :tag Chris@909: in_tag = self[1] Chris@909: if self[2] Chris@909: state = :in_special_tag if in_tag Chris@909: else Chris@909: state = :attribute Chris@909: end Chris@909: elsif match = scan(/[^<>&]+/) Chris@909: encoder.text_token match, :plain Chris@909: elsif match = scan(/#{ENTITY}/ox) Chris@909: encoder.text_token match, :entity Chris@909: elsif match = scan(/[<>&]/) Chris@909: in_tag = nil Chris@909: encoder.text_token match, :error Chris@909: else Chris@909: raise_inspect '[BUG] else-case reached with state %p' % [state], encoder Chris@909: end Chris@909: Chris@909: when :attribute Chris@909: if match = scan(/#{TAG_END}/o) Chris@909: encoder.text_token match, :tag Chris@909: in_attribute = nil Chris@909: if in_tag Chris@909: state = :in_special_tag Chris@909: else Chris@909: state = :initial Chris@909: end Chris@909: elsif match = scan(/#{ATTR_NAME}/o) Chris@909: in_attribute = IN_ATTRIBUTE[match] Chris@909: encoder.text_token match, :attribute_name Chris@909: state = :attribute_equal Chris@909: else Chris@909: in_tag = nil Chris@909: encoder.text_token getch, :error Chris@909: end Chris@909: Chris@909: when :attribute_equal Chris@909: if match = scan(/=/) #/ Chris@909: encoder.text_token match, :operator Chris@909: state = :attribute_value Chris@909: elsif scan(/#{ATTR_NAME}/o) || scan(/#{TAG_END}/o) Chris@909: state = :attribute Chris@909: next Chris@909: else Chris@909: encoder.text_token getch, :error Chris@909: state = :attribute Chris@909: end Chris@909: Chris@909: when :attribute_value Chris@909: if match = scan(/#{ATTR_NAME}/o) Chris@909: encoder.text_token match, :attribute_value Chris@909: state = :attribute Chris@909: elsif match = scan(/["']/) Chris@909: if in_attribute == :script Chris@909: encoder.begin_group :inline Chris@909: encoder.text_token match, :inline_delimiter Chris@909: if scan(/javascript:[ \t]*/) Chris@909: encoder.text_token matched, :comment Chris@909: end Chris@909: code = scan_until(match == '"' ? /(?="|\z)/ : /(?='|\z)/) Chris@909: scan_java_script encoder, code Chris@909: match = scan(/["']/) Chris@909: encoder.text_token match, :inline_delimiter if match Chris@909: encoder.end_group :inline Chris@909: state = :attribute Chris@909: in_attribute = nil Chris@909: else Chris@909: encoder.begin_group :string Chris@909: state = :attribute_value_string Chris@909: plain_string_content = PLAIN_STRING_CONTENT[match] Chris@909: encoder.text_token match, :delimiter Chris@909: end Chris@909: elsif match = scan(/#{TAG_END}/o) Chris@909: encoder.text_token match, :tag Chris@909: state = :initial Chris@909: else Chris@909: encoder.text_token getch, :error Chris@909: end Chris@909: Chris@909: when :attribute_value_string Chris@909: if match = scan(plain_string_content) Chris@909: encoder.text_token match, :content Chris@909: elsif match = scan(/['"]/) Chris@909: encoder.text_token match, :delimiter Chris@909: encoder.end_group :string Chris@909: state = :attribute Chris@909: elsif match = scan(/#{ENTITY}/ox) Chris@909: encoder.text_token match, :entity Chris@909: elsif match = scan(/&/) Chris@909: encoder.text_token match, :content Chris@909: elsif match = scan(/[\n>]/) Chris@909: encoder.end_group :string Chris@909: state = :initial Chris@909: encoder.text_token match, :error Chris@909: end Chris@909: Chris@909: when :in_special_tag Chris@909: case in_tag Chris@909: when 'script' Chris@909: encoder.text_token match, :space if match = scan(/[ \t]*\n/) Chris@909: if scan(/(\s*)|(.*))/m) Chris@909: code = self[2] || self[4] Chris@909: closing = self[3] Chris@909: encoder.text_token self[1], :comment Chris@909: else Chris@909: code = scan_until(/(?=(?:\n\s*)?<\/script>)|\z/) Chris@909: closing = false Chris@909: end Chris@909: unless code.empty? Chris@909: encoder.begin_group :inline Chris@909: scan_java_script encoder, code Chris@909: encoder.end_group :inline Chris@909: end Chris@909: encoder.text_token closing, :comment if closing Chris@909: state = :initial Chris@909: else Chris@909: raise 'unknown special tag: %p' % [in_tag] Chris@909: end Chris@909: Chris@909: else Chris@909: raise_inspect 'Unknown state: %p' % [state], encoder Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: if options[:keep_state] Chris@909: @state = state Chris@909: @plain_string_content = plain_string_content Chris@909: end Chris@909: Chris@909: encoder.end_group :string if state == :attribute_value_string Chris@909: Chris@909: encoder Chris@909: end Chris@909: Chris@909: end Chris@909: Chris@909: end Chris@909: end