Chris@909
|
1 module CodeRay
|
Chris@909
|
2 module Scanners
|
Chris@909
|
3
|
Chris@909
|
4 # HTML Scanner
|
Chris@909
|
5 #
|
Chris@909
|
6 # Alias: +xhtml+
|
Chris@909
|
7 #
|
Chris@909
|
8 # See also: Scanners::XML
|
Chris@909
|
9 class HTML < Scanner
|
Chris@909
|
10
|
Chris@909
|
11 register_for :html
|
Chris@909
|
12
|
Chris@909
|
13 KINDS_NOT_LOC = [
|
Chris@909
|
14 :comment, :doctype, :preprocessor,
|
Chris@909
|
15 :tag, :attribute_name, :operator,
|
Chris@909
|
16 :attribute_value, :string,
|
Chris@909
|
17 :plain, :entity, :error,
|
Chris@909
|
18 ] # :nodoc:
|
Chris@909
|
19
|
Chris@909
|
20 EVENT_ATTRIBUTES = %w(
|
Chris@909
|
21 onabort onafterprint onbeforeprint onbeforeunload onblur oncanplay
|
Chris@909
|
22 oncanplaythrough onchange onclick oncontextmenu oncuechange ondblclick
|
Chris@909
|
23 ondrag ondragdrop ondragend ondragenter ondragleave ondragover
|
Chris@909
|
24 ondragstart ondrop ondurationchange onemptied onended onerror onfocus
|
Chris@909
|
25 onformchange onforminput onhashchange oninput oninvalid onkeydown
|
Chris@909
|
26 onkeypress onkeyup onload onloadeddata onloadedmetadata onloadstart
|
Chris@909
|
27 onmessage onmousedown onmousemove onmouseout onmouseover onmouseup
|
Chris@909
|
28 onmousewheel onmove onoffline ononline onpagehide onpageshow onpause
|
Chris@909
|
29 onplay onplaying onpopstate onprogress onratechange onreadystatechange
|
Chris@909
|
30 onredo onreset onresize onscroll onseeked onseeking onselect onshow
|
Chris@909
|
31 onstalled onstorage onsubmit onsuspend ontimeupdate onundo onunload
|
Chris@909
|
32 onvolumechange onwaiting
|
Chris@909
|
33 )
|
Chris@909
|
34
|
Chris@909
|
35 IN_ATTRIBUTE = WordList::CaseIgnoring.new(nil).
|
Chris@909
|
36 add(EVENT_ATTRIBUTES, :script)
|
Chris@909
|
37
|
Chris@909
|
38 ATTR_NAME = /[\w.:-]+/ # :nodoc:
|
Chris@909
|
39 TAG_END = /\/?>/ # :nodoc:
|
Chris@909
|
40 HEX = /[0-9a-fA-F]/ # :nodoc:
|
Chris@909
|
41 ENTITY = /
|
Chris@909
|
42 &
|
Chris@909
|
43 (?:
|
Chris@909
|
44 \w+
|
Chris@909
|
45 |
|
Chris@909
|
46 \#
|
Chris@909
|
47 (?:
|
Chris@909
|
48 \d+
|
Chris@909
|
49 |
|
Chris@909
|
50 x#{HEX}+
|
Chris@909
|
51 )
|
Chris@909
|
52 )
|
Chris@909
|
53 ;
|
Chris@909
|
54 /ox # :nodoc:
|
Chris@909
|
55
|
Chris@909
|
56 PLAIN_STRING_CONTENT = {
|
Chris@909
|
57 "'" => /[^&'>\n]+/,
|
Chris@909
|
58 '"' => /[^&">\n]+/,
|
Chris@909
|
59 } # :nodoc:
|
Chris@909
|
60
|
Chris@909
|
61 def reset
|
Chris@909
|
62 super
|
Chris@909
|
63 @state = :initial
|
Chris@909
|
64 @plain_string_content = nil
|
Chris@909
|
65 end
|
Chris@909
|
66
|
Chris@909
|
67 protected
|
Chris@909
|
68
|
Chris@909
|
69 def setup
|
Chris@909
|
70 @state = :initial
|
Chris@909
|
71 @plain_string_content = nil
|
Chris@909
|
72 end
|
Chris@909
|
73
|
Chris@909
|
74 def scan_java_script encoder, code
|
Chris@909
|
75 if code && !code.empty?
|
Chris@909
|
76 @java_script_scanner ||= Scanners::JavaScript.new '', :keep_tokens => true
|
Chris@909
|
77 # encoder.begin_group :inline
|
Chris@909
|
78 @java_script_scanner.tokenize code, :tokens => encoder
|
Chris@909
|
79 # encoder.end_group :inline
|
Chris@909
|
80 end
|
Chris@909
|
81 end
|
Chris@909
|
82
|
Chris@909
|
83 def scan_tokens encoder, options
|
Chris@909
|
84 state = options[:state] || @state
|
Chris@909
|
85 plain_string_content = @plain_string_content
|
Chris@909
|
86 in_tag = in_attribute = nil
|
Chris@909
|
87
|
Chris@909
|
88 encoder.begin_group :string if state == :attribute_value_string
|
Chris@909
|
89
|
Chris@909
|
90 until eos?
|
Chris@909
|
91
|
Chris@909
|
92 if state != :in_special_tag && match = scan(/\s+/m)
|
Chris@909
|
93 encoder.text_token match, :space
|
Chris@909
|
94
|
Chris@909
|
95 else
|
Chris@909
|
96
|
Chris@909
|
97 case state
|
Chris@909
|
98
|
Chris@909
|
99 when :initial
|
Chris@909
|
100 if match = scan(/<!--(?:.*?-->|.*)/m)
|
Chris@909
|
101 encoder.text_token match, :comment
|
Chris@909
|
102 elsif match = scan(/<!DOCTYPE(?:.*?>|.*)/m)
|
Chris@909
|
103 encoder.text_token match, :doctype
|
Chris@909
|
104 elsif match = scan(/<\?xml(?:.*?\?>|.*)/m)
|
Chris@909
|
105 encoder.text_token match, :preprocessor
|
Chris@909
|
106 elsif match = scan(/<\?(?:.*?\?>|.*)/m)
|
Chris@909
|
107 encoder.text_token match, :comment
|
Chris@909
|
108 elsif match = scan(/<\/[-\w.:]*>?/m)
|
Chris@909
|
109 in_tag = nil
|
Chris@909
|
110 encoder.text_token match, :tag
|
Chris@909
|
111 elsif match = scan(/<(?:(script)|[-\w.:]+)(>)?/m)
|
Chris@909
|
112 encoder.text_token match, :tag
|
Chris@909
|
113 in_tag = self[1]
|
Chris@909
|
114 if self[2]
|
Chris@909
|
115 state = :in_special_tag if in_tag
|
Chris@909
|
116 else
|
Chris@909
|
117 state = :attribute
|
Chris@909
|
118 end
|
Chris@909
|
119 elsif match = scan(/[^<>&]+/)
|
Chris@909
|
120 encoder.text_token match, :plain
|
Chris@909
|
121 elsif match = scan(/#{ENTITY}/ox)
|
Chris@909
|
122 encoder.text_token match, :entity
|
Chris@909
|
123 elsif match = scan(/[<>&]/)
|
Chris@909
|
124 in_tag = nil
|
Chris@909
|
125 encoder.text_token match, :error
|
Chris@909
|
126 else
|
Chris@909
|
127 raise_inspect '[BUG] else-case reached with state %p' % [state], encoder
|
Chris@909
|
128 end
|
Chris@909
|
129
|
Chris@909
|
130 when :attribute
|
Chris@909
|
131 if match = scan(/#{TAG_END}/o)
|
Chris@909
|
132 encoder.text_token match, :tag
|
Chris@909
|
133 in_attribute = nil
|
Chris@909
|
134 if in_tag
|
Chris@909
|
135 state = :in_special_tag
|
Chris@909
|
136 else
|
Chris@909
|
137 state = :initial
|
Chris@909
|
138 end
|
Chris@909
|
139 elsif match = scan(/#{ATTR_NAME}/o)
|
Chris@909
|
140 in_attribute = IN_ATTRIBUTE[match]
|
Chris@909
|
141 encoder.text_token match, :attribute_name
|
Chris@909
|
142 state = :attribute_equal
|
Chris@909
|
143 else
|
Chris@909
|
144 in_tag = nil
|
Chris@909
|
145 encoder.text_token getch, :error
|
Chris@909
|
146 end
|
Chris@909
|
147
|
Chris@909
|
148 when :attribute_equal
|
Chris@909
|
149 if match = scan(/=/) #/
|
Chris@909
|
150 encoder.text_token match, :operator
|
Chris@909
|
151 state = :attribute_value
|
Chris@909
|
152 elsif scan(/#{ATTR_NAME}/o) || scan(/#{TAG_END}/o)
|
Chris@909
|
153 state = :attribute
|
Chris@909
|
154 next
|
Chris@909
|
155 else
|
Chris@909
|
156 encoder.text_token getch, :error
|
Chris@909
|
157 state = :attribute
|
Chris@909
|
158 end
|
Chris@909
|
159
|
Chris@909
|
160 when :attribute_value
|
Chris@909
|
161 if match = scan(/#{ATTR_NAME}/o)
|
Chris@909
|
162 encoder.text_token match, :attribute_value
|
Chris@909
|
163 state = :attribute
|
Chris@909
|
164 elsif match = scan(/["']/)
|
Chris@909
|
165 if in_attribute == :script
|
Chris@909
|
166 encoder.begin_group :inline
|
Chris@909
|
167 encoder.text_token match, :inline_delimiter
|
Chris@909
|
168 if scan(/javascript:[ \t]*/)
|
Chris@909
|
169 encoder.text_token matched, :comment
|
Chris@909
|
170 end
|
Chris@909
|
171 code = scan_until(match == '"' ? /(?="|\z)/ : /(?='|\z)/)
|
Chris@909
|
172 scan_java_script encoder, code
|
Chris@909
|
173 match = scan(/["']/)
|
Chris@909
|
174 encoder.text_token match, :inline_delimiter if match
|
Chris@909
|
175 encoder.end_group :inline
|
Chris@909
|
176 state = :attribute
|
Chris@909
|
177 in_attribute = nil
|
Chris@909
|
178 else
|
Chris@909
|
179 encoder.begin_group :string
|
Chris@909
|
180 state = :attribute_value_string
|
Chris@909
|
181 plain_string_content = PLAIN_STRING_CONTENT[match]
|
Chris@909
|
182 encoder.text_token match, :delimiter
|
Chris@909
|
183 end
|
Chris@909
|
184 elsif match = scan(/#{TAG_END}/o)
|
Chris@909
|
185 encoder.text_token match, :tag
|
Chris@909
|
186 state = :initial
|
Chris@909
|
187 else
|
Chris@909
|
188 encoder.text_token getch, :error
|
Chris@909
|
189 end
|
Chris@909
|
190
|
Chris@909
|
191 when :attribute_value_string
|
Chris@909
|
192 if match = scan(plain_string_content)
|
Chris@909
|
193 encoder.text_token match, :content
|
Chris@909
|
194 elsif match = scan(/['"]/)
|
Chris@909
|
195 encoder.text_token match, :delimiter
|
Chris@909
|
196 encoder.end_group :string
|
Chris@909
|
197 state = :attribute
|
Chris@909
|
198 elsif match = scan(/#{ENTITY}/ox)
|
Chris@909
|
199 encoder.text_token match, :entity
|
Chris@909
|
200 elsif match = scan(/&/)
|
Chris@909
|
201 encoder.text_token match, :content
|
Chris@909
|
202 elsif match = scan(/[\n>]/)
|
Chris@909
|
203 encoder.end_group :string
|
Chris@909
|
204 state = :initial
|
Chris@909
|
205 encoder.text_token match, :error
|
Chris@909
|
206 end
|
Chris@909
|
207
|
Chris@909
|
208 when :in_special_tag
|
Chris@909
|
209 case in_tag
|
Chris@909
|
210 when 'script'
|
Chris@909
|
211 encoder.text_token match, :space if match = scan(/[ \t]*\n/)
|
Chris@909
|
212 if scan(/(\s*<!--)(?:(.*?)(-->)|(.*))/m)
|
Chris@909
|
213 code = self[2] || self[4]
|
Chris@909
|
214 closing = self[3]
|
Chris@909
|
215 encoder.text_token self[1], :comment
|
Chris@909
|
216 else
|
Chris@909
|
217 code = scan_until(/(?=(?:\n\s*)?<\/script>)|\z/)
|
Chris@909
|
218 closing = false
|
Chris@909
|
219 end
|
Chris@909
|
220 unless code.empty?
|
Chris@909
|
221 encoder.begin_group :inline
|
Chris@909
|
222 scan_java_script encoder, code
|
Chris@909
|
223 encoder.end_group :inline
|
Chris@909
|
224 end
|
Chris@909
|
225 encoder.text_token closing, :comment if closing
|
Chris@909
|
226 state = :initial
|
Chris@909
|
227 else
|
Chris@909
|
228 raise 'unknown special tag: %p' % [in_tag]
|
Chris@909
|
229 end
|
Chris@909
|
230
|
Chris@909
|
231 else
|
Chris@909
|
232 raise_inspect 'Unknown state: %p' % [state], encoder
|
Chris@909
|
233
|
Chris@909
|
234 end
|
Chris@909
|
235
|
Chris@909
|
236 end
|
Chris@909
|
237
|
Chris@909
|
238 end
|
Chris@909
|
239
|
Chris@909
|
240 if options[:keep_state]
|
Chris@909
|
241 @state = state
|
Chris@909
|
242 @plain_string_content = plain_string_content
|
Chris@909
|
243 end
|
Chris@909
|
244
|
Chris@909
|
245 encoder.end_group :string if state == :attribute_value_string
|
Chris@909
|
246
|
Chris@909
|
247 encoder
|
Chris@909
|
248 end
|
Chris@909
|
249
|
Chris@909
|
250 end
|
Chris@909
|
251
|
Chris@909
|
252 end
|
Chris@909
|
253 end
|