Chris@0
|
1 module CodeRay
|
Chris@0
|
2 module Scanners
|
Chris@0
|
3
|
Chris@0
|
4 # HTML Scanner
|
Chris@0
|
5 class HTML < Scanner
|
Chris@0
|
6
|
Chris@0
|
7 include Streamable
|
Chris@0
|
8 register_for :html
|
Chris@0
|
9
|
Chris@0
|
10 KINDS_NOT_LOC = [
|
Chris@0
|
11 :comment, :doctype, :preprocessor,
|
Chris@0
|
12 :tag, :attribute_name, :operator,
|
Chris@0
|
13 :attribute_value, :delimiter, :content,
|
Chris@0
|
14 :plain, :entity, :error
|
Chris@0
|
15 ]
|
Chris@0
|
16
|
Chris@0
|
17 ATTR_NAME = /[\w.:-]+/
|
Chris@0
|
18 ATTR_VALUE_UNQUOTED = ATTR_NAME
|
Chris@0
|
19 TAG_END = /\/?>/
|
Chris@0
|
20 HEX = /[0-9a-fA-F]/
|
Chris@0
|
21 ENTITY = /
|
Chris@0
|
22 &
|
Chris@0
|
23 (?:
|
Chris@0
|
24 \w+
|
Chris@0
|
25 |
|
Chris@0
|
26 \#
|
Chris@0
|
27 (?:
|
Chris@0
|
28 \d+
|
Chris@0
|
29 |
|
Chris@0
|
30 x#{HEX}+
|
Chris@0
|
31 )
|
Chris@0
|
32 )
|
Chris@0
|
33 ;
|
Chris@0
|
34 /ox
|
Chris@0
|
35
|
Chris@0
|
36 PLAIN_STRING_CONTENT = {
|
Chris@0
|
37 "'" => /[^&'>\n]+/,
|
Chris@0
|
38 '"' => /[^&">\n]+/,
|
Chris@0
|
39 }
|
Chris@0
|
40
|
Chris@0
|
41 def reset
|
Chris@0
|
42 super
|
Chris@0
|
43 @state = :initial
|
Chris@0
|
44 end
|
Chris@0
|
45
|
Chris@0
|
46 private
|
Chris@0
|
47 def setup
|
Chris@0
|
48 @state = :initial
|
Chris@0
|
49 @plain_string_content = nil
|
Chris@0
|
50 end
|
Chris@0
|
51
|
Chris@0
|
52 def scan_tokens tokens, options
|
Chris@0
|
53
|
Chris@0
|
54 state = @state
|
Chris@0
|
55 plain_string_content = @plain_string_content
|
Chris@0
|
56
|
Chris@0
|
57 until eos?
|
Chris@0
|
58
|
Chris@0
|
59 kind = nil
|
Chris@0
|
60 match = nil
|
Chris@0
|
61
|
Chris@0
|
62 if scan(/\s+/m)
|
Chris@0
|
63 kind = :space
|
Chris@0
|
64
|
Chris@0
|
65 else
|
Chris@0
|
66
|
Chris@0
|
67 case state
|
Chris@0
|
68
|
Chris@0
|
69 when :initial
|
Chris@0
|
70 if scan(/<!--.*?-->/m)
|
Chris@0
|
71 kind = :comment
|
Chris@0
|
72 elsif scan(/<!DOCTYPE.*?>/m)
|
Chris@0
|
73 kind = :doctype
|
Chris@0
|
74 elsif scan(/<\?xml.*?\?>/m)
|
Chris@0
|
75 kind = :preprocessor
|
Chris@0
|
76 elsif scan(/<\?.*?\?>|<%.*?%>/m)
|
Chris@0
|
77 kind = :comment
|
Chris@0
|
78 elsif scan(/<\/[-\w.:]*>/m)
|
Chris@0
|
79 kind = :tag
|
Chris@0
|
80 elsif match = scan(/<[-\w.:]+>?/m)
|
Chris@0
|
81 kind = :tag
|
Chris@0
|
82 state = :attribute unless match[-1] == ?>
|
Chris@0
|
83 elsif scan(/[^<>&]+/)
|
Chris@0
|
84 kind = :plain
|
Chris@0
|
85 elsif scan(/#{ENTITY}/ox)
|
Chris@0
|
86 kind = :entity
|
Chris@0
|
87 elsif scan(/[<>&]/)
|
Chris@0
|
88 kind = :error
|
Chris@0
|
89 else
|
Chris@0
|
90 raise_inspect '[BUG] else-case reached with state %p' % [state], tokens
|
Chris@0
|
91 end
|
Chris@0
|
92
|
Chris@0
|
93 when :attribute
|
Chris@0
|
94 if scan(/#{TAG_END}/)
|
Chris@0
|
95 kind = :tag
|
Chris@0
|
96 state = :initial
|
Chris@0
|
97 elsif scan(/#{ATTR_NAME}/o)
|
Chris@0
|
98 kind = :attribute_name
|
Chris@0
|
99 state = :attribute_equal
|
Chris@0
|
100 else
|
Chris@0
|
101 kind = :error
|
Chris@0
|
102 getch
|
Chris@0
|
103 end
|
Chris@0
|
104
|
Chris@0
|
105 when :attribute_equal
|
Chris@0
|
106 if scan(/=/)
|
Chris@0
|
107 kind = :operator
|
Chris@0
|
108 state = :attribute_value
|
Chris@0
|
109 elsif scan(/#{ATTR_NAME}/o)
|
Chris@0
|
110 kind = :attribute_name
|
Chris@0
|
111 elsif scan(/#{TAG_END}/o)
|
Chris@0
|
112 kind = :tag
|
Chris@0
|
113 state = :initial
|
Chris@0
|
114 elsif scan(/./)
|
Chris@0
|
115 kind = :error
|
Chris@0
|
116 state = :attribute
|
Chris@0
|
117 end
|
Chris@0
|
118
|
Chris@0
|
119 when :attribute_value
|
Chris@0
|
120 if scan(/#{ATTR_VALUE_UNQUOTED}/o)
|
Chris@0
|
121 kind = :attribute_value
|
Chris@0
|
122 state = :attribute
|
Chris@0
|
123 elsif match = scan(/["']/)
|
Chris@0
|
124 tokens << [:open, :string]
|
Chris@0
|
125 state = :attribute_value_string
|
Chris@0
|
126 plain_string_content = PLAIN_STRING_CONTENT[match]
|
Chris@0
|
127 kind = :delimiter
|
Chris@0
|
128 elsif scan(/#{TAG_END}/o)
|
Chris@0
|
129 kind = :tag
|
Chris@0
|
130 state = :initial
|
Chris@0
|
131 else
|
Chris@0
|
132 kind = :error
|
Chris@0
|
133 getch
|
Chris@0
|
134 end
|
Chris@0
|
135
|
Chris@0
|
136 when :attribute_value_string
|
Chris@0
|
137 if scan(plain_string_content)
|
Chris@0
|
138 kind = :content
|
Chris@0
|
139 elsif scan(/['"]/)
|
Chris@0
|
140 tokens << [matched, :delimiter]
|
Chris@0
|
141 tokens << [:close, :string]
|
Chris@0
|
142 state = :attribute
|
Chris@0
|
143 next
|
Chris@0
|
144 elsif scan(/#{ENTITY}/ox)
|
Chris@0
|
145 kind = :entity
|
Chris@0
|
146 elsif scan(/&/)
|
Chris@0
|
147 kind = :content
|
Chris@0
|
148 elsif scan(/[\n>]/)
|
Chris@0
|
149 tokens << [:close, :string]
|
Chris@0
|
150 kind = :error
|
Chris@0
|
151 state = :initial
|
Chris@0
|
152 end
|
Chris@0
|
153
|
Chris@0
|
154 else
|
Chris@0
|
155 raise_inspect 'Unknown state: %p' % [state], tokens
|
Chris@0
|
156
|
Chris@0
|
157 end
|
Chris@0
|
158
|
Chris@0
|
159 end
|
Chris@0
|
160
|
Chris@0
|
161 match ||= matched
|
Chris@0
|
162 if $CODERAY_DEBUG and not kind
|
Chris@0
|
163 raise_inspect 'Error token %p in line %d' %
|
Chris@0
|
164 [[match, kind], line], tokens, state
|
Chris@0
|
165 end
|
Chris@0
|
166 raise_inspect 'Empty token', tokens unless match
|
Chris@0
|
167
|
Chris@0
|
168 tokens << [match, kind]
|
Chris@0
|
169 end
|
Chris@0
|
170
|
Chris@0
|
171 if options[:keep_state]
|
Chris@0
|
172 @state = state
|
Chris@0
|
173 @plain_string_content = plain_string_content
|
Chris@0
|
174 end
|
Chris@0
|
175
|
Chris@0
|
176 tokens
|
Chris@0
|
177 end
|
Chris@0
|
178
|
Chris@0
|
179 end
|
Chris@0
|
180
|
Chris@0
|
181 end
|
Chris@0
|
182 end
|