To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / .svn / pristine / c4 / c434d791a7d10db078bf19f213b250982f17b1e9.svn-base @ 1297:0a574315af3e

History | View | Annotate | Download (7.92 KB)

1
module CodeRay
2
module Scanners
3

    
4
  # HTML Scanner
5
  # 
6
  # Alias: +xhtml+
7
  # 
8
  # See also: Scanners::XML
9
  class HTML < Scanner
10

    
11
    register_for :html
12
    
13
    KINDS_NOT_LOC = [
14
      :comment, :doctype, :preprocessor,
15
      :tag, :attribute_name, :operator,
16
      :attribute_value, :string,
17
      :plain, :entity, :error,
18
    ]  # :nodoc:
19
    
20
    EVENT_ATTRIBUTES = %w(
21
      onabort onafterprint onbeforeprint onbeforeunload onblur oncanplay
22
      oncanplaythrough onchange onclick oncontextmenu oncuechange ondblclick
23
      ondrag ondragdrop ondragend ondragenter ondragleave ondragover
24
      ondragstart ondrop ondurationchange onemptied onended onerror onfocus
25
      onformchange onforminput onhashchange oninput oninvalid onkeydown
26
      onkeypress onkeyup onload onloadeddata onloadedmetadata onloadstart
27
      onmessage onmousedown onmousemove onmouseout onmouseover onmouseup
28
      onmousewheel onmove onoffline ononline onpagehide onpageshow onpause
29
      onplay onplaying onpopstate onprogress onratechange onreadystatechange
30
      onredo onreset onresize onscroll onseeked onseeking onselect onshow
31
      onstalled onstorage onsubmit onsuspend ontimeupdate onundo onunload
32
      onvolumechange onwaiting
33
    )
34
    
35
    IN_ATTRIBUTE = WordList::CaseIgnoring.new(nil).
36
      add(EVENT_ATTRIBUTES, :script)
37
    
38
    ATTR_NAME = /[\w.:-]+/  # :nodoc:
39
    TAG_END = /\/?>/  # :nodoc:
40
    HEX = /[0-9a-fA-F]/  # :nodoc:
41
    ENTITY = /
42
      &
43
      (?:
44
        \w+
45
      |
46
        \#
47
        (?:
48
          \d+
49
        |
50
          x#{HEX}+
51
        )
52
      )
53
      ;
54
    /ox  # :nodoc:
55
    
56
    PLAIN_STRING_CONTENT = {
57
      "'" => /[^&'>\n]+/,
58
      '"' => /[^&">\n]+/,
59
    }  # :nodoc:
60
    
61
    def reset
62
      super
63
      @state = :initial
64
      @plain_string_content = nil
65
    end
66
    
67
  protected
68
    
69
    def setup
70
      @state = :initial
71
      @plain_string_content = nil
72
    end
73
    
74
    def scan_java_script encoder, code
75
      if code && !code.empty?
76
        @java_script_scanner ||= Scanners::JavaScript.new '', :keep_tokens => true
77
        # encoder.begin_group :inline
78
        @java_script_scanner.tokenize code, :tokens => encoder
79
        # encoder.end_group :inline
80
      end
81
    end
82
    
83
    def scan_tokens encoder, options
84
      state = options[:state] || @state
85
      plain_string_content = @plain_string_content
86
      in_tag = in_attribute = nil
87
      
88
      encoder.begin_group :string if state == :attribute_value_string
89
      
90
      until eos?
91
        
92
        if state != :in_special_tag && match = scan(/\s+/m)
93
          encoder.text_token match, :space
94
          
95
        else
96
          
97
          case state
98
          
99
          when :initial
100
            if match = scan(/<!--(?:.*?-->|.*)/m)
101
              encoder.text_token match, :comment
102
            elsif match = scan(/<!DOCTYPE(?:.*?>|.*)/m)
103
              encoder.text_token match, :doctype
104
            elsif match = scan(/<\?xml(?:.*?\?>|.*)/m)
105
              encoder.text_token match, :preprocessor
106
            elsif match = scan(/<\?(?:.*?\?>|.*)/m)
107
              encoder.text_token match, :comment
108
            elsif match = scan(/<\/[-\w.:]*>?/m)
109
              in_tag = nil
110
              encoder.text_token match, :tag
111
            elsif match = scan(/<(?:(script)|[-\w.:]+)(>)?/m)
112
              encoder.text_token match, :tag
113
              in_tag = self[1]
114
              if self[2]
115
                state = :in_special_tag if in_tag
116
              else
117
                state = :attribute
118
              end
119
            elsif match = scan(/[^<>&]+/)
120
              encoder.text_token match, :plain
121
            elsif match = scan(/#{ENTITY}/ox)
122
              encoder.text_token match, :entity
123
            elsif match = scan(/[<>&]/)
124
              in_tag = nil
125
              encoder.text_token match, :error
126
            else
127
              raise_inspect '[BUG] else-case reached with state %p' % [state], encoder
128
            end
129
            
130
          when :attribute
131
            if match = scan(/#{TAG_END}/o)
132
              encoder.text_token match, :tag
133
              in_attribute = nil
134
              if in_tag
135
                state = :in_special_tag
136
              else
137
                state = :initial
138
              end
139
            elsif match = scan(/#{ATTR_NAME}/o)
140
              in_attribute = IN_ATTRIBUTE[match]
141
              encoder.text_token match, :attribute_name
142
              state = :attribute_equal
143
            else
144
              in_tag = nil
145
              encoder.text_token getch, :error
146
            end
147
            
148
          when :attribute_equal
149
            if match = scan(/=/)  #/
150
              encoder.text_token match, :operator
151
              state = :attribute_value
152
            elsif scan(/#{ATTR_NAME}/o) || scan(/#{TAG_END}/o)
153
              state = :attribute
154
              next
155
            else
156
              encoder.text_token getch, :error
157
              state = :attribute
158
            end
159
            
160
          when :attribute_value
161
            if match = scan(/#{ATTR_NAME}/o)
162
              encoder.text_token match, :attribute_value
163
              state = :attribute
164
            elsif match = scan(/["']/)
165
              if in_attribute == :script
166
                encoder.begin_group :inline
167
                encoder.text_token match, :inline_delimiter
168
                if scan(/javascript:[ \t]*/)
169
                  encoder.text_token matched, :comment
170
                end
171
                code = scan_until(match == '"' ? /(?="|\z)/ : /(?='|\z)/)
172
                scan_java_script encoder, code
173
                match = scan(/["']/)
174
                encoder.text_token match, :inline_delimiter if match
175
                encoder.end_group :inline
176
                state = :attribute
177
                in_attribute = nil
178
              else
179
                encoder.begin_group :string
180
                state = :attribute_value_string
181
                plain_string_content = PLAIN_STRING_CONTENT[match]
182
                encoder.text_token match, :delimiter
183
              end
184
            elsif match = scan(/#{TAG_END}/o)
185
              encoder.text_token match, :tag
186
              state = :initial
187
            else
188
              encoder.text_token getch, :error
189
            end
190
            
191
          when :attribute_value_string
192
            if match = scan(plain_string_content)
193
              encoder.text_token match, :content
194
            elsif match = scan(/['"]/)
195
              encoder.text_token match, :delimiter
196
              encoder.end_group :string
197
              state = :attribute
198
            elsif match = scan(/#{ENTITY}/ox)
199
              encoder.text_token match, :entity
200
            elsif match = scan(/&/)
201
              encoder.text_token match, :content
202
            elsif match = scan(/[\n>]/)
203
              encoder.end_group :string
204
              state = :initial
205
              encoder.text_token match, :error
206
            end
207
            
208
          when :in_special_tag
209
            case in_tag
210
            when 'script'
211
              encoder.text_token match, :space if match = scan(/[ \t]*\n/)
212
              if scan(/(\s*<!--)(?:(.*?)(-->)|(.*))/m)
213
                code = self[2] || self[4]
214
                closing = self[3]
215
                encoder.text_token self[1], :comment
216
              else
217
                code = scan_until(/(?=(?:\n\s*)?<\/script>)|\z/)
218
                closing = false
219
              end
220
              unless code.empty?
221
                encoder.begin_group :inline
222
                scan_java_script encoder, code
223
                encoder.end_group :inline
224
              end
225
              encoder.text_token closing, :comment if closing
226
              state = :initial
227
            else
228
              raise 'unknown special tag: %p' % [in_tag]
229
            end
230
            
231
          else
232
            raise_inspect 'Unknown state: %p' % [state], encoder
233
            
234
          end
235
          
236
        end
237
        
238
      end
239
      
240
      if options[:keep_state]
241
        @state = state
242
        @plain_string_content = plain_string_content
243
      end
244
      
245
      encoder.end_group :string if state == :attribute_value_string
246
      
247
      encoder
248
    end
249
    
250
  end
251
  
252
end
253
end