Mercurial > hg > isophonics-drupal-site
comparison vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php @ 17:129ea1e6d783
Update, including to Drupal core 8.6.10
author | Chris Cannam |
---|---|
date | Thu, 28 Feb 2019 13:21:36 +0000 |
parents | 4c8ae668cc8c |
children |
comparison
equal
deleted
inserted
replaced
16:c2387f117808 | 17:129ea1e6d783 |
---|---|
1 <?php | 1 <?php |
2 | |
2 namespace Masterminds\HTML5\Parser; | 3 namespace Masterminds\HTML5\Parser; |
3 | 4 |
4 use Masterminds\HTML5\Elements; | 5 use Masterminds\HTML5\Elements; |
5 | 6 |
6 /** | 7 /** |
23 * | 24 * |
24 * @see http://www.w3.org/TR/2012/CR-html5-20121217/ | 25 * @see http://www.w3.org/TR/2012/CR-html5-20121217/ |
25 */ | 26 */ |
26 class Tokenizer | 27 class Tokenizer |
27 { | 28 { |
28 | |
29 protected $scanner; | 29 protected $scanner; |
30 | 30 |
31 protected $events; | 31 protected $events; |
32 | 32 |
33 protected $tok; | 33 protected $tok; |
45 | 45 |
46 const CONFORMANT_XML = 'xml'; | 46 const CONFORMANT_XML = 'xml'; |
47 const CONFORMANT_HTML = 'html'; | 47 const CONFORMANT_HTML = 'html'; |
48 protected $mode = self::CONFORMANT_HTML; | 48 protected $mode = self::CONFORMANT_HTML; |
49 | 49 |
50 const WHITE = "\t\n\f "; | |
51 | |
52 /** | 50 /** |
53 * Create a new tokenizer. | 51 * Create a new tokenizer. |
54 * | 52 * |
55 * Typically, parsing a document involves creating a new tokenizer, giving | 53 * Typically, parsing a document involves creating a new tokenizer, giving |
56 * it a scanner (input) and an event handler (output), and then calling | 54 * it a scanner (input) and an event handler (output), and then calling |
57 * the Tokenizer::parse() method.` | 55 * the Tokenizer::parse() method.` |
58 * | 56 * |
59 * @param \Masterminds\HTML5\Parser\Scanner $scanner | 57 * @param Scanner $scanner A scanner initialized with an input stream. |
60 * A scanner initialized with an input stream. | 58 * @param EventHandler $eventHandler An event handler, initialized and ready to receive events. |
61 * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler | 59 * @param string $mode |
62 * An event handler, initialized and ready to receive | |
63 * events. | |
64 * @param string $mode | |
65 */ | 60 */ |
66 public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) | 61 public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) |
67 { | 62 { |
68 $this->scanner = $scanner; | 63 $this->scanner = $scanner; |
69 $this->events = $eventHandler; | 64 $this->events = $eventHandler; |
101 * This allows those modes to be set. | 96 * This allows those modes to be set. |
102 * | 97 * |
103 * Normally, setting is done by the event handler via a special return code on | 98 * Normally, setting is done by the event handler via a special return code on |
104 * startTag(), but it can also be set manually using this function. | 99 * startTag(), but it can also be set manually using this function. |
105 * | 100 * |
106 * @param integer $textmode | 101 * @param int $textmode One of Elements::TEXT_*. |
107 * One of Elements::TEXT_* | 102 * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not |
108 * @param string $untilTag | 103 * use this indicator. |
109 * The tag that should stop RAW or RCDATA mode. Normal mode does not | |
110 * use this indicator. | |
111 */ | 104 */ |
112 public function setTextMode($textmode, $untilTag = null) | 105 public function setTextMode($textmode, $untilTag = null) |
113 { | 106 { |
114 $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); | 107 $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); |
115 $this->untilTag = $untilTag; | 108 $this->untilTag = $untilTag; |
116 } | 109 } |
117 | 110 |
118 /** | 111 /** |
119 * Consume a character and make a move. | 112 * Consume a character and make a move. |
120 * HTML5 8.2.4.1 | 113 * HTML5 8.2.4.1. |
121 */ | 114 */ |
122 protected function consumeData() | 115 protected function consumeData() |
123 { | 116 { |
124 // Character Ref | 117 $tok = $this->scanner->current(); |
125 /* | 118 |
126 * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); | 119 if ('&' === $tok) { |
127 */ | 120 // Character reference |
128 $this->characterReference(); | 121 $ref = $this->decodeCharacterReference(); |
129 $this->tagOpen(); | 122 $this->buffer($ref); |
130 $this->eof(); | 123 |
131 $this->characterData(); | 124 $tok = $this->scanner->current(); |
125 } | |
126 | |
127 // Parse tag | |
128 if ('<' === $tok) { | |
129 // Any buffered text data can go out now. | |
130 $this->flushBuffer(); | |
131 | |
132 $tok = $this->scanner->next(); | |
133 | |
134 if ('!' === $tok) { | |
135 $this->markupDeclaration(); | |
136 } elseif ('/' === $tok) { | |
137 $this->endTag(); | |
138 } elseif ('?' === $tok) { | |
139 $this->processingInstruction(); | |
140 } elseif (ctype_alpha($tok)) { | |
141 $this->tagName(); | |
142 } else { | |
143 $this->parseError('Illegal tag opening'); | |
144 // TODO is this necessary ? | |
145 $this->characterData(); | |
146 } | |
147 | |
148 $tok = $this->scanner->current(); | |
149 } | |
150 | |
151 if (false === $tok) { | |
152 // Handle end of document | |
153 $this->eof(); | |
154 } else { | |
155 // Parse character | |
156 switch ($this->textMode) { | |
157 case Elements::TEXT_RAW: | |
158 $this->rawText($tok); | |
159 break; | |
160 | |
161 case Elements::TEXT_RCDATA: | |
162 $this->rcdata($tok); | |
163 break; | |
164 | |
165 default: | |
166 if ('<' === $tok || '&' === $tok) { | |
167 break; | |
168 } | |
169 | |
170 // NULL character | |
171 if ("\00" === $tok) { | |
172 $this->parseError('Received null character.'); | |
173 | |
174 $this->text .= $tok; | |
175 $this->scanner->consume(); | |
176 | |
177 break; | |
178 } | |
179 | |
180 $this->text .= $this->scanner->charsUntil("<&\0"); | |
181 } | |
182 } | |
132 | 183 |
133 return $this->carryOn; | 184 return $this->carryOn; |
134 } | 185 } |
135 | 186 |
136 /** | 187 /** |
141 * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. | 192 * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. |
142 */ | 193 */ |
143 protected function characterData() | 194 protected function characterData() |
144 { | 195 { |
145 $tok = $this->scanner->current(); | 196 $tok = $this->scanner->current(); |
146 if ($tok === false) { | 197 if (false === $tok) { |
147 return false; | 198 return false; |
148 } | 199 } |
149 switch ($this->textMode) { | 200 switch ($this->textMode) { |
150 case Elements::TEXT_RAW: | 201 case Elements::TEXT_RAW: |
151 return $this->rawText(); | 202 return $this->rawText($tok); |
152 case Elements::TEXT_RCDATA: | 203 case Elements::TEXT_RCDATA: |
153 return $this->rcdata(); | 204 return $this->rcdata($tok); |
154 default: | 205 default: |
155 if (strspn($tok, "<&")) { | 206 if ('<' === $tok || '&' === $tok) { |
156 return false; | 207 return false; |
157 } | 208 } |
158 return $this->text(); | 209 |
210 return $this->text($tok); | |
159 } | 211 } |
160 } | 212 } |
161 | 213 |
162 /** | 214 /** |
163 * This buffers the current token as character data. | 215 * This buffers the current token as character data. |
164 */ | 216 * |
165 protected function text() | 217 * @param string $tok The current token. |
166 { | 218 * |
167 $tok = $this->scanner->current(); | 219 * @return bool |
168 | 220 */ |
221 protected function text($tok) | |
222 { | |
169 // This should never happen... | 223 // This should never happen... |
170 if ($tok === false) { | 224 if (false === $tok) { |
171 return false; | 225 return false; |
172 } | 226 } |
173 // Null | 227 |
174 if ($tok === "\00") { | 228 // NULL character |
175 $this->parseError("Received null character."); | 229 if ("\00" === $tok) { |
176 } | 230 $this->parseError('Received null character.'); |
177 // fprintf(STDOUT, "Writing '%s'", $tok); | 231 } |
232 | |
178 $this->buffer($tok); | 233 $this->buffer($tok); |
179 $this->scanner->next(); | 234 $this->scanner->consume(); |
235 | |
180 return true; | 236 return true; |
181 } | 237 } |
182 | 238 |
183 /** | 239 /** |
184 * Read text in RAW mode. | 240 * Read text in RAW mode. |
185 */ | 241 * |
186 protected function rawText() | 242 * @param string $tok The current token. |
243 * | |
244 * @return bool | |
245 */ | |
246 protected function rawText($tok) | |
187 { | 247 { |
188 if (is_null($this->untilTag)) { | 248 if (is_null($this->untilTag)) { |
189 return $this->text(); | 249 return $this->text($tok); |
190 } | 250 } |
251 | |
191 $sequence = '</' . $this->untilTag . '>'; | 252 $sequence = '</' . $this->untilTag . '>'; |
192 $txt = $this->readUntilSequence($sequence); | 253 $txt = $this->readUntilSequence($sequence); |
193 $this->events->text($txt); | 254 $this->events->text($txt); |
194 $this->setTextMode(0); | 255 $this->setTextMode(0); |
256 | |
195 return $this->endTag(); | 257 return $this->endTag(); |
196 } | 258 } |
197 | 259 |
198 /** | 260 /** |
199 * Read text in RCDATA mode. | 261 * Read text in RCDATA mode. |
200 */ | 262 * |
201 protected function rcdata() | 263 * @param string $tok The current token. |
264 * | |
265 * @return bool | |
266 */ | |
267 protected function rcdata($tok) | |
202 { | 268 { |
203 if (is_null($this->untilTag)) { | 269 if (is_null($this->untilTag)) { |
204 return $this->text(); | 270 return $this->text($tok); |
205 } | 271 } |
272 | |
206 $sequence = '</' . $this->untilTag; | 273 $sequence = '</' . $this->untilTag; |
207 $txt = ''; | 274 $txt = ''; |
208 $tok = $this->scanner->current(); | |
209 | 275 |
210 $caseSensitive = !Elements::isHtml5Element($this->untilTag); | 276 $caseSensitive = !Elements::isHtml5Element($this->untilTag); |
211 while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { | 277 while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { |
212 if ($tok == '&') { | 278 if ('&' == $tok) { |
213 $txt .= $this->decodeCharacterReference(); | 279 $txt .= $this->decodeCharacterReference(); |
214 $tok = $this->scanner->current(); | 280 $tok = $this->scanner->current(); |
215 } else { | 281 } else { |
216 $txt .= $tok; | 282 $txt .= $tok; |
217 $tok = $this->scanner->next(); | 283 $tok = $this->scanner->next(); |
218 } | 284 } |
219 } | 285 } |
220 $len = strlen($sequence); | 286 $len = strlen($sequence); |
221 $this->scanner->consume($len); | 287 $this->scanner->consume($len); |
222 $len += strlen($this->scanner->whitespace()); | 288 $len += $this->scanner->whitespace(); |
223 if ($this->scanner->current() !== '>') { | 289 if ('>' !== $this->scanner->current()) { |
224 $this->parseError("Unclosed RCDATA end tag"); | 290 $this->parseError('Unclosed RCDATA end tag'); |
225 } | 291 } |
292 | |
226 $this->scanner->unconsume($len); | 293 $this->scanner->unconsume($len); |
227 $this->events->text($txt); | 294 $this->events->text($txt); |
228 $this->setTextMode(0); | 295 $this->setTextMode(0); |
296 | |
229 return $this->endTag(); | 297 return $this->endTag(); |
230 } | 298 } |
231 | 299 |
232 /** | 300 /** |
233 * If the document is read, emit an EOF event. | 301 * If the document is read, emit an EOF event. |
234 */ | 302 */ |
235 protected function eof() | 303 protected function eof() |
236 { | 304 { |
237 if ($this->scanner->current() === false) { | 305 // fprintf(STDOUT, "EOF"); |
238 // fprintf(STDOUT, "EOF"); | |
239 $this->flushBuffer(); | |
240 $this->events->eof(); | |
241 $this->carryOn = false; | |
242 return true; | |
243 } | |
244 return false; | |
245 } | |
246 | |
247 /** | |
248 * Handle character references (aka entities). | |
249 * | |
250 * This version is specific to PCDATA, as it buffers data into the | |
251 * text buffer. For a generic version, see decodeCharacterReference(). | |
252 * | |
253 * HTML5 8.2.4.2 | |
254 */ | |
255 protected function characterReference() | |
256 { | |
257 $ref = $this->decodeCharacterReference(); | |
258 if ($ref !== false) { | |
259 $this->buffer($ref); | |
260 return true; | |
261 } | |
262 return false; | |
263 } | |
264 | |
265 /** | |
266 * Emit a tagStart event on encountering a tag. | |
267 * | |
268 * 8.2.4.8 | |
269 */ | |
270 protected function tagOpen() | |
271 { | |
272 if ($this->scanner->current() != '<') { | |
273 return false; | |
274 } | |
275 | |
276 // Any buffered text data can go out now. | |
277 $this->flushBuffer(); | 306 $this->flushBuffer(); |
278 | 307 $this->events->eof(); |
279 $this->scanner->next(); | 308 $this->carryOn = false; |
280 | |
281 return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || | |
282 /* This always returns false. */ | |
283 $this->parseError("Illegal tag opening") || $this->characterData(); | |
284 } | 309 } |
285 | 310 |
286 /** | 311 /** |
287 * Look for markup. | 312 * Look for markup. |
288 */ | 313 */ |
289 protected function markupDeclaration() | 314 protected function markupDeclaration() |
290 { | 315 { |
291 if ($this->scanner->current() != '!') { | |
292 return false; | |
293 } | |
294 | |
295 $tok = $this->scanner->next(); | 316 $tok = $this->scanner->next(); |
296 | 317 |
297 // Comment: | 318 // Comment: |
298 if ($tok == '-' && $this->scanner->peek() == '-') { | 319 if ('-' == $tok && '-' == $this->scanner->peek()) { |
299 $this->scanner->next(); // Consume the other '-' | 320 $this->scanner->consume(2); |
300 $this->scanner->next(); // Next char. | 321 |
301 return $this->comment(); | 322 return $this->comment(); |
302 } | 323 } elseif ('D' == $tok || 'd' == $tok) { // Doctype |
303 | |
304 elseif ($tok == 'D' || $tok == 'd') { // Doctype | |
305 return $this->doctype(); | 324 return $this->doctype(); |
306 } | 325 } elseif ('[' == $tok) { // CDATA section |
307 | |
308 elseif ($tok == '[') { // CDATA section | |
309 return $this->cdataSection(); | 326 return $this->cdataSection(); |
310 } | 327 } |
311 | 328 |
312 // FINISH | 329 // FINISH |
313 $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok); | 330 $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok); |
314 $this->bogusComment('<!'); | 331 $this->bogusComment('<!'); |
332 | |
315 return true; | 333 return true; |
316 } | 334 } |
317 | 335 |
318 /** | 336 /** |
319 * Consume an end tag. | 337 * Consume an end tag. See section 8.2.4.9. |
320 * 8.2.4.9 | |
321 */ | 338 */ |
322 protected function endTag() | 339 protected function endTag() |
323 { | 340 { |
324 if ($this->scanner->current() != '/') { | 341 if ('/' != $this->scanner->current()) { |
325 return false; | 342 return false; |
326 } | 343 } |
327 $tok = $this->scanner->next(); | 344 $tok = $this->scanner->next(); |
328 | 345 |
329 // a-zA-Z -> tagname | 346 // a-zA-Z -> tagname |
330 // > -> parse error | 347 // > -> parse error |
331 // EOF -> parse error | 348 // EOF -> parse error |
332 // -> parse error | 349 // -> parse error |
333 if (! ctype_alpha($tok)) { | 350 if (!ctype_alpha($tok)) { |
334 $this->parseError("Expected tag name, got '%s'", $tok); | 351 $this->parseError("Expected tag name, got '%s'", $tok); |
335 if ($tok == "\0" || $tok === false) { | 352 if ("\0" == $tok || false === $tok) { |
336 return false; | 353 return false; |
337 } | 354 } |
355 | |
338 return $this->bogusComment('</'); | 356 return $this->bogusComment('</'); |
339 } | 357 } |
340 | 358 |
341 $name = $this->scanner->charsUntil("\n\f \t>"); | 359 $name = $this->scanner->charsUntil("\n\f \t>"); |
342 $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name); | 360 $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); |
343 // Trash whitespace. | 361 // Trash whitespace. |
344 $this->scanner->whitespace(); | 362 $this->scanner->whitespace(); |
345 | 363 |
346 if ($this->scanner->current() != '>') { | 364 $tok = $this->scanner->current(); |
347 $this->parseError("Expected >, got '%s'", $this->scanner->current()); | 365 if ('>' != $tok) { |
366 $this->parseError("Expected >, got '%s'", $tok); | |
348 // We just trash stuff until we get to the next tag close. | 367 // We just trash stuff until we get to the next tag close. |
349 $this->scanner->charsUntil('>'); | 368 $this->scanner->charsUntil('>'); |
350 } | 369 } |
351 | 370 |
352 $this->events->endTag($name); | 371 $this->events->endTag($name); |
353 $this->scanner->next(); | 372 $this->scanner->consume(); |
373 | |
354 return true; | 374 return true; |
355 } | 375 } |
356 | 376 |
357 /** | 377 /** |
358 * Consume a tag name and body. | 378 * Consume a tag name and body. See section 8.2.4.10. |
359 * 8.2.4.10 | |
360 */ | 379 */ |
361 protected function tagName() | 380 protected function tagName() |
362 { | 381 { |
363 $tok = $this->scanner->current(); | |
364 if (! ctype_alpha($tok)) { | |
365 return false; | |
366 } | |
367 | |
368 // We know this is at least one char. | 382 // We know this is at least one char. |
369 $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); | 383 $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'); |
370 $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name); | 384 $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); |
371 $attributes = array(); | 385 $attributes = array(); |
372 $selfClose = false; | 386 $selfClose = false; |
373 | 387 |
374 // Handle attribute parse exceptions here so that we can | 388 // Handle attribute parse exceptions here so that we can |
375 // react by trying to build a sensible parse tree. | 389 // react by trying to build a sensible parse tree. |
376 try { | 390 try { |
377 do { | 391 do { |
378 $this->scanner->whitespace(); | 392 $this->scanner->whitespace(); |
379 $this->attribute($attributes); | 393 $this->attribute($attributes); |
380 } while (! $this->isTagEnd($selfClose)); | 394 } while (!$this->isTagEnd($selfClose)); |
381 } catch (ParseError $e) { | 395 } catch (ParseError $e) { |
382 $selfClose = false; | 396 $selfClose = false; |
383 } | 397 } |
384 | 398 |
385 $mode = $this->events->startTag($name, $attributes, $selfClose); | 399 $mode = $this->events->startTag($name, $attributes, $selfClose); |
386 // Should we do this? What does this buy that selfClose doesn't? | 400 |
387 if ($selfClose) { | 401 if (is_int($mode)) { |
388 $this->events->endTag($name); | |
389 } elseif (is_int($mode)) { | |
390 // fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name); | |
391 $this->setTextMode($mode, $name); | 402 $this->setTextMode($mode, $name); |
392 } | 403 } |
393 | 404 |
394 $this->scanner->next(); | 405 $this->scanner->consume(); |
395 | 406 |
396 return true; | 407 return true; |
397 } | 408 } |
398 | 409 |
399 /** | 410 /** |
400 * Check if the scanner has reached the end of a tag. | 411 * Check if the scanner has reached the end of a tag. |
401 */ | 412 */ |
402 protected function isTagEnd(&$selfClose) | 413 protected function isTagEnd(&$selfClose) |
403 { | 414 { |
404 $tok = $this->scanner->current(); | 415 $tok = $this->scanner->current(); |
405 if ($tok == '/') { | 416 if ('/' == $tok) { |
406 $this->scanner->next(); | 417 $this->scanner->consume(); |
407 $this->scanner->whitespace(); | 418 $this->scanner->whitespace(); |
408 $tok = $this->scanner->current(); | 419 $tok = $this->scanner->current(); |
409 | 420 |
410 if ($tok == '>') { | 421 if ('>' == $tok) { |
411 $selfClose = true; | 422 $selfClose = true; |
423 | |
412 return true; | 424 return true; |
413 } | 425 } |
414 if ($tok === false) { | 426 if (false === $tok) { |
415 $this->parseError("Unexpected EOF inside of tag."); | 427 $this->parseError('Unexpected EOF inside of tag.'); |
428 | |
416 return true; | 429 return true; |
417 } | 430 } |
418 // Basically, we skip the / token and go on. | 431 // Basically, we skip the / token and go on. |
419 // See 8.2.4.43. | 432 // See 8.2.4.43. |
420 $this->parseError("Unexpected '%s' inside of a tag.", $tok); | 433 $this->parseError("Unexpected '%s' inside of a tag.", $tok); |
434 | |
421 return false; | 435 return false; |
422 } | 436 } |
423 | 437 |
424 if ($tok == '>') { | 438 if ('>' == $tok) { |
425 return true; | 439 return true; |
426 } | 440 } |
427 if ($tok === false) { | 441 if (false === $tok) { |
428 $this->parseError("Unexpected EOF inside of tag."); | 442 $this->parseError('Unexpected EOF inside of tag.'); |
443 | |
429 return true; | 444 return true; |
430 } | 445 } |
431 | 446 |
432 return false; | 447 return false; |
433 } | 448 } |
434 | 449 |
435 /** | 450 /** |
436 * Parse attributes from inside of a tag. | 451 * Parse attributes from inside of a tag. |
452 * | |
453 * @param string[] $attributes | |
454 * | |
455 * @return bool | |
456 * | |
457 * @throws ParseError | |
437 */ | 458 */ |
438 protected function attribute(&$attributes) | 459 protected function attribute(&$attributes) |
439 { | 460 { |
440 $tok = $this->scanner->current(); | 461 $tok = $this->scanner->current(); |
441 if ($tok == '/' || $tok == '>' || $tok === false) { | 462 if ('/' == $tok || '>' == $tok || false === $tok) { |
442 return false; | 463 return false; |
443 } | 464 } |
444 | 465 |
445 if ($tok == '<') { | 466 if ('<' == $tok) { |
446 $this->parseError("Unexepcted '<' inside of attributes list."); | 467 $this->parseError("Unexpected '<' inside of attributes list."); |
447 // Push the < back onto the stack. | 468 // Push the < back onto the stack. |
448 $this->scanner->unconsume(); | 469 $this->scanner->unconsume(); |
449 // Let the caller figure out how to handle this. | 470 // Let the caller figure out how to handle this. |
450 throw new ParseError("Start tag inside of attribute."); | 471 throw new ParseError('Start tag inside of attribute.'); |
451 } | 472 } |
452 | 473 |
453 $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); | 474 $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); |
454 | 475 |
455 if (strlen($name) == 0) { | 476 if (0 == strlen($name)) { |
456 $this->parseError("Expected an attribute name, got %s.", $this->scanner->current()); | 477 $tok = $this->scanner->current(); |
478 $this->parseError('Expected an attribute name, got %s.', $tok); | |
457 // Really, only '=' can be the char here. Everything else gets absorbed | 479 // Really, only '=' can be the char here. Everything else gets absorbed |
458 // under one rule or another. | 480 // under one rule or another. |
459 $name = $this->scanner->current(); | 481 $name = $tok; |
460 $this->scanner->next(); | 482 $this->scanner->consume(); |
461 } | 483 } |
462 | 484 |
463 $isValidAttribute = true; | 485 $isValidAttribute = true; |
464 // Attribute names can contain most Unicode characters for HTML5. | 486 // Attribute names can contain most Unicode characters for HTML5. |
465 // But method "DOMElement::setAttribute" is throwing exception | 487 // But method "DOMElement::setAttribute" is throwing exception |
466 // because of it's own internal restriction so these have to be filtered. | 488 // because of it's own internal restriction so these have to be filtered. |
467 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 | 489 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 |
468 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name | 490 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name |
469 if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { | 491 if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { |
470 $this->parseError("Unexpected characters in attribute name: %s", $name); | 492 $this->parseError('Unexpected characters in attribute name: %s', $name); |
471 $isValidAttribute = false; | 493 $isValidAttribute = false; |
472 } // There is no limitation for 1st character in HTML5. | 494 } // There is no limitation for 1st character in HTML5. |
473 // But method "DOMElement::setAttribute" is throwing exception for the | 495 // But method "DOMElement::setAttribute" is throwing exception for the |
474 // characters below so they have to be filtered. | 496 // characters below so they have to be filtered. |
475 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 | 497 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 |
476 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name | 498 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name |
477 else | 499 elseif (preg_match('/^[0-9.-]/u', $name)) { |
478 if (preg_match("/^[0-9.-]/u", $name)) { | 500 $this->parseError('Unexpected character at the begining of attribute name: %s', $name); |
479 $this->parseError("Unexpected character at the begining of attribute name: %s", $name); | 501 $isValidAttribute = false; |
480 $isValidAttribute = false; | 502 } |
481 } | |
482 // 8.1.2.3 | 503 // 8.1.2.3 |
483 $this->scanner->whitespace(); | 504 $this->scanner->whitespace(); |
484 | 505 |
485 $val = $this->attributeValue(); | 506 $val = $this->attributeValue(); |
486 if ($isValidAttribute) { | 507 if ($isValidAttribute) { |
487 $attributes[$name] = $val; | 508 $attributes[$name] = $val; |
488 } | 509 } |
510 | |
489 return true; | 511 return true; |
490 } | 512 } |
491 | 513 |
492 /** | 514 /** |
493 * Consume an attribute value. | 515 * Consume an attribute value. See section 8.2.4.37 and after. |
494 * 8.2.4.37 and after. | 516 * |
517 * @return string|null | |
495 */ | 518 */ |
496 protected function attributeValue() | 519 protected function attributeValue() |
497 { | 520 { |
498 if ($this->scanner->current() != '=') { | 521 if ('=' != $this->scanner->current()) { |
499 return null; | 522 return null; |
500 } | 523 } |
501 $this->scanner->next(); | 524 $this->scanner->consume(); |
502 // 8.1.2.3 | 525 // 8.1.2.3 |
503 $this->scanner->whitespace(); | 526 $this->scanner->whitespace(); |
504 | 527 |
505 $tok = $this->scanner->current(); | 528 $tok = $this->scanner->current(); |
506 switch ($tok) { | 529 switch ($tok) { |
507 case "\n": | 530 case "\n": |
508 case "\f": | 531 case "\f": |
509 case " ": | 532 case ' ': |
510 case "\t": | 533 case "\t": |
511 // Whitespace here indicates an empty value. | 534 // Whitespace here indicates an empty value. |
512 return null; | 535 return null; |
513 case '"': | 536 case '"': |
514 case "'": | 537 case "'": |
515 $this->scanner->next(); | 538 $this->scanner->consume(); |
539 | |
516 return $this->quotedAttributeValue($tok); | 540 return $this->quotedAttributeValue($tok); |
517 case '>': | 541 case '>': |
518 // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr. | 542 // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr. |
519 $this->parseError("Expected attribute value, got tag end."); | 543 $this->parseError('Expected attribute value, got tag end.'); |
544 | |
520 return null; | 545 return null; |
521 case '=': | 546 case '=': |
522 case '`': | 547 case '`': |
523 $this->parseError("Expecting quotes, got %s.", $tok); | 548 $this->parseError('Expecting quotes, got %s.', $tok); |
549 | |
524 return $this->unquotedAttributeValue(); | 550 return $this->unquotedAttributeValue(); |
525 default: | 551 default: |
526 return $this->unquotedAttributeValue(); | 552 return $this->unquotedAttributeValue(); |
527 } | 553 } |
528 } | 554 } |
529 | 555 |
530 /** | 556 /** |
531 * Get an attribute value string. | 557 * Get an attribute value string. |
532 * | 558 * |
533 * @param string $quote | 559 * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered |
534 * IMPORTANT: This is a series of chars! Any one of which will be considered | 560 * termination of an attribute's value. E.g. "\"'" will stop at either |
535 * termination of an attribute's value. E.g. "\"'" will stop at either | 561 * ' or ". |
536 * ' or ". | 562 * |
537 * @return string The attribute value. | 563 * @return string The attribute value. |
538 */ | 564 */ |
539 protected function quotedAttributeValue($quote) | 565 protected function quotedAttributeValue($quote) |
540 { | 566 { |
541 $stoplist = "\f" . $quote; | 567 $stoplist = "\f" . $quote; |
542 $val = ''; | 568 $val = ''; |
543 | 569 |
544 while (true) { | 570 while (true) { |
545 $tokens = $this->scanner->charsUntil($stoplist.'&'); | 571 $tokens = $this->scanner->charsUntil($stoplist . '&'); |
546 if ($tokens !== false) { | 572 if (false !== $tokens) { |
547 $val .= $tokens; | 573 $val .= $tokens; |
548 } else { | 574 } else { |
549 break; | 575 break; |
550 } | 576 } |
551 | 577 |
552 $tok = $this->scanner->current(); | 578 $tok = $this->scanner->current(); |
553 if ($tok == '&') { | 579 if ('&' == $tok) { |
554 $val .= $this->decodeCharacterReference(true, $tok); | 580 $val .= $this->decodeCharacterReference(true); |
555 continue; | 581 continue; |
556 } | 582 } |
557 break; | 583 break; |
558 } | 584 } |
559 $this->scanner->next(); | 585 $this->scanner->consume(); |
586 | |
560 return $val; | 587 return $val; |
561 } | 588 } |
562 | 589 |
563 protected function unquotedAttributeValue() | 590 protected function unquotedAttributeValue() |
564 { | 591 { |
565 $stoplist = "\t\n\f >"; | |
566 $val = ''; | 592 $val = ''; |
567 $tok = $this->scanner->current(); | 593 $tok = $this->scanner->current(); |
568 while (strspn($tok, $stoplist) == 0 && $tok !== false) { | 594 while (false !== $tok) { |
569 if ($tok == '&') { | 595 switch ($tok) { |
570 $val .= $this->decodeCharacterReference(true); | 596 case "\n": |
571 $tok = $this->scanner->current(); | 597 case "\f": |
572 } else { | 598 case ' ': |
573 if (strspn($tok, "\"'<=`") > 0) { | 599 case "\t": |
574 $this->parseError("Unexpected chars in unquoted attribute value %s", $tok); | 600 case '>': |
575 } | 601 break 2; |
576 $val .= $tok; | 602 |
577 $tok = $this->scanner->next(); | 603 case '&': |
578 } | 604 $val .= $this->decodeCharacterReference(true); |
579 } | 605 $tok = $this->scanner->current(); |
606 | |
607 break; | |
608 | |
609 case "'": | |
610 case '"': | |
611 case '<': | |
612 case '=': | |
613 case '`': | |
614 $this->parseError('Unexpected chars in unquoted attribute value %s', $tok); | |
615 $val .= $tok; | |
616 $tok = $this->scanner->next(); | |
617 break; | |
618 | |
619 default: | |
620 $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`"); | |
621 | |
622 $tok = $this->scanner->current(); | |
623 } | |
624 } | |
625 | |
580 return $val; | 626 return $val; |
581 } | 627 } |
582 | 628 |
583 /** | 629 /** |
584 * Consume malformed markup as if it were a comment. | 630 * Consume malformed markup as if it were a comment. |
585 * 8.2.4.44 | 631 * 8.2.4.44. |
586 * | 632 * |
587 * The spec requires that the ENTIRE tag-like thing be enclosed inside of | 633 * The spec requires that the ENTIRE tag-like thing be enclosed inside of |
588 * the comment. So this will generate comments like: | 634 * the comment. So this will generate comments like: |
589 * | 635 * |
590 * <!--</+foo>--> | 636 * <!--</+foo>--> |
591 * | 637 * |
592 * @param string $leading | 638 * @param string $leading Prepend any leading characters. This essentially |
593 * Prepend any leading characters. This essentially | 639 * negates the need to backtrack, but it's sort of a hack. |
594 * negates the need to backtrack, but it's sort of | 640 * |
595 * a hack. | 641 * @return bool |
596 */ | 642 */ |
597 protected function bogusComment($leading = '') | 643 protected function bogusComment($leading = '') |
598 { | 644 { |
599 $comment = $leading; | 645 $comment = $leading; |
600 $tokens = $this->scanner->charsUntil('>'); | 646 $tokens = $this->scanner->charsUntil('>'); |
601 if ($tokens !== false) { | 647 if (false !== $tokens) { |
602 $comment .= $tokens; | 648 $comment .= $tokens; |
603 } | 649 } |
604 $tok = $this->scanner->current(); | 650 $tok = $this->scanner->current(); |
605 if ($tok !== false) { | 651 if (false !== $tok) { |
606 $comment .= $tok; | 652 $comment .= $tok; |
607 } | 653 } |
608 | 654 |
609 $this->flushBuffer(); | 655 $this->flushBuffer(); |
610 $this->events->comment($comment); | 656 $this->events->comment($comment); |
611 $this->scanner->next(); | 657 $this->scanner->consume(); |
612 | 658 |
613 return true; | 659 return true; |
614 } | 660 } |
615 | 661 |
616 /** | 662 /** |
617 * Read a comment. | 663 * Read a comment. |
618 * | |
619 * Expects the first tok to be inside of the comment. | 664 * Expects the first tok to be inside of the comment. |
665 * | |
666 * @return bool | |
620 */ | 667 */ |
621 protected function comment() | 668 protected function comment() |
622 { | 669 { |
623 $tok = $this->scanner->current(); | 670 $tok = $this->scanner->current(); |
624 $comment = ''; | 671 $comment = ''; |
625 | 672 |
626 // <!-->. Emit an empty comment because 8.2.4.46 says to. | 673 // <!-->. Emit an empty comment because 8.2.4.46 says to. |
627 if ($tok == '>') { | 674 if ('>' == $tok) { |
628 // Parse error. Emit the comment token. | 675 // Parse error. Emit the comment token. |
629 $this->parseError("Expected comment data, got '>'"); | 676 $this->parseError("Expected comment data, got '>'"); |
630 $this->events->comment(''); | 677 $this->events->comment(''); |
631 $this->scanner->next(); | 678 $this->scanner->consume(); |
679 | |
632 return true; | 680 return true; |
633 } | 681 } |
634 | 682 |
635 // Replace NULL with the replacement char. | 683 // Replace NULL with the replacement char. |
636 if ($tok == "\0") { | 684 if ("\0" == $tok) { |
637 $tok = UTF8Utils::FFFD; | 685 $tok = UTF8Utils::FFFD; |
638 } | 686 } |
639 while (! $this->isCommentEnd()) { | 687 while (!$this->isCommentEnd()) { |
640 $comment .= $tok; | 688 $comment .= $tok; |
641 $tok = $this->scanner->next(); | 689 $tok = $this->scanner->next(); |
642 } | 690 } |
643 | 691 |
644 $this->events->comment($comment); | 692 $this->events->comment($comment); |
645 $this->scanner->next(); | 693 $this->scanner->consume(); |
694 | |
646 return true; | 695 return true; |
647 } | 696 } |
648 | 697 |
649 /** | 698 /** |
650 * Check if the scanner has reached the end of a comment. | 699 * Check if the scanner has reached the end of a comment. |
700 * | |
701 * @return bool | |
651 */ | 702 */ |
652 protected function isCommentEnd() | 703 protected function isCommentEnd() |
653 { | 704 { |
654 $tok = $this->scanner->current(); | 705 $tok = $this->scanner->current(); |
655 | 706 |
656 // EOF | 707 // EOF |
657 if ($tok === false) { | 708 if (false === $tok) { |
658 // Hit the end. | 709 // Hit the end. |
659 $this->parseError("Unexpected EOF in a comment."); | 710 $this->parseError('Unexpected EOF in a comment.'); |
711 | |
660 return true; | 712 return true; |
661 } | 713 } |
662 | 714 |
663 // If it doesn't start with -, not the end. | 715 // If it doesn't start with -, not the end. |
664 if ($tok != '-') { | 716 if ('-' != $tok) { |
665 return false; | 717 return false; |
666 } | 718 } |
667 | 719 |
668 // Advance one, and test for '->' | 720 // Advance one, and test for '->' |
669 if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { | 721 if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) { |
670 $this->scanner->next(); // Consume the last '>' | 722 $this->scanner->consume(); // Consume the last '>' |
671 return true; | 723 return true; |
672 } | 724 } |
673 // Unread '-'; | 725 // Unread '-'; |
674 $this->scanner->unconsume(1); | 726 $this->scanner->unconsume(1); |
727 | |
675 return false; | 728 return false; |
676 } | 729 } |
677 | 730 |
678 /** | 731 /** |
679 * Parse a DOCTYPE. | 732 * Parse a DOCTYPE. |
680 * | 733 * |
681 * Parse a DOCTYPE declaration. This method has strong bearing on whether or | 734 * Parse a DOCTYPE declaration. This method has strong bearing on whether or |
682 * not Quirksmode is enabled on the event handler. | 735 * not Quirksmode is enabled on the event handler. |
683 * | 736 * |
684 * @todo This method is a little long. Should probably refactor. | 737 * @todo This method is a little long. Should probably refactor. |
738 * | |
739 * @return bool | |
685 */ | 740 */ |
686 protected function doctype() | 741 protected function doctype() |
687 { | 742 { |
688 if (strcasecmp($this->scanner->current(), 'D')) { | |
689 return false; | |
690 } | |
691 // Check that string is DOCTYPE. | 743 // Check that string is DOCTYPE. |
692 $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); | 744 if ($this->scanner->sequenceMatches('DOCTYPE', false)) { |
693 if (strcasecmp($chars, 'DOCTYPE')) { | 745 $this->scanner->consume(7); |
746 } else { | |
747 $chars = $this->scanner->charsWhile('DOCTYPEdoctype'); | |
694 $this->parseError('Expected DOCTYPE, got %s', $chars); | 748 $this->parseError('Expected DOCTYPE, got %s', $chars); |
749 | |
695 return $this->bogusComment('<!' . $chars); | 750 return $this->bogusComment('<!' . $chars); |
696 } | 751 } |
697 | 752 |
698 $this->scanner->whitespace(); | 753 $this->scanner->whitespace(); |
699 $tok = $this->scanner->current(); | 754 $tok = $this->scanner->current(); |
700 | 755 |
701 // EOF: die. | 756 // EOF: die. |
702 if ($tok === false) { | 757 if (false === $tok) { |
703 $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); | 758 $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); |
704 return $this->eof(); | 759 $this->eof(); |
705 } | 760 |
706 | 761 return true; |
707 $doctypeName = ''; | 762 } |
708 | 763 |
709 // NULL char: convert. | 764 // NULL char: convert. |
710 if ($tok === "\0") { | 765 if ("\0" === $tok) { |
711 $this->parseError("Unexpected null character in DOCTYPE."); | 766 $this->parseError('Unexpected null character in DOCTYPE.'); |
712 $doctypeName .= UTF8::FFFD; | |
713 $tok = $this->scanner->next(); | |
714 } | 767 } |
715 | 768 |
716 $stop = " \n\f>"; | 769 $stop = " \n\f>"; |
717 $doctypeName = $this->scanner->charsUntil($stop); | 770 $doctypeName = $this->scanner->charsUntil($stop); |
718 // Lowercase ASCII, replace \0 with FFFD | 771 // Lowercase ASCII, replace \0 with FFFD |
719 $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); | 772 $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); |
720 | 773 |
721 $tok = $this->scanner->current(); | 774 $tok = $this->scanner->current(); |
722 | 775 |
723 // If false, emit a parse error, DOCTYPE, and return. | 776 // If false, emit a parse error, DOCTYPE, and return. |
724 if ($tok === false) { | 777 if (false === $tok) { |
725 $this->parseError('Unexpected EOF in DOCTYPE declaration.'); | 778 $this->parseError('Unexpected EOF in DOCTYPE declaration.'); |
726 $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); | 779 $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); |
780 | |
727 return true; | 781 return true; |
728 } | 782 } |
729 | 783 |
730 // Short DOCTYPE, like <!DOCTYPE html> | 784 // Short DOCTYPE, like <!DOCTYPE html> |
731 if ($tok == '>') { | 785 if ('>' == $tok) { |
732 // DOCTYPE without a name. | 786 // DOCTYPE without a name. |
733 if (strlen($doctypeName) == 0) { | 787 if (0 == strlen($doctypeName)) { |
734 $this->parseError("Expected a DOCTYPE name. Got nothing."); | 788 $this->parseError('Expected a DOCTYPE name. Got nothing.'); |
735 $this->events->doctype($doctypeName, 0, null, true); | 789 $this->events->doctype($doctypeName, 0, null, true); |
736 $this->scanner->next(); | 790 $this->scanner->consume(); |
791 | |
737 return true; | 792 return true; |
738 } | 793 } |
739 $this->events->doctype($doctypeName); | 794 $this->events->doctype($doctypeName); |
740 $this->scanner->next(); | 795 $this->scanner->consume(); |
796 | |
741 return true; | 797 return true; |
742 } | 798 } |
743 $this->scanner->whitespace(); | 799 $this->scanner->whitespace(); |
744 | 800 |
745 $pub = strtoupper($this->scanner->getAsciiAlpha()); | 801 $pub = strtoupper($this->scanner->getAsciiAlpha()); |
746 $white = strlen($this->scanner->whitespace()); | 802 $white = $this->scanner->whitespace(); |
747 | 803 |
748 // Get ID, and flag it as pub or system. | 804 // Get ID, and flag it as pub or system. |
749 if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { | 805 if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) { |
750 // Get the sys ID. | 806 // Get the sys ID. |
751 $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; | 807 $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; |
752 $id = $this->quotedString("\0>"); | 808 $id = $this->quotedString("\0>"); |
753 if ($id === false) { | 809 if (false === $id) { |
754 $this->events->doctype($doctypeName, $type, $pub, false); | 810 $this->events->doctype($doctypeName, $type, $pub, false); |
755 return false; | 811 |
812 return true; | |
756 } | 813 } |
757 | 814 |
758 // Premature EOF. | 815 // Premature EOF. |
759 if ($this->scanner->current() === false) { | 816 if (false === $this->scanner->current()) { |
760 $this->parseError("Unexpected EOF in DOCTYPE"); | 817 $this->parseError('Unexpected EOF in DOCTYPE'); |
761 $this->events->doctype($doctypeName, $type, $id, true); | 818 $this->events->doctype($doctypeName, $type, $id, true); |
819 | |
762 return true; | 820 return true; |
763 } | 821 } |
764 | 822 |
765 // Well-formed complete DOCTYPE. | 823 // Well-formed complete DOCTYPE. |
766 $this->scanner->whitespace(); | 824 $this->scanner->whitespace(); |
767 if ($this->scanner->current() == '>') { | 825 if ('>' == $this->scanner->current()) { |
768 $this->events->doctype($doctypeName, $type, $id, false); | 826 $this->events->doctype($doctypeName, $type, $id, false); |
769 $this->scanner->next(); | 827 $this->scanner->consume(); |
828 | |
770 return true; | 829 return true; |
771 } | 830 } |
772 | 831 |
773 // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK | 832 // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK |
774 // Throw away the junk, parse error, quirks mode, return true. | 833 // Throw away the junk, parse error, quirks mode, return true. |
775 $this->scanner->charsUntil(">"); | 834 $this->scanner->charsUntil('>'); |
776 $this->parseError("Malformed DOCTYPE."); | 835 $this->parseError('Malformed DOCTYPE.'); |
777 $this->events->doctype($doctypeName, $type, $id, true); | 836 $this->events->doctype($doctypeName, $type, $id, true); |
778 $this->scanner->next(); | 837 $this->scanner->consume(); |
838 | |
779 return true; | 839 return true; |
780 } | 840 } |
781 | 841 |
782 // Else it's a bogus DOCTYPE. | 842 // Else it's a bogus DOCTYPE. |
783 // Consume to > and trash. | 843 // Consume to > and trash. |
784 $this->scanner->charsUntil('>'); | 844 $this->scanner->charsUntil('>'); |
785 | 845 |
786 $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); | 846 $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub); |
787 $this->events->doctype($doctypeName, 0, null, true); | 847 $this->events->doctype($doctypeName, 0, null, true); |
788 $this->scanner->next(); | 848 $this->scanner->consume(); |
849 | |
789 return true; | 850 return true; |
790 } | 851 } |
791 | 852 |
792 /** | 853 /** |
793 * Utility for reading a quoted string. | 854 * Utility for reading a quoted string. |
794 * | 855 * |
795 * @param string $stopchars | 856 * @param string $stopchars Characters (in addition to a close-quote) that should stop the string. |
796 * Characters (in addition to a close-quote) that should stop the string. | 857 * E.g. sometimes '>' is higher precedence than '"' or "'". |
797 * E.g. sometimes '>' is higher precedence than '"' or "'". | 858 * |
798 * @return mixed String if one is found (quotations omitted) | 859 * @return mixed String if one is found (quotations omitted). |
799 */ | 860 */ |
800 protected function quotedString($stopchars) | 861 protected function quotedString($stopchars) |
801 { | 862 { |
802 $tok = $this->scanner->current(); | 863 $tok = $this->scanner->current(); |
803 if ($tok == '"' || $tok == "'") { | 864 if ('"' == $tok || "'" == $tok) { |
804 $this->scanner->next(); | 865 $this->scanner->consume(); |
805 $ret = $this->scanner->charsUntil($tok . $stopchars); | 866 $ret = $this->scanner->charsUntil($tok . $stopchars); |
806 if ($this->scanner->current() == $tok) { | 867 if ($this->scanner->current() == $tok) { |
807 $this->scanner->next(); | 868 $this->scanner->consume(); |
808 } else { | 869 } else { |
809 // Parse error because no close quote. | 870 // Parse error because no close quote. |
810 $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); | 871 $this->parseError('Expected %s, got %s', $tok, $this->scanner->current()); |
811 } | 872 } |
873 | |
812 return $ret; | 874 return $ret; |
813 } | 875 } |
876 | |
814 return false; | 877 return false; |
815 } | 878 } |
816 | 879 |
817 /** | 880 /** |
818 * Handle a CDATA section. | 881 * Handle a CDATA section. |
882 * | |
883 * @return bool | |
819 */ | 884 */ |
820 protected function cdataSection() | 885 protected function cdataSection() |
821 { | 886 { |
822 if ($this->scanner->current() != '[') { | |
823 return false; | |
824 } | |
825 $cdata = ''; | 887 $cdata = ''; |
826 $this->scanner->next(); | 888 $this->scanner->consume(); |
827 | 889 |
828 $chars = $this->scanner->charsWhile('CDAT'); | 890 $chars = $this->scanner->charsWhile('CDAT'); |
829 if ($chars != 'CDATA' || $this->scanner->current() != '[') { | 891 if ('CDATA' != $chars || '[' != $this->scanner->current()) { |
830 $this->parseError('Expected [CDATA[, got %s', $chars); | 892 $this->parseError('Expected [CDATA[, got %s', $chars); |
893 | |
831 return $this->bogusComment('<![' . $chars); | 894 return $this->bogusComment('<![' . $chars); |
832 } | 895 } |
833 | 896 |
834 $tok = $this->scanner->next(); | 897 $tok = $this->scanner->next(); |
835 do { | 898 do { |
836 if ($tok === false) { | 899 if (false === $tok) { |
837 $this->parseError('Unexpected EOF inside CDATA.'); | 900 $this->parseError('Unexpected EOF inside CDATA.'); |
838 $this->bogusComment('<![CDATA[' . $cdata); | 901 $this->bogusComment('<![CDATA[' . $cdata); |
902 | |
839 return true; | 903 return true; |
840 } | 904 } |
841 $cdata .= $tok; | 905 $cdata .= $tok; |
842 $tok = $this->scanner->next(); | 906 $tok = $this->scanner->next(); |
843 } while (! $this->sequenceMatches(']]>')); | 907 } while (!$this->scanner->sequenceMatches(']]>')); |
844 | 908 |
845 // Consume ]]> | 909 // Consume ]]> |
846 $this->scanner->consume(3); | 910 $this->scanner->consume(3); |
847 | 911 |
848 $this->events->cdata($cdata); | 912 $this->events->cdata($cdata); |
913 | |
849 return true; | 914 return true; |
850 } | 915 } |
851 | 916 |
852 // ================================================================ | 917 // ================================================================ |
853 // Non-HTML5 | 918 // Non-HTML5 |
854 // ================================================================ | 919 // ================================================================ |
920 | |
855 /** | 921 /** |
856 * Handle a processing instruction. | 922 * Handle a processing instruction. |
857 * | 923 * |
858 * XML processing instructions are supposed to be ignored in HTML5, | 924 * XML processing instructions are supposed to be ignored in HTML5, |
859 * treated as "bogus comments". However, since we're not a user | 925 * treated as "bogus comments". However, since we're not a user |
860 * agent, we allow them. We consume until ?> and then issue a | 926 * agent, we allow them. We consume until ?> and then issue a |
861 * EventListener::processingInstruction() event. | 927 * EventListener::processingInstruction() event. |
928 * | |
929 * @return bool | |
862 */ | 930 */ |
863 protected function processingInstruction() | 931 protected function processingInstruction() |
864 { | 932 { |
865 if ($this->scanner->current() != '?') { | 933 if ('?' != $this->scanner->current()) { |
866 return false; | 934 return false; |
867 } | 935 } |
868 | 936 |
869 $tok = $this->scanner->next(); | 937 $tok = $this->scanner->next(); |
870 $procName = $this->scanner->getAsciiAlpha(); | 938 $procName = $this->scanner->getAsciiAlpha(); |
871 $white = strlen($this->scanner->whitespace()); | 939 $white = $this->scanner->whitespace(); |
872 | 940 |
873 // If not a PI, send to bogusComment. | 941 // If not a PI, send to bogusComment. |
874 if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { | 942 if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) { |
875 $this->parseError("Expected processing instruction name, got $tok"); | 943 $this->parseError("Expected processing instruction name, got $tok"); |
876 $this->bogusComment('<?' . $tok . $procName); | 944 $this->bogusComment('<?' . $tok . $procName); |
945 | |
877 return true; | 946 return true; |
878 } | 947 } |
879 | 948 |
880 $data = ''; | 949 $data = ''; |
881 // As long as it's not the case that the next two chars are ? and >. | 950 // As long as it's not the case that the next two chars are ? and >. |
882 while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { | 951 while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) { |
883 $data .= $this->scanner->current(); | 952 $data .= $this->scanner->current(); |
884 | 953 |
885 $tok = $this->scanner->next(); | 954 $tok = $this->scanner->next(); |
886 if ($tok === false) { | 955 if (false === $tok) { |
887 $this->parseError("Unexpected EOF in processing instruction."); | 956 $this->parseError('Unexpected EOF in processing instruction.'); |
888 $this->events->processingInstruction($procName, $data); | 957 $this->events->processingInstruction($procName, $data); |
958 | |
889 return true; | 959 return true; |
890 } | 960 } |
891 } | 961 } |
892 | 962 |
893 $this->scanner->next(); // > | 963 $this->scanner->consume(2); // Consume the closing tag |
894 $this->scanner->next(); // Next token. | |
895 $this->events->processingInstruction($procName, $data); | 964 $this->events->processingInstruction($procName, $data); |
965 | |
896 return true; | 966 return true; |
897 } | 967 } |
898 | 968 |
899 // ================================================================ | 969 // ================================================================ |
900 // UTILITY FUNCTIONS | 970 // UTILITY FUNCTIONS |
901 // ================================================================ | 971 // ================================================================ |
902 | 972 |
903 /** | 973 /** |
904 * Read from the input stream until we get to the desired sequene | 974 * Read from the input stream until we get to the desired sequene |
905 * or hit the end of the input stream. | 975 * or hit the end of the input stream. |
976 * | |
977 * @param string $sequence | |
978 * | |
979 * @return string | |
906 */ | 980 */ |
907 protected function readUntilSequence($sequence) | 981 protected function readUntilSequence($sequence) |
908 { | 982 { |
909 $buffer = ''; | 983 $buffer = ''; |
910 | 984 |
911 // Optimization for reading larger blocks faster. | 985 // Optimization for reading larger blocks faster. |
912 $first = substr($sequence, 0, 1); | 986 $first = substr($sequence, 0, 1); |
913 while ($this->scanner->current() !== false) { | 987 while (false !== $this->scanner->current()) { |
914 $buffer .= $this->scanner->charsUntil($first); | 988 $buffer .= $this->scanner->charsUntil($first); |
915 | 989 |
916 // Stop as soon as we hit the stopping condition. | 990 // Stop as soon as we hit the stopping condition. |
917 if ($this->sequenceMatches($sequence, false)) { | 991 if ($this->scanner->sequenceMatches($sequence, false)) { |
918 return $buffer; | 992 return $buffer; |
919 } | 993 } |
920 $buffer .= $this->scanner->current(); | 994 $buffer .= $this->scanner->current(); |
921 $this->scanner->next(); | 995 $this->scanner->consume(); |
922 } | 996 } |
923 | 997 |
924 // If we get here, we hit the EOF. | 998 // If we get here, we hit the EOF. |
925 $this->parseError("Unexpected EOF during text read."); | 999 $this->parseError('Unexpected EOF during text read.'); |
1000 | |
926 return $buffer; | 1001 return $buffer; |
927 } | 1002 } |
928 | 1003 |
929 /** | 1004 /** |
930 * Check if upcomming chars match the given sequence. | 1005 * Check if upcomming chars match the given sequence. |
933 * found, this will return true. If not, return false. | 1008 * found, this will return true. If not, return false. |
934 * Since this unconsumes any chars it reads, the caller | 1009 * Since this unconsumes any chars it reads, the caller |
935 * will still need to read the next sequence, even if | 1010 * will still need to read the next sequence, even if |
936 * this returns true. | 1011 * this returns true. |
937 * | 1012 * |
938 * Example: $this->sequenceMatches('</script>') will | 1013 * Example: $this->scanner->sequenceMatches('</script>') will |
939 * see if the input stream is at the start of a | 1014 * see if the input stream is at the start of a |
940 * '</script>' string. | 1015 * '</script>' string. |
1016 * | |
1017 * @param string $sequence | |
1018 * @param bool $caseSensitive | |
1019 * | |
1020 * @return bool | |
941 */ | 1021 */ |
942 protected function sequenceMatches($sequence, $caseSensitive = true) | 1022 protected function sequenceMatches($sequence, $caseSensitive = true) |
943 { | 1023 { |
944 $len = strlen($sequence); | 1024 @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED); |
945 $buffer = ''; | 1025 |
946 for ($i = 0; $i < $len; ++ $i) { | 1026 return $this->scanner->sequenceMatches($sequence, $caseSensitive); |
947 $tok = $this->scanner->current(); | |
948 $buffer .= $tok; | |
949 | |
950 // EOF. Rewind and let the caller handle it. | |
951 if ($tok === false) { | |
952 $this->scanner->unconsume($i); | |
953 return false; | |
954 } | |
955 $this->scanner->next(); | |
956 } | |
957 | |
958 $this->scanner->unconsume($len); | |
959 return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; | |
960 } | 1027 } |
961 | 1028 |
962 /** | 1029 /** |
963 * Send a TEXT event with the contents of the text buffer. | 1030 * Send a TEXT event with the contents of the text buffer. |
964 * | 1031 * |
966 * temporary text buffer. (The buffer is used to group as much PCDATA | 1033 * temporary text buffer. (The buffer is used to group as much PCDATA |
967 * as we can instead of emitting lots and lots of TEXT events.) | 1034 * as we can instead of emitting lots and lots of TEXT events.) |
968 */ | 1035 */ |
969 protected function flushBuffer() | 1036 protected function flushBuffer() |
970 { | 1037 { |
971 if ($this->text === '') { | 1038 if ('' === $this->text) { |
972 return; | 1039 return; |
973 } | 1040 } |
974 $this->events->text($this->text); | 1041 $this->events->text($this->text); |
975 $this->text = ''; | 1042 $this->text = ''; |
976 } | 1043 } |
977 | 1044 |
978 /** | 1045 /** |
979 * Add text to the temporary buffer. | 1046 * Add text to the temporary buffer. |
980 * | 1047 * |
981 * @see flushBuffer() | 1048 * @see flushBuffer() |
1049 * | |
1050 * @param string $str | |
982 */ | 1051 */ |
983 protected function buffer($str) | 1052 protected function buffer($str) |
984 { | 1053 { |
985 $this->text .= $str; | 1054 $this->text .= $str; |
986 } | 1055 } |
988 /** | 1057 /** |
989 * Emit a parse error. | 1058 * Emit a parse error. |
990 * | 1059 * |
991 * A parse error always returns false because it never consumes any | 1060 * A parse error always returns false because it never consumes any |
992 * characters. | 1061 * characters. |
1062 * | |
1063 * @param string $msg | |
1064 * | |
1065 * @return string | |
993 */ | 1066 */ |
994 protected function parseError($msg) | 1067 protected function parseError($msg) |
995 { | 1068 { |
996 $args = func_get_args(); | 1069 $args = func_get_args(); |
997 | 1070 |
1001 } | 1074 } |
1002 | 1075 |
1003 $line = $this->scanner->currentLine(); | 1076 $line = $this->scanner->currentLine(); |
1004 $col = $this->scanner->columnOffset(); | 1077 $col = $this->scanner->columnOffset(); |
1005 $this->events->parseError($msg, $line, $col); | 1078 $this->events->parseError($msg, $line, $col); |
1079 | |
1006 return false; | 1080 return false; |
1007 } | 1081 } |
1008 | 1082 |
1009 /** | 1083 /** |
1010 * Decode a character reference and return the string. | 1084 * Decode a character reference and return the string. |
1011 * | 1085 * |
1012 * Returns false if the entity could not be found. If $inAttribute is set | 1086 * If $inAttribute is set to true, a bare & will be returned as-is. |
1013 * to true, a bare & will be returned as-is. | 1087 * |
1014 * | 1088 * @param bool $inAttribute Set to true if the text is inside of an attribute value. |
1015 * @param boolean $inAttribute | 1089 * false otherwise. |
1016 * Set to true if the text is inside of an attribute value. | 1090 * |
1017 * false otherwise. | 1091 * @return string |
1018 */ | 1092 */ |
1019 protected function decodeCharacterReference($inAttribute = false) | 1093 protected function decodeCharacterReference($inAttribute = false) |
1020 { | 1094 { |
1021 | |
1022 // If it fails this, it's definitely not an entity. | |
1023 if ($this->scanner->current() != '&') { | |
1024 return false; | |
1025 } | |
1026 | |
1027 // Next char after &. | 1095 // Next char after &. |
1028 $tok = $this->scanner->next(); | 1096 $tok = $this->scanner->next(); |
1029 $entity = ''; | |
1030 $start = $this->scanner->position(); | 1097 $start = $this->scanner->position(); |
1031 | 1098 |
1032 if ($tok == false) { | 1099 if (false === $tok) { |
1033 return '&'; | 1100 return '&'; |
1034 } | 1101 } |
1035 | 1102 |
1036 // These indicate not an entity. We return just | 1103 // These indicate not an entity. We return just |
1037 // the &. | 1104 // the &. |
1038 if (strspn($tok, static::WHITE . "&<") == 1) { | 1105 if ("\t" === $tok || "\n" === $tok || "\f" === $tok || ' ' === $tok || '&' === $tok || '<' === $tok) { |
1039 // $this->scanner->next(); | 1106 // $this->scanner->next(); |
1040 return '&'; | 1107 return '&'; |
1041 } | 1108 } |
1042 | 1109 |
1043 // Numeric entity | 1110 // Numeric entity |
1044 if ($tok == '#') { | 1111 if ('#' === $tok) { |
1045 $tok = $this->scanner->next(); | 1112 $tok = $this->scanner->next(); |
1046 | 1113 |
1047 // Hexidecimal encoding. | 1114 // Hexidecimal encoding. |
1048 // X[0-9a-fA-F]+; | 1115 // X[0-9a-fA-F]+; |
1049 // x[0-9a-fA-F]+; | 1116 // x[0-9a-fA-F]+; |
1050 if ($tok == 'x' || $tok == 'X') { | 1117 if ('x' === $tok || 'X' === $tok) { |
1051 $tok = $this->scanner->next(); // Consume x | 1118 $tok = $this->scanner->next(); // Consume x |
1052 | 1119 |
1053 // Convert from hex code to char. | 1120 // Convert from hex code to char. |
1054 $hex = $this->scanner->getHex(); | 1121 $hex = $this->scanner->getHex(); |
1055 if (empty($hex)) { | 1122 if (empty($hex)) { |
1056 $this->parseError("Expected &#xHEX;, got &#x%s", $tok); | 1123 $this->parseError('Expected &#xHEX;, got &#x%s', $tok); |
1057 // We unconsume because we don't know what parser rules might | 1124 // We unconsume because we don't know what parser rules might |
1058 // be in effect for the remaining chars. For example. '&#>' | 1125 // be in effect for the remaining chars. For example. '&#>' |
1059 // might result in a specific parsing rule inside of tag | 1126 // might result in a specific parsing rule inside of tag |
1060 // contexts, while not inside of pcdata context. | 1127 // contexts, while not inside of pcdata context. |
1061 $this->scanner->unconsume(2); | 1128 $this->scanner->unconsume(2); |
1129 | |
1062 return '&'; | 1130 return '&'; |
1063 } | 1131 } |
1064 $entity = CharacterReference::lookupHex($hex); | 1132 $entity = CharacterReference::lookupHex($hex); |
1065 } // Decimal encoding. | 1133 } // Decimal encoding. |
1066 // [0-9]+; | 1134 // [0-9]+; |
1067 else { | 1135 else { |
1068 // Convert from decimal to char. | 1136 // Convert from decimal to char. |
1069 $numeric = $this->scanner->getNumeric(); | 1137 $numeric = $this->scanner->getNumeric(); |
1070 if ($numeric === false) { | 1138 if (false === $numeric) { |
1071 $this->parseError("Expected &#DIGITS;, got &#%s", $tok); | 1139 $this->parseError('Expected &#DIGITS;, got &#%s', $tok); |
1072 $this->scanner->unconsume(2); | 1140 $this->scanner->unconsume(2); |
1141 | |
1073 return '&'; | 1142 return '&'; |
1074 } | 1143 } |
1075 $entity = CharacterReference::lookupDecimal($numeric); | 1144 $entity = CharacterReference::lookupDecimal($numeric); |
1076 } | 1145 } |
1077 } elseif ($tok === '=' && $inAttribute) { | 1146 } elseif ('=' === $tok && $inAttribute) { |
1078 return '&'; | 1147 return '&'; |
1079 } else { // String entity. | 1148 } else { // String entity. |
1080 | |
1081 // Attempt to consume a string up to a ';'. | 1149 // Attempt to consume a string up to a ';'. |
1082 // [a-zA-Z0-9]+; | 1150 // [a-zA-Z0-9]+; |
1083 $cname = $this->scanner->getAsciiAlphaNum(); | 1151 $cname = $this->scanner->getAsciiAlphaNum(); |
1084 $entity = CharacterReference::lookupName($cname); | 1152 $entity = CharacterReference::lookupName($cname); |
1085 | 1153 |
1086 // When no entity is found provide the name of the unmatched string | 1154 // When no entity is found provide the name of the unmatched string |
1087 // and continue on as the & is not part of an entity. The & will | 1155 // and continue on as the & is not part of an entity. The & will |
1088 // be converted to & elsewhere. | 1156 // be converted to & elsewhere. |
1089 if ($entity == null) { | 1157 if (null === $entity) { |
1090 if (!$inAttribute || strlen($cname) === 0) { | 1158 if (!$inAttribute || '' === $cname) { |
1091 $this->parseError("No match in entity table for '%s'", $cname); | 1159 $this->parseError("No match in entity table for '%s'", $cname); |
1092 } | 1160 } |
1093 $this->scanner->unconsume($this->scanner->position() - $start); | 1161 $this->scanner->unconsume($this->scanner->position() - $start); |
1162 | |
1094 return '&'; | 1163 return '&'; |
1095 } | 1164 } |
1096 } | 1165 } |
1097 | 1166 |
1098 // The scanner has advanced the cursor for us. | 1167 // The scanner has advanced the cursor for us. |
1099 $tok = $this->scanner->current(); | 1168 $tok = $this->scanner->current(); |
1100 | 1169 |
1101 // We have an entity. We're done here. | 1170 // We have an entity. We're done here. |
1102 if ($tok == ';') { | 1171 if (';' === $tok) { |
1103 $this->scanner->next(); | 1172 $this->scanner->consume(); |
1173 | |
1104 return $entity; | 1174 return $entity; |
1105 } | 1175 } |
1106 | 1176 |
1107 // If in an attribute, then failing to match ; means unconsume the | 1177 // If in an attribute, then failing to match ; means unconsume the |
1108 // entire string. Otherwise, failure to match is an error. | 1178 // entire string. Otherwise, failure to match is an error. |
1109 if ($inAttribute) { | 1179 if ($inAttribute) { |
1110 $this->scanner->unconsume($this->scanner->position() - $start); | 1180 $this->scanner->unconsume($this->scanner->position() - $start); |
1181 | |
1111 return '&'; | 1182 return '&'; |
1112 } | 1183 } |
1113 | 1184 |
1114 $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); | 1185 $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok); |
1186 | |
1115 return '&' . $entity; | 1187 return '&' . $entity; |
1116 } | 1188 } |
1117 } | 1189 } |