Mercurial > hg > isophonics-drupal-site
comparison vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 129ea1e6d783 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4c8ae668cc8c |
---|---|
1 <?php | |
2 namespace Masterminds\HTML5\Parser; | |
3 | |
4 use Masterminds\HTML5\Elements; | |
5 | |
6 /** | |
7 * The HTML5 tokenizer. | |
8 * | |
9 * The tokenizer's role is reading data from the scanner and gathering it into | |
10 * semantic units. From the tokenizer, data is emitted to an event handler, | |
11 * which may (for example) create a DOM tree. | |
12 * | |
13 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We | |
14 * follow that specification to the maximum extent that we can. If you find | |
15 * a discrepancy that is not documented, please file a bug and/or submit a | |
16 * patch. | |
17 * | |
18 * This tokenizer is implemented as a recursive descent parser. | |
19 * | |
20 * Within the API documentation, you may see references to the specific section | |
21 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1. | |
22 * This refers to section 8.2.4.1 of the HTML5 CR specification. | |
23 * | |
24 * @see http://www.w3.org/TR/2012/CR-html5-20121217/ | |
25 */ | |
26 class Tokenizer | |
27 { | |
28 | |
29 protected $scanner; | |
30 | |
31 protected $events; | |
32 | |
33 protected $tok; | |
34 | |
35 /** | |
36 * Buffer for text. | |
37 */ | |
38 protected $text = ''; | |
39 | |
40 // When this goes to false, the parser stops. | |
41 protected $carryOn = true; | |
42 | |
43 protected $textMode = 0; // TEXTMODE_NORMAL; | |
44 protected $untilTag = null; | |
45 | |
46 const CONFORMANT_XML = 'xml'; | |
47 const CONFORMANT_HTML = 'html'; | |
48 protected $mode = self::CONFORMANT_HTML; | |
49 | |
50 const WHITE = "\t\n\f "; | |
51 | |
52 /** | |
53 * Create a new tokenizer. | |
54 * | |
55 * Typically, parsing a document involves creating a new tokenizer, giving | |
56 * it a scanner (input) and an event handler (output), and then calling | |
57 * the Tokenizer::parse() method.` | |
58 * | |
59 * @param \Masterminds\HTML5\Parser\Scanner $scanner | |
60 * A scanner initialized with an input stream. | |
61 * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler | |
62 * An event handler, initialized and ready to receive | |
63 * events. | |
64 * @param string $mode | |
65 */ | |
66 public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) | |
67 { | |
68 $this->scanner = $scanner; | |
69 $this->events = $eventHandler; | |
70 $this->mode = $mode; | |
71 } | |
72 | |
73 /** | |
74 * Begin parsing. | |
75 * | |
76 * This will begin scanning the document, tokenizing as it goes. | |
77 * Tokens are emitted into the event handler. | |
78 * | |
79 * Tokenizing will continue until the document is completely | |
80 * read. Errors are emitted into the event handler, but | |
81 * the parser will attempt to continue parsing until the | |
82 * entire input stream is read. | |
83 */ | |
84 public function parse() | |
85 { | |
86 do { | |
87 $this->consumeData(); | |
88 // FIXME: Add infinite loop protection. | |
89 } while ($this->carryOn); | |
90 } | |
91 | |
92 /** | |
93 * Set the text mode for the character data reader. | |
94 * | |
95 * HTML5 defines three different modes for reading text: | |
96 * - Normal: Read until a tag is encountered. | |
97 * - RCDATA: Read until a tag is encountered, but skip a few otherwise- | |
98 * special characters. | |
99 * - Raw: Read until a special closing tag is encountered (viz. pre, script) | |
100 * | |
101 * This allows those modes to be set. | |
102 * | |
103 * Normally, setting is done by the event handler via a special return code on | |
104 * startTag(), but it can also be set manually using this function. | |
105 * | |
106 * @param integer $textmode | |
107 * One of Elements::TEXT_* | |
108 * @param string $untilTag | |
109 * The tag that should stop RAW or RCDATA mode. Normal mode does not | |
110 * use this indicator. | |
111 */ | |
112 public function setTextMode($textmode, $untilTag = null) | |
113 { | |
114 $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); | |
115 $this->untilTag = $untilTag; | |
116 } | |
117 | |
118 /** | |
119 * Consume a character and make a move. | |
120 * HTML5 8.2.4.1 | |
121 */ | |
122 protected function consumeData() | |
123 { | |
124 // Character Ref | |
125 /* | |
126 * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); | |
127 */ | |
128 $this->characterReference(); | |
129 $this->tagOpen(); | |
130 $this->eof(); | |
131 $this->characterData(); | |
132 | |
133 return $this->carryOn; | |
134 } | |
135 | |
136 /** | |
137 * Parse anything that looks like character data. | |
138 * | |
139 * Different rules apply based on the current text mode. | |
140 * | |
141 * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. | |
142 */ | |
143 protected function characterData() | |
144 { | |
145 $tok = $this->scanner->current(); | |
146 if ($tok === false) { | |
147 return false; | |
148 } | |
149 switch ($this->textMode) { | |
150 case Elements::TEXT_RAW: | |
151 return $this->rawText(); | |
152 case Elements::TEXT_RCDATA: | |
153 return $this->rcdata(); | |
154 default: | |
155 if (strspn($tok, "<&")) { | |
156 return false; | |
157 } | |
158 return $this->text(); | |
159 } | |
160 } | |
161 | |
162 /** | |
163 * This buffers the current token as character data. | |
164 */ | |
165 protected function text() | |
166 { | |
167 $tok = $this->scanner->current(); | |
168 | |
169 // This should never happen... | |
170 if ($tok === false) { | |
171 return false; | |
172 } | |
173 // Null | |
174 if ($tok === "\00") { | |
175 $this->parseError("Received null character."); | |
176 } | |
177 // fprintf(STDOUT, "Writing '%s'", $tok); | |
178 $this->buffer($tok); | |
179 $this->scanner->next(); | |
180 return true; | |
181 } | |
182 | |
183 /** | |
184 * Read text in RAW mode. | |
185 */ | |
186 protected function rawText() | |
187 { | |
188 if (is_null($this->untilTag)) { | |
189 return $this->text(); | |
190 } | |
191 $sequence = '</' . $this->untilTag . '>'; | |
192 $txt = $this->readUntilSequence($sequence); | |
193 $this->events->text($txt); | |
194 $this->setTextMode(0); | |
195 return $this->endTag(); | |
196 } | |
197 | |
198 /** | |
199 * Read text in RCDATA mode. | |
200 */ | |
201 protected function rcdata() | |
202 { | |
203 if (is_null($this->untilTag)) { | |
204 return $this->text(); | |
205 } | |
206 $sequence = '</' . $this->untilTag; | |
207 $txt = ''; | |
208 $tok = $this->scanner->current(); | |
209 | |
210 $caseSensitive = !Elements::isHtml5Element($this->untilTag); | |
211 while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { | |
212 if ($tok == '&') { | |
213 $txt .= $this->decodeCharacterReference(); | |
214 $tok = $this->scanner->current(); | |
215 } else { | |
216 $txt .= $tok; | |
217 $tok = $this->scanner->next(); | |
218 } | |
219 } | |
220 $len = strlen($sequence); | |
221 $this->scanner->consume($len); | |
222 $len += strlen($this->scanner->whitespace()); | |
223 if ($this->scanner->current() !== '>') { | |
224 $this->parseError("Unclosed RCDATA end tag"); | |
225 } | |
226 $this->scanner->unconsume($len); | |
227 $this->events->text($txt); | |
228 $this->setTextMode(0); | |
229 return $this->endTag(); | |
230 } | |
231 | |
232 /** | |
233 * If the document is read, emit an EOF event. | |
234 */ | |
235 protected function eof() | |
236 { | |
237 if ($this->scanner->current() === false) { | |
238 // fprintf(STDOUT, "EOF"); | |
239 $this->flushBuffer(); | |
240 $this->events->eof(); | |
241 $this->carryOn = false; | |
242 return true; | |
243 } | |
244 return false; | |
245 } | |
246 | |
247 /** | |
248 * Handle character references (aka entities). | |
249 * | |
250 * This version is specific to PCDATA, as it buffers data into the | |
251 * text buffer. For a generic version, see decodeCharacterReference(). | |
252 * | |
253 * HTML5 8.2.4.2 | |
254 */ | |
255 protected function characterReference() | |
256 { | |
257 $ref = $this->decodeCharacterReference(); | |
258 if ($ref !== false) { | |
259 $this->buffer($ref); | |
260 return true; | |
261 } | |
262 return false; | |
263 } | |
264 | |
265 /** | |
266 * Emit a tagStart event on encountering a tag. | |
267 * | |
268 * 8.2.4.8 | |
269 */ | |
270 protected function tagOpen() | |
271 { | |
272 if ($this->scanner->current() != '<') { | |
273 return false; | |
274 } | |
275 | |
276 // Any buffered text data can go out now. | |
277 $this->flushBuffer(); | |
278 | |
279 $this->scanner->next(); | |
280 | |
281 return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || | |
282 /* This always returns false. */ | |
283 $this->parseError("Illegal tag opening") || $this->characterData(); | |
284 } | |
285 | |
286 /** | |
287 * Look for markup. | |
288 */ | |
289 protected function markupDeclaration() | |
290 { | |
291 if ($this->scanner->current() != '!') { | |
292 return false; | |
293 } | |
294 | |
295 $tok = $this->scanner->next(); | |
296 | |
297 // Comment: | |
298 if ($tok == '-' && $this->scanner->peek() == '-') { | |
299 $this->scanner->next(); // Consume the other '-' | |
300 $this->scanner->next(); // Next char. | |
301 return $this->comment(); | |
302 } | |
303 | |
304 elseif ($tok == 'D' || $tok == 'd') { // Doctype | |
305 return $this->doctype(); | |
306 } | |
307 | |
308 elseif ($tok == '[') { // CDATA section | |
309 return $this->cdataSection(); | |
310 } | |
311 | |
312 // FINISH | |
313 $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok); | |
314 $this->bogusComment('<!'); | |
315 return true; | |
316 } | |
317 | |
318 /** | |
319 * Consume an end tag. | |
320 * 8.2.4.9 | |
321 */ | |
322 protected function endTag() | |
323 { | |
324 if ($this->scanner->current() != '/') { | |
325 return false; | |
326 } | |
327 $tok = $this->scanner->next(); | |
328 | |
329 // a-zA-Z -> tagname | |
330 // > -> parse error | |
331 // EOF -> parse error | |
332 // -> parse error | |
333 if (! ctype_alpha($tok)) { | |
334 $this->parseError("Expected tag name, got '%s'", $tok); | |
335 if ($tok == "\0" || $tok === false) { | |
336 return false; | |
337 } | |
338 return $this->bogusComment('</'); | |
339 } | |
340 | |
341 $name = $this->scanner->charsUntil("\n\f \t>"); | |
342 $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name); | |
343 // Trash whitespace. | |
344 $this->scanner->whitespace(); | |
345 | |
346 if ($this->scanner->current() != '>') { | |
347 $this->parseError("Expected >, got '%s'", $this->scanner->current()); | |
348 // We just trash stuff until we get to the next tag close. | |
349 $this->scanner->charsUntil('>'); | |
350 } | |
351 | |
352 $this->events->endTag($name); | |
353 $this->scanner->next(); | |
354 return true; | |
355 } | |
356 | |
357 /** | |
358 * Consume a tag name and body. | |
359 * 8.2.4.10 | |
360 */ | |
361 protected function tagName() | |
362 { | |
363 $tok = $this->scanner->current(); | |
364 if (! ctype_alpha($tok)) { | |
365 return false; | |
366 } | |
367 | |
368 // We know this is at least one char. | |
369 $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); | |
370 $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name); | |
371 $attributes = array(); | |
372 $selfClose = false; | |
373 | |
374 // Handle attribute parse exceptions here so that we can | |
375 // react by trying to build a sensible parse tree. | |
376 try { | |
377 do { | |
378 $this->scanner->whitespace(); | |
379 $this->attribute($attributes); | |
380 } while (! $this->isTagEnd($selfClose)); | |
381 } catch (ParseError $e) { | |
382 $selfClose = false; | |
383 } | |
384 | |
385 $mode = $this->events->startTag($name, $attributes, $selfClose); | |
386 // Should we do this? What does this buy that selfClose doesn't? | |
387 if ($selfClose) { | |
388 $this->events->endTag($name); | |
389 } elseif (is_int($mode)) { | |
390 // fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name); | |
391 $this->setTextMode($mode, $name); | |
392 } | |
393 | |
394 $this->scanner->next(); | |
395 | |
396 return true; | |
397 } | |
398 | |
399 /** | |
400 * Check if the scanner has reached the end of a tag. | |
401 */ | |
402 protected function isTagEnd(&$selfClose) | |
403 { | |
404 $tok = $this->scanner->current(); | |
405 if ($tok == '/') { | |
406 $this->scanner->next(); | |
407 $this->scanner->whitespace(); | |
408 $tok = $this->scanner->current(); | |
409 | |
410 if ($tok == '>') { | |
411 $selfClose = true; | |
412 return true; | |
413 } | |
414 if ($tok === false) { | |
415 $this->parseError("Unexpected EOF inside of tag."); | |
416 return true; | |
417 } | |
418 // Basically, we skip the / token and go on. | |
419 // See 8.2.4.43. | |
420 $this->parseError("Unexpected '%s' inside of a tag.", $tok); | |
421 return false; | |
422 } | |
423 | |
424 if ($tok == '>') { | |
425 return true; | |
426 } | |
427 if ($tok === false) { | |
428 $this->parseError("Unexpected EOF inside of tag."); | |
429 return true; | |
430 } | |
431 | |
432 return false; | |
433 } | |
434 | |
435 /** | |
436 * Parse attributes from inside of a tag. | |
437 */ | |
438 protected function attribute(&$attributes) | |
439 { | |
440 $tok = $this->scanner->current(); | |
441 if ($tok == '/' || $tok == '>' || $tok === false) { | |
442 return false; | |
443 } | |
444 | |
445 if ($tok == '<') { | |
446 $this->parseError("Unexepcted '<' inside of attributes list."); | |
447 // Push the < back onto the stack. | |
448 $this->scanner->unconsume(); | |
449 // Let the caller figure out how to handle this. | |
450 throw new ParseError("Start tag inside of attribute."); | |
451 } | |
452 | |
453 $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); | |
454 | |
455 if (strlen($name) == 0) { | |
456 $this->parseError("Expected an attribute name, got %s.", $this->scanner->current()); | |
457 // Really, only '=' can be the char here. Everything else gets absorbed | |
458 // under one rule or another. | |
459 $name = $this->scanner->current(); | |
460 $this->scanner->next(); | |
461 } | |
462 | |
463 $isValidAttribute = true; | |
464 // Attribute names can contain most Unicode characters for HTML5. | |
465 // But method "DOMElement::setAttribute" is throwing exception | |
466 // because of it's own internal restriction so these have to be filtered. | |
467 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 | |
468 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name | |
469 if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { | |
470 $this->parseError("Unexpected characters in attribute name: %s", $name); | |
471 $isValidAttribute = false; | |
472 } // There is no limitation for 1st character in HTML5. | |
473 // But method "DOMElement::setAttribute" is throwing exception for the | |
474 // characters below so they have to be filtered. | |
475 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 | |
476 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name | |
477 else | |
478 if (preg_match("/^[0-9.-]/u", $name)) { | |
479 $this->parseError("Unexpected character at the begining of attribute name: %s", $name); | |
480 $isValidAttribute = false; | |
481 } | |
482 // 8.1.2.3 | |
483 $this->scanner->whitespace(); | |
484 | |
485 $val = $this->attributeValue(); | |
486 if ($isValidAttribute) { | |
487 $attributes[$name] = $val; | |
488 } | |
489 return true; | |
490 } | |
491 | |
492 /** | |
493 * Consume an attribute value. | |
494 * 8.2.4.37 and after. | |
495 */ | |
496 protected function attributeValue() | |
497 { | |
498 if ($this->scanner->current() != '=') { | |
499 return null; | |
500 } | |
501 $this->scanner->next(); | |
502 // 8.1.2.3 | |
503 $this->scanner->whitespace(); | |
504 | |
505 $tok = $this->scanner->current(); | |
506 switch ($tok) { | |
507 case "\n": | |
508 case "\f": | |
509 case " ": | |
510 case "\t": | |
511 // Whitespace here indicates an empty value. | |
512 return null; | |
513 case '"': | |
514 case "'": | |
515 $this->scanner->next(); | |
516 return $this->quotedAttributeValue($tok); | |
517 case '>': | |
518 // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr. | |
519 $this->parseError("Expected attribute value, got tag end."); | |
520 return null; | |
521 case '=': | |
522 case '`': | |
523 $this->parseError("Expecting quotes, got %s.", $tok); | |
524 return $this->unquotedAttributeValue(); | |
525 default: | |
526 return $this->unquotedAttributeValue(); | |
527 } | |
528 } | |
529 | |
530 /** | |
531 * Get an attribute value string. | |
532 * | |
533 * @param string $quote | |
534 * IMPORTANT: This is a series of chars! Any one of which will be considered | |
535 * termination of an attribute's value. E.g. "\"'" will stop at either | |
536 * ' or ". | |
537 * @return string The attribute value. | |
538 */ | |
539 protected function quotedAttributeValue($quote) | |
540 { | |
541 $stoplist = "\f" . $quote; | |
542 $val = ''; | |
543 | |
544 while (true) { | |
545 $tokens = $this->scanner->charsUntil($stoplist.'&'); | |
546 if ($tokens !== false) { | |
547 $val .= $tokens; | |
548 } else { | |
549 break; | |
550 } | |
551 | |
552 $tok = $this->scanner->current(); | |
553 if ($tok == '&') { | |
554 $val .= $this->decodeCharacterReference(true, $tok); | |
555 continue; | |
556 } | |
557 break; | |
558 } | |
559 $this->scanner->next(); | |
560 return $val; | |
561 } | |
562 | |
563 protected function unquotedAttributeValue() | |
564 { | |
565 $stoplist = "\t\n\f >"; | |
566 $val = ''; | |
567 $tok = $this->scanner->current(); | |
568 while (strspn($tok, $stoplist) == 0 && $tok !== false) { | |
569 if ($tok == '&') { | |
570 $val .= $this->decodeCharacterReference(true); | |
571 $tok = $this->scanner->current(); | |
572 } else { | |
573 if (strspn($tok, "\"'<=`") > 0) { | |
574 $this->parseError("Unexpected chars in unquoted attribute value %s", $tok); | |
575 } | |
576 $val .= $tok; | |
577 $tok = $this->scanner->next(); | |
578 } | |
579 } | |
580 return $val; | |
581 } | |
582 | |
583 /** | |
584 * Consume malformed markup as if it were a comment. | |
585 * 8.2.4.44 | |
586 * | |
587 * The spec requires that the ENTIRE tag-like thing be enclosed inside of | |
588 * the comment. So this will generate comments like: | |
589 * | |
590 * <!--</+foo>--> | |
591 * | |
592 * @param string $leading | |
593 * Prepend any leading characters. This essentially | |
594 * negates the need to backtrack, but it's sort of | |
595 * a hack. | |
596 */ | |
597 protected function bogusComment($leading = '') | |
598 { | |
599 $comment = $leading; | |
600 $tokens = $this->scanner->charsUntil('>'); | |
601 if ($tokens !== false) { | |
602 $comment .= $tokens; | |
603 } | |
604 $tok = $this->scanner->current(); | |
605 if ($tok !== false) { | |
606 $comment .= $tok; | |
607 } | |
608 | |
609 $this->flushBuffer(); | |
610 $this->events->comment($comment); | |
611 $this->scanner->next(); | |
612 | |
613 return true; | |
614 } | |
615 | |
616 /** | |
617 * Read a comment. | |
618 * | |
619 * Expects the first tok to be inside of the comment. | |
620 */ | |
621 protected function comment() | |
622 { | |
623 $tok = $this->scanner->current(); | |
624 $comment = ''; | |
625 | |
626 // <!-->. Emit an empty comment because 8.2.4.46 says to. | |
627 if ($tok == '>') { | |
628 // Parse error. Emit the comment token. | |
629 $this->parseError("Expected comment data, got '>'"); | |
630 $this->events->comment(''); | |
631 $this->scanner->next(); | |
632 return true; | |
633 } | |
634 | |
635 // Replace NULL with the replacement char. | |
636 if ($tok == "\0") { | |
637 $tok = UTF8Utils::FFFD; | |
638 } | |
639 while (! $this->isCommentEnd()) { | |
640 $comment .= $tok; | |
641 $tok = $this->scanner->next(); | |
642 } | |
643 | |
644 $this->events->comment($comment); | |
645 $this->scanner->next(); | |
646 return true; | |
647 } | |
648 | |
649 /** | |
650 * Check if the scanner has reached the end of a comment. | |
651 */ | |
652 protected function isCommentEnd() | |
653 { | |
654 $tok = $this->scanner->current(); | |
655 | |
656 // EOF | |
657 if ($tok === false) { | |
658 // Hit the end. | |
659 $this->parseError("Unexpected EOF in a comment."); | |
660 return true; | |
661 } | |
662 | |
663 // If it doesn't start with -, not the end. | |
664 if ($tok != '-') { | |
665 return false; | |
666 } | |
667 | |
668 // Advance one, and test for '->' | |
669 if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { | |
670 $this->scanner->next(); // Consume the last '>' | |
671 return true; | |
672 } | |
673 // Unread '-'; | |
674 $this->scanner->unconsume(1); | |
675 return false; | |
676 } | |
677 | |
678 /** | |
679 * Parse a DOCTYPE. | |
680 * | |
681 * Parse a DOCTYPE declaration. This method has strong bearing on whether or | |
682 * not Quirksmode is enabled on the event handler. | |
683 * | |
684 * @todo This method is a little long. Should probably refactor. | |
685 */ | |
686 protected function doctype() | |
687 { | |
688 if (strcasecmp($this->scanner->current(), 'D')) { | |
689 return false; | |
690 } | |
691 // Check that string is DOCTYPE. | |
692 $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); | |
693 if (strcasecmp($chars, 'DOCTYPE')) { | |
694 $this->parseError('Expected DOCTYPE, got %s', $chars); | |
695 return $this->bogusComment('<!' . $chars); | |
696 } | |
697 | |
698 $this->scanner->whitespace(); | |
699 $tok = $this->scanner->current(); | |
700 | |
701 // EOF: die. | |
702 if ($tok === false) { | |
703 $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); | |
704 return $this->eof(); | |
705 } | |
706 | |
707 $doctypeName = ''; | |
708 | |
709 // NULL char: convert. | |
710 if ($tok === "\0") { | |
711 $this->parseError("Unexpected null character in DOCTYPE."); | |
712 $doctypeName .= UTF8::FFFD; | |
713 $tok = $this->scanner->next(); | |
714 } | |
715 | |
716 $stop = " \n\f>"; | |
717 $doctypeName = $this->scanner->charsUntil($stop); | |
718 // Lowercase ASCII, replace \0 with FFFD | |
719 $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); | |
720 | |
721 $tok = $this->scanner->current(); | |
722 | |
723 // If false, emit a parse error, DOCTYPE, and return. | |
724 if ($tok === false) { | |
725 $this->parseError('Unexpected EOF in DOCTYPE declaration.'); | |
726 $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); | |
727 return true; | |
728 } | |
729 | |
730 // Short DOCTYPE, like <!DOCTYPE html> | |
731 if ($tok == '>') { | |
732 // DOCTYPE without a name. | |
733 if (strlen($doctypeName) == 0) { | |
734 $this->parseError("Expected a DOCTYPE name. Got nothing."); | |
735 $this->events->doctype($doctypeName, 0, null, true); | |
736 $this->scanner->next(); | |
737 return true; | |
738 } | |
739 $this->events->doctype($doctypeName); | |
740 $this->scanner->next(); | |
741 return true; | |
742 } | |
743 $this->scanner->whitespace(); | |
744 | |
745 $pub = strtoupper($this->scanner->getAsciiAlpha()); | |
746 $white = strlen($this->scanner->whitespace()); | |
747 | |
748 // Get ID, and flag it as pub or system. | |
749 if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { | |
750 // Get the sys ID. | |
751 $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; | |
752 $id = $this->quotedString("\0>"); | |
753 if ($id === false) { | |
754 $this->events->doctype($doctypeName, $type, $pub, false); | |
755 return false; | |
756 } | |
757 | |
758 // Premature EOF. | |
759 if ($this->scanner->current() === false) { | |
760 $this->parseError("Unexpected EOF in DOCTYPE"); | |
761 $this->events->doctype($doctypeName, $type, $id, true); | |
762 return true; | |
763 } | |
764 | |
765 // Well-formed complete DOCTYPE. | |
766 $this->scanner->whitespace(); | |
767 if ($this->scanner->current() == '>') { | |
768 $this->events->doctype($doctypeName, $type, $id, false); | |
769 $this->scanner->next(); | |
770 return true; | |
771 } | |
772 | |
773 // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK | |
774 // Throw away the junk, parse error, quirks mode, return true. | |
775 $this->scanner->charsUntil(">"); | |
776 $this->parseError("Malformed DOCTYPE."); | |
777 $this->events->doctype($doctypeName, $type, $id, true); | |
778 $this->scanner->next(); | |
779 return true; | |
780 } | |
781 | |
782 // Else it's a bogus DOCTYPE. | |
783 // Consume to > and trash. | |
784 $this->scanner->charsUntil('>'); | |
785 | |
786 $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); | |
787 $this->events->doctype($doctypeName, 0, null, true); | |
788 $this->scanner->next(); | |
789 return true; | |
790 } | |
791 | |
792 /** | |
793 * Utility for reading a quoted string. | |
794 * | |
795 * @param string $stopchars | |
796 * Characters (in addition to a close-quote) that should stop the string. | |
797 * E.g. sometimes '>' is higher precedence than '"' or "'". | |
798 * @return mixed String if one is found (quotations omitted) | |
799 */ | |
800 protected function quotedString($stopchars) | |
801 { | |
802 $tok = $this->scanner->current(); | |
803 if ($tok == '"' || $tok == "'") { | |
804 $this->scanner->next(); | |
805 $ret = $this->scanner->charsUntil($tok . $stopchars); | |
806 if ($this->scanner->current() == $tok) { | |
807 $this->scanner->next(); | |
808 } else { | |
809 // Parse error because no close quote. | |
810 $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); | |
811 } | |
812 return $ret; | |
813 } | |
814 return false; | |
815 } | |
816 | |
817 /** | |
818 * Handle a CDATA section. | |
819 */ | |
820 protected function cdataSection() | |
821 { | |
822 if ($this->scanner->current() != '[') { | |
823 return false; | |
824 } | |
825 $cdata = ''; | |
826 $this->scanner->next(); | |
827 | |
828 $chars = $this->scanner->charsWhile('CDAT'); | |
829 if ($chars != 'CDATA' || $this->scanner->current() != '[') { | |
830 $this->parseError('Expected [CDATA[, got %s', $chars); | |
831 return $this->bogusComment('<![' . $chars); | |
832 } | |
833 | |
834 $tok = $this->scanner->next(); | |
835 do { | |
836 if ($tok === false) { | |
837 $this->parseError('Unexpected EOF inside CDATA.'); | |
838 $this->bogusComment('<![CDATA[' . $cdata); | |
839 return true; | |
840 } | |
841 $cdata .= $tok; | |
842 $tok = $this->scanner->next(); | |
843 } while (! $this->sequenceMatches(']]>')); | |
844 | |
845 // Consume ]]> | |
846 $this->scanner->consume(3); | |
847 | |
848 $this->events->cdata($cdata); | |
849 return true; | |
850 } | |
851 | |
852 // ================================================================ | |
853 // Non-HTML5 | |
854 // ================================================================ | |
855 /** | |
856 * Handle a processing instruction. | |
857 * | |
858 * XML processing instructions are supposed to be ignored in HTML5, | |
859 * treated as "bogus comments". However, since we're not a user | |
860 * agent, we allow them. We consume until ?> and then issue a | |
861 * EventListener::processingInstruction() event. | |
862 */ | |
863 protected function processingInstruction() | |
864 { | |
865 if ($this->scanner->current() != '?') { | |
866 return false; | |
867 } | |
868 | |
869 $tok = $this->scanner->next(); | |
870 $procName = $this->scanner->getAsciiAlpha(); | |
871 $white = strlen($this->scanner->whitespace()); | |
872 | |
873 // If not a PI, send to bogusComment. | |
874 if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { | |
875 $this->parseError("Expected processing instruction name, got $tok"); | |
876 $this->bogusComment('<?' . $tok . $procName); | |
877 return true; | |
878 } | |
879 | |
880 $data = ''; | |
881 // As long as it's not the case that the next two chars are ? and >. | |
882 while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { | |
883 $data .= $this->scanner->current(); | |
884 | |
885 $tok = $this->scanner->next(); | |
886 if ($tok === false) { | |
887 $this->parseError("Unexpected EOF in processing instruction."); | |
888 $this->events->processingInstruction($procName, $data); | |
889 return true; | |
890 } | |
891 } | |
892 | |
893 $this->scanner->next(); // > | |
894 $this->scanner->next(); // Next token. | |
895 $this->events->processingInstruction($procName, $data); | |
896 return true; | |
897 } | |
898 | |
899 // ================================================================ | |
900 // UTILITY FUNCTIONS | |
901 // ================================================================ | |
902 | |
903 /** | |
904 * Read from the input stream until we get to the desired sequene | |
905 * or hit the end of the input stream. | |
906 */ | |
907 protected function readUntilSequence($sequence) | |
908 { | |
909 $buffer = ''; | |
910 | |
911 // Optimization for reading larger blocks faster. | |
912 $first = substr($sequence, 0, 1); | |
913 while ($this->scanner->current() !== false) { | |
914 $buffer .= $this->scanner->charsUntil($first); | |
915 | |
916 // Stop as soon as we hit the stopping condition. | |
917 if ($this->sequenceMatches($sequence, false)) { | |
918 return $buffer; | |
919 } | |
920 $buffer .= $this->scanner->current(); | |
921 $this->scanner->next(); | |
922 } | |
923 | |
924 // If we get here, we hit the EOF. | |
925 $this->parseError("Unexpected EOF during text read."); | |
926 return $buffer; | |
927 } | |
928 | |
929 /** | |
930 * Check if upcomming chars match the given sequence. | |
931 * | |
932 * This will read the stream for the $sequence. If it's | |
933 * found, this will return true. If not, return false. | |
934 * Since this unconsumes any chars it reads, the caller | |
935 * will still need to read the next sequence, even if | |
936 * this returns true. | |
937 * | |
938 * Example: $this->sequenceMatches('</script>') will | |
939 * see if the input stream is at the start of a | |
940 * '</script>' string. | |
941 */ | |
942 protected function sequenceMatches($sequence, $caseSensitive = true) | |
943 { | |
944 $len = strlen($sequence); | |
945 $buffer = ''; | |
946 for ($i = 0; $i < $len; ++ $i) { | |
947 $tok = $this->scanner->current(); | |
948 $buffer .= $tok; | |
949 | |
950 // EOF. Rewind and let the caller handle it. | |
951 if ($tok === false) { | |
952 $this->scanner->unconsume($i); | |
953 return false; | |
954 } | |
955 $this->scanner->next(); | |
956 } | |
957 | |
958 $this->scanner->unconsume($len); | |
959 return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; | |
960 } | |
961 | |
962 /** | |
963 * Send a TEXT event with the contents of the text buffer. | |
964 * | |
965 * This emits an EventHandler::text() event with the current contents of the | |
966 * temporary text buffer. (The buffer is used to group as much PCDATA | |
967 * as we can instead of emitting lots and lots of TEXT events.) | |
968 */ | |
969 protected function flushBuffer() | |
970 { | |
971 if ($this->text === '') { | |
972 return; | |
973 } | |
974 $this->events->text($this->text); | |
975 $this->text = ''; | |
976 } | |
977 | |
978 /** | |
979 * Add text to the temporary buffer. | |
980 * | |
981 * @see flushBuffer() | |
982 */ | |
983 protected function buffer($str) | |
984 { | |
985 $this->text .= $str; | |
986 } | |
987 | |
988 /** | |
989 * Emit a parse error. | |
990 * | |
991 * A parse error always returns false because it never consumes any | |
992 * characters. | |
993 */ | |
994 protected function parseError($msg) | |
995 { | |
996 $args = func_get_args(); | |
997 | |
998 if (count($args) > 1) { | |
999 array_shift($args); | |
1000 $msg = vsprintf($msg, $args); | |
1001 } | |
1002 | |
1003 $line = $this->scanner->currentLine(); | |
1004 $col = $this->scanner->columnOffset(); | |
1005 $this->events->parseError($msg, $line, $col); | |
1006 return false; | |
1007 } | |
1008 | |
1009 /** | |
1010 * Decode a character reference and return the string. | |
1011 * | |
1012 * Returns false if the entity could not be found. If $inAttribute is set | |
1013 * to true, a bare & will be returned as-is. | |
1014 * | |
1015 * @param boolean $inAttribute | |
1016 * Set to true if the text is inside of an attribute value. | |
1017 * false otherwise. | |
1018 */ | |
1019 protected function decodeCharacterReference($inAttribute = false) | |
1020 { | |
1021 | |
1022 // If it fails this, it's definitely not an entity. | |
1023 if ($this->scanner->current() != '&') { | |
1024 return false; | |
1025 } | |
1026 | |
1027 // Next char after &. | |
1028 $tok = $this->scanner->next(); | |
1029 $entity = ''; | |
1030 $start = $this->scanner->position(); | |
1031 | |
1032 if ($tok == false) { | |
1033 return '&'; | |
1034 } | |
1035 | |
1036 // These indicate not an entity. We return just | |
1037 // the &. | |
1038 if (strspn($tok, static::WHITE . "&<") == 1) { | |
1039 // $this->scanner->next(); | |
1040 return '&'; | |
1041 } | |
1042 | |
1043 // Numeric entity | |
1044 if ($tok == '#') { | |
1045 $tok = $this->scanner->next(); | |
1046 | |
1047 // Hexidecimal encoding. | |
1048 // X[0-9a-fA-F]+; | |
1049 // x[0-9a-fA-F]+; | |
1050 if ($tok == 'x' || $tok == 'X') { | |
1051 $tok = $this->scanner->next(); // Consume x | |
1052 | |
1053 // Convert from hex code to char. | |
1054 $hex = $this->scanner->getHex(); | |
1055 if (empty($hex)) { | |
1056 $this->parseError("Expected &#xHEX;, got &#x%s", $tok); | |
1057 // We unconsume because we don't know what parser rules might | |
1058 // be in effect for the remaining chars. For example. '&#>' | |
1059 // might result in a specific parsing rule inside of tag | |
1060 // contexts, while not inside of pcdata context. | |
1061 $this->scanner->unconsume(2); | |
1062 return '&'; | |
1063 } | |
1064 $entity = CharacterReference::lookupHex($hex); | |
1065 } // Decimal encoding. | |
1066 // [0-9]+; | |
1067 else { | |
1068 // Convert from decimal to char. | |
1069 $numeric = $this->scanner->getNumeric(); | |
1070 if ($numeric === false) { | |
1071 $this->parseError("Expected &#DIGITS;, got &#%s", $tok); | |
1072 $this->scanner->unconsume(2); | |
1073 return '&'; | |
1074 } | |
1075 $entity = CharacterReference::lookupDecimal($numeric); | |
1076 } | |
1077 } elseif ($tok === '=' && $inAttribute) { | |
1078 return '&'; | |
1079 } else { // String entity. | |
1080 | |
1081 // Attempt to consume a string up to a ';'. | |
1082 // [a-zA-Z0-9]+; | |
1083 $cname = $this->scanner->getAsciiAlphaNum(); | |
1084 $entity = CharacterReference::lookupName($cname); | |
1085 | |
1086 // When no entity is found provide the name of the unmatched string | |
1087 // and continue on as the & is not part of an entity. The & will | |
1088 // be converted to & elsewhere. | |
1089 if ($entity == null) { | |
1090 if (!$inAttribute || strlen($cname) === 0) { | |
1091 $this->parseError("No match in entity table for '%s'", $cname); | |
1092 } | |
1093 $this->scanner->unconsume($this->scanner->position() - $start); | |
1094 return '&'; | |
1095 } | |
1096 } | |
1097 | |
1098 // The scanner has advanced the cursor for us. | |
1099 $tok = $this->scanner->current(); | |
1100 | |
1101 // We have an entity. We're done here. | |
1102 if ($tok == ';') { | |
1103 $this->scanner->next(); | |
1104 return $entity; | |
1105 } | |
1106 | |
1107 // If in an attribute, then failing to match ; means unconsume the | |
1108 // entire string. Otherwise, failure to match is an error. | |
1109 if ($inAttribute) { | |
1110 $this->scanner->unconsume($this->scanner->position() - $start); | |
1111 return '&'; | |
1112 } | |
1113 | |
1114 $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); | |
1115 return '&' . $entity; | |
1116 } | |
1117 } |