comparison vendor/masterminds/html5/src/HTML5/Parser/Tokenizer.php @ 17:129ea1e6d783

Update, including to Drupal core 8.6.10
author Chris Cannam
date Thu, 28 Feb 2019 13:21:36 +0000
parents 4c8ae668cc8c
children
comparison
equal deleted inserted replaced
16:c2387f117808 17:129ea1e6d783
1 <?php 1 <?php
2
2 namespace Masterminds\HTML5\Parser; 3 namespace Masterminds\HTML5\Parser;
3 4
4 use Masterminds\HTML5\Elements; 5 use Masterminds\HTML5\Elements;
5 6
6 /** 7 /**
23 * 24 *
24 * @see http://www.w3.org/TR/2012/CR-html5-20121217/ 25 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
25 */ 26 */
26 class Tokenizer 27 class Tokenizer
27 { 28 {
28
29 protected $scanner; 29 protected $scanner;
30 30
31 protected $events; 31 protected $events;
32 32
33 protected $tok; 33 protected $tok;
45 45
46 const CONFORMANT_XML = 'xml'; 46 const CONFORMANT_XML = 'xml';
47 const CONFORMANT_HTML = 'html'; 47 const CONFORMANT_HTML = 'html';
48 protected $mode = self::CONFORMANT_HTML; 48 protected $mode = self::CONFORMANT_HTML;
49 49
50 const WHITE = "\t\n\f ";
51
52 /** 50 /**
53 * Create a new tokenizer. 51 * Create a new tokenizer.
54 * 52 *
55 * Typically, parsing a document involves creating a new tokenizer, giving 53 * Typically, parsing a document involves creating a new tokenizer, giving
56 * it a scanner (input) and an event handler (output), and then calling 54 * it a scanner (input) and an event handler (output), and then calling
57 * the Tokenizer::parse() method.` 55 * the Tokenizer::parse() method.`
58 * 56 *
59 * @param \Masterminds\HTML5\Parser\Scanner $scanner 57 * @param Scanner $scanner A scanner initialized with an input stream.
60 * A scanner initialized with an input stream. 58 * @param EventHandler $eventHandler An event handler, initialized and ready to receive events.
61 * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler 59 * @param string $mode
62 * An event handler, initialized and ready to receive
63 * events.
64 * @param string $mode
65 */ 60 */
66 public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML) 61 public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
67 { 62 {
68 $this->scanner = $scanner; 63 $this->scanner = $scanner;
69 $this->events = $eventHandler; 64 $this->events = $eventHandler;
101 * This allows those modes to be set. 96 * This allows those modes to be set.
102 * 97 *
103 * Normally, setting is done by the event handler via a special return code on 98 * Normally, setting is done by the event handler via a special return code on
104 * startTag(), but it can also be set manually using this function. 99 * startTag(), but it can also be set manually using this function.
105 * 100 *
106 * @param integer $textmode 101 * @param int $textmode One of Elements::TEXT_*.
107 * One of Elements::TEXT_* 102 * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not
108 * @param string $untilTag 103 * use this indicator.
109 * The tag that should stop RAW or RCDATA mode. Normal mode does not
110 * use this indicator.
111 */ 104 */
112 public function setTextMode($textmode, $untilTag = null) 105 public function setTextMode($textmode, $untilTag = null)
113 { 106 {
114 $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); 107 $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
115 $this->untilTag = $untilTag; 108 $this->untilTag = $untilTag;
116 } 109 }
117 110
118 /** 111 /**
119 * Consume a character and make a move. 112 * Consume a character and make a move.
120 * HTML5 8.2.4.1 113 * HTML5 8.2.4.1.
121 */ 114 */
122 protected function consumeData() 115 protected function consumeData()
123 { 116 {
124 // Character Ref 117 $tok = $this->scanner->current();
125 /* 118
126 * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); 119 if ('&' === $tok) {
127 */ 120 // Character reference
128 $this->characterReference(); 121 $ref = $this->decodeCharacterReference();
129 $this->tagOpen(); 122 $this->buffer($ref);
130 $this->eof(); 123
131 $this->characterData(); 124 $tok = $this->scanner->current();
125 }
126
127 // Parse tag
128 if ('<' === $tok) {
129 // Any buffered text data can go out now.
130 $this->flushBuffer();
131
132 $tok = $this->scanner->next();
133
134 if ('!' === $tok) {
135 $this->markupDeclaration();
136 } elseif ('/' === $tok) {
137 $this->endTag();
138 } elseif ('?' === $tok) {
139 $this->processingInstruction();
140 } elseif (ctype_alpha($tok)) {
141 $this->tagName();
142 } else {
143 $this->parseError('Illegal tag opening');
144 // TODO is this necessary ?
145 $this->characterData();
146 }
147
148 $tok = $this->scanner->current();
149 }
150
151 if (false === $tok) {
152 // Handle end of document
153 $this->eof();
154 } else {
155 // Parse character
156 switch ($this->textMode) {
157 case Elements::TEXT_RAW:
158 $this->rawText($tok);
159 break;
160
161 case Elements::TEXT_RCDATA:
162 $this->rcdata($tok);
163 break;
164
165 default:
166 if ('<' === $tok || '&' === $tok) {
167 break;
168 }
169
170 // NULL character
171 if ("\00" === $tok) {
172 $this->parseError('Received null character.');
173
174 $this->text .= $tok;
175 $this->scanner->consume();
176
177 break;
178 }
179
180 $this->text .= $this->scanner->charsUntil("<&\0");
181 }
182 }
132 183
133 return $this->carryOn; 184 return $this->carryOn;
134 } 185 }
135 186
136 /** 187 /**
141 * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. 192 * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
142 */ 193 */
143 protected function characterData() 194 protected function characterData()
144 { 195 {
145 $tok = $this->scanner->current(); 196 $tok = $this->scanner->current();
146 if ($tok === false) { 197 if (false === $tok) {
147 return false; 198 return false;
148 } 199 }
149 switch ($this->textMode) { 200 switch ($this->textMode) {
150 case Elements::TEXT_RAW: 201 case Elements::TEXT_RAW:
151 return $this->rawText(); 202 return $this->rawText($tok);
152 case Elements::TEXT_RCDATA: 203 case Elements::TEXT_RCDATA:
153 return $this->rcdata(); 204 return $this->rcdata($tok);
154 default: 205 default:
155 if (strspn($tok, "<&")) { 206 if ('<' === $tok || '&' === $tok) {
156 return false; 207 return false;
157 } 208 }
158 return $this->text(); 209
210 return $this->text($tok);
159 } 211 }
160 } 212 }
161 213
162 /** 214 /**
163 * This buffers the current token as character data. 215 * This buffers the current token as character data.
164 */ 216 *
165 protected function text() 217 * @param string $tok The current token.
166 { 218 *
167 $tok = $this->scanner->current(); 219 * @return bool
168 220 */
221 protected function text($tok)
222 {
169 // This should never happen... 223 // This should never happen...
170 if ($tok === false) { 224 if (false === $tok) {
171 return false; 225 return false;
172 } 226 }
173 // Null 227
174 if ($tok === "\00") { 228 // NULL character
175 $this->parseError("Received null character."); 229 if ("\00" === $tok) {
176 } 230 $this->parseError('Received null character.');
177 // fprintf(STDOUT, "Writing '%s'", $tok); 231 }
232
178 $this->buffer($tok); 233 $this->buffer($tok);
179 $this->scanner->next(); 234 $this->scanner->consume();
235
180 return true; 236 return true;
181 } 237 }
182 238
183 /** 239 /**
184 * Read text in RAW mode. 240 * Read text in RAW mode.
185 */ 241 *
186 protected function rawText() 242 * @param string $tok The current token.
243 *
244 * @return bool
245 */
246 protected function rawText($tok)
187 { 247 {
188 if (is_null($this->untilTag)) { 248 if (is_null($this->untilTag)) {
189 return $this->text(); 249 return $this->text($tok);
190 } 250 }
251
191 $sequence = '</' . $this->untilTag . '>'; 252 $sequence = '</' . $this->untilTag . '>';
192 $txt = $this->readUntilSequence($sequence); 253 $txt = $this->readUntilSequence($sequence);
193 $this->events->text($txt); 254 $this->events->text($txt);
194 $this->setTextMode(0); 255 $this->setTextMode(0);
256
195 return $this->endTag(); 257 return $this->endTag();
196 } 258 }
197 259
198 /** 260 /**
199 * Read text in RCDATA mode. 261 * Read text in RCDATA mode.
200 */ 262 *
201 protected function rcdata() 263 * @param string $tok The current token.
264 *
265 * @return bool
266 */
267 protected function rcdata($tok)
202 { 268 {
203 if (is_null($this->untilTag)) { 269 if (is_null($this->untilTag)) {
204 return $this->text(); 270 return $this->text($tok);
205 } 271 }
272
206 $sequence = '</' . $this->untilTag; 273 $sequence = '</' . $this->untilTag;
207 $txt = ''; 274 $txt = '';
208 $tok = $this->scanner->current();
209 275
210 $caseSensitive = !Elements::isHtml5Element($this->untilTag); 276 $caseSensitive = !Elements::isHtml5Element($this->untilTag);
211 while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { 277 while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
212 if ($tok == '&') { 278 if ('&' == $tok) {
213 $txt .= $this->decodeCharacterReference(); 279 $txt .= $this->decodeCharacterReference();
214 $tok = $this->scanner->current(); 280 $tok = $this->scanner->current();
215 } else { 281 } else {
216 $txt .= $tok; 282 $txt .= $tok;
217 $tok = $this->scanner->next(); 283 $tok = $this->scanner->next();
218 } 284 }
219 } 285 }
220 $len = strlen($sequence); 286 $len = strlen($sequence);
221 $this->scanner->consume($len); 287 $this->scanner->consume($len);
222 $len += strlen($this->scanner->whitespace()); 288 $len += $this->scanner->whitespace();
223 if ($this->scanner->current() !== '>') { 289 if ('>' !== $this->scanner->current()) {
224 $this->parseError("Unclosed RCDATA end tag"); 290 $this->parseError('Unclosed RCDATA end tag');
225 } 291 }
292
226 $this->scanner->unconsume($len); 293 $this->scanner->unconsume($len);
227 $this->events->text($txt); 294 $this->events->text($txt);
228 $this->setTextMode(0); 295 $this->setTextMode(0);
296
229 return $this->endTag(); 297 return $this->endTag();
230 } 298 }
231 299
232 /** 300 /**
233 * If the document is read, emit an EOF event. 301 * If the document is read, emit an EOF event.
234 */ 302 */
235 protected function eof() 303 protected function eof()
236 { 304 {
237 if ($this->scanner->current() === false) { 305 // fprintf(STDOUT, "EOF");
238 // fprintf(STDOUT, "EOF");
239 $this->flushBuffer();
240 $this->events->eof();
241 $this->carryOn = false;
242 return true;
243 }
244 return false;
245 }
246
247 /**
248 * Handle character references (aka entities).
249 *
250 * This version is specific to PCDATA, as it buffers data into the
251 * text buffer. For a generic version, see decodeCharacterReference().
252 *
253 * HTML5 8.2.4.2
254 */
255 protected function characterReference()
256 {
257 $ref = $this->decodeCharacterReference();
258 if ($ref !== false) {
259 $this->buffer($ref);
260 return true;
261 }
262 return false;
263 }
264
265 /**
266 * Emit a tagStart event on encountering a tag.
267 *
268 * 8.2.4.8
269 */
270 protected function tagOpen()
271 {
272 if ($this->scanner->current() != '<') {
273 return false;
274 }
275
276 // Any buffered text data can go out now.
277 $this->flushBuffer(); 306 $this->flushBuffer();
278 307 $this->events->eof();
279 $this->scanner->next(); 308 $this->carryOn = false;
280
281 return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
282 /* This always returns false. */
283 $this->parseError("Illegal tag opening") || $this->characterData();
284 } 309 }
285 310
286 /** 311 /**
287 * Look for markup. 312 * Look for markup.
288 */ 313 */
289 protected function markupDeclaration() 314 protected function markupDeclaration()
290 { 315 {
291 if ($this->scanner->current() != '!') {
292 return false;
293 }
294
295 $tok = $this->scanner->next(); 316 $tok = $this->scanner->next();
296 317
297 // Comment: 318 // Comment:
298 if ($tok == '-' && $this->scanner->peek() == '-') { 319 if ('-' == $tok && '-' == $this->scanner->peek()) {
299 $this->scanner->next(); // Consume the other '-' 320 $this->scanner->consume(2);
300 $this->scanner->next(); // Next char. 321
301 return $this->comment(); 322 return $this->comment();
302 } 323 } elseif ('D' == $tok || 'd' == $tok) { // Doctype
303
304 elseif ($tok == 'D' || $tok == 'd') { // Doctype
305 return $this->doctype(); 324 return $this->doctype();
306 } 325 } elseif ('[' == $tok) { // CDATA section
307
308 elseif ($tok == '[') { // CDATA section
309 return $this->cdataSection(); 326 return $this->cdataSection();
310 } 327 }
311 328
312 // FINISH 329 // FINISH
313 $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok); 330 $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok);
314 $this->bogusComment('<!'); 331 $this->bogusComment('<!');
332
315 return true; 333 return true;
316 } 334 }
317 335
318 /** 336 /**
319 * Consume an end tag. 337 * Consume an end tag. See section 8.2.4.9.
320 * 8.2.4.9
321 */ 338 */
322 protected function endTag() 339 protected function endTag()
323 { 340 {
324 if ($this->scanner->current() != '/') { 341 if ('/' != $this->scanner->current()) {
325 return false; 342 return false;
326 } 343 }
327 $tok = $this->scanner->next(); 344 $tok = $this->scanner->next();
328 345
329 // a-zA-Z -> tagname 346 // a-zA-Z -> tagname
330 // > -> parse error 347 // > -> parse error
331 // EOF -> parse error 348 // EOF -> parse error
332 // -> parse error 349 // -> parse error
333 if (! ctype_alpha($tok)) { 350 if (!ctype_alpha($tok)) {
334 $this->parseError("Expected tag name, got '%s'", $tok); 351 $this->parseError("Expected tag name, got '%s'", $tok);
335 if ($tok == "\0" || $tok === false) { 352 if ("\0" == $tok || false === $tok) {
336 return false; 353 return false;
337 } 354 }
355
338 return $this->bogusComment('</'); 356 return $this->bogusComment('</');
339 } 357 }
340 358
341 $name = $this->scanner->charsUntil("\n\f \t>"); 359 $name = $this->scanner->charsUntil("\n\f \t>");
342 $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name); 360 $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
343 // Trash whitespace. 361 // Trash whitespace.
344 $this->scanner->whitespace(); 362 $this->scanner->whitespace();
345 363
346 if ($this->scanner->current() != '>') { 364 $tok = $this->scanner->current();
347 $this->parseError("Expected >, got '%s'", $this->scanner->current()); 365 if ('>' != $tok) {
366 $this->parseError("Expected >, got '%s'", $tok);
348 // We just trash stuff until we get to the next tag close. 367 // We just trash stuff until we get to the next tag close.
349 $this->scanner->charsUntil('>'); 368 $this->scanner->charsUntil('>');
350 } 369 }
351 370
352 $this->events->endTag($name); 371 $this->events->endTag($name);
353 $this->scanner->next(); 372 $this->scanner->consume();
373
354 return true; 374 return true;
355 } 375 }
356 376
357 /** 377 /**
358 * Consume a tag name and body. 378 * Consume a tag name and body. See section 8.2.4.10.
359 * 8.2.4.10
360 */ 379 */
361 protected function tagName() 380 protected function tagName()
362 { 381 {
363 $tok = $this->scanner->current();
364 if (! ctype_alpha($tok)) {
365 return false;
366 }
367
368 // We know this is at least one char. 382 // We know this is at least one char.
369 $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); 383 $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
370 $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name); 384 $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
371 $attributes = array(); 385 $attributes = array();
372 $selfClose = false; 386 $selfClose = false;
373 387
374 // Handle attribute parse exceptions here so that we can 388 // Handle attribute parse exceptions here so that we can
375 // react by trying to build a sensible parse tree. 389 // react by trying to build a sensible parse tree.
376 try { 390 try {
377 do { 391 do {
378 $this->scanner->whitespace(); 392 $this->scanner->whitespace();
379 $this->attribute($attributes); 393 $this->attribute($attributes);
380 } while (! $this->isTagEnd($selfClose)); 394 } while (!$this->isTagEnd($selfClose));
381 } catch (ParseError $e) { 395 } catch (ParseError $e) {
382 $selfClose = false; 396 $selfClose = false;
383 } 397 }
384 398
385 $mode = $this->events->startTag($name, $attributes, $selfClose); 399 $mode = $this->events->startTag($name, $attributes, $selfClose);
386 // Should we do this? What does this buy that selfClose doesn't? 400
387 if ($selfClose) { 401 if (is_int($mode)) {
388 $this->events->endTag($name);
389 } elseif (is_int($mode)) {
390 // fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name);
391 $this->setTextMode($mode, $name); 402 $this->setTextMode($mode, $name);
392 } 403 }
393 404
394 $this->scanner->next(); 405 $this->scanner->consume();
395 406
396 return true; 407 return true;
397 } 408 }
398 409
399 /** 410 /**
400 * Check if the scanner has reached the end of a tag. 411 * Check if the scanner has reached the end of a tag.
401 */ 412 */
402 protected function isTagEnd(&$selfClose) 413 protected function isTagEnd(&$selfClose)
403 { 414 {
404 $tok = $this->scanner->current(); 415 $tok = $this->scanner->current();
405 if ($tok == '/') { 416 if ('/' == $tok) {
406 $this->scanner->next(); 417 $this->scanner->consume();
407 $this->scanner->whitespace(); 418 $this->scanner->whitespace();
408 $tok = $this->scanner->current(); 419 $tok = $this->scanner->current();
409 420
410 if ($tok == '>') { 421 if ('>' == $tok) {
411 $selfClose = true; 422 $selfClose = true;
423
412 return true; 424 return true;
413 } 425 }
414 if ($tok === false) { 426 if (false === $tok) {
415 $this->parseError("Unexpected EOF inside of tag."); 427 $this->parseError('Unexpected EOF inside of tag.');
428
416 return true; 429 return true;
417 } 430 }
418 // Basically, we skip the / token and go on. 431 // Basically, we skip the / token and go on.
419 // See 8.2.4.43. 432 // See 8.2.4.43.
420 $this->parseError("Unexpected '%s' inside of a tag.", $tok); 433 $this->parseError("Unexpected '%s' inside of a tag.", $tok);
434
421 return false; 435 return false;
422 } 436 }
423 437
424 if ($tok == '>') { 438 if ('>' == $tok) {
425 return true; 439 return true;
426 } 440 }
427 if ($tok === false) { 441 if (false === $tok) {
428 $this->parseError("Unexpected EOF inside of tag."); 442 $this->parseError('Unexpected EOF inside of tag.');
443
429 return true; 444 return true;
430 } 445 }
431 446
432 return false; 447 return false;
433 } 448 }
434 449
435 /** 450 /**
436 * Parse attributes from inside of a tag. 451 * Parse attributes from inside of a tag.
452 *
453 * @param string[] $attributes
454 *
455 * @return bool
456 *
457 * @throws ParseError
437 */ 458 */
438 protected function attribute(&$attributes) 459 protected function attribute(&$attributes)
439 { 460 {
440 $tok = $this->scanner->current(); 461 $tok = $this->scanner->current();
441 if ($tok == '/' || $tok == '>' || $tok === false) { 462 if ('/' == $tok || '>' == $tok || false === $tok) {
442 return false; 463 return false;
443 } 464 }
444 465
445 if ($tok == '<') { 466 if ('<' == $tok) {
446 $this->parseError("Unexepcted '<' inside of attributes list."); 467 $this->parseError("Unexpected '<' inside of attributes list.");
447 // Push the < back onto the stack. 468 // Push the < back onto the stack.
448 $this->scanner->unconsume(); 469 $this->scanner->unconsume();
449 // Let the caller figure out how to handle this. 470 // Let the caller figure out how to handle this.
450 throw new ParseError("Start tag inside of attribute."); 471 throw new ParseError('Start tag inside of attribute.');
451 } 472 }
452 473
453 $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); 474 $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
454 475
455 if (strlen($name) == 0) { 476 if (0 == strlen($name)) {
456 $this->parseError("Expected an attribute name, got %s.", $this->scanner->current()); 477 $tok = $this->scanner->current();
478 $this->parseError('Expected an attribute name, got %s.', $tok);
457 // Really, only '=' can be the char here. Everything else gets absorbed 479 // Really, only '=' can be the char here. Everything else gets absorbed
458 // under one rule or another. 480 // under one rule or another.
459 $name = $this->scanner->current(); 481 $name = $tok;
460 $this->scanner->next(); 482 $this->scanner->consume();
461 } 483 }
462 484
463 $isValidAttribute = true; 485 $isValidAttribute = true;
464 // Attribute names can contain most Unicode characters for HTML5. 486 // Attribute names can contain most Unicode characters for HTML5.
465 // But method "DOMElement::setAttribute" is throwing exception 487 // But method "DOMElement::setAttribute" is throwing exception
466 // because of it's own internal restriction so these have to be filtered. 488 // because of it's own internal restriction so these have to be filtered.
467 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 489 // see issue #23: https://github.com/Masterminds/html5-php/issues/23
468 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name 490 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
469 if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { 491 if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
470 $this->parseError("Unexpected characters in attribute name: %s", $name); 492 $this->parseError('Unexpected characters in attribute name: %s', $name);
471 $isValidAttribute = false; 493 $isValidAttribute = false;
472 } // There is no limitation for 1st character in HTML5. 494 } // There is no limitation for 1st character in HTML5.
473 // But method "DOMElement::setAttribute" is throwing exception for the 495 // But method "DOMElement::setAttribute" is throwing exception for the
474 // characters below so they have to be filtered. 496 // characters below so they have to be filtered.
475 // see issue #23: https://github.com/Masterminds/html5-php/issues/23 497 // see issue #23: https://github.com/Masterminds/html5-php/issues/23
476 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name 498 // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
477 else 499 elseif (preg_match('/^[0-9.-]/u', $name)) {
478 if (preg_match("/^[0-9.-]/u", $name)) { 500 $this->parseError('Unexpected character at the begining of attribute name: %s', $name);
479 $this->parseError("Unexpected character at the begining of attribute name: %s", $name); 501 $isValidAttribute = false;
480 $isValidAttribute = false; 502 }
481 }
482 // 8.1.2.3 503 // 8.1.2.3
483 $this->scanner->whitespace(); 504 $this->scanner->whitespace();
484 505
485 $val = $this->attributeValue(); 506 $val = $this->attributeValue();
486 if ($isValidAttribute) { 507 if ($isValidAttribute) {
487 $attributes[$name] = $val; 508 $attributes[$name] = $val;
488 } 509 }
510
489 return true; 511 return true;
490 } 512 }
491 513
492 /** 514 /**
493 * Consume an attribute value. 515 * Consume an attribute value. See section 8.2.4.37 and after.
494 * 8.2.4.37 and after. 516 *
517 * @return string|null
495 */ 518 */
496 protected function attributeValue() 519 protected function attributeValue()
497 { 520 {
498 if ($this->scanner->current() != '=') { 521 if ('=' != $this->scanner->current()) {
499 return null; 522 return null;
500 } 523 }
501 $this->scanner->next(); 524 $this->scanner->consume();
502 // 8.1.2.3 525 // 8.1.2.3
503 $this->scanner->whitespace(); 526 $this->scanner->whitespace();
504 527
505 $tok = $this->scanner->current(); 528 $tok = $this->scanner->current();
506 switch ($tok) { 529 switch ($tok) {
507 case "\n": 530 case "\n":
508 case "\f": 531 case "\f":
509 case " ": 532 case ' ':
510 case "\t": 533 case "\t":
511 // Whitespace here indicates an empty value. 534 // Whitespace here indicates an empty value.
512 return null; 535 return null;
513 case '"': 536 case '"':
514 case "'": 537 case "'":
515 $this->scanner->next(); 538 $this->scanner->consume();
539
516 return $this->quotedAttributeValue($tok); 540 return $this->quotedAttributeValue($tok);
517 case '>': 541 case '>':
518 // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr. 542 // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
519 $this->parseError("Expected attribute value, got tag end."); 543 $this->parseError('Expected attribute value, got tag end.');
544
520 return null; 545 return null;
521 case '=': 546 case '=':
522 case '`': 547 case '`':
523 $this->parseError("Expecting quotes, got %s.", $tok); 548 $this->parseError('Expecting quotes, got %s.', $tok);
549
524 return $this->unquotedAttributeValue(); 550 return $this->unquotedAttributeValue();
525 default: 551 default:
526 return $this->unquotedAttributeValue(); 552 return $this->unquotedAttributeValue();
527 } 553 }
528 } 554 }
529 555
530 /** 556 /**
531 * Get an attribute value string. 557 * Get an attribute value string.
532 * 558 *
533 * @param string $quote 559 * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered
534 * IMPORTANT: This is a series of chars! Any one of which will be considered 560 * termination of an attribute's value. E.g. "\"'" will stop at either
535 * termination of an attribute's value. E.g. "\"'" will stop at either 561 * ' or ".
536 * ' or ". 562 *
537 * @return string The attribute value. 563 * @return string The attribute value.
538 */ 564 */
539 protected function quotedAttributeValue($quote) 565 protected function quotedAttributeValue($quote)
540 { 566 {
541 $stoplist = "\f" . $quote; 567 $stoplist = "\f" . $quote;
542 $val = ''; 568 $val = '';
543 569
544 while (true) { 570 while (true) {
545 $tokens = $this->scanner->charsUntil($stoplist.'&'); 571 $tokens = $this->scanner->charsUntil($stoplist . '&');
546 if ($tokens !== false) { 572 if (false !== $tokens) {
547 $val .= $tokens; 573 $val .= $tokens;
548 } else { 574 } else {
549 break; 575 break;
550 } 576 }
551 577
552 $tok = $this->scanner->current(); 578 $tok = $this->scanner->current();
553 if ($tok == '&') { 579 if ('&' == $tok) {
554 $val .= $this->decodeCharacterReference(true, $tok); 580 $val .= $this->decodeCharacterReference(true);
555 continue; 581 continue;
556 } 582 }
557 break; 583 break;
558 } 584 }
559 $this->scanner->next(); 585 $this->scanner->consume();
586
560 return $val; 587 return $val;
561 } 588 }
562 589
563 protected function unquotedAttributeValue() 590 protected function unquotedAttributeValue()
564 { 591 {
565 $stoplist = "\t\n\f >";
566 $val = ''; 592 $val = '';
567 $tok = $this->scanner->current(); 593 $tok = $this->scanner->current();
568 while (strspn($tok, $stoplist) == 0 && $tok !== false) { 594 while (false !== $tok) {
569 if ($tok == '&') { 595 switch ($tok) {
570 $val .= $this->decodeCharacterReference(true); 596 case "\n":
571 $tok = $this->scanner->current(); 597 case "\f":
572 } else { 598 case ' ':
573 if (strspn($tok, "\"'<=`") > 0) { 599 case "\t":
574 $this->parseError("Unexpected chars in unquoted attribute value %s", $tok); 600 case '>':
575 } 601 break 2;
576 $val .= $tok; 602
577 $tok = $this->scanner->next(); 603 case '&':
578 } 604 $val .= $this->decodeCharacterReference(true);
579 } 605 $tok = $this->scanner->current();
606
607 break;
608
609 case "'":
610 case '"':
611 case '<':
612 case '=':
613 case '`':
614 $this->parseError('Unexpected chars in unquoted attribute value %s', $tok);
615 $val .= $tok;
616 $tok = $this->scanner->next();
617 break;
618
619 default:
620 $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`");
621
622 $tok = $this->scanner->current();
623 }
624 }
625
580 return $val; 626 return $val;
581 } 627 }
582 628
583 /** 629 /**
584 * Consume malformed markup as if it were a comment. 630 * Consume malformed markup as if it were a comment.
585 * 8.2.4.44 631 * 8.2.4.44.
586 * 632 *
587 * The spec requires that the ENTIRE tag-like thing be enclosed inside of 633 * The spec requires that the ENTIRE tag-like thing be enclosed inside of
588 * the comment. So this will generate comments like: 634 * the comment. So this will generate comments like:
589 * 635 *
590 * &lt;!--&lt/+foo&gt;--&gt; 636 * &lt;!--&lt/+foo&gt;--&gt;
591 * 637 *
592 * @param string $leading 638 * @param string $leading Prepend any leading characters. This essentially
593 * Prepend any leading characters. This essentially 639 * negates the need to backtrack, but it's sort of a hack.
594 * negates the need to backtrack, but it's sort of 640 *
595 * a hack. 641 * @return bool
596 */ 642 */
597 protected function bogusComment($leading = '') 643 protected function bogusComment($leading = '')
598 { 644 {
599 $comment = $leading; 645 $comment = $leading;
600 $tokens = $this->scanner->charsUntil('>'); 646 $tokens = $this->scanner->charsUntil('>');
601 if ($tokens !== false) { 647 if (false !== $tokens) {
602 $comment .= $tokens; 648 $comment .= $tokens;
603 } 649 }
604 $tok = $this->scanner->current(); 650 $tok = $this->scanner->current();
605 if ($tok !== false) { 651 if (false !== $tok) {
606 $comment .= $tok; 652 $comment .= $tok;
607 } 653 }
608 654
609 $this->flushBuffer(); 655 $this->flushBuffer();
610 $this->events->comment($comment); 656 $this->events->comment($comment);
611 $this->scanner->next(); 657 $this->scanner->consume();
612 658
613 return true; 659 return true;
614 } 660 }
615 661
616 /** 662 /**
617 * Read a comment. 663 * Read a comment.
618 *
619 * Expects the first tok to be inside of the comment. 664 * Expects the first tok to be inside of the comment.
665 *
666 * @return bool
620 */ 667 */
621 protected function comment() 668 protected function comment()
622 { 669 {
623 $tok = $this->scanner->current(); 670 $tok = $this->scanner->current();
624 $comment = ''; 671 $comment = '';
625 672
626 // <!-->. Emit an empty comment because 8.2.4.46 says to. 673 // <!-->. Emit an empty comment because 8.2.4.46 says to.
627 if ($tok == '>') { 674 if ('>' == $tok) {
628 // Parse error. Emit the comment token. 675 // Parse error. Emit the comment token.
629 $this->parseError("Expected comment data, got '>'"); 676 $this->parseError("Expected comment data, got '>'");
630 $this->events->comment(''); 677 $this->events->comment('');
631 $this->scanner->next(); 678 $this->scanner->consume();
679
632 return true; 680 return true;
633 } 681 }
634 682
635 // Replace NULL with the replacement char. 683 // Replace NULL with the replacement char.
636 if ($tok == "\0") { 684 if ("\0" == $tok) {
637 $tok = UTF8Utils::FFFD; 685 $tok = UTF8Utils::FFFD;
638 } 686 }
639 while (! $this->isCommentEnd()) { 687 while (!$this->isCommentEnd()) {
640 $comment .= $tok; 688 $comment .= $tok;
641 $tok = $this->scanner->next(); 689 $tok = $this->scanner->next();
642 } 690 }
643 691
644 $this->events->comment($comment); 692 $this->events->comment($comment);
645 $this->scanner->next(); 693 $this->scanner->consume();
694
646 return true; 695 return true;
647 } 696 }
648 697
649 /** 698 /**
650 * Check if the scanner has reached the end of a comment. 699 * Check if the scanner has reached the end of a comment.
700 *
701 * @return bool
651 */ 702 */
652 protected function isCommentEnd() 703 protected function isCommentEnd()
653 { 704 {
654 $tok = $this->scanner->current(); 705 $tok = $this->scanner->current();
655 706
656 // EOF 707 // EOF
657 if ($tok === false) { 708 if (false === $tok) {
658 // Hit the end. 709 // Hit the end.
659 $this->parseError("Unexpected EOF in a comment."); 710 $this->parseError('Unexpected EOF in a comment.');
711
660 return true; 712 return true;
661 } 713 }
662 714
663 // If it doesn't start with -, not the end. 715 // If it doesn't start with -, not the end.
664 if ($tok != '-') { 716 if ('-' != $tok) {
665 return false; 717 return false;
666 } 718 }
667 719
668 // Advance one, and test for '->' 720 // Advance one, and test for '->'
669 if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { 721 if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) {
670 $this->scanner->next(); // Consume the last '>' 722 $this->scanner->consume(); // Consume the last '>'
671 return true; 723 return true;
672 } 724 }
673 // Unread '-'; 725 // Unread '-';
674 $this->scanner->unconsume(1); 726 $this->scanner->unconsume(1);
727
675 return false; 728 return false;
676 } 729 }
677 730
678 /** 731 /**
679 * Parse a DOCTYPE. 732 * Parse a DOCTYPE.
680 * 733 *
681 * Parse a DOCTYPE declaration. This method has strong bearing on whether or 734 * Parse a DOCTYPE declaration. This method has strong bearing on whether or
682 * not Quirksmode is enabled on the event handler. 735 * not Quirksmode is enabled on the event handler.
683 * 736 *
684 * @todo This method is a little long. Should probably refactor. 737 * @todo This method is a little long. Should probably refactor.
738 *
739 * @return bool
685 */ 740 */
686 protected function doctype() 741 protected function doctype()
687 { 742 {
688 if (strcasecmp($this->scanner->current(), 'D')) {
689 return false;
690 }
691 // Check that string is DOCTYPE. 743 // Check that string is DOCTYPE.
692 $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); 744 if ($this->scanner->sequenceMatches('DOCTYPE', false)) {
693 if (strcasecmp($chars, 'DOCTYPE')) { 745 $this->scanner->consume(7);
746 } else {
747 $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
694 $this->parseError('Expected DOCTYPE, got %s', $chars); 748 $this->parseError('Expected DOCTYPE, got %s', $chars);
749
695 return $this->bogusComment('<!' . $chars); 750 return $this->bogusComment('<!' . $chars);
696 } 751 }
697 752
698 $this->scanner->whitespace(); 753 $this->scanner->whitespace();
699 $tok = $this->scanner->current(); 754 $tok = $this->scanner->current();
700 755
701 // EOF: die. 756 // EOF: die.
702 if ($tok === false) { 757 if (false === $tok) {
703 $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); 758 $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
704 return $this->eof(); 759 $this->eof();
705 } 760
706 761 return true;
707 $doctypeName = ''; 762 }
708 763
709 // NULL char: convert. 764 // NULL char: convert.
710 if ($tok === "\0") { 765 if ("\0" === $tok) {
711 $this->parseError("Unexpected null character in DOCTYPE."); 766 $this->parseError('Unexpected null character in DOCTYPE.');
712 $doctypeName .= UTF8::FFFD;
713 $tok = $this->scanner->next();
714 } 767 }
715 768
716 $stop = " \n\f>"; 769 $stop = " \n\f>";
717 $doctypeName = $this->scanner->charsUntil($stop); 770 $doctypeName = $this->scanner->charsUntil($stop);
718 // Lowercase ASCII, replace \0 with FFFD 771 // Lowercase ASCII, replace \0 with FFFD
719 $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); 772 $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
720 773
721 $tok = $this->scanner->current(); 774 $tok = $this->scanner->current();
722 775
723 // If false, emit a parse error, DOCTYPE, and return. 776 // If false, emit a parse error, DOCTYPE, and return.
724 if ($tok === false) { 777 if (false === $tok) {
725 $this->parseError('Unexpected EOF in DOCTYPE declaration.'); 778 $this->parseError('Unexpected EOF in DOCTYPE declaration.');
726 $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); 779 $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
780
727 return true; 781 return true;
728 } 782 }
729 783
730 // Short DOCTYPE, like <!DOCTYPE html> 784 // Short DOCTYPE, like <!DOCTYPE html>
731 if ($tok == '>') { 785 if ('>' == $tok) {
732 // DOCTYPE without a name. 786 // DOCTYPE without a name.
733 if (strlen($doctypeName) == 0) { 787 if (0 == strlen($doctypeName)) {
734 $this->parseError("Expected a DOCTYPE name. Got nothing."); 788 $this->parseError('Expected a DOCTYPE name. Got nothing.');
735 $this->events->doctype($doctypeName, 0, null, true); 789 $this->events->doctype($doctypeName, 0, null, true);
736 $this->scanner->next(); 790 $this->scanner->consume();
791
737 return true; 792 return true;
738 } 793 }
739 $this->events->doctype($doctypeName); 794 $this->events->doctype($doctypeName);
740 $this->scanner->next(); 795 $this->scanner->consume();
796
741 return true; 797 return true;
742 } 798 }
743 $this->scanner->whitespace(); 799 $this->scanner->whitespace();
744 800
745 $pub = strtoupper($this->scanner->getAsciiAlpha()); 801 $pub = strtoupper($this->scanner->getAsciiAlpha());
746 $white = strlen($this->scanner->whitespace()); 802 $white = $this->scanner->whitespace();
747 803
748 // Get ID, and flag it as pub or system. 804 // Get ID, and flag it as pub or system.
749 if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { 805 if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) {
750 // Get the sys ID. 806 // Get the sys ID.
751 $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; 807 $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
752 $id = $this->quotedString("\0>"); 808 $id = $this->quotedString("\0>");
753 if ($id === false) { 809 if (false === $id) {
754 $this->events->doctype($doctypeName, $type, $pub, false); 810 $this->events->doctype($doctypeName, $type, $pub, false);
755 return false; 811
812 return true;
756 } 813 }
757 814
758 // Premature EOF. 815 // Premature EOF.
759 if ($this->scanner->current() === false) { 816 if (false === $this->scanner->current()) {
760 $this->parseError("Unexpected EOF in DOCTYPE"); 817 $this->parseError('Unexpected EOF in DOCTYPE');
761 $this->events->doctype($doctypeName, $type, $id, true); 818 $this->events->doctype($doctypeName, $type, $id, true);
819
762 return true; 820 return true;
763 } 821 }
764 822
765 // Well-formed complete DOCTYPE. 823 // Well-formed complete DOCTYPE.
766 $this->scanner->whitespace(); 824 $this->scanner->whitespace();
767 if ($this->scanner->current() == '>') { 825 if ('>' == $this->scanner->current()) {
768 $this->events->doctype($doctypeName, $type, $id, false); 826 $this->events->doctype($doctypeName, $type, $id, false);
769 $this->scanner->next(); 827 $this->scanner->consume();
828
770 return true; 829 return true;
771 } 830 }
772 831
773 // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK 832 // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
774 // Throw away the junk, parse error, quirks mode, return true. 833 // Throw away the junk, parse error, quirks mode, return true.
775 $this->scanner->charsUntil(">"); 834 $this->scanner->charsUntil('>');
776 $this->parseError("Malformed DOCTYPE."); 835 $this->parseError('Malformed DOCTYPE.');
777 $this->events->doctype($doctypeName, $type, $id, true); 836 $this->events->doctype($doctypeName, $type, $id, true);
778 $this->scanner->next(); 837 $this->scanner->consume();
838
779 return true; 839 return true;
780 } 840 }
781 841
782 // Else it's a bogus DOCTYPE. 842 // Else it's a bogus DOCTYPE.
783 // Consume to > and trash. 843 // Consume to > and trash.
784 $this->scanner->charsUntil('>'); 844 $this->scanner->charsUntil('>');
785 845
786 $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); 846 $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub);
787 $this->events->doctype($doctypeName, 0, null, true); 847 $this->events->doctype($doctypeName, 0, null, true);
788 $this->scanner->next(); 848 $this->scanner->consume();
849
789 return true; 850 return true;
790 } 851 }
791 852
792 /** 853 /**
793 * Utility for reading a quoted string. 854 * Utility for reading a quoted string.
794 * 855 *
795 * @param string $stopchars 856 * @param string $stopchars Characters (in addition to a close-quote) that should stop the string.
796 * Characters (in addition to a close-quote) that should stop the string. 857 * E.g. sometimes '>' is higher precedence than '"' or "'".
797 * E.g. sometimes '>' is higher precedence than '"' or "'". 858 *
798 * @return mixed String if one is found (quotations omitted) 859 * @return mixed String if one is found (quotations omitted).
799 */ 860 */
800 protected function quotedString($stopchars) 861 protected function quotedString($stopchars)
801 { 862 {
802 $tok = $this->scanner->current(); 863 $tok = $this->scanner->current();
803 if ($tok == '"' || $tok == "'") { 864 if ('"' == $tok || "'" == $tok) {
804 $this->scanner->next(); 865 $this->scanner->consume();
805 $ret = $this->scanner->charsUntil($tok . $stopchars); 866 $ret = $this->scanner->charsUntil($tok . $stopchars);
806 if ($this->scanner->current() == $tok) { 867 if ($this->scanner->current() == $tok) {
807 $this->scanner->next(); 868 $this->scanner->consume();
808 } else { 869 } else {
809 // Parse error because no close quote. 870 // Parse error because no close quote.
810 $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); 871 $this->parseError('Expected %s, got %s', $tok, $this->scanner->current());
811 } 872 }
873
812 return $ret; 874 return $ret;
813 } 875 }
876
814 return false; 877 return false;
815 } 878 }
816 879
817 /** 880 /**
818 * Handle a CDATA section. 881 * Handle a CDATA section.
882 *
883 * @return bool
819 */ 884 */
820 protected function cdataSection() 885 protected function cdataSection()
821 { 886 {
822 if ($this->scanner->current() != '[') {
823 return false;
824 }
825 $cdata = ''; 887 $cdata = '';
826 $this->scanner->next(); 888 $this->scanner->consume();
827 889
828 $chars = $this->scanner->charsWhile('CDAT'); 890 $chars = $this->scanner->charsWhile('CDAT');
829 if ($chars != 'CDATA' || $this->scanner->current() != '[') { 891 if ('CDATA' != $chars || '[' != $this->scanner->current()) {
830 $this->parseError('Expected [CDATA[, got %s', $chars); 892 $this->parseError('Expected [CDATA[, got %s', $chars);
893
831 return $this->bogusComment('<![' . $chars); 894 return $this->bogusComment('<![' . $chars);
832 } 895 }
833 896
834 $tok = $this->scanner->next(); 897 $tok = $this->scanner->next();
835 do { 898 do {
836 if ($tok === false) { 899 if (false === $tok) {
837 $this->parseError('Unexpected EOF inside CDATA.'); 900 $this->parseError('Unexpected EOF inside CDATA.');
838 $this->bogusComment('<![CDATA[' . $cdata); 901 $this->bogusComment('<![CDATA[' . $cdata);
902
839 return true; 903 return true;
840 } 904 }
841 $cdata .= $tok; 905 $cdata .= $tok;
842 $tok = $this->scanner->next(); 906 $tok = $this->scanner->next();
843 } while (! $this->sequenceMatches(']]>')); 907 } while (!$this->scanner->sequenceMatches(']]>'));
844 908
845 // Consume ]]> 909 // Consume ]]>
846 $this->scanner->consume(3); 910 $this->scanner->consume(3);
847 911
848 $this->events->cdata($cdata); 912 $this->events->cdata($cdata);
913
849 return true; 914 return true;
850 } 915 }
851 916
852 // ================================================================ 917 // ================================================================
853 // Non-HTML5 918 // Non-HTML5
854 // ================================================================ 919 // ================================================================
920
855 /** 921 /**
856 * Handle a processing instruction. 922 * Handle a processing instruction.
857 * 923 *
858 * XML processing instructions are supposed to be ignored in HTML5, 924 * XML processing instructions are supposed to be ignored in HTML5,
859 * treated as "bogus comments". However, since we're not a user 925 * treated as "bogus comments". However, since we're not a user
860 * agent, we allow them. We consume until ?> and then issue a 926 * agent, we allow them. We consume until ?> and then issue a
861 * EventListener::processingInstruction() event. 927 * EventListener::processingInstruction() event.
928 *
929 * @return bool
862 */ 930 */
863 protected function processingInstruction() 931 protected function processingInstruction()
864 { 932 {
865 if ($this->scanner->current() != '?') { 933 if ('?' != $this->scanner->current()) {
866 return false; 934 return false;
867 } 935 }
868 936
869 $tok = $this->scanner->next(); 937 $tok = $this->scanner->next();
870 $procName = $this->scanner->getAsciiAlpha(); 938 $procName = $this->scanner->getAsciiAlpha();
871 $white = strlen($this->scanner->whitespace()); 939 $white = $this->scanner->whitespace();
872 940
873 // If not a PI, send to bogusComment. 941 // If not a PI, send to bogusComment.
874 if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { 942 if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) {
875 $this->parseError("Expected processing instruction name, got $tok"); 943 $this->parseError("Expected processing instruction name, got $tok");
876 $this->bogusComment('<?' . $tok . $procName); 944 $this->bogusComment('<?' . $tok . $procName);
945
877 return true; 946 return true;
878 } 947 }
879 948
880 $data = ''; 949 $data = '';
881 // As long as it's not the case that the next two chars are ? and >. 950 // As long as it's not the case that the next two chars are ? and >.
882 while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { 951 while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) {
883 $data .= $this->scanner->current(); 952 $data .= $this->scanner->current();
884 953
885 $tok = $this->scanner->next(); 954 $tok = $this->scanner->next();
886 if ($tok === false) { 955 if (false === $tok) {
887 $this->parseError("Unexpected EOF in processing instruction."); 956 $this->parseError('Unexpected EOF in processing instruction.');
888 $this->events->processingInstruction($procName, $data); 957 $this->events->processingInstruction($procName, $data);
958
889 return true; 959 return true;
890 } 960 }
891 } 961 }
892 962
893 $this->scanner->next(); // > 963 $this->scanner->consume(2); // Consume the closing tag
894 $this->scanner->next(); // Next token.
895 $this->events->processingInstruction($procName, $data); 964 $this->events->processingInstruction($procName, $data);
965
896 return true; 966 return true;
897 } 967 }
898 968
899 // ================================================================ 969 // ================================================================
900 // UTILITY FUNCTIONS 970 // UTILITY FUNCTIONS
901 // ================================================================ 971 // ================================================================
902 972
903 /** 973 /**
904 * Read from the input stream until we get to the desired sequene 974 * Read from the input stream until we get to the desired sequene
905 * or hit the end of the input stream. 975 * or hit the end of the input stream.
976 *
977 * @param string $sequence
978 *
979 * @return string
906 */ 980 */
907 protected function readUntilSequence($sequence) 981 protected function readUntilSequence($sequence)
908 { 982 {
909 $buffer = ''; 983 $buffer = '';
910 984
911 // Optimization for reading larger blocks faster. 985 // Optimization for reading larger blocks faster.
912 $first = substr($sequence, 0, 1); 986 $first = substr($sequence, 0, 1);
913 while ($this->scanner->current() !== false) { 987 while (false !== $this->scanner->current()) {
914 $buffer .= $this->scanner->charsUntil($first); 988 $buffer .= $this->scanner->charsUntil($first);
915 989
916 // Stop as soon as we hit the stopping condition. 990 // Stop as soon as we hit the stopping condition.
917 if ($this->sequenceMatches($sequence, false)) { 991 if ($this->scanner->sequenceMatches($sequence, false)) {
918 return $buffer; 992 return $buffer;
919 } 993 }
920 $buffer .= $this->scanner->current(); 994 $buffer .= $this->scanner->current();
921 $this->scanner->next(); 995 $this->scanner->consume();
922 } 996 }
923 997
924 // If we get here, we hit the EOF. 998 // If we get here, we hit the EOF.
925 $this->parseError("Unexpected EOF during text read."); 999 $this->parseError('Unexpected EOF during text read.');
1000
926 return $buffer; 1001 return $buffer;
927 } 1002 }
928 1003
929 /** 1004 /**
930 * Check if upcomming chars match the given sequence. 1005 * Check if upcomming chars match the given sequence.
933 * found, this will return true. If not, return false. 1008 * found, this will return true. If not, return false.
934 * Since this unconsumes any chars it reads, the caller 1009 * Since this unconsumes any chars it reads, the caller
935 * will still need to read the next sequence, even if 1010 * will still need to read the next sequence, even if
936 * this returns true. 1011 * this returns true.
937 * 1012 *
938 * Example: $this->sequenceMatches('</script>') will 1013 * Example: $this->scanner->sequenceMatches('</script>') will
939 * see if the input stream is at the start of a 1014 * see if the input stream is at the start of a
940 * '</script>' string. 1015 * '</script>' string.
1016 *
1017 * @param string $sequence
1018 * @param bool $caseSensitive
1019 *
1020 * @return bool
941 */ 1021 */
942 protected function sequenceMatches($sequence, $caseSensitive = true) 1022 protected function sequenceMatches($sequence, $caseSensitive = true)
943 { 1023 {
944 $len = strlen($sequence); 1024 @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
945 $buffer = ''; 1025
946 for ($i = 0; $i < $len; ++ $i) { 1026 return $this->scanner->sequenceMatches($sequence, $caseSensitive);
947 $tok = $this->scanner->current();
948 $buffer .= $tok;
949
950 // EOF. Rewind and let the caller handle it.
951 if ($tok === false) {
952 $this->scanner->unconsume($i);
953 return false;
954 }
955 $this->scanner->next();
956 }
957
958 $this->scanner->unconsume($len);
959 return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
960 } 1027 }
961 1028
962 /** 1029 /**
963 * Send a TEXT event with the contents of the text buffer. 1030 * Send a TEXT event with the contents of the text buffer.
964 * 1031 *
966 * temporary text buffer. (The buffer is used to group as much PCDATA 1033 * temporary text buffer. (The buffer is used to group as much PCDATA
967 * as we can instead of emitting lots and lots of TEXT events.) 1034 * as we can instead of emitting lots and lots of TEXT events.)
968 */ 1035 */
969 protected function flushBuffer() 1036 protected function flushBuffer()
970 { 1037 {
971 if ($this->text === '') { 1038 if ('' === $this->text) {
972 return; 1039 return;
973 } 1040 }
974 $this->events->text($this->text); 1041 $this->events->text($this->text);
975 $this->text = ''; 1042 $this->text = '';
976 } 1043 }
977 1044
978 /** 1045 /**
979 * Add text to the temporary buffer. 1046 * Add text to the temporary buffer.
980 * 1047 *
981 * @see flushBuffer() 1048 * @see flushBuffer()
1049 *
1050 * @param string $str
982 */ 1051 */
983 protected function buffer($str) 1052 protected function buffer($str)
984 { 1053 {
985 $this->text .= $str; 1054 $this->text .= $str;
986 } 1055 }
988 /** 1057 /**
989 * Emit a parse error. 1058 * Emit a parse error.
990 * 1059 *
991 * A parse error always returns false because it never consumes any 1060 * A parse error always returns false because it never consumes any
992 * characters. 1061 * characters.
1062 *
1063 * @param string $msg
1064 *
1065 * @return string
993 */ 1066 */
994 protected function parseError($msg) 1067 protected function parseError($msg)
995 { 1068 {
996 $args = func_get_args(); 1069 $args = func_get_args();
997 1070
1001 } 1074 }
1002 1075
1003 $line = $this->scanner->currentLine(); 1076 $line = $this->scanner->currentLine();
1004 $col = $this->scanner->columnOffset(); 1077 $col = $this->scanner->columnOffset();
1005 $this->events->parseError($msg, $line, $col); 1078 $this->events->parseError($msg, $line, $col);
1079
1006 return false; 1080 return false;
1007 } 1081 }
1008 1082
1009 /** 1083 /**
1010 * Decode a character reference and return the string. 1084 * Decode a character reference and return the string.
1011 * 1085 *
1012 * Returns false if the entity could not be found. If $inAttribute is set 1086 * If $inAttribute is set to true, a bare & will be returned as-is.
1013 * to true, a bare & will be returned as-is. 1087 *
1014 * 1088 * @param bool $inAttribute Set to true if the text is inside of an attribute value.
1015 * @param boolean $inAttribute 1089 * false otherwise.
1016 * Set to true if the text is inside of an attribute value. 1090 *
1017 * false otherwise. 1091 * @return string
1018 */ 1092 */
1019 protected function decodeCharacterReference($inAttribute = false) 1093 protected function decodeCharacterReference($inAttribute = false)
1020 { 1094 {
1021
1022 // If it fails this, it's definitely not an entity.
1023 if ($this->scanner->current() != '&') {
1024 return false;
1025 }
1026
1027 // Next char after &. 1095 // Next char after &.
1028 $tok = $this->scanner->next(); 1096 $tok = $this->scanner->next();
1029 $entity = '';
1030 $start = $this->scanner->position(); 1097 $start = $this->scanner->position();
1031 1098
1032 if ($tok == false) { 1099 if (false === $tok) {
1033 return '&'; 1100 return '&';
1034 } 1101 }
1035 1102
1036 // These indicate not an entity. We return just 1103 // These indicate not an entity. We return just
1037 // the &. 1104 // the &.
1038 if (strspn($tok, static::WHITE . "&<") == 1) { 1105 if ("\t" === $tok || "\n" === $tok || "\f" === $tok || ' ' === $tok || '&' === $tok || '<' === $tok) {
1039 // $this->scanner->next(); 1106 // $this->scanner->next();
1040 return '&'; 1107 return '&';
1041 } 1108 }
1042 1109
1043 // Numeric entity 1110 // Numeric entity
1044 if ($tok == '#') { 1111 if ('#' === $tok) {
1045 $tok = $this->scanner->next(); 1112 $tok = $this->scanner->next();
1046 1113
1047 // Hexidecimal encoding. 1114 // Hexidecimal encoding.
1048 // X[0-9a-fA-F]+; 1115 // X[0-9a-fA-F]+;
1049 // x[0-9a-fA-F]+; 1116 // x[0-9a-fA-F]+;
1050 if ($tok == 'x' || $tok == 'X') { 1117 if ('x' === $tok || 'X' === $tok) {
1051 $tok = $this->scanner->next(); // Consume x 1118 $tok = $this->scanner->next(); // Consume x
1052 1119
1053 // Convert from hex code to char. 1120 // Convert from hex code to char.
1054 $hex = $this->scanner->getHex(); 1121 $hex = $this->scanner->getHex();
1055 if (empty($hex)) { 1122 if (empty($hex)) {
1056 $this->parseError("Expected &#xHEX;, got &#x%s", $tok); 1123 $this->parseError('Expected &#xHEX;, got &#x%s', $tok);
1057 // We unconsume because we don't know what parser rules might 1124 // We unconsume because we don't know what parser rules might
1058 // be in effect for the remaining chars. For example. '&#>' 1125 // be in effect for the remaining chars. For example. '&#>'
1059 // might result in a specific parsing rule inside of tag 1126 // might result in a specific parsing rule inside of tag
1060 // contexts, while not inside of pcdata context. 1127 // contexts, while not inside of pcdata context.
1061 $this->scanner->unconsume(2); 1128 $this->scanner->unconsume(2);
1129
1062 return '&'; 1130 return '&';
1063 } 1131 }
1064 $entity = CharacterReference::lookupHex($hex); 1132 $entity = CharacterReference::lookupHex($hex);
1065 } // Decimal encoding. 1133 } // Decimal encoding.
1066 // [0-9]+; 1134 // [0-9]+;
1067 else { 1135 else {
1068 // Convert from decimal to char. 1136 // Convert from decimal to char.
1069 $numeric = $this->scanner->getNumeric(); 1137 $numeric = $this->scanner->getNumeric();
1070 if ($numeric === false) { 1138 if (false === $numeric) {
1071 $this->parseError("Expected &#DIGITS;, got &#%s", $tok); 1139 $this->parseError('Expected &#DIGITS;, got &#%s', $tok);
1072 $this->scanner->unconsume(2); 1140 $this->scanner->unconsume(2);
1141
1073 return '&'; 1142 return '&';
1074 } 1143 }
1075 $entity = CharacterReference::lookupDecimal($numeric); 1144 $entity = CharacterReference::lookupDecimal($numeric);
1076 } 1145 }
1077 } elseif ($tok === '=' && $inAttribute) { 1146 } elseif ('=' === $tok && $inAttribute) {
1078 return '&'; 1147 return '&';
1079 } else { // String entity. 1148 } else { // String entity.
1080
1081 // Attempt to consume a string up to a ';'. 1149 // Attempt to consume a string up to a ';'.
1082 // [a-zA-Z0-9]+; 1150 // [a-zA-Z0-9]+;
1083 $cname = $this->scanner->getAsciiAlphaNum(); 1151 $cname = $this->scanner->getAsciiAlphaNum();
1084 $entity = CharacterReference::lookupName($cname); 1152 $entity = CharacterReference::lookupName($cname);
1085 1153
1086 // When no entity is found provide the name of the unmatched string 1154 // When no entity is found provide the name of the unmatched string
1087 // and continue on as the & is not part of an entity. The & will 1155 // and continue on as the & is not part of an entity. The & will
1088 // be converted to &amp; elsewhere. 1156 // be converted to &amp; elsewhere.
1089 if ($entity == null) { 1157 if (null === $entity) {
1090 if (!$inAttribute || strlen($cname) === 0) { 1158 if (!$inAttribute || '' === $cname) {
1091 $this->parseError("No match in entity table for '%s'", $cname); 1159 $this->parseError("No match in entity table for '%s'", $cname);
1092 } 1160 }
1093 $this->scanner->unconsume($this->scanner->position() - $start); 1161 $this->scanner->unconsume($this->scanner->position() - $start);
1162
1094 return '&'; 1163 return '&';
1095 } 1164 }
1096 } 1165 }
1097 1166
1098 // The scanner has advanced the cursor for us. 1167 // The scanner has advanced the cursor for us.
1099 $tok = $this->scanner->current(); 1168 $tok = $this->scanner->current();
1100 1169
1101 // We have an entity. We're done here. 1170 // We have an entity. We're done here.
1102 if ($tok == ';') { 1171 if (';' === $tok) {
1103 $this->scanner->next(); 1172 $this->scanner->consume();
1173
1104 return $entity; 1174 return $entity;
1105 } 1175 }
1106 1176
1107 // If in an attribute, then failing to match ; means unconsume the 1177 // If in an attribute, then failing to match ; means unconsume the
1108 // entire string. Otherwise, failure to match is an error. 1178 // entire string. Otherwise, failure to match is an error.
1109 if ($inAttribute) { 1179 if ($inAttribute) {
1110 $this->scanner->unconsume($this->scanner->position() - $start); 1180 $this->scanner->unconsume($this->scanner->position() - $start);
1181
1111 return '&'; 1182 return '&';
1112 } 1183 }
1113 1184
1114 $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); 1185 $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok);
1186
1115 return '&' . $entity; 1187 return '&' . $entity;
1116 } 1188 }
1117 } 1189 }