annotate vendor/masterminds/html5/src/HTML5/Parser/DOMTreeBuilder.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 129ea1e6d783
children
rev   line source
Chris@0 1 <?php
Chris@17 2
Chris@0 3 namespace Masterminds\HTML5\Parser;
Chris@0 4
Chris@0 5 use Masterminds\HTML5\Elements;
Chris@17 6 use Masterminds\HTML5\InstructionProcessor;
Chris@0 7
Chris@0 8 /**
Chris@0 9 * Create an HTML5 DOM tree from events.
Chris@0 10 *
Chris@0 11 * This attempts to create a DOM from events emitted by a parser. This
Chris@0 12 * attempts (but does not guarantee) to up-convert older HTML documents
Chris@0 13 * to HTML5. It does this by applying HTML5's rules, but it will not
Chris@0 14 * change the architecture of the document itself.
Chris@0 15 *
Chris@0 16 * Many of the error correction and quirks features suggested in the specification
Chris@0 17 * are implemented herein; however, not all of them are. Since we do not
Chris@0 18 * assume a graphical user agent, no presentation-specific logic is conducted
Chris@0 19 * during tree building.
Chris@0 20 *
Chris@0 21 * FIXME: The present tree builder does not exactly follow the state machine rules
Chris@0 22 * for insert modes as outlined in the HTML5 spec. The processor needs to be
Chris@0 23 * re-written to accomodate this. See, for example, the Go language HTML5
Chris@0 24 * parser.
Chris@0 25 */
Chris@0 26 class DOMTreeBuilder implements EventHandler
Chris@0 27 {
Chris@0 28 /**
Chris@17 29 * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0.
Chris@0 30 */
Chris@0 31 const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
Chris@0 32
Chris@0 33 const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
Chris@0 34
Chris@0 35 const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
Chris@0 36
Chris@0 37 const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
Chris@0 38
Chris@0 39 const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
Chris@0 40
Chris@0 41 const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
Chris@0 42
Chris@0 43 const OPT_DISABLE_HTML_NS = 'disable_html_ns';
Chris@0 44
Chris@0 45 const OPT_TARGET_DOC = 'target_document';
Chris@0 46
Chris@0 47 const OPT_IMPLICIT_NS = 'implicit_namespaces';
Chris@0 48
Chris@0 49 /**
Chris@17 50 * Holds the HTML5 element names that causes a namespace switch.
Chris@0 51 *
Chris@0 52 * @var array
Chris@0 53 */
Chris@0 54 protected $nsRoots = array(
Chris@0 55 'html' => self::NAMESPACE_HTML,
Chris@0 56 'svg' => self::NAMESPACE_SVG,
Chris@17 57 'math' => self::NAMESPACE_MATHML,
Chris@0 58 );
Chris@0 59
Chris@0 60 /**
Chris@0 61 * Holds the always available namespaces (which does not require the XMLNS declaration).
Chris@0 62 *
Chris@0 63 * @var array
Chris@0 64 */
Chris@0 65 protected $implicitNamespaces = array(
Chris@0 66 'xml' => self::NAMESPACE_XML,
Chris@0 67 'xmlns' => self::NAMESPACE_XMLNS,
Chris@17 68 'xlink' => self::NAMESPACE_XLINK,
Chris@0 69 );
Chris@0 70
Chris@0 71 /**
Chris@0 72 * Holds a stack of currently active namespaces.
Chris@0 73 *
Chris@0 74 * @var array
Chris@0 75 */
Chris@0 76 protected $nsStack = array();
Chris@0 77
Chris@0 78 /**
Chris@0 79 * Holds the number of namespaces declared by a node.
Chris@0 80 *
Chris@0 81 * @var array
Chris@0 82 */
Chris@0 83 protected $pushes = array();
Chris@0 84
Chris@0 85 /**
Chris@0 86 * Defined in 8.2.5.
Chris@0 87 */
Chris@0 88 const IM_INITIAL = 0;
Chris@0 89
Chris@0 90 const IM_BEFORE_HTML = 1;
Chris@0 91
Chris@0 92 const IM_BEFORE_HEAD = 2;
Chris@0 93
Chris@0 94 const IM_IN_HEAD = 3;
Chris@0 95
Chris@0 96 const IM_IN_HEAD_NOSCRIPT = 4;
Chris@0 97
Chris@0 98 const IM_AFTER_HEAD = 5;
Chris@0 99
Chris@0 100 const IM_IN_BODY = 6;
Chris@0 101
Chris@0 102 const IM_TEXT = 7;
Chris@0 103
Chris@0 104 const IM_IN_TABLE = 8;
Chris@0 105
Chris@0 106 const IM_IN_TABLE_TEXT = 9;
Chris@0 107
Chris@0 108 const IM_IN_CAPTION = 10;
Chris@0 109
Chris@0 110 const IM_IN_COLUMN_GROUP = 11;
Chris@0 111
Chris@0 112 const IM_IN_TABLE_BODY = 12;
Chris@0 113
Chris@0 114 const IM_IN_ROW = 13;
Chris@0 115
Chris@0 116 const IM_IN_CELL = 14;
Chris@0 117
Chris@0 118 const IM_IN_SELECT = 15;
Chris@0 119
Chris@0 120 const IM_IN_SELECT_IN_TABLE = 16;
Chris@0 121
Chris@0 122 const IM_AFTER_BODY = 17;
Chris@0 123
Chris@0 124 const IM_IN_FRAMESET = 18;
Chris@0 125
Chris@0 126 const IM_AFTER_FRAMESET = 19;
Chris@0 127
Chris@0 128 const IM_AFTER_AFTER_BODY = 20;
Chris@0 129
Chris@0 130 const IM_AFTER_AFTER_FRAMESET = 21;
Chris@0 131
Chris@0 132 const IM_IN_SVG = 22;
Chris@0 133
Chris@0 134 const IM_IN_MATHML = 23;
Chris@0 135
Chris@0 136 protected $options = array();
Chris@0 137
Chris@0 138 protected $stack = array();
Chris@0 139
Chris@0 140 protected $current; // Pointer in the tag hierarchy.
Chris@17 141 protected $rules;
Chris@0 142 protected $doc;
Chris@0 143
Chris@0 144 protected $frag;
Chris@0 145
Chris@0 146 protected $processor;
Chris@0 147
Chris@0 148 protected $insertMode = 0;
Chris@0 149
Chris@0 150 /**
Chris@17 151 * Track if we are in an element that allows only inline child nodes.
Chris@17 152 *
Chris@0 153 * @var string|null
Chris@0 154 */
Chris@0 155 protected $onlyInline;
Chris@0 156
Chris@0 157 /**
Chris@0 158 * Quirks mode is enabled by default.
Chris@17 159 * Any document that is missing the DT will be considered to be in quirks mode.
Chris@0 160 */
Chris@0 161 protected $quirks = true;
Chris@0 162
Chris@0 163 protected $errors = array();
Chris@0 164
Chris@0 165 public function __construct($isFragment = false, array $options = array())
Chris@0 166 {
Chris@0 167 $this->options = $options;
Chris@0 168
Chris@0 169 if (isset($options[self::OPT_TARGET_DOC])) {
Chris@0 170 $this->doc = $options[self::OPT_TARGET_DOC];
Chris@0 171 } else {
Chris@0 172 $impl = new \DOMImplementation();
Chris@0 173 // XXX:
Chris@0 174 // Create the doctype. For now, we are always creating HTML5
Chris@0 175 // documents, and attempting to up-convert any older DTDs to HTML5.
Chris@0 176 $dt = $impl->createDocumentType('html');
Chris@0 177 // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
Chris@0 178 $this->doc = $impl->createDocument(null, null, $dt);
Chris@0 179 }
Chris@17 180
Chris@0 181 $this->errors = array();
Chris@0 182
Chris@0 183 $this->current = $this->doc; // ->documentElement;
Chris@0 184
Chris@0 185 // Create a rules engine for tags.
Chris@17 186 $this->rules = new TreeBuildingRules();
Chris@0 187
Chris@0 188 $implicitNS = array();
Chris@0 189 if (isset($this->options[self::OPT_IMPLICIT_NS])) {
Chris@0 190 $implicitNS = $this->options[self::OPT_IMPLICIT_NS];
Chris@17 191 } elseif (isset($this->options['implicitNamespaces'])) {
Chris@17 192 $implicitNS = $this->options['implicitNamespaces'];
Chris@0 193 }
Chris@0 194
Chris@0 195 // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
Chris@17 196 array_unshift($this->nsStack, $implicitNS + array('' => self::NAMESPACE_HTML) + $this->implicitNamespaces);
Chris@0 197
Chris@0 198 if ($isFragment) {
Chris@0 199 $this->insertMode = static::IM_IN_BODY;
Chris@0 200 $this->frag = $this->doc->createDocumentFragment();
Chris@0 201 $this->current = $this->frag;
Chris@0 202 }
Chris@0 203 }
Chris@0 204
Chris@0 205 /**
Chris@0 206 * Get the document.
Chris@0 207 */
Chris@0 208 public function document()
Chris@0 209 {
Chris@0 210 return $this->doc;
Chris@0 211 }
Chris@0 212
Chris@0 213 /**
Chris@0 214 * Get the DOM fragment for the body.
Chris@0 215 *
Chris@0 216 * This returns a DOMNodeList because a fragment may have zero or more
Chris@0 217 * DOMNodes at its root.
Chris@0 218 *
Chris@0 219 * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
Chris@0 220 *
Chris@17 221 * @return \DOMDocumentFragment
Chris@0 222 */
Chris@0 223 public function fragment()
Chris@0 224 {
Chris@0 225 return $this->frag;
Chris@0 226 }
Chris@0 227
Chris@0 228 /**
Chris@0 229 * Provide an instruction processor.
Chris@0 230 *
Chris@0 231 * This is used for handling Processor Instructions as they are
Chris@0 232 * inserted. If omitted, PI's are inserted directly into the DOM tree.
Chris@17 233 *
Chris@17 234 * @param InstructionProcessor $proc
Chris@0 235 */
Chris@17 236 public function setInstructionProcessor(InstructionProcessor $proc)
Chris@0 237 {
Chris@0 238 $this->processor = $proc;
Chris@0 239 }
Chris@0 240
Chris@0 241 public function doctype($name, $idType = 0, $id = null, $quirks = false)
Chris@0 242 {
Chris@0 243 // This is used solely for setting quirks mode. Currently we don't
Chris@0 244 // try to preserve the inbound DT. We convert it to HTML5.
Chris@0 245 $this->quirks = $quirks;
Chris@0 246
Chris@0 247 if ($this->insertMode > static::IM_INITIAL) {
Chris@17 248 $this->parseError('Illegal placement of DOCTYPE tag. Ignoring: ' . $name);
Chris@0 249
Chris@0 250 return;
Chris@0 251 }
Chris@0 252
Chris@0 253 $this->insertMode = static::IM_BEFORE_HTML;
Chris@0 254 }
Chris@0 255
Chris@0 256 /**
Chris@0 257 * Process the start tag.
Chris@0 258 *
Chris@0 259 * @todo - XMLNS namespace handling (we need to parse, even if it's not valid)
Chris@0 260 * - XLink, MathML and SVG namespace handling
Chris@0 261 * - Omission rules: 8.1.2.4 Optional tags
Chris@17 262 *
Chris@17 263 * @param string $name
Chris@17 264 * @param array $attributes
Chris@17 265 * @param bool $selfClosing
Chris@17 266 *
Chris@17 267 * @return int
Chris@0 268 */
Chris@0 269 public function startTag($name, $attributes = array(), $selfClosing = false)
Chris@0 270 {
Chris@0 271 $lname = $this->normalizeTagName($name);
Chris@0 272
Chris@0 273 // Make sure we have an html element.
Chris@17 274 if (!$this->doc->documentElement && 'html' !== $name && !$this->frag) {
Chris@0 275 $this->startTag('html');
Chris@0 276 }
Chris@0 277
Chris@0 278 // Set quirks mode if we're at IM_INITIAL with no doctype.
Chris@17 279 if ($this->insertMode === static::IM_INITIAL) {
Chris@0 280 $this->quirks = true;
Chris@17 281 $this->parseError('No DOCTYPE specified.');
Chris@0 282 }
Chris@0 283
Chris@0 284 // SPECIAL TAG HANDLING:
Chris@0 285 // Spec says do this, and "don't ask."
Chris@0 286 // find the spec where this is defined... looks problematic
Chris@17 287 if ('image' === $name && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
Chris@0 288 $name = 'img';
Chris@0 289 }
Chris@0 290
Chris@0 291 // Autoclose p tags where appropriate.
Chris@0 292 if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
Chris@0 293 $this->autoclose('p');
Chris@0 294 }
Chris@0 295
Chris@0 296 // Set insert mode:
Chris@0 297 switch ($name) {
Chris@0 298 case 'html':
Chris@0 299 $this->insertMode = static::IM_BEFORE_HEAD;
Chris@0 300 break;
Chris@0 301 case 'head':
Chris@0 302 if ($this->insertMode > static::IM_BEFORE_HEAD) {
Chris@17 303 $this->parseError('Unexpected head tag outside of head context.');
Chris@0 304 } else {
Chris@0 305 $this->insertMode = static::IM_IN_HEAD;
Chris@0 306 }
Chris@0 307 break;
Chris@0 308 case 'body':
Chris@0 309 $this->insertMode = static::IM_IN_BODY;
Chris@0 310 break;
Chris@0 311 case 'svg':
Chris@0 312 $this->insertMode = static::IM_IN_SVG;
Chris@0 313 break;
Chris@0 314 case 'math':
Chris@0 315 $this->insertMode = static::IM_IN_MATHML;
Chris@0 316 break;
Chris@0 317 case 'noscript':
Chris@17 318 if ($this->insertMode === static::IM_IN_HEAD) {
Chris@0 319 $this->insertMode = static::IM_IN_HEAD_NOSCRIPT;
Chris@0 320 }
Chris@0 321 break;
Chris@0 322 }
Chris@0 323
Chris@0 324 // Special case handling for SVG.
Chris@17 325 if ($this->insertMode === static::IM_IN_SVG) {
Chris@0 326 $lname = Elements::normalizeSvgElement($lname);
Chris@0 327 }
Chris@0 328
Chris@0 329 $pushes = 0;
Chris@0 330 // when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace
Chris@0 331 if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) {
Chris@0 332 array_unshift($this->nsStack, array(
Chris@17 333 '' => $this->nsRoots[$lname],
Chris@0 334 ) + $this->nsStack[0]);
Chris@17 335 ++$pushes;
Chris@0 336 }
Chris@0 337 $needsWorkaround = false;
Chris@17 338 if (isset($this->options['xmlNamespaces']) && $this->options['xmlNamespaces']) {
Chris@0 339 // when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack
Chris@0 340 foreach ($attributes as $aName => $aVal) {
Chris@17 341 if ('xmlns' === $aName) {
Chris@0 342 $needsWorkaround = $aVal;
Chris@0 343 array_unshift($this->nsStack, array(
Chris@17 344 '' => $aVal,
Chris@0 345 ) + $this->nsStack[0]);
Chris@17 346 ++$pushes;
Chris@17 347 } elseif ('xmlns' === (($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '')) {
Chris@0 348 array_unshift($this->nsStack, array(
Chris@17 349 substr($aName, $pos + 1) => $aVal,
Chris@0 350 ) + $this->nsStack[0]);
Chris@17 351 ++$pushes;
Chris@0 352 }
Chris@0 353 }
Chris@0 354 }
Chris@0 355
Chris@0 356 if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
Chris@17 357 $this->autoclose($this->onlyInline);
Chris@17 358 $this->onlyInline = null;
Chris@0 359 }
Chris@0 360
Chris@0 361 try {
Chris@0 362 $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
Chris@0 363
Chris@17 364 if (false !== $needsWorkaround) {
Chris@17 365 $xml = "<$lname xmlns=\"$needsWorkaround\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? ("xmlns:$prefix=\"" . $this->nsStack[0][$prefix] . '"') : '') . '/>';
Chris@0 366
Chris@0 367 $frag = new \DOMDocument('1.0', 'UTF-8');
Chris@0 368 $frag->loadXML($xml);
Chris@0 369
Chris@0 370 $ele = $this->doc->importNode($frag->documentElement, true);
Chris@0 371 } else {
Chris@17 372 if (!isset($this->nsStack[0][$prefix]) || ('' === $prefix && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
Chris@0 373 $ele = $this->doc->createElement($lname);
Chris@0 374 } else {
Chris@0 375 $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
Chris@0 376 }
Chris@0 377 }
Chris@0 378 } catch (\DOMException $e) {
Chris@0 379 $this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>.");
Chris@0 380 $ele = $this->doc->createElement('invalid');
Chris@0 381 }
Chris@0 382
Chris@0 383 if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
Chris@17 384 $this->onlyInline = $lname;
Chris@0 385 }
Chris@0 386
Chris@0 387 // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
Chris@0 388 // When we are on a void tag, we do not need to care about namesapce nesting.
Chris@0 389 if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 390 // PHP tends to free the memory used by DOM,
Chris@0 391 // to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes
Chris@0 392 // see https://bugs.php.net/bug.php?id=67459
Chris@0 393 $this->pushes[spl_object_hash($ele)] = array($pushes, $ele);
Chris@0 394
Chris@0 395 // SEE https://github.com/facebook/hhvm/issues/2962
Chris@0 396 if (defined('HHVM_VERSION')) {
Chris@0 397 $ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele));
Chris@0 398 }
Chris@0 399 }
Chris@0 400
Chris@0 401 foreach ($attributes as $aName => $aVal) {
Chris@0 402 // xmlns attributes can't be set
Chris@17 403 if ('xmlns' === $aName) {
Chris@0 404 continue;
Chris@0 405 }
Chris@0 406
Chris@17 407 if ($this->insertMode === static::IM_IN_SVG) {
Chris@0 408 $aName = Elements::normalizeSvgAttribute($aName);
Chris@17 409 } elseif ($this->insertMode === static::IM_IN_MATHML) {
Chris@0 410 $aName = Elements::normalizeMathMlAttribute($aName);
Chris@0 411 }
Chris@0 412
Chris@0 413 try {
Chris@0 414 $prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false;
Chris@0 415
Chris@17 416 if ('xmlns' === $prefix) {
Chris@17 417 $ele->setAttributeNS(self::NAMESPACE_XMLNS, $aName, $aVal);
Chris@17 418 } elseif (false !== $prefix && isset($this->nsStack[0][$prefix])) {
Chris@17 419 $ele->setAttributeNS($this->nsStack[0][$prefix], $aName, $aVal);
Chris@0 420 } else {
Chris@0 421 $ele->setAttribute($aName, $aVal);
Chris@0 422 }
Chris@0 423 } catch (\DOMException $e) {
Chris@0 424 $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
Chris@0 425 continue;
Chris@0 426 }
Chris@0 427
Chris@0 428 // This is necessary on a non-DTD schema, like HTML5.
Chris@17 429 if ('id' === $aName) {
Chris@0 430 $ele->setIdAttribute('id', true);
Chris@0 431 }
Chris@0 432 }
Chris@0 433
Chris@17 434 if ($this->frag !== $this->current && $this->rules->hasRules($name)) {
Chris@17 435 // Some elements have special processing rules. Handle those separately.
Chris@0 436 $this->current = $this->rules->evaluate($ele, $this->current);
Chris@17 437 } else {
Chris@17 438 // Otherwise, it's a standard element.
Chris@0 439 $this->current->appendChild($ele);
Chris@0 440
Chris@17 441 if (!Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 442 $this->current = $ele;
Chris@0 443 }
Chris@17 444
Chris@17 445 // Self-closing tags should only be respected on foreign elements
Chris@17 446 // (and are implied on void elements)
Chris@17 447 // See: https://www.w3.org/TR/html5/syntax.html#start-tags
Chris@17 448 if (Elements::isHtml5Element($name)) {
Chris@17 449 $selfClosing = false;
Chris@17 450 }
Chris@0 451 }
Chris@0 452
Chris@0 453 // This is sort of a last-ditch attempt to correct for cases where no head/body
Chris@0 454 // elements are provided.
Chris@17 455 if ($this->insertMode <= static::IM_BEFORE_HEAD && 'head' !== $name && 'html' !== $name) {
Chris@0 456 $this->insertMode = static::IM_IN_BODY;
Chris@0 457 }
Chris@0 458
Chris@0 459 // When we are on a void tag, we do not need to care about namesapce nesting,
Chris@0 460 // but we have to remove the namespaces pushed to $nsStack.
Chris@0 461 if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 462 // remove the namespaced definded by current node
Chris@17 463 for ($i = 0; $i < $pushes; ++$i) {
Chris@0 464 array_shift($this->nsStack);
Chris@0 465 }
Chris@0 466 }
Chris@17 467
Chris@17 468 if ($selfClosing) {
Chris@17 469 $this->endTag($name);
Chris@17 470 }
Chris@17 471
Chris@0 472 // Return the element mask, which the tokenizer can then use to set
Chris@0 473 // various processing rules.
Chris@0 474 return Elements::element($name);
Chris@0 475 }
Chris@0 476
Chris@0 477 public function endTag($name)
Chris@0 478 {
Chris@0 479 $lname = $this->normalizeTagName($name);
Chris@0 480
Chris@0 481 // Ignore closing tags for unary elements.
Chris@0 482 if (Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 483 return;
Chris@0 484 }
Chris@0 485
Chris@0 486 if ($this->insertMode <= static::IM_BEFORE_HTML) {
Chris@0 487 // 8.2.5.4.2
Chris@0 488 if (in_array($name, array(
Chris@0 489 'html',
Chris@0 490 'br',
Chris@0 491 'head',
Chris@17 492 'title',
Chris@0 493 ))) {
Chris@0 494 $this->startTag('html');
Chris@0 495 $this->endTag($name);
Chris@0 496 $this->insertMode = static::IM_BEFORE_HEAD;
Chris@0 497
Chris@0 498 return;
Chris@0 499 }
Chris@0 500
Chris@0 501 // Ignore the tag.
Chris@17 502 $this->parseError('Illegal closing tag at global scope.');
Chris@0 503
Chris@0 504 return;
Chris@0 505 }
Chris@0 506
Chris@0 507 // Special case handling for SVG.
Chris@17 508 if ($this->insertMode === static::IM_IN_SVG) {
Chris@0 509 $lname = Elements::normalizeSvgElement($lname);
Chris@0 510 }
Chris@0 511
Chris@0 512 // See https://github.com/facebook/hhvm/issues/2962
Chris@0 513 if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) {
Chris@0 514 $this->current->removeAttribute('html5-php-fake-id-attribute');
Chris@0 515 } else {
Chris@0 516 $cid = spl_object_hash($this->current);
Chris@0 517 }
Chris@0 518
Chris@0 519 // XXX: HTML has no parent. What do we do, though,
Chris@0 520 // if this element appears in the wrong place?
Chris@17 521 if ('html' === $lname) {
Chris@0 522 return;
Chris@0 523 }
Chris@0 524
Chris@0 525 // remove the namespaced definded by current node
Chris@0 526 if (isset($this->pushes[$cid])) {
Chris@17 527 for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) {
Chris@0 528 array_shift($this->nsStack);
Chris@0 529 }
Chris@0 530 unset($this->pushes[$cid]);
Chris@0 531 }
Chris@0 532
Chris@17 533 if (!$this->autoclose($lname)) {
Chris@0 534 $this->parseError('Could not find closing tag for ' . $lname);
Chris@0 535 }
Chris@0 536
Chris@0 537 switch ($lname) {
Chris@17 538 case 'head':
Chris@0 539 $this->insertMode = static::IM_AFTER_HEAD;
Chris@0 540 break;
Chris@17 541 case 'body':
Chris@0 542 $this->insertMode = static::IM_AFTER_BODY;
Chris@0 543 break;
Chris@17 544 case 'svg':
Chris@17 545 case 'mathml':
Chris@0 546 $this->insertMode = static::IM_IN_BODY;
Chris@0 547 break;
Chris@0 548 }
Chris@0 549 }
Chris@0 550
Chris@0 551 public function comment($cdata)
Chris@0 552 {
Chris@0 553 // TODO: Need to handle case where comment appears outside of the HTML tag.
Chris@0 554 $node = $this->doc->createComment($cdata);
Chris@0 555 $this->current->appendChild($node);
Chris@0 556 }
Chris@0 557
Chris@0 558 public function text($data)
Chris@0 559 {
Chris@0 560 // XXX: Hmmm.... should we really be this strict?
Chris@0 561 if ($this->insertMode < static::IM_IN_HEAD) {
Chris@0 562 // Per '8.2.5.4.3 The "before head" insertion mode' the characters
Chris@0 563 // " \t\n\r\f" should be ignored but no mention of a parse error. This is
Chris@0 564 // practical as most documents contain these characters. Other text is not
Chris@0 565 // expected here so recording a parse error is necessary.
Chris@0 566 $dataTmp = trim($data, " \t\n\r\f");
Chris@17 567 if (!empty($dataTmp)) {
Chris@0 568 // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
Chris@17 569 $this->parseError('Unexpected text. Ignoring: ' . $dataTmp);
Chris@0 570 }
Chris@0 571
Chris@0 572 return;
Chris@0 573 }
Chris@0 574 // fprintf(STDOUT, "Appending text %s.", $data);
Chris@0 575 $node = $this->doc->createTextNode($data);
Chris@0 576 $this->current->appendChild($node);
Chris@0 577 }
Chris@0 578
Chris@0 579 public function eof()
Chris@0 580 {
Chris@0 581 // If the $current isn't the $root, do we need to do anything?
Chris@0 582 }
Chris@0 583
Chris@0 584 public function parseError($msg, $line = 0, $col = 0)
Chris@0 585 {
Chris@17 586 $this->errors[] = sprintf('Line %d, Col %d: %s', $line, $col, $msg);
Chris@0 587 }
Chris@0 588
Chris@0 589 public function getErrors()
Chris@0 590 {
Chris@0 591 return $this->errors;
Chris@0 592 }
Chris@0 593
Chris@0 594 public function cdata($data)
Chris@0 595 {
Chris@0 596 $node = $this->doc->createCDATASection($data);
Chris@0 597 $this->current->appendChild($node);
Chris@0 598 }
Chris@0 599
Chris@0 600 public function processingInstruction($name, $data = null)
Chris@0 601 {
Chris@0 602 // XXX: Ignore initial XML declaration, per the spec.
Chris@17 603 if ($this->insertMode === static::IM_INITIAL && 'xml' === strtolower($name)) {
Chris@0 604 return;
Chris@0 605 }
Chris@0 606
Chris@17 607 // Important: The processor may modify the current DOM tree however it sees fit.
Chris@17 608 if ($this->processor instanceof InstructionProcessor) {
Chris@0 609 $res = $this->processor->process($this->current, $name, $data);
Chris@17 610 if (!empty($res)) {
Chris@0 611 $this->current = $res;
Chris@0 612 }
Chris@0 613
Chris@0 614 return;
Chris@0 615 }
Chris@0 616
Chris@0 617 // Otherwise, this is just a dumb PI element.
Chris@0 618 $node = $this->doc->createProcessingInstruction($name, $data);
Chris@0 619
Chris@0 620 $this->current->appendChild($node);
Chris@0 621 }
Chris@0 622
Chris@0 623 // ==========================================================================
Chris@0 624 // UTILITIES
Chris@0 625 // ==========================================================================
Chris@0 626
Chris@0 627 /**
Chris@0 628 * Apply normalization rules to a tag name.
Chris@0 629 * See sections 2.9 and 8.1.2.
Chris@0 630 *
Chris@17 631 * @param string $tagName
Chris@17 632 *
Chris@0 633 * @return string The normalized tag name.
Chris@0 634 */
Chris@17 635 protected function normalizeTagName($tagName)
Chris@0 636 {
Chris@0 637 /*
Chris@0 638 * Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); }
Chris@0 639 */
Chris@17 640 return $tagName;
Chris@0 641 }
Chris@0 642
Chris@0 643 protected function quirksTreeResolver($name)
Chris@0 644 {
Chris@17 645 throw new \Exception('Not implemented.');
Chris@0 646 }
Chris@0 647
Chris@0 648 /**
Chris@0 649 * Automatically climb the tree and close the closest node with the matching $tag.
Chris@17 650 *
Chris@17 651 * @param string $tagName
Chris@17 652 *
Chris@17 653 * @return bool
Chris@0 654 */
Chris@17 655 protected function autoclose($tagName)
Chris@0 656 {
Chris@0 657 $working = $this->current;
Chris@0 658 do {
Chris@17 659 if (XML_ELEMENT_NODE !== $working->nodeType) {
Chris@0 660 return false;
Chris@0 661 }
Chris@17 662 if ($working->tagName === $tagName) {
Chris@0 663 $this->current = $working->parentNode;
Chris@0 664
Chris@0 665 return true;
Chris@0 666 }
Chris@0 667 } while ($working = $working->parentNode);
Chris@17 668
Chris@0 669 return false;
Chris@0 670 }
Chris@0 671
Chris@0 672 /**
Chris@0 673 * Checks if the given tagname is an ancestor of the present candidate.
Chris@0 674 *
Chris@0 675 * If $this->current or anything above $this->current matches the given tag
Chris@0 676 * name, this returns true.
Chris@17 677 *
Chris@17 678 * @param string $tagName
Chris@17 679 *
Chris@17 680 * @return bool
Chris@0 681 */
Chris@17 682 protected function isAncestor($tagName)
Chris@0 683 {
Chris@0 684 $candidate = $this->current;
Chris@17 685 while (XML_ELEMENT_NODE === $candidate->nodeType) {
Chris@17 686 if ($candidate->tagName === $tagName) {
Chris@0 687 return true;
Chris@0 688 }
Chris@0 689 $candidate = $candidate->parentNode;
Chris@0 690 }
Chris@0 691
Chris@0 692 return false;
Chris@0 693 }
Chris@0 694
Chris@0 695 /**
Chris@0 696 * Returns true if the immediate parent element is of the given tagname.
Chris@17 697 *
Chris@17 698 * @param string $tagName
Chris@17 699 *
Chris@17 700 * @return bool
Chris@0 701 */
Chris@17 702 protected function isParent($tagName)
Chris@0 703 {
Chris@17 704 return $this->current->tagName === $tagName;
Chris@0 705 }
Chris@0 706 }