annotate vendor/masterminds/html5/src/HTML5/Serializer/OutputRules.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 129ea1e6d783
children
rev   line source
Chris@0 1 <?php
Chris@0 2 /**
Chris@0 3 * @file
Chris@0 4 * The rules for generating output in the serializer.
Chris@0 5 *
Chris@0 6 * These output rules are likely to generate output similar to the document that
Chris@0 7 * was parsed. It is not intended to output exactly the document that was parsed.
Chris@0 8 */
Chris@17 9
Chris@0 10 namespace Masterminds\HTML5\Serializer;
Chris@0 11
Chris@0 12 use Masterminds\HTML5\Elements;
Chris@0 13
Chris@0 14 /**
Chris@0 15 * Generate the output html5 based on element rules.
Chris@0 16 */
Chris@17 17 class OutputRules implements RulesInterface
Chris@0 18 {
Chris@0 19 /**
Chris@17 20 * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0.
Chris@0 21 */
Chris@0 22 const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
Chris@0 23
Chris@0 24 const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
Chris@0 25
Chris@0 26 const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
Chris@0 27
Chris@0 28 const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
Chris@0 29
Chris@0 30 const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
Chris@0 31
Chris@0 32 const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
Chris@0 33
Chris@0 34 /**
Chris@17 35 * Holds the HTML5 element names that causes a namespace switch.
Chris@0 36 *
Chris@0 37 * @var array
Chris@0 38 */
Chris@0 39 protected $implicitNamespaces = array(
Chris@0 40 self::NAMESPACE_HTML,
Chris@0 41 self::NAMESPACE_SVG,
Chris@0 42 self::NAMESPACE_MATHML,
Chris@0 43 self::NAMESPACE_XML,
Chris@0 44 self::NAMESPACE_XMLNS,
Chris@0 45 );
Chris@0 46
Chris@0 47 const IM_IN_HTML = 1;
Chris@0 48
Chris@0 49 const IM_IN_SVG = 2;
Chris@0 50
Chris@0 51 const IM_IN_MATHML = 3;
Chris@0 52
Chris@0 53 /**
Chris@17 54 * Used as cache to detect if is available ENT_HTML5.
Chris@17 55 *
Chris@17 56 * @var bool
Chris@0 57 */
Chris@0 58 private $hasHTML5 = false;
Chris@0 59
Chris@0 60 protected $traverser;
Chris@0 61
Chris@0 62 protected $encode = false;
Chris@0 63
Chris@0 64 protected $out;
Chris@0 65
Chris@0 66 protected $outputMode;
Chris@0 67
Chris@0 68 private $xpath;
Chris@0 69
Chris@0 70 protected $nonBooleanAttributes = array(
Chris@0 71 /*
Chris@0 72 array(
Chris@0 73 'nodeNamespace'=>'http://www.w3.org/1999/xhtml',
Chris@0 74 'attrNamespace'=>'http://www.w3.org/1999/xhtml',
Chris@0 75
Chris@0 76 'nodeName'=>'img', 'nodeName'=>array('img', 'a'),
Chris@0 77 'attrName'=>'alt', 'attrName'=>array('title', 'alt'),
Chris@0 78 ),
Chris@0 79 */
Chris@0 80 array(
Chris@0 81 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
Chris@0 82 'attrName' => array('href',
Chris@0 83 'hreflang',
Chris@0 84 'http-equiv',
Chris@0 85 'icon',
Chris@0 86 'id',
Chris@0 87 'keytype',
Chris@0 88 'kind',
Chris@0 89 'label',
Chris@0 90 'lang',
Chris@0 91 'language',
Chris@0 92 'list',
Chris@0 93 'maxlength',
Chris@0 94 'media',
Chris@0 95 'method',
Chris@0 96 'name',
Chris@0 97 'placeholder',
Chris@0 98 'rel',
Chris@0 99 'rows',
Chris@0 100 'rowspan',
Chris@0 101 'sandbox',
Chris@0 102 'spellcheck',
Chris@0 103 'scope',
Chris@0 104 'seamless',
Chris@0 105 'shape',
Chris@0 106 'size',
Chris@0 107 'sizes',
Chris@0 108 'span',
Chris@0 109 'src',
Chris@0 110 'srcdoc',
Chris@0 111 'srclang',
Chris@0 112 'srcset',
Chris@0 113 'start',
Chris@0 114 'step',
Chris@0 115 'style',
Chris@0 116 'summary',
Chris@0 117 'tabindex',
Chris@0 118 'target',
Chris@0 119 'title',
Chris@0 120 'type',
Chris@0 121 'value',
Chris@0 122 'width',
Chris@0 123 'border',
Chris@0 124 'charset',
Chris@0 125 'cite',
Chris@0 126 'class',
Chris@0 127 'code',
Chris@0 128 'codebase',
Chris@0 129 'color',
Chris@0 130 'cols',
Chris@0 131 'colspan',
Chris@0 132 'content',
Chris@0 133 'coords',
Chris@0 134 'data',
Chris@0 135 'datetime',
Chris@0 136 'default',
Chris@0 137 'dir',
Chris@0 138 'dirname',
Chris@0 139 'enctype',
Chris@0 140 'for',
Chris@0 141 'form',
Chris@0 142 'formaction',
Chris@0 143 'headers',
Chris@0 144 'height',
Chris@0 145 'accept',
Chris@0 146 'accept-charset',
Chris@0 147 'accesskey',
Chris@0 148 'action',
Chris@0 149 'align',
Chris@0 150 'alt',
Chris@0 151 'bgcolor',
Chris@0 152 ),
Chris@0 153 ),
Chris@0 154 array(
Chris@0 155 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
Chris@0 156 'xpath' => 'starts-with(local-name(), \'data-\')',
Chris@0 157 ),
Chris@0 158 );
Chris@0 159
Chris@0 160 const DOCTYPE = '<!DOCTYPE html>';
Chris@0 161
Chris@0 162 public function __construct($output, $options = array())
Chris@0 163 {
Chris@0 164 if (isset($options['encode_entities'])) {
Chris@0 165 $this->encode = $options['encode_entities'];
Chris@0 166 }
Chris@0 167
Chris@0 168 $this->outputMode = static::IM_IN_HTML;
Chris@0 169 $this->out = $output;
Chris@0 170
Chris@0 171 // If HHVM, see https://github.com/facebook/hhvm/issues/2727
Chris@0 172 $this->hasHTML5 = defined('ENT_HTML5') && !defined('HHVM_VERSION');
Chris@0 173 }
Chris@17 174
Chris@0 175 public function addRule(array $rule)
Chris@0 176 {
Chris@0 177 $this->nonBooleanAttributes[] = $rule;
Chris@0 178 }
Chris@0 179
Chris@17 180 public function setTraverser(Traverser $traverser)
Chris@0 181 {
Chris@0 182 $this->traverser = $traverser;
Chris@0 183
Chris@0 184 return $this;
Chris@0 185 }
Chris@0 186
Chris@0 187 public function document($dom)
Chris@0 188 {
Chris@0 189 $this->doctype();
Chris@0 190 if ($dom->documentElement) {
Chris@0 191 foreach ($dom->childNodes as $node) {
Chris@0 192 $this->traverser->node($node);
Chris@0 193 }
Chris@0 194 $this->nl();
Chris@0 195 }
Chris@0 196 }
Chris@0 197
Chris@0 198 protected function doctype()
Chris@0 199 {
Chris@0 200 $this->wr(static::DOCTYPE);
Chris@0 201 $this->nl();
Chris@0 202 }
Chris@0 203
Chris@0 204 public function element($ele)
Chris@0 205 {
Chris@0 206 $name = $ele->tagName;
Chris@0 207
Chris@0 208 // Per spec:
Chris@0 209 // If the element has a declared namespace in the HTML, MathML or
Chris@0 210 // SVG namespaces, we use the lname instead of the tagName.
Chris@0 211 if ($this->traverser->isLocalElement($ele)) {
Chris@0 212 $name = $ele->localName;
Chris@0 213 }
Chris@0 214
Chris@0 215 // If we are in SVG or MathML there is special handling.
Chris@0 216 // Using if/elseif instead of switch because it's faster in PHP.
Chris@17 217 if ('svg' == $name) {
Chris@0 218 $this->outputMode = static::IM_IN_SVG;
Chris@0 219 $name = Elements::normalizeSvgElement($name);
Chris@17 220 } elseif ('math' == $name) {
Chris@0 221 $this->outputMode = static::IM_IN_MATHML;
Chris@0 222 }
Chris@0 223
Chris@0 224 $this->openTag($ele);
Chris@0 225 if (Elements::isA($name, Elements::TEXT_RAW)) {
Chris@0 226 foreach ($ele->childNodes as $child) {
Chris@0 227 if ($child instanceof \DOMCharacterData) {
Chris@0 228 $this->wr($child->data);
Chris@0 229 } elseif ($child instanceof \DOMElement) {
Chris@0 230 $this->element($child);
Chris@0 231 }
Chris@0 232 }
Chris@0 233 } else {
Chris@0 234 // Handle children.
Chris@0 235 if ($ele->hasChildNodes()) {
Chris@0 236 $this->traverser->children($ele->childNodes);
Chris@0 237 }
Chris@0 238
Chris@0 239 // Close out the SVG or MathML special handling.
Chris@17 240 if ('svg' == $name || 'math' == $name) {
Chris@0 241 $this->outputMode = static::IM_IN_HTML;
Chris@0 242 }
Chris@0 243 }
Chris@0 244
Chris@0 245 // If not unary, add a closing tag.
Chris@17 246 if (!Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 247 $this->closeTag($ele);
Chris@0 248 }
Chris@0 249 }
Chris@0 250
Chris@0 251 /**
Chris@0 252 * Write a text node.
Chris@0 253 *
Chris@17 254 * @param \DOMText $ele The text node to write.
Chris@0 255 */
Chris@0 256 public function text($ele)
Chris@0 257 {
Chris@0 258 if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->localName, Elements::TEXT_RAW)) {
Chris@0 259 $this->wr($ele->data);
Chris@17 260
Chris@0 261 return;
Chris@0 262 }
Chris@0 263
Chris@0 264 // FIXME: This probably needs some flags set.
Chris@0 265 $this->wr($this->enc($ele->data));
Chris@0 266 }
Chris@0 267
Chris@0 268 public function cdata($ele)
Chris@0 269 {
Chris@0 270 // This encodes CDATA.
Chris@0 271 $this->wr($ele->ownerDocument->saveXML($ele));
Chris@0 272 }
Chris@0 273
Chris@0 274 public function comment($ele)
Chris@0 275 {
Chris@0 276 // These produce identical output.
Chris@0 277 // $this->wr('<!--')->wr($ele->data)->wr('-->');
Chris@0 278 $this->wr($ele->ownerDocument->saveXML($ele));
Chris@0 279 }
Chris@0 280
Chris@0 281 public function processorInstruction($ele)
Chris@0 282 {
Chris@0 283 $this->wr('<?')
Chris@0 284 ->wr($ele->target)
Chris@0 285 ->wr(' ')
Chris@0 286 ->wr($ele->data)
Chris@0 287 ->wr('?>');
Chris@0 288 }
Chris@17 289
Chris@0 290 /**
Chris@17 291 * Write the namespace attributes.
Chris@0 292 *
Chris@17 293 * @param \DOMNode $ele The element being written.
Chris@0 294 */
Chris@0 295 protected function namespaceAttrs($ele)
Chris@0 296 {
Chris@17 297 if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument) {
Chris@0 298 $this->xpath = new \DOMXPath($ele->ownerDocument);
Chris@0 299 }
Chris@0 300
Chris@17 301 foreach ($this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele) as $nsNode) {
Chris@0 302 if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) {
Chris@0 303 $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"');
Chris@0 304 }
Chris@0 305 }
Chris@0 306 }
Chris@0 307
Chris@0 308 /**
Chris@0 309 * Write the opening tag.
Chris@0 310 *
Chris@0 311 * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
Chris@0 312 * qualified name (8.3).
Chris@0 313 *
Chris@17 314 * @param \DOMNode $ele The element being written.
Chris@0 315 */
Chris@0 316 protected function openTag($ele)
Chris@0 317 {
Chris@0 318 $this->wr('<')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName);
Chris@0 319
Chris@0 320 $this->attrs($ele);
Chris@0 321 $this->namespaceAttrs($ele);
Chris@0 322
Chris@0 323 if ($this->outputMode == static::IM_IN_HTML) {
Chris@0 324 $this->wr('>');
Chris@0 325 } // If we are not in html mode we are in SVG, MathML, or XML embedded content.
Chris@0 326 else {
Chris@0 327 if ($ele->hasChildNodes()) {
Chris@0 328 $this->wr('>');
Chris@0 329 } // If there are no children this is self closing.
Chris@0 330 else {
Chris@0 331 $this->wr(' />');
Chris@0 332 }
Chris@0 333 }
Chris@0 334 }
Chris@0 335
Chris@0 336 protected function attrs($ele)
Chris@0 337 {
Chris@0 338 // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements.
Chris@17 339 if (!$ele->hasAttributes()) {
Chris@0 340 return $this;
Chris@0 341 }
Chris@0 342
Chris@0 343 // TODO: Currently, this always writes name="value", and does not do
Chris@0 344 // value-less attributes.
Chris@0 345 $map = $ele->attributes;
Chris@0 346 $len = $map->length;
Chris@17 347 for ($i = 0; $i < $len; ++$i) {
Chris@0 348 $node = $map->item($i);
Chris@0 349 $val = $this->enc($node->value, true);
Chris@0 350
Chris@0 351 // XXX: The spec says that we need to ensure that anything in
Chris@0 352 // the XML, XMLNS, or XLink NS's should use the canonical
Chris@0 353 // prefix. It seems that DOM does this for us already, but there
Chris@0 354 // may be exceptions.
Chris@0 355 $name = $node->nodeName;
Chris@0 356
Chris@0 357 // Special handling for attributes in SVG and MathML.
Chris@0 358 // Using if/elseif instead of switch because it's faster in PHP.
Chris@0 359 if ($this->outputMode == static::IM_IN_SVG) {
Chris@0 360 $name = Elements::normalizeSvgAttribute($name);
Chris@0 361 } elseif ($this->outputMode == static::IM_IN_MATHML) {
Chris@0 362 $name = Elements::normalizeMathMlAttribute($name);
Chris@0 363 }
Chris@0 364
Chris@0 365 $this->wr(' ')->wr($name);
Chris@0 366
Chris@17 367 if ((isset($val) && '' !== $val) || $this->nonBooleanAttribute($node)) {
Chris@0 368 $this->wr('="')->wr($val)->wr('"');
Chris@0 369 }
Chris@0 370 }
Chris@0 371 }
Chris@0 372
Chris@0 373 protected function nonBooleanAttribute(\DOMAttr $attr)
Chris@0 374 {
Chris@0 375 $ele = $attr->ownerElement;
Chris@17 376 foreach ($this->nonBooleanAttributes as $rule) {
Chris@17 377 if (isset($rule['nodeNamespace']) && $rule['nodeNamespace'] !== $ele->namespaceURI) {
Chris@0 378 continue;
Chris@0 379 }
Chris@17 380 if (isset($rule['attNamespace']) && $rule['attNamespace'] !== $attr->namespaceURI) {
Chris@0 381 continue;
Chris@0 382 }
Chris@17 383 if (isset($rule['nodeName']) && !is_array($rule['nodeName']) && $rule['nodeName'] !== $ele->localName) {
Chris@0 384 continue;
Chris@0 385 }
Chris@17 386 if (isset($rule['nodeName']) && is_array($rule['nodeName']) && !in_array($ele->localName, $rule['nodeName'], true)) {
Chris@0 387 continue;
Chris@0 388 }
Chris@17 389 if (isset($rule['attrName']) && !is_array($rule['attrName']) && $rule['attrName'] !== $attr->localName) {
Chris@0 390 continue;
Chris@0 391 }
Chris@17 392 if (isset($rule['attrName']) && is_array($rule['attrName']) && !in_array($attr->localName, $rule['attrName'], true)) {
Chris@0 393 continue;
Chris@0 394 }
Chris@17 395 if (isset($rule['xpath'])) {
Chris@0 396 $xp = $this->getXPath($attr);
Chris@17 397 if (isset($rule['prefixes'])) {
Chris@17 398 foreach ($rule['prefixes'] as $nsPrefix => $ns) {
Chris@0 399 $xp->registerNamespace($nsPrefix, $ns);
Chris@0 400 }
Chris@0 401 }
Chris@17 402 if (!$xp->evaluate($rule['xpath'], $attr)) {
Chris@0 403 continue;
Chris@0 404 }
Chris@0 405 }
Chris@0 406
Chris@0 407 return true;
Chris@0 408 }
Chris@0 409
Chris@0 410 return false;
Chris@0 411 }
Chris@0 412
Chris@17 413 private function getXPath(\DOMNode $node)
Chris@17 414 {
Chris@17 415 if (!$this->xpath) {
Chris@0 416 $this->xpath = new \DOMXPath($node->ownerDocument);
Chris@0 417 }
Chris@17 418
Chris@0 419 return $this->xpath;
Chris@0 420 }
Chris@0 421
Chris@0 422 /**
Chris@0 423 * Write the closing tag.
Chris@0 424 *
Chris@0 425 * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
Chris@0 426 * qualified name (8.3).
Chris@0 427 *
Chris@17 428 * @param \DOMNode $ele The element being written.
Chris@0 429 */
Chris@0 430 protected function closeTag($ele)
Chris@0 431 {
Chris@0 432 if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) {
Chris@0 433 $this->wr('</')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName)->wr('>');
Chris@0 434 }
Chris@0 435 }
Chris@0 436
Chris@0 437 /**
Chris@0 438 * Write to the output.
Chris@0 439 *
Chris@17 440 * @param string $text The string to put into the output
Chris@0 441 *
Chris@17 442 * @return $this
Chris@0 443 */
Chris@0 444 protected function wr($text)
Chris@0 445 {
Chris@0 446 fwrite($this->out, $text);
Chris@17 447
Chris@0 448 return $this;
Chris@0 449 }
Chris@0 450
Chris@0 451 /**
Chris@0 452 * Write a new line character.
Chris@0 453 *
Chris@17 454 * @return $this
Chris@0 455 */
Chris@0 456 protected function nl()
Chris@0 457 {
Chris@0 458 fwrite($this->out, PHP_EOL);
Chris@17 459
Chris@0 460 return $this;
Chris@0 461 }
Chris@0 462
Chris@0 463 /**
Chris@0 464 * Encode text.
Chris@0 465 *
Chris@0 466 * When encode is set to false, the default value, the text passed in is
Chris@0 467 * escaped per section 8.3 of the html5 spec. For details on how text is
Chris@0 468 * escaped see the escape() method.
Chris@0 469 *
Chris@0 470 * When encoding is set to true the text is converted to named character
Chris@0 471 * references where appropriate. Section 8.1.4 Character references of the
Chris@0 472 * html5 spec refers to using named character references. This is useful for
Chris@0 473 * characters that can't otherwise legally be used in the text.
Chris@0 474 *
Chris@0 475 * The named character references are listed in section 8.5.
Chris@0 476 *
Chris@0 477 * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references True encoding will turn all named character references into their entities.
Chris@0 478 * This includes such characters as +.# and many other common ones. By default
Chris@0 479 * encoding here will just escape &'<>".
Chris@0 480 *
Chris@0 481 * Note, PHP 5.4+ has better html5 encoding.
Chris@0 482 *
Chris@0 483 * @todo Use the Entities class in php 5.3 to have html5 entities.
Chris@0 484 *
Chris@17 485 * @param string $text Text to encode.
Chris@17 486 * @param bool $attribute True if we are encoding an attrubute, false otherwise.
Chris@0 487 *
Chris@0 488 * @return string The encoded text.
Chris@0 489 */
Chris@0 490 protected function enc($text, $attribute = false)
Chris@0 491 {
Chris@0 492 // Escape the text rather than convert to named character references.
Chris@17 493 if (!$this->encode) {
Chris@0 494 return $this->escape($text, $attribute);
Chris@0 495 }
Chris@0 496
Chris@0 497 // If we are in PHP 5.4+ we can use the native html5 entity functionality to
Chris@0 498 // convert the named character references.
Chris@0 499
Chris@0 500 if ($this->hasHTML5) {
Chris@0 501 return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', false);
Chris@0 502 } // If a version earlier than 5.4 html5 entities are not entirely handled.
Chris@0 503 // This manually handles them.
Chris@0 504 else {
Chris@17 505 return strtr($text, HTML5Entities::$map);
Chris@0 506 }
Chris@0 507 }
Chris@0 508
Chris@0 509 /**
Chris@0 510 * Escape test.
Chris@0 511 *
Chris@0 512 * According to the html5 spec section 8.3 Serializing HTML fragments, text
Chris@0 513 * within tags that are not style, script, xmp, iframe, noembed, and noframes
Chris@0 514 * need to be properly escaped.
Chris@0 515 *
Chris@0 516 * The & should be converted to &amp;, no breaking space unicode characters
Chris@0 517 * converted to &nbsp;, when in attribute mode the " should be converted to
Chris@0 518 * &quot;, and when not in attribute mode the < and > should be converted to
Chris@0 519 * &lt; and &gt;.
Chris@0 520 *
Chris@0 521 * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString
Chris@0 522 *
Chris@17 523 * @param string $text Text to escape.
Chris@17 524 * @param bool $attribute True if we are escaping an attrubute, false otherwise.
Chris@0 525 */
Chris@0 526 protected function escape($text, $attribute = false)
Chris@0 527 {
Chris@0 528 // Not using htmlspecialchars because, while it does escaping, it doesn't
Chris@0 529 // match the requirements of section 8.5. For example, it doesn't handle
Chris@0 530 // non-breaking spaces.
Chris@0 531 if ($attribute) {
Chris@0 532 $replace = array(
Chris@0 533 '"' => '&quot;',
Chris@0 534 '&' => '&amp;',
Chris@17 535 "\xc2\xa0" => '&nbsp;',
Chris@0 536 );
Chris@0 537 } else {
Chris@0 538 $replace = array(
Chris@0 539 '<' => '&lt;',
Chris@0 540 '>' => '&gt;',
Chris@0 541 '&' => '&amp;',
Chris@17 542 "\xc2\xa0" => '&nbsp;',
Chris@0 543 );
Chris@0 544 }
Chris@0 545
Chris@0 546 return strtr($text, $replace);
Chris@0 547 }
Chris@0 548 }