annotate vendor/masterminds/html5/src/HTML5/Serializer/OutputRules.php @ 0:4c8ae668cc8c

Initial import (non-working)
author Chris Cannam
date Wed, 29 Nov 2017 16:09:58 +0000
parents
children 129ea1e6d783
rev   line source
Chris@0 1 <?php
Chris@0 2 /**
Chris@0 3 * @file
Chris@0 4 * The rules for generating output in the serializer.
Chris@0 5 *
Chris@0 6 * These output rules are likely to generate output similar to the document that
Chris@0 7 * was parsed. It is not intended to output exactly the document that was parsed.
Chris@0 8 */
Chris@0 9 namespace Masterminds\HTML5\Serializer;
Chris@0 10
Chris@0 11 use Masterminds\HTML5\Elements;
Chris@0 12
Chris@0 13 /**
Chris@0 14 * Generate the output html5 based on element rules.
Chris@0 15 */
Chris@0 16 class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
Chris@0 17 {
Chris@0 18 /**
Chris@0 19 * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0
Chris@0 20 */
Chris@0 21 const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
Chris@0 22
Chris@0 23 const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
Chris@0 24
Chris@0 25 const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
Chris@0 26
Chris@0 27 const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
Chris@0 28
Chris@0 29 const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
Chris@0 30
Chris@0 31 const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
Chris@0 32
Chris@0 33 /**
Chris@0 34 * Holds the HTML5 element names that causes a namespace switch
Chris@0 35 *
Chris@0 36 * @var array
Chris@0 37 */
Chris@0 38 protected $implicitNamespaces = array(
Chris@0 39 self::NAMESPACE_HTML,
Chris@0 40 self::NAMESPACE_SVG,
Chris@0 41 self::NAMESPACE_MATHML,
Chris@0 42 self::NAMESPACE_XML,
Chris@0 43 self::NAMESPACE_XMLNS,
Chris@0 44 );
Chris@0 45
Chris@0 46 const IM_IN_HTML = 1;
Chris@0 47
Chris@0 48 const IM_IN_SVG = 2;
Chris@0 49
Chris@0 50 const IM_IN_MATHML = 3;
Chris@0 51
Chris@0 52 /**
Chris@0 53 * Used as cache to detect if is available ENT_HTML5
Chris@0 54 * @var boolean
Chris@0 55 */
Chris@0 56 private $hasHTML5 = false;
Chris@0 57
Chris@0 58 protected $traverser;
Chris@0 59
Chris@0 60 protected $encode = false;
Chris@0 61
Chris@0 62 protected $out;
Chris@0 63
Chris@0 64 protected $outputMode;
Chris@0 65
Chris@0 66 private $xpath;
Chris@0 67
Chris@0 68 protected $nonBooleanAttributes = array(
Chris@0 69 /*
Chris@0 70 array(
Chris@0 71 'nodeNamespace'=>'http://www.w3.org/1999/xhtml',
Chris@0 72 'attrNamespace'=>'http://www.w3.org/1999/xhtml',
Chris@0 73
Chris@0 74 'nodeName'=>'img', 'nodeName'=>array('img', 'a'),
Chris@0 75 'attrName'=>'alt', 'attrName'=>array('title', 'alt'),
Chris@0 76 ),
Chris@0 77 */
Chris@0 78 array(
Chris@0 79 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
Chris@0 80 'attrName' => array('href',
Chris@0 81 'hreflang',
Chris@0 82 'http-equiv',
Chris@0 83 'icon',
Chris@0 84 'id',
Chris@0 85 'keytype',
Chris@0 86 'kind',
Chris@0 87 'label',
Chris@0 88 'lang',
Chris@0 89 'language',
Chris@0 90 'list',
Chris@0 91 'maxlength',
Chris@0 92 'media',
Chris@0 93 'method',
Chris@0 94 'name',
Chris@0 95 'placeholder',
Chris@0 96 'rel',
Chris@0 97 'rows',
Chris@0 98 'rowspan',
Chris@0 99 'sandbox',
Chris@0 100 'spellcheck',
Chris@0 101 'scope',
Chris@0 102 'seamless',
Chris@0 103 'shape',
Chris@0 104 'size',
Chris@0 105 'sizes',
Chris@0 106 'span',
Chris@0 107 'src',
Chris@0 108 'srcdoc',
Chris@0 109 'srclang',
Chris@0 110 'srcset',
Chris@0 111 'start',
Chris@0 112 'step',
Chris@0 113 'style',
Chris@0 114 'summary',
Chris@0 115 'tabindex',
Chris@0 116 'target',
Chris@0 117 'title',
Chris@0 118 'type',
Chris@0 119 'value',
Chris@0 120 'width',
Chris@0 121 'border',
Chris@0 122 'charset',
Chris@0 123 'cite',
Chris@0 124 'class',
Chris@0 125 'code',
Chris@0 126 'codebase',
Chris@0 127 'color',
Chris@0 128 'cols',
Chris@0 129 'colspan',
Chris@0 130 'content',
Chris@0 131 'coords',
Chris@0 132 'data',
Chris@0 133 'datetime',
Chris@0 134 'default',
Chris@0 135 'dir',
Chris@0 136 'dirname',
Chris@0 137 'enctype',
Chris@0 138 'for',
Chris@0 139 'form',
Chris@0 140 'formaction',
Chris@0 141 'headers',
Chris@0 142 'height',
Chris@0 143 'accept',
Chris@0 144 'accept-charset',
Chris@0 145 'accesskey',
Chris@0 146 'action',
Chris@0 147 'align',
Chris@0 148 'alt',
Chris@0 149 'bgcolor',
Chris@0 150 ),
Chris@0 151 ),
Chris@0 152 array(
Chris@0 153 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
Chris@0 154 'xpath' => 'starts-with(local-name(), \'data-\')',
Chris@0 155 ),
Chris@0 156 );
Chris@0 157
Chris@0 158 const DOCTYPE = '<!DOCTYPE html>';
Chris@0 159
Chris@0 160 public function __construct($output, $options = array())
Chris@0 161 {
Chris@0 162 if (isset($options['encode_entities'])) {
Chris@0 163 $this->encode = $options['encode_entities'];
Chris@0 164 }
Chris@0 165
Chris@0 166 $this->outputMode = static::IM_IN_HTML;
Chris@0 167 $this->out = $output;
Chris@0 168
Chris@0 169 // If HHVM, see https://github.com/facebook/hhvm/issues/2727
Chris@0 170 $this->hasHTML5 = defined('ENT_HTML5') && !defined('HHVM_VERSION');
Chris@0 171 }
Chris@0 172 public function addRule(array $rule)
Chris@0 173 {
Chris@0 174 $this->nonBooleanAttributes[] = $rule;
Chris@0 175 }
Chris@0 176
Chris@0 177 public function setTraverser(\Masterminds\HTML5\Serializer\Traverser $traverser)
Chris@0 178 {
Chris@0 179 $this->traverser = $traverser;
Chris@0 180
Chris@0 181 return $this;
Chris@0 182 }
Chris@0 183
Chris@0 184 public function document($dom)
Chris@0 185 {
Chris@0 186 $this->doctype();
Chris@0 187 if ($dom->documentElement) {
Chris@0 188 foreach ($dom->childNodes as $node) {
Chris@0 189 $this->traverser->node($node);
Chris@0 190 }
Chris@0 191 $this->nl();
Chris@0 192 }
Chris@0 193 }
Chris@0 194
Chris@0 195 protected function doctype()
Chris@0 196 {
Chris@0 197 $this->wr(static::DOCTYPE);
Chris@0 198 $this->nl();
Chris@0 199 }
Chris@0 200
Chris@0 201 public function element($ele)
Chris@0 202 {
Chris@0 203 $name = $ele->tagName;
Chris@0 204
Chris@0 205 // Per spec:
Chris@0 206 // If the element has a declared namespace in the HTML, MathML or
Chris@0 207 // SVG namespaces, we use the lname instead of the tagName.
Chris@0 208 if ($this->traverser->isLocalElement($ele)) {
Chris@0 209 $name = $ele->localName;
Chris@0 210 }
Chris@0 211
Chris@0 212 // If we are in SVG or MathML there is special handling.
Chris@0 213 // Using if/elseif instead of switch because it's faster in PHP.
Chris@0 214 if ($name == 'svg') {
Chris@0 215 $this->outputMode = static::IM_IN_SVG;
Chris@0 216 $name = Elements::normalizeSvgElement($name);
Chris@0 217 } elseif ($name == 'math') {
Chris@0 218 $this->outputMode = static::IM_IN_MATHML;
Chris@0 219 }
Chris@0 220
Chris@0 221 $this->openTag($ele);
Chris@0 222 if (Elements::isA($name, Elements::TEXT_RAW)) {
Chris@0 223 foreach ($ele->childNodes as $child) {
Chris@0 224 if ($child instanceof \DOMCharacterData) {
Chris@0 225 $this->wr($child->data);
Chris@0 226 } elseif ($child instanceof \DOMElement) {
Chris@0 227 $this->element($child);
Chris@0 228 }
Chris@0 229 }
Chris@0 230 } else {
Chris@0 231 // Handle children.
Chris@0 232 if ($ele->hasChildNodes()) {
Chris@0 233 $this->traverser->children($ele->childNodes);
Chris@0 234 }
Chris@0 235
Chris@0 236 // Close out the SVG or MathML special handling.
Chris@0 237 if ($name == 'svg' || $name == 'math') {
Chris@0 238 $this->outputMode = static::IM_IN_HTML;
Chris@0 239 }
Chris@0 240 }
Chris@0 241
Chris@0 242 // If not unary, add a closing tag.
Chris@0 243 if (! Elements::isA($name, Elements::VOID_TAG)) {
Chris@0 244 $this->closeTag($ele);
Chris@0 245 }
Chris@0 246 }
Chris@0 247
Chris@0 248 /**
Chris@0 249 * Write a text node.
Chris@0 250 *
Chris@0 251 * @param \DOMText $ele
Chris@0 252 * The text node to write.
Chris@0 253 */
Chris@0 254 public function text($ele)
Chris@0 255 {
Chris@0 256 if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->localName, Elements::TEXT_RAW)) {
Chris@0 257 $this->wr($ele->data);
Chris@0 258 return;
Chris@0 259 }
Chris@0 260
Chris@0 261 // FIXME: This probably needs some flags set.
Chris@0 262 $this->wr($this->enc($ele->data));
Chris@0 263 }
Chris@0 264
Chris@0 265 public function cdata($ele)
Chris@0 266 {
Chris@0 267 // This encodes CDATA.
Chris@0 268 $this->wr($ele->ownerDocument->saveXML($ele));
Chris@0 269 }
Chris@0 270
Chris@0 271 public function comment($ele)
Chris@0 272 {
Chris@0 273 // These produce identical output.
Chris@0 274 // $this->wr('<!--')->wr($ele->data)->wr('-->');
Chris@0 275 $this->wr($ele->ownerDocument->saveXML($ele));
Chris@0 276 }
Chris@0 277
Chris@0 278 public function processorInstruction($ele)
Chris@0 279 {
Chris@0 280 $this->wr('<?')
Chris@0 281 ->wr($ele->target)
Chris@0 282 ->wr(' ')
Chris@0 283 ->wr($ele->data)
Chris@0 284 ->wr('?>');
Chris@0 285 }
Chris@0 286 /**
Chris@0 287 * Write the namespace attributes
Chris@0 288 *
Chris@0 289 *
Chris@0 290 * @param \DOMNode $ele
Chris@0 291 * The element being written.
Chris@0 292 */
Chris@0 293 protected function namespaceAttrs($ele)
Chris@0 294 {
Chris@0 295 if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument){
Chris@0 296 $this->xpath = new \DOMXPath($ele->ownerDocument);
Chris@0 297 }
Chris@0 298
Chris@0 299 foreach( $this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele ) as $nsNode ) {
Chris@0 300 if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) {
Chris@0 301 $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"');
Chris@0 302 }
Chris@0 303 }
Chris@0 304 }
Chris@0 305
Chris@0 306 /**
Chris@0 307 * Write the opening tag.
Chris@0 308 *
Chris@0 309 * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
Chris@0 310 * qualified name (8.3).
Chris@0 311 *
Chris@0 312 * @param \DOMNode $ele
Chris@0 313 * The element being written.
Chris@0 314 */
Chris@0 315 protected function openTag($ele)
Chris@0 316 {
Chris@0 317 $this->wr('<')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName);
Chris@0 318
Chris@0 319
Chris@0 320 $this->attrs($ele);
Chris@0 321 $this->namespaceAttrs($ele);
Chris@0 322
Chris@0 323
Chris@0 324 if ($this->outputMode == static::IM_IN_HTML) {
Chris@0 325 $this->wr('>');
Chris@0 326 } // If we are not in html mode we are in SVG, MathML, or XML embedded content.
Chris@0 327 else {
Chris@0 328 if ($ele->hasChildNodes()) {
Chris@0 329 $this->wr('>');
Chris@0 330 } // If there are no children this is self closing.
Chris@0 331 else {
Chris@0 332 $this->wr(' />');
Chris@0 333 }
Chris@0 334 }
Chris@0 335 }
Chris@0 336
Chris@0 337 protected function attrs($ele)
Chris@0 338 {
Chris@0 339 // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements.
Chris@0 340 if (! $ele->hasAttributes()) {
Chris@0 341 return $this;
Chris@0 342 }
Chris@0 343
Chris@0 344 // TODO: Currently, this always writes name="value", and does not do
Chris@0 345 // value-less attributes.
Chris@0 346 $map = $ele->attributes;
Chris@0 347 $len = $map->length;
Chris@0 348 for ($i = 0; $i < $len; ++ $i) {
Chris@0 349 $node = $map->item($i);
Chris@0 350 $val = $this->enc($node->value, true);
Chris@0 351
Chris@0 352 // XXX: The spec says that we need to ensure that anything in
Chris@0 353 // the XML, XMLNS, or XLink NS's should use the canonical
Chris@0 354 // prefix. It seems that DOM does this for us already, but there
Chris@0 355 // may be exceptions.
Chris@0 356 $name = $node->nodeName;
Chris@0 357
Chris@0 358 // Special handling for attributes in SVG and MathML.
Chris@0 359 // Using if/elseif instead of switch because it's faster in PHP.
Chris@0 360 if ($this->outputMode == static::IM_IN_SVG) {
Chris@0 361 $name = Elements::normalizeSvgAttribute($name);
Chris@0 362 } elseif ($this->outputMode == static::IM_IN_MATHML) {
Chris@0 363 $name = Elements::normalizeMathMlAttribute($name);
Chris@0 364 }
Chris@0 365
Chris@0 366 $this->wr(' ')->wr($name);
Chris@0 367
Chris@0 368 if ((isset($val) && $val !== '') || $this->nonBooleanAttribute($node)) {
Chris@0 369 $this->wr('="')->wr($val)->wr('"');
Chris@0 370 }
Chris@0 371 }
Chris@0 372 }
Chris@0 373
Chris@0 374
Chris@0 375 protected function nonBooleanAttribute(\DOMAttr $attr)
Chris@0 376 {
Chris@0 377 $ele = $attr->ownerElement;
Chris@0 378 foreach($this->nonBooleanAttributes as $rule){
Chris@0 379
Chris@0 380 if(isset($rule['nodeNamespace']) && $rule['nodeNamespace']!==$ele->namespaceURI){
Chris@0 381 continue;
Chris@0 382 }
Chris@0 383 if(isset($rule['attNamespace']) && $rule['attNamespace']!==$attr->namespaceURI){
Chris@0 384 continue;
Chris@0 385 }
Chris@0 386 if(isset($rule['nodeName']) && !is_array($rule['nodeName']) && $rule['nodeName']!==$ele->localName){
Chris@0 387 continue;
Chris@0 388 }
Chris@0 389 if(isset($rule['nodeName']) && is_array($rule['nodeName']) && !in_array($ele->localName, $rule['nodeName'], true)){
Chris@0 390 continue;
Chris@0 391 }
Chris@0 392 if(isset($rule['attrName']) && !is_array($rule['attrName']) && $rule['attrName']!==$attr->localName){
Chris@0 393 continue;
Chris@0 394 }
Chris@0 395 if(isset($rule['attrName']) && is_array($rule['attrName']) && !in_array($attr->localName, $rule['attrName'], true)){
Chris@0 396 continue;
Chris@0 397 }
Chris@0 398 if(isset($rule['xpath'])){
Chris@0 399
Chris@0 400 $xp = $this->getXPath($attr);
Chris@0 401 if(isset($rule['prefixes'])){
Chris@0 402 foreach($rule['prefixes'] as $nsPrefix => $ns){
Chris@0 403 $xp->registerNamespace($nsPrefix, $ns);
Chris@0 404 }
Chris@0 405 }
Chris@0 406 if(!$xp->evaluate($rule['xpath'], $attr)){
Chris@0 407 continue;
Chris@0 408 }
Chris@0 409 }
Chris@0 410
Chris@0 411 return true;
Chris@0 412 }
Chris@0 413
Chris@0 414 return false;
Chris@0 415 }
Chris@0 416
Chris@0 417 private function getXPath(\DOMNode $node){
Chris@0 418 if(!$this->xpath){
Chris@0 419 $this->xpath = new \DOMXPath($node->ownerDocument);
Chris@0 420 }
Chris@0 421 return $this->xpath;
Chris@0 422 }
Chris@0 423
Chris@0 424 /**
Chris@0 425 * Write the closing tag.
Chris@0 426 *
Chris@0 427 * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
Chris@0 428 * qualified name (8.3).
Chris@0 429 *
Chris@0 430 * @param \DOMNode $ele
Chris@0 431 * The element being written.
Chris@0 432 */
Chris@0 433 protected function closeTag($ele)
Chris@0 434 {
Chris@0 435 if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) {
Chris@0 436 $this->wr('</')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName)->wr('>');
Chris@0 437 }
Chris@0 438 }
Chris@0 439
Chris@0 440 /**
Chris@0 441 * Write to the output.
Chris@0 442 *
Chris@0 443 * @param string $text
Chris@0 444 * The string to put into the output.
Chris@0 445 *
Chris@0 446 * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining.
Chris@0 447 */
Chris@0 448 protected function wr($text)
Chris@0 449 {
Chris@0 450 fwrite($this->out, $text);
Chris@0 451 return $this;
Chris@0 452 }
Chris@0 453
Chris@0 454 /**
Chris@0 455 * Write a new line character.
Chris@0 456 *
Chris@0 457 * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining.
Chris@0 458 */
Chris@0 459 protected function nl()
Chris@0 460 {
Chris@0 461 fwrite($this->out, PHP_EOL);
Chris@0 462 return $this;
Chris@0 463 }
Chris@0 464
Chris@0 465 /**
Chris@0 466 * Encode text.
Chris@0 467 *
Chris@0 468 * When encode is set to false, the default value, the text passed in is
Chris@0 469 * escaped per section 8.3 of the html5 spec. For details on how text is
Chris@0 470 * escaped see the escape() method.
Chris@0 471 *
Chris@0 472 * When encoding is set to true the text is converted to named character
Chris@0 473 * references where appropriate. Section 8.1.4 Character references of the
Chris@0 474 * html5 spec refers to using named character references. This is useful for
Chris@0 475 * characters that can't otherwise legally be used in the text.
Chris@0 476 *
Chris@0 477 * The named character references are listed in section 8.5.
Chris@0 478 *
Chris@0 479 * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references True encoding will turn all named character references into their entities.
Chris@0 480 * This includes such characters as +.# and many other common ones. By default
Chris@0 481 * encoding here will just escape &'<>".
Chris@0 482 *
Chris@0 483 * Note, PHP 5.4+ has better html5 encoding.
Chris@0 484 *
Chris@0 485 * @todo Use the Entities class in php 5.3 to have html5 entities.
Chris@0 486 *
Chris@0 487 * @param string $text
Chris@0 488 * text to encode.
Chris@0 489 * @param boolean $attribute
Chris@0 490 * True if we are encoding an attrubute, false otherwise
Chris@0 491 *
Chris@0 492 * @return string The encoded text.
Chris@0 493 */
Chris@0 494 protected function enc($text, $attribute = false)
Chris@0 495 {
Chris@0 496
Chris@0 497 // Escape the text rather than convert to named character references.
Chris@0 498 if (! $this->encode) {
Chris@0 499 return $this->escape($text, $attribute);
Chris@0 500 }
Chris@0 501
Chris@0 502 // If we are in PHP 5.4+ we can use the native html5 entity functionality to
Chris@0 503 // convert the named character references.
Chris@0 504
Chris@0 505 if ($this->hasHTML5) {
Chris@0 506 return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', false);
Chris@0 507 } // If a version earlier than 5.4 html5 entities are not entirely handled.
Chris@0 508 // This manually handles them.
Chris@0 509 else {
Chris@0 510 return strtr($text, \Masterminds\HTML5\Serializer\HTML5Entities::$map);
Chris@0 511 }
Chris@0 512 }
Chris@0 513
Chris@0 514 /**
Chris@0 515 * Escape test.
Chris@0 516 *
Chris@0 517 * According to the html5 spec section 8.3 Serializing HTML fragments, text
Chris@0 518 * within tags that are not style, script, xmp, iframe, noembed, and noframes
Chris@0 519 * need to be properly escaped.
Chris@0 520 *
Chris@0 521 * The & should be converted to &amp;, no breaking space unicode characters
Chris@0 522 * converted to &nbsp;, when in attribute mode the " should be converted to
Chris@0 523 * &quot;, and when not in attribute mode the < and > should be converted to
Chris@0 524 * &lt; and &gt;.
Chris@0 525 *
Chris@0 526 * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString
Chris@0 527 *
Chris@0 528 * @param string $text
Chris@0 529 * text to escape.
Chris@0 530 * @param boolean $attribute
Chris@0 531 * True if we are escaping an attrubute, false otherwise
Chris@0 532 */
Chris@0 533 protected function escape($text, $attribute = false)
Chris@0 534 {
Chris@0 535
Chris@0 536 // Not using htmlspecialchars because, while it does escaping, it doesn't
Chris@0 537 // match the requirements of section 8.5. For example, it doesn't handle
Chris@0 538 // non-breaking spaces.
Chris@0 539 if ($attribute) {
Chris@0 540 $replace = array(
Chris@0 541 '"' => '&quot;',
Chris@0 542 '&' => '&amp;',
Chris@0 543 "\xc2\xa0" => '&nbsp;'
Chris@0 544 );
Chris@0 545 } else {
Chris@0 546 $replace = array(
Chris@0 547 '<' => '&lt;',
Chris@0 548 '>' => '&gt;',
Chris@0 549 '&' => '&amp;',
Chris@0 550 "\xc2\xa0" => '&nbsp;'
Chris@0 551 );
Chris@0 552 }
Chris@0 553
Chris@0 554 return strtr($text, $replace);
Chris@0 555 }
Chris@0 556 }