Chris@0: 'http://www.w3.org/1999/xhtml', Chris@0: 'attrNamespace'=>'http://www.w3.org/1999/xhtml', Chris@0: Chris@0: 'nodeName'=>'img', 'nodeName'=>array('img', 'a'), Chris@0: 'attrName'=>'alt', 'attrName'=>array('title', 'alt'), Chris@0: ), Chris@0: */ Chris@0: array( Chris@0: 'nodeNamespace' => 'http://www.w3.org/1999/xhtml', Chris@0: 'attrName' => array('href', Chris@0: 'hreflang', Chris@0: 'http-equiv', Chris@0: 'icon', Chris@0: 'id', Chris@0: 'keytype', Chris@0: 'kind', Chris@0: 'label', Chris@0: 'lang', Chris@0: 'language', Chris@0: 'list', Chris@0: 'maxlength', Chris@0: 'media', Chris@0: 'method', Chris@0: 'name', Chris@0: 'placeholder', Chris@0: 'rel', Chris@0: 'rows', Chris@0: 'rowspan', Chris@0: 'sandbox', Chris@0: 'spellcheck', Chris@0: 'scope', Chris@0: 'seamless', Chris@0: 'shape', Chris@0: 'size', Chris@0: 'sizes', Chris@0: 'span', Chris@0: 'src', Chris@0: 'srcdoc', Chris@0: 'srclang', Chris@0: 'srcset', Chris@0: 'start', Chris@0: 'step', Chris@0: 'style', Chris@0: 'summary', Chris@0: 'tabindex', Chris@0: 'target', Chris@0: 'title', Chris@0: 'type', Chris@0: 'value', Chris@0: 'width', Chris@0: 'border', Chris@0: 'charset', Chris@0: 'cite', Chris@0: 'class', Chris@0: 'code', Chris@0: 'codebase', Chris@0: 'color', Chris@0: 'cols', Chris@0: 'colspan', Chris@0: 'content', Chris@0: 'coords', Chris@0: 'data', Chris@0: 'datetime', Chris@0: 'default', Chris@0: 'dir', Chris@0: 'dirname', Chris@0: 'enctype', Chris@0: 'for', Chris@0: 'form', Chris@0: 'formaction', Chris@0: 'headers', Chris@0: 'height', Chris@0: 'accept', Chris@0: 'accept-charset', Chris@0: 'accesskey', Chris@0: 'action', Chris@0: 'align', Chris@0: 'alt', Chris@0: 'bgcolor', Chris@0: ), Chris@0: ), Chris@0: array( Chris@0: 'nodeNamespace' => 'http://www.w3.org/1999/xhtml', Chris@0: 'xpath' => 'starts-with(local-name(), \'data-\')', Chris@0: ), Chris@0: ); Chris@0: Chris@0: const DOCTYPE = ''; Chris@0: Chris@0: public function __construct($output, $options = array()) Chris@0: { Chris@0: if (isset($options['encode_entities'])) { Chris@0: $this->encode = $options['encode_entities']; Chris@0: } Chris@0: Chris@0: $this->outputMode = static::IM_IN_HTML; Chris@0: $this->out = $output; Chris@0: Chris@0: // If HHVM, see https://github.com/facebook/hhvm/issues/2727 Chris@0: $this->hasHTML5 = defined('ENT_HTML5') && !defined('HHVM_VERSION'); Chris@0: } Chris@17: Chris@0: public function addRule(array $rule) Chris@0: { Chris@0: $this->nonBooleanAttributes[] = $rule; Chris@0: } Chris@0: Chris@17: public function setTraverser(Traverser $traverser) Chris@0: { Chris@0: $this->traverser = $traverser; Chris@0: Chris@0: return $this; Chris@0: } Chris@0: Chris@0: public function document($dom) Chris@0: { Chris@0: $this->doctype(); Chris@0: if ($dom->documentElement) { Chris@0: foreach ($dom->childNodes as $node) { Chris@0: $this->traverser->node($node); Chris@0: } Chris@0: $this->nl(); Chris@0: } Chris@0: } Chris@0: Chris@0: protected function doctype() Chris@0: { Chris@0: $this->wr(static::DOCTYPE); Chris@0: $this->nl(); Chris@0: } Chris@0: Chris@0: public function element($ele) Chris@0: { Chris@0: $name = $ele->tagName; Chris@0: Chris@0: // Per spec: Chris@0: // If the element has a declared namespace in the HTML, MathML or Chris@0: // SVG namespaces, we use the lname instead of the tagName. Chris@0: if ($this->traverser->isLocalElement($ele)) { Chris@0: $name = $ele->localName; Chris@0: } Chris@0: Chris@0: // If we are in SVG or MathML there is special handling. Chris@0: // Using if/elseif instead of switch because it's faster in PHP. Chris@17: if ('svg' == $name) { Chris@0: $this->outputMode = static::IM_IN_SVG; Chris@0: $name = Elements::normalizeSvgElement($name); Chris@17: } elseif ('math' == $name) { Chris@0: $this->outputMode = static::IM_IN_MATHML; Chris@0: } Chris@0: Chris@0: $this->openTag($ele); Chris@0: if (Elements::isA($name, Elements::TEXT_RAW)) { Chris@0: foreach ($ele->childNodes as $child) { Chris@0: if ($child instanceof \DOMCharacterData) { Chris@0: $this->wr($child->data); Chris@0: } elseif ($child instanceof \DOMElement) { Chris@0: $this->element($child); Chris@0: } Chris@0: } Chris@0: } else { Chris@0: // Handle children. Chris@0: if ($ele->hasChildNodes()) { Chris@0: $this->traverser->children($ele->childNodes); Chris@0: } Chris@0: Chris@0: // Close out the SVG or MathML special handling. Chris@17: if ('svg' == $name || 'math' == $name) { Chris@0: $this->outputMode = static::IM_IN_HTML; Chris@0: } Chris@0: } Chris@0: Chris@0: // If not unary, add a closing tag. Chris@17: if (!Elements::isA($name, Elements::VOID_TAG)) { Chris@0: $this->closeTag($ele); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Write a text node. Chris@0: * Chris@17: * @param \DOMText $ele The text node to write. Chris@0: */ Chris@0: public function text($ele) Chris@0: { Chris@0: if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->localName, Elements::TEXT_RAW)) { Chris@0: $this->wr($ele->data); Chris@17: Chris@0: return; Chris@0: } Chris@0: Chris@0: // FIXME: This probably needs some flags set. Chris@0: $this->wr($this->enc($ele->data)); Chris@0: } Chris@0: Chris@0: public function cdata($ele) Chris@0: { Chris@0: // This encodes CDATA. Chris@0: $this->wr($ele->ownerDocument->saveXML($ele)); Chris@0: } Chris@0: Chris@0: public function comment($ele) Chris@0: { Chris@0: // These produce identical output. Chris@0: // $this->wr(''); Chris@0: $this->wr($ele->ownerDocument->saveXML($ele)); Chris@0: } Chris@0: Chris@0: public function processorInstruction($ele) Chris@0: { Chris@0: $this->wr('wr($ele->target) Chris@0: ->wr(' ') Chris@0: ->wr($ele->data) Chris@0: ->wr('?>'); Chris@0: } Chris@17: Chris@0: /** Chris@17: * Write the namespace attributes. Chris@0: * Chris@17: * @param \DOMNode $ele The element being written. Chris@0: */ Chris@0: protected function namespaceAttrs($ele) Chris@0: { Chris@17: if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument) { Chris@0: $this->xpath = new \DOMXPath($ele->ownerDocument); Chris@0: } Chris@0: Chris@17: foreach ($this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele) as $nsNode) { Chris@0: if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) { Chris@0: $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"'); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Write the opening tag. Chris@0: * Chris@0: * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the Chris@0: * qualified name (8.3). Chris@0: * Chris@17: * @param \DOMNode $ele The element being written. Chris@0: */ Chris@0: protected function openTag($ele) Chris@0: { Chris@0: $this->wr('<')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName); Chris@0: Chris@0: $this->attrs($ele); Chris@0: $this->namespaceAttrs($ele); Chris@0: Chris@0: if ($this->outputMode == static::IM_IN_HTML) { Chris@0: $this->wr('>'); Chris@0: } // If we are not in html mode we are in SVG, MathML, or XML embedded content. Chris@0: else { Chris@0: if ($ele->hasChildNodes()) { Chris@0: $this->wr('>'); Chris@0: } // If there are no children this is self closing. Chris@0: else { Chris@0: $this->wr(' />'); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: protected function attrs($ele) Chris@0: { Chris@0: // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. Chris@17: if (!$ele->hasAttributes()) { Chris@0: return $this; Chris@0: } Chris@0: Chris@0: // TODO: Currently, this always writes name="value", and does not do Chris@0: // value-less attributes. Chris@0: $map = $ele->attributes; Chris@0: $len = $map->length; Chris@17: for ($i = 0; $i < $len; ++$i) { Chris@0: $node = $map->item($i); Chris@0: $val = $this->enc($node->value, true); Chris@0: Chris@0: // XXX: The spec says that we need to ensure that anything in Chris@0: // the XML, XMLNS, or XLink NS's should use the canonical Chris@0: // prefix. It seems that DOM does this for us already, but there Chris@0: // may be exceptions. Chris@0: $name = $node->nodeName; Chris@0: Chris@0: // Special handling for attributes in SVG and MathML. Chris@0: // Using if/elseif instead of switch because it's faster in PHP. Chris@0: if ($this->outputMode == static::IM_IN_SVG) { Chris@0: $name = Elements::normalizeSvgAttribute($name); Chris@0: } elseif ($this->outputMode == static::IM_IN_MATHML) { Chris@0: $name = Elements::normalizeMathMlAttribute($name); Chris@0: } Chris@0: Chris@0: $this->wr(' ')->wr($name); Chris@0: Chris@17: if ((isset($val) && '' !== $val) || $this->nonBooleanAttribute($node)) { Chris@0: $this->wr('="')->wr($val)->wr('"'); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: protected function nonBooleanAttribute(\DOMAttr $attr) Chris@0: { Chris@0: $ele = $attr->ownerElement; Chris@17: foreach ($this->nonBooleanAttributes as $rule) { Chris@17: if (isset($rule['nodeNamespace']) && $rule['nodeNamespace'] !== $ele->namespaceURI) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['attNamespace']) && $rule['attNamespace'] !== $attr->namespaceURI) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['nodeName']) && !is_array($rule['nodeName']) && $rule['nodeName'] !== $ele->localName) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['nodeName']) && is_array($rule['nodeName']) && !in_array($ele->localName, $rule['nodeName'], true)) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['attrName']) && !is_array($rule['attrName']) && $rule['attrName'] !== $attr->localName) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['attrName']) && is_array($rule['attrName']) && !in_array($attr->localName, $rule['attrName'], true)) { Chris@0: continue; Chris@0: } Chris@17: if (isset($rule['xpath'])) { Chris@0: $xp = $this->getXPath($attr); Chris@17: if (isset($rule['prefixes'])) { Chris@17: foreach ($rule['prefixes'] as $nsPrefix => $ns) { Chris@0: $xp->registerNamespace($nsPrefix, $ns); Chris@0: } Chris@0: } Chris@17: if (!$xp->evaluate($rule['xpath'], $attr)) { Chris@0: continue; Chris@0: } Chris@0: } Chris@0: Chris@0: return true; Chris@0: } Chris@0: Chris@0: return false; Chris@0: } Chris@0: Chris@17: private function getXPath(\DOMNode $node) Chris@17: { Chris@17: if (!$this->xpath) { Chris@0: $this->xpath = new \DOMXPath($node->ownerDocument); Chris@0: } Chris@17: Chris@0: return $this->xpath; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Write the closing tag. Chris@0: * Chris@0: * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the Chris@0: * qualified name (8.3). Chris@0: * Chris@17: * @param \DOMNode $ele The element being written. Chris@0: */ Chris@0: protected function closeTag($ele) Chris@0: { Chris@0: if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) { Chris@0: $this->wr('wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName)->wr('>'); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Write to the output. Chris@0: * Chris@17: * @param string $text The string to put into the output Chris@0: * Chris@17: * @return $this Chris@0: */ Chris@0: protected function wr($text) Chris@0: { Chris@0: fwrite($this->out, $text); Chris@17: Chris@0: return $this; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Write a new line character. Chris@0: * Chris@17: * @return $this Chris@0: */ Chris@0: protected function nl() Chris@0: { Chris@0: fwrite($this->out, PHP_EOL); Chris@17: Chris@0: return $this; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Encode text. Chris@0: * Chris@0: * When encode is set to false, the default value, the text passed in is Chris@0: * escaped per section 8.3 of the html5 spec. For details on how text is Chris@0: * escaped see the escape() method. Chris@0: * Chris@0: * When encoding is set to true the text is converted to named character Chris@0: * references where appropriate. Section 8.1.4 Character references of the Chris@0: * html5 spec refers to using named character references. This is useful for Chris@0: * characters that can't otherwise legally be used in the text. Chris@0: * Chris@0: * The named character references are listed in section 8.5. Chris@0: * Chris@0: * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references True encoding will turn all named character references into their entities. Chris@0: * This includes such characters as +.# and many other common ones. By default Chris@0: * encoding here will just escape &'<>". Chris@0: * Chris@0: * Note, PHP 5.4+ has better html5 encoding. Chris@0: * Chris@0: * @todo Use the Entities class in php 5.3 to have html5 entities. Chris@0: * Chris@17: * @param string $text Text to encode. Chris@17: * @param bool $attribute True if we are encoding an attrubute, false otherwise. Chris@0: * Chris@0: * @return string The encoded text. Chris@0: */ Chris@0: protected function enc($text, $attribute = false) Chris@0: { Chris@0: // Escape the text rather than convert to named character references. Chris@17: if (!$this->encode) { Chris@0: return $this->escape($text, $attribute); Chris@0: } Chris@0: Chris@0: // If we are in PHP 5.4+ we can use the native html5 entity functionality to Chris@0: // convert the named character references. Chris@0: Chris@0: if ($this->hasHTML5) { Chris@0: return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', false); Chris@0: } // If a version earlier than 5.4 html5 entities are not entirely handled. Chris@0: // This manually handles them. Chris@0: else { Chris@17: return strtr($text, HTML5Entities::$map); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape test. Chris@0: * Chris@0: * According to the html5 spec section 8.3 Serializing HTML fragments, text Chris@0: * within tags that are not style, script, xmp, iframe, noembed, and noframes Chris@0: * need to be properly escaped. Chris@0: * Chris@0: * The & should be converted to &, no breaking space unicode characters Chris@0: * converted to  , when in attribute mode the " should be converted to Chris@0: * ", and when not in attribute mode the < and > should be converted to Chris@0: * < and >. Chris@0: * Chris@0: * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString Chris@0: * Chris@17: * @param string $text Text to escape. Chris@17: * @param bool $attribute True if we are escaping an attrubute, false otherwise. Chris@0: */ Chris@0: protected function escape($text, $attribute = false) Chris@0: { Chris@0: // Not using htmlspecialchars because, while it does escaping, it doesn't Chris@0: // match the requirements of section 8.5. For example, it doesn't handle Chris@0: // non-breaking spaces. Chris@0: if ($attribute) { Chris@0: $replace = array( Chris@0: '"' => '"', Chris@0: '&' => '&', Chris@17: "\xc2\xa0" => ' ', Chris@0: ); Chris@0: } else { Chris@0: $replace = array( Chris@0: '<' => '<', Chris@0: '>' => '>', Chris@0: '&' => '&', Chris@17: "\xc2\xa0" => ' ', Chris@0: ); Chris@0: } Chris@0: Chris@0: return strtr($text, $replace); Chris@0: } Chris@0: }