Chris@0: tag. The time of Java applets has passed. Chris@0: * - The attribute 'icon' is omitted, because no browser implements the Chris@0: * tag anymore. Chris@0: * See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command. Chris@0: * - The 'manifest' attribute is omitted because it only exists for the Chris@0: * tag. That tag only makes sense in a HTML-served-as-HTML context, in which Chris@0: * case relative URLs are guaranteed to work. Chris@0: * Chris@0: * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes Chris@0: * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value Chris@0: * Chris@0: * @var string[] Chris@0: */ Chris@0: protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about']; Chris@0: Chris@0: /** Chris@0: * Prepares a string for use as a valid class name. Chris@0: * Chris@0: * Do not pass one string containing multiple classes as they will be Chris@0: * incorrectly concatenated with dashes, i.e. "one two" will become "one-two". Chris@0: * Chris@0: * @param mixed $class Chris@0: * The class name to clean. It can be a string or anything that can be cast Chris@0: * to string. Chris@0: * Chris@0: * @return string Chris@0: * The cleaned class name. Chris@0: */ Chris@0: public static function getClass($class) { Chris@0: $class = (string) $class; Chris@0: if (!isset(static::$classes[$class])) { Chris@17: static::$classes[$class] = static::cleanCssIdentifier(mb_strtolower($class)); Chris@0: } Chris@0: return static::$classes[$class]; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Prepares a string for use as a CSS identifier (element, class, or ID name). Chris@0: * Chris@17: * Link below shows the syntax for valid CSS identifiers (including element Chris@17: * names, classes, and IDs in selectors). Chris@17: * Chris@17: * @see http://www.w3.org/TR/CSS21/syndata.html#characters Chris@0: * Chris@0: * @param string $identifier Chris@0: * The identifier to clean. Chris@0: * @param array $filter Chris@0: * An array of string replacements to use on the identifier. Chris@0: * Chris@0: * @return string Chris@0: * The cleaned identifier. Chris@0: */ Chris@0: public static function cleanCssIdentifier($identifier, array $filter = [ Chris@0: ' ' => '-', Chris@0: '_' => '-', Chris@0: '/' => '-', Chris@0: '[' => '-', Chris@0: ']' => '', Chris@0: ]) { Chris@0: // We could also use strtr() here but its much slower than str_replace(). In Chris@0: // order to keep '__' to stay '__' we first replace it with a different Chris@0: // placeholder after checking that it is not defined as a filter. Chris@0: $double_underscore_replacements = 0; Chris@0: if (!isset($filter['__'])) { Chris@0: $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements); Chris@0: } Chris@0: $identifier = str_replace(array_keys($filter), array_values($filter), $identifier); Chris@0: // Replace temporary placeholder '##' with '__' only if the original Chris@0: // $identifier contained '__'. Chris@0: if ($double_underscore_replacements > 0) { Chris@0: $identifier = str_replace('##', '__', $identifier); Chris@0: } Chris@0: Chris@0: // Valid characters in a CSS identifier are: Chris@0: // - the hyphen (U+002D) Chris@0: // - a-z (U+0030 - U+0039) Chris@0: // - A-Z (U+0041 - U+005A) Chris@0: // - the underscore (U+005F) Chris@0: // - 0-9 (U+0061 - U+007A) Chris@0: // - ISO 10646 characters U+00A1 and higher Chris@0: // We strip out any character not in the above list. Chris@0: $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier); Chris@0: // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit. Chris@0: $identifier = preg_replace([ Chris@0: '/^[0-9]/', Chris@17: '/^(-[0-9])|^(--)/', Chris@0: ], ['_', '__'], $identifier); Chris@0: return $identifier; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Sets if this request is an Ajax request. Chris@0: * Chris@0: * @param bool $is_ajax Chris@0: * TRUE if this request is an Ajax request, FALSE otherwise. Chris@0: */ Chris@0: public static function setIsAjax($is_ajax) { Chris@0: static::$isAjax = $is_ajax; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Prepares a string for use as a valid HTML ID and guarantees uniqueness. Chris@0: * Chris@0: * This function ensures that each passed HTML ID value only exists once on Chris@0: * the page. By tracking the already returned ids, this function enables Chris@0: * forms, blocks, and other content to be output multiple times on the same Chris@0: * page, without breaking (X)HTML validation. Chris@0: * Chris@0: * For already existing IDs, a counter is appended to the ID string. Chris@0: * Therefore, JavaScript and CSS code should not rely on any value that was Chris@0: * generated by this function and instead should rely on manually added CSS Chris@0: * classes or similarly reliable constructs. Chris@0: * Chris@0: * Two consecutive hyphens separate the counter from the original ID. To Chris@0: * manage uniqueness across multiple Ajax requests on the same page, Ajax Chris@0: * requests POST an array of all IDs currently present on the page, which are Chris@0: * used to prime this function's cache upon first invocation. Chris@0: * Chris@0: * To allow reverse-parsing of IDs submitted via Ajax, any multiple Chris@0: * consecutive hyphens in the originally passed $id are replaced with a Chris@0: * single hyphen. Chris@0: * Chris@0: * @param string $id Chris@0: * The ID to clean. Chris@0: * Chris@0: * @return string Chris@0: * The cleaned ID. Chris@0: */ Chris@0: public static function getUniqueId($id) { Chris@0: // If this is an Ajax request, then content returned by this page request Chris@0: // will be merged with content already on the base page. The HTML IDs must Chris@0: // be unique for the fully merged content. Therefore use unique IDs. Chris@0: if (static::$isAjax) { Chris@0: return static::getId($id) . '--' . Crypt::randomBytesBase64(8); Chris@0: } Chris@0: Chris@0: // @todo Remove all that code once we switch over to random IDs only, Chris@0: // see https://www.drupal.org/node/1090592. Chris@0: if (!isset(static::$seenIdsInit)) { Chris@0: static::$seenIdsInit = []; Chris@0: } Chris@0: if (!isset(static::$seenIds)) { Chris@0: static::$seenIds = static::$seenIdsInit; Chris@0: } Chris@0: Chris@0: $id = static::getId($id); Chris@0: Chris@0: // Ensure IDs are unique by appending a counter after the first occurrence. Chris@0: // The counter needs to be appended with a delimiter that does not exist in Chris@0: // the base ID. Requiring a unique delimiter helps ensure that we really do Chris@0: // return unique IDs and also helps us re-create the $seen_ids array during Chris@0: // Ajax requests. Chris@0: if (isset(static::$seenIds[$id])) { Chris@0: $id = $id . '--' . ++static::$seenIds[$id]; Chris@0: } Chris@0: else { Chris@0: static::$seenIds[$id] = 1; Chris@0: } Chris@0: return $id; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Prepares a string for use as a valid HTML ID. Chris@0: * Chris@0: * Only use this function when you want to intentionally skip the uniqueness Chris@0: * guarantee of self::getUniqueId(). Chris@0: * Chris@0: * @param string $id Chris@0: * The ID to clean. Chris@0: * Chris@0: * @return string Chris@0: * The cleaned ID. Chris@0: * Chris@0: * @see self::getUniqueId() Chris@0: */ Chris@0: public static function getId($id) { Chris@17: $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], mb_strtolower($id)); Chris@0: Chris@0: // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can Chris@0: // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"), Chris@0: // colons (":"), and periods ("."). We strip out any character not in that Chris@0: // list. Note that the CSS spec doesn't allow colons or periods in identifiers Chris@0: // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two Chris@0: // characters as well. Chris@0: $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id); Chris@0: Chris@0: // Removing multiple consecutive hyphens. Chris@0: $id = preg_replace('/\-+/', '-', $id); Chris@0: return $id; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Resets the list of seen IDs. Chris@0: */ Chris@0: public static function resetSeenIds() { Chris@0: static::$seenIds = NULL; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Normalizes an HTML snippet. Chris@0: * Chris@0: * This function is essentially \DOMDocument::normalizeDocument(), but Chris@0: * operates on an HTML string instead of a \DOMDocument. Chris@0: * Chris@0: * @param string $html Chris@0: * The HTML string to normalize. Chris@0: * Chris@0: * @return string Chris@0: * The normalized HTML string. Chris@0: */ Chris@0: public static function normalize($html) { Chris@0: $document = static::load($html); Chris@0: return static::serialize($document); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parses an HTML snippet and returns it as a DOM object. Chris@0: * Chris@0: * This function loads the body part of a partial (X)HTML document and returns Chris@0: * a full \DOMDocument object that represents this document. Chris@0: * Chris@0: * Use \Drupal\Component\Utility\Html::serialize() to serialize this Chris@0: * \DOMDocument back to a string. Chris@0: * Chris@0: * @param string $html Chris@0: * The partial (X)HTML snippet to load. Invalid markup will be corrected on Chris@0: * import. Chris@0: * Chris@0: * @return \DOMDocument Chris@0: * A \DOMDocument that represents the loaded (X)HTML snippet. Chris@0: */ Chris@0: public static function load($html) { Chris@0: $document = << Chris@0: Chris@0: Chris@0: !html Chris@0: Chris@0: EOD; Chris@0: // PHP's \DOMDocument serialization adds extra whitespace when the markup Chris@0: // of the wrapping document contains newlines, so ensure we remove all Chris@0: // newlines before injecting the actual HTML body to be processed. Chris@0: $document = strtr($document, ["\n" => '', '!html' => $html]); Chris@0: Chris@0: $dom = new \DOMDocument(); Chris@0: // Ignore warnings during HTML soup loading. Chris@0: @$dom->loadHTML($document); Chris@0: Chris@0: return $dom; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Converts the body of a \DOMDocument back to an HTML snippet. Chris@0: * Chris@0: * The function serializes the body part of a \DOMDocument back to an (X)HTML Chris@0: * snippet. The resulting (X)HTML snippet will be properly formatted to be Chris@0: * compatible with HTML user agents. Chris@0: * Chris@0: * @param \DOMDocument $document Chris@0: * A \DOMDocument object to serialize, only the tags below the first Chris@0: * node will be converted. Chris@0: * Chris@0: * @return string Chris@0: * A valid (X)HTML snippet, as a string. Chris@0: */ Chris@0: public static function serialize(\DOMDocument $document) { Chris@0: $body_node = $document->getElementsByTagName('body')->item(0); Chris@0: $html = ''; Chris@0: Chris@0: if ($body_node !== NULL) { Chris@0: foreach ($body_node->getElementsByTagName('script') as $node) { Chris@0: static::escapeCdataElement($node); Chris@0: } Chris@0: foreach ($body_node->getElementsByTagName('style') as $node) { Chris@0: static::escapeCdataElement($node, '/*', '*/'); Chris@0: } Chris@0: foreach ($body_node->childNodes as $node) { Chris@0: $html .= $document->saveXML($node); Chris@0: } Chris@0: } Chris@0: return $html; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds comments around a childNodes as $child_node) { Chris@0: if ($child_node instanceof \DOMCdataSection) { Chris@0: $embed_prefix = "\n{$comment_end}\n"; Chris@0: Chris@0: // Prevent invalid cdata escaping as this would throw a DOM error. Chris@0: // This is the same behavior as found in libxml2. Chris@0: // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection Chris@0: // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting Chris@0: $data = str_replace(']]>', ']]]]>', $child_node->data); Chris@0: Chris@0: $fragment = $node->ownerDocument->createDocumentFragment(); Chris@0: $fragment->appendXML($embed_prefix . $data . $embed_suffix); Chris@0: $node->appendChild($fragment); Chris@0: $node->removeChild($child_node); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Decodes all HTML entities including numerical ones to regular UTF-8 bytes. Chris@0: * Chris@0: * Double-escaped entities will only be decoded once ("&lt;" becomes Chris@0: * "<", not "<"). Be careful when using this function, as it will revert Chris@0: * previous sanitization efforts (<script> will become