comparison core/lib/Drupal/Component/Utility/Html.php @ 0:4c8ae668cc8c

Initial import (non-working)
author Chris Cannam
date Wed, 29 Nov 2017 16:09:58 +0000
parents
children 1fec387a4317
comparison
equal deleted inserted replaced
-1:000000000000 0:4c8ae668cc8c
1 <?php
2
3 namespace Drupal\Component\Utility;
4
5 /**
6 * Provides DOMDocument helpers for parsing and serializing HTML strings.
7 *
8 * @ingroup utility
9 */
10 class Html {
11
12 /**
13 * An array of previously cleaned HTML classes.
14 *
15 * @var array
16 */
17 protected static $classes = [];
18
19 /**
20 * An array of the initial IDs used in one request.
21 *
22 * @var array
23 */
24 protected static $seenIdsInit;
25
26 /**
27 * An array of IDs, including incremented versions when an ID is duplicated.
28 * @var array
29 */
30 protected static $seenIds;
31
32 /**
33 * Stores whether the current request was sent via AJAX.
34 *
35 * @var bool
36 */
37 protected static $isAjax = FALSE;
38
39 /**
40 * All attributes that may contain URIs.
41 *
42 * - The attributes 'code' and 'codebase' are omitted, because they only exist
43 * for the <applet> tag. The time of Java applets has passed.
44 * - The attribute 'icon' is omitted, because no browser implements the
45 * <command> tag anymore.
46 * See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command.
47 * - The 'manifest' attribute is omitted because it only exists for the <html>
48 * tag. That tag only makes sense in a HTML-served-as-HTML context, in which
49 * case relative URLs are guaranteed to work.
50 *
51 * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
52 * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
53 *
54 * @var string[]
55 */
56 protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about'];
57
58 /**
59 * Prepares a string for use as a valid class name.
60 *
61 * Do not pass one string containing multiple classes as they will be
62 * incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
63 *
64 * @param mixed $class
65 * The class name to clean. It can be a string or anything that can be cast
66 * to string.
67 *
68 * @return string
69 * The cleaned class name.
70 */
71 public static function getClass($class) {
72 $class = (string) $class;
73 if (!isset(static::$classes[$class])) {
74 static::$classes[$class] = static::cleanCssIdentifier(Unicode::strtolower($class));
75 }
76 return static::$classes[$class];
77 }
78
79 /**
80 * Prepares a string for use as a CSS identifier (element, class, or ID name).
81 *
82 * http://www.w3.org/TR/CSS21/syndata.html#characters shows the syntax for
83 * valid CSS identifiers (including element names, classes, and IDs in
84 * selectors.)
85 *
86 * @param string $identifier
87 * The identifier to clean.
88 * @param array $filter
89 * An array of string replacements to use on the identifier.
90 *
91 * @return string
92 * The cleaned identifier.
93 */
94 public static function cleanCssIdentifier($identifier, array $filter = [
95 ' ' => '-',
96 '_' => '-',
97 '/' => '-',
98 '[' => '-',
99 ']' => '',
100 ]) {
101 // We could also use strtr() here but its much slower than str_replace(). In
102 // order to keep '__' to stay '__' we first replace it with a different
103 // placeholder after checking that it is not defined as a filter.
104 $double_underscore_replacements = 0;
105 if (!isset($filter['__'])) {
106 $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements);
107 }
108 $identifier = str_replace(array_keys($filter), array_values($filter), $identifier);
109 // Replace temporary placeholder '##' with '__' only if the original
110 // $identifier contained '__'.
111 if ($double_underscore_replacements > 0) {
112 $identifier = str_replace('##', '__', $identifier);
113 }
114
115 // Valid characters in a CSS identifier are:
116 // - the hyphen (U+002D)
117 // - a-z (U+0030 - U+0039)
118 // - A-Z (U+0041 - U+005A)
119 // - the underscore (U+005F)
120 // - 0-9 (U+0061 - U+007A)
121 // - ISO 10646 characters U+00A1 and higher
122 // We strip out any character not in the above list.
123 $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
124 // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
125 $identifier = preg_replace([
126 '/^[0-9]/',
127 '/^(-[0-9])|^(--)/'
128 ], ['_', '__'], $identifier);
129 return $identifier;
130 }
131
132 /**
133 * Sets if this request is an Ajax request.
134 *
135 * @param bool $is_ajax
136 * TRUE if this request is an Ajax request, FALSE otherwise.
137 */
138 public static function setIsAjax($is_ajax) {
139 static::$isAjax = $is_ajax;
140 }
141
142 /**
143 * Prepares a string for use as a valid HTML ID and guarantees uniqueness.
144 *
145 * This function ensures that each passed HTML ID value only exists once on
146 * the page. By tracking the already returned ids, this function enables
147 * forms, blocks, and other content to be output multiple times on the same
148 * page, without breaking (X)HTML validation.
149 *
150 * For already existing IDs, a counter is appended to the ID string.
151 * Therefore, JavaScript and CSS code should not rely on any value that was
152 * generated by this function and instead should rely on manually added CSS
153 * classes or similarly reliable constructs.
154 *
155 * Two consecutive hyphens separate the counter from the original ID. To
156 * manage uniqueness across multiple Ajax requests on the same page, Ajax
157 * requests POST an array of all IDs currently present on the page, which are
158 * used to prime this function's cache upon first invocation.
159 *
160 * To allow reverse-parsing of IDs submitted via Ajax, any multiple
161 * consecutive hyphens in the originally passed $id are replaced with a
162 * single hyphen.
163 *
164 * @param string $id
165 * The ID to clean.
166 *
167 * @return string
168 * The cleaned ID.
169 */
170 public static function getUniqueId($id) {
171 // If this is an Ajax request, then content returned by this page request
172 // will be merged with content already on the base page. The HTML IDs must
173 // be unique for the fully merged content. Therefore use unique IDs.
174 if (static::$isAjax) {
175 return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
176 }
177
178 // @todo Remove all that code once we switch over to random IDs only,
179 // see https://www.drupal.org/node/1090592.
180 if (!isset(static::$seenIdsInit)) {
181 static::$seenIdsInit = [];
182 }
183 if (!isset(static::$seenIds)) {
184 static::$seenIds = static::$seenIdsInit;
185 }
186
187 $id = static::getId($id);
188
189 // Ensure IDs are unique by appending a counter after the first occurrence.
190 // The counter needs to be appended with a delimiter that does not exist in
191 // the base ID. Requiring a unique delimiter helps ensure that we really do
192 // return unique IDs and also helps us re-create the $seen_ids array during
193 // Ajax requests.
194 if (isset(static::$seenIds[$id])) {
195 $id = $id . '--' . ++static::$seenIds[$id];
196 }
197 else {
198 static::$seenIds[$id] = 1;
199 }
200 return $id;
201 }
202
203 /**
204 * Prepares a string for use as a valid HTML ID.
205 *
206 * Only use this function when you want to intentionally skip the uniqueness
207 * guarantee of self::getUniqueId().
208 *
209 * @param string $id
210 * The ID to clean.
211 *
212 * @return string
213 * The cleaned ID.
214 *
215 * @see self::getUniqueId()
216 */
217 public static function getId($id) {
218 $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], Unicode::strtolower($id));
219
220 // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
221 // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
222 // colons (":"), and periods ("."). We strip out any character not in that
223 // list. Note that the CSS spec doesn't allow colons or periods in identifiers
224 // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
225 // characters as well.
226 $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
227
228 // Removing multiple consecutive hyphens.
229 $id = preg_replace('/\-+/', '-', $id);
230 return $id;
231 }
232
233 /**
234 * Resets the list of seen IDs.
235 */
236 public static function resetSeenIds() {
237 static::$seenIds = NULL;
238 }
239
240 /**
241 * Normalizes an HTML snippet.
242 *
243 * This function is essentially \DOMDocument::normalizeDocument(), but
244 * operates on an HTML string instead of a \DOMDocument.
245 *
246 * @param string $html
247 * The HTML string to normalize.
248 *
249 * @return string
250 * The normalized HTML string.
251 */
252 public static function normalize($html) {
253 $document = static::load($html);
254 return static::serialize($document);
255 }
256
257 /**
258 * Parses an HTML snippet and returns it as a DOM object.
259 *
260 * This function loads the body part of a partial (X)HTML document and returns
261 * a full \DOMDocument object that represents this document.
262 *
263 * Use \Drupal\Component\Utility\Html::serialize() to serialize this
264 * \DOMDocument back to a string.
265 *
266 * @param string $html
267 * The partial (X)HTML snippet to load. Invalid markup will be corrected on
268 * import.
269 *
270 * @return \DOMDocument
271 * A \DOMDocument that represents the loaded (X)HTML snippet.
272 */
273 public static function load($html) {
274 $document = <<<EOD
275 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
276 <html xmlns="http://www.w3.org/1999/xhtml">
277 <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
278 <body>!html</body>
279 </html>
280 EOD;
281 // PHP's \DOMDocument serialization adds extra whitespace when the markup
282 // of the wrapping document contains newlines, so ensure we remove all
283 // newlines before injecting the actual HTML body to be processed.
284 $document = strtr($document, ["\n" => '', '!html' => $html]);
285
286 $dom = new \DOMDocument();
287 // Ignore warnings during HTML soup loading.
288 @$dom->loadHTML($document);
289
290 return $dom;
291 }
292
293 /**
294 * Converts the body of a \DOMDocument back to an HTML snippet.
295 *
296 * The function serializes the body part of a \DOMDocument back to an (X)HTML
297 * snippet. The resulting (X)HTML snippet will be properly formatted to be
298 * compatible with HTML user agents.
299 *
300 * @param \DOMDocument $document
301 * A \DOMDocument object to serialize, only the tags below the first <body>
302 * node will be converted.
303 *
304 * @return string
305 * A valid (X)HTML snippet, as a string.
306 */
307 public static function serialize(\DOMDocument $document) {
308 $body_node = $document->getElementsByTagName('body')->item(0);
309 $html = '';
310
311 if ($body_node !== NULL) {
312 foreach ($body_node->getElementsByTagName('script') as $node) {
313 static::escapeCdataElement($node);
314 }
315 foreach ($body_node->getElementsByTagName('style') as $node) {
316 static::escapeCdataElement($node, '/*', '*/');
317 }
318 foreach ($body_node->childNodes as $node) {
319 $html .= $document->saveXML($node);
320 }
321 }
322 return $html;
323 }
324
325 /**
326 * Adds comments around a <!CDATA section in a \DOMNode.
327 *
328 * \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
329 * CDATA sections from the contents of inline script and style tags. This can
330 * cause HTML4 browsers to throw exceptions.
331 *
332 * This function attempts to solve the problem by creating a
333 * \DOMDocumentFragment to comment the CDATA tag.
334 *
335 * @param \DOMNode $node
336 * The element potentially containing a CDATA node.
337 * @param string $comment_start
338 * (optional) A string to use as a comment start marker to escape the CDATA
339 * declaration. Defaults to '//'.
340 * @param string $comment_end
341 * (optional) A string to use as a comment end marker to escape the CDATA
342 * declaration. Defaults to an empty string.
343 */
344 public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
345 foreach ($node->childNodes as $child_node) {
346 if ($child_node instanceof \DOMCdataSection) {
347 $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
348 $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
349
350 // Prevent invalid cdata escaping as this would throw a DOM error.
351 // This is the same behavior as found in libxml2.
352 // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
353 // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting
354 $data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
355
356 $fragment = $node->ownerDocument->createDocumentFragment();
357 $fragment->appendXML($embed_prefix . $data . $embed_suffix);
358 $node->appendChild($fragment);
359 $node->removeChild($child_node);
360 }
361 }
362 }
363
364 /**
365 * Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
366 *
367 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
368 * "&lt;", not "<"). Be careful when using this function, as it will revert
369 * previous sanitization efforts (&lt;script&gt; will become <script>).
370 *
371 * This method is not the opposite of Html::escape(). For example, this method
372 * will convert "&eacute;" to "é", whereas Html::escape() will not convert "é"
373 * to "&eacute;".
374 *
375 * @param string $text
376 * The text to decode entities in.
377 *
378 * @return string
379 * The input $text, with all HTML entities decoded once.
380 *
381 * @see html_entity_decode()
382 * @see \Drupal\Component\Utility\Html::escape()
383 */
384 public static function decodeEntities($text) {
385 return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
386 }
387
388 /**
389 * Escapes text by converting special characters to HTML entities.
390 *
391 * This method escapes HTML for sanitization purposes by replacing the
392 * following special characters with their HTML entity equivalents:
393 * - & (ampersand) becomes &amp;
394 * - " (double quote) becomes &quot;
395 * - ' (single quote) becomes &#039;
396 * - < (less than) becomes &lt;
397 * - > (greater than) becomes &gt;
398 * Special characters that have already been escaped will be double-escaped
399 * (for example, "&lt;" becomes "&amp;lt;"), and invalid UTF-8 encoding
400 * will be converted to the Unicode replacement character ("�").
401 *
402 * This method is not the opposite of Html::decodeEntities(). For example,
403 * this method will not encode "é" to "&eacute;", whereas
404 * Html::decodeEntities() will convert all HTML entities to UTF-8 bytes,
405 * including "&eacute;" and "&lt;" to "é" and "<".
406 *
407 * When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to
408 * '#markup' is not recommended. Use the '#plain_text' key instead and the
409 * renderer will autoescape the text.
410 *
411 * @param string $text
412 * The input text.
413 *
414 * @return string
415 * The text with all HTML special characters converted.
416 *
417 * @see htmlspecialchars()
418 * @see \Drupal\Component\Utility\Html::decodeEntities()
419 *
420 * @ingroup sanitization
421 */
422 public static function escape($text) {
423 return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
424 }
425
426 /**
427 * Converts all root-relative URLs to absolute URLs.
428 *
429 * Does not change any existing protocol-relative or absolute URLs. Does not
430 * change other relative URLs because they would result in different absolute
431 * URLs depending on the current path. For example: when the same content
432 * containing such a relative URL (for example 'image.png'), is served from
433 * its canonical URL (for example 'http://example.com/some-article') or from
434 * a listing or feed (for example 'http://example.com/all-articles') their
435 * "current path" differs, resulting in different absolute URLs:
436 * 'http://example.com/some-article/image.png' versus
437 * 'http://example.com/all-articles/image.png'. Only one can be correct.
438 * Therefore relative URLs that are not root-relative cannot be safely
439 * transformed and should generally be avoided.
440 *
441 * Necessary for HTML that is served outside of a website, for example, RSS
442 * and e-mail.
443 *
444 * @param string $html
445 * The partial (X)HTML snippet to load. Invalid markup will be corrected on
446 * import.
447 * @param string $scheme_and_host
448 * The root URL, which has a URI scheme, host and optional port.
449 *
450 * @return string
451 * The updated (X)HTML snippet.
452 */
453 public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) {
454 assert('empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"]))', '$scheme_and_host contains scheme, host and port at most.');
455 assert('isset(parse_url($scheme_and_host)["scheme"])', '$scheme_and_host is absolute and hence has a scheme.');
456 assert('isset(parse_url($scheme_and_host)["host"])', '$base_url is absolute and hence has a host.');
457
458 $html_dom = Html::load($html);
459 $xpath = new \DOMXpath($html_dom);
460
461 // Update all root-relative URLs to absolute URLs in the given HTML.
462 foreach (static::$uriAttributes as $attr) {
463 foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) {
464 $node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr));
465 }
466 foreach ($xpath->query("//*[@srcset]") as $node) {
467 // @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
468 // @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string
469 $image_candidate_strings = explode(',', $node->getAttribute('srcset'));
470 $image_candidate_strings = array_map('trim', $image_candidate_strings);
471 for ($i = 0; $i < count($image_candidate_strings); $i++) {
472 $image_candidate_string = $image_candidate_strings[$i];
473 if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') {
474 $image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string;
475 }
476 }
477 $node->setAttribute('srcset', implode(', ', $image_candidate_strings));
478 }
479 }
480 return Html::serialize($html_dom);
481 }
482
483 }