Mercurial > hg > isophonics-drupal-site
comparison core/lib/Drupal/Component/Utility/Html.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 1fec387a4317 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4c8ae668cc8c |
---|---|
1 <?php | |
2 | |
3 namespace Drupal\Component\Utility; | |
4 | |
5 /** | |
6 * Provides DOMDocument helpers for parsing and serializing HTML strings. | |
7 * | |
8 * @ingroup utility | |
9 */ | |
10 class Html { | |
11 | |
12 /** | |
13 * An array of previously cleaned HTML classes. | |
14 * | |
15 * @var array | |
16 */ | |
17 protected static $classes = []; | |
18 | |
19 /** | |
20 * An array of the initial IDs used in one request. | |
21 * | |
22 * @var array | |
23 */ | |
24 protected static $seenIdsInit; | |
25 | |
26 /** | |
27 * An array of IDs, including incremented versions when an ID is duplicated. | |
28 * @var array | |
29 */ | |
30 protected static $seenIds; | |
31 | |
32 /** | |
33 * Stores whether the current request was sent via AJAX. | |
34 * | |
35 * @var bool | |
36 */ | |
37 protected static $isAjax = FALSE; | |
38 | |
39 /** | |
40 * All attributes that may contain URIs. | |
41 * | |
42 * - The attributes 'code' and 'codebase' are omitted, because they only exist | |
43 * for the <applet> tag. The time of Java applets has passed. | |
44 * - The attribute 'icon' is omitted, because no browser implements the | |
45 * <command> tag anymore. | |
46 * See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command. | |
47 * - The 'manifest' attribute is omitted because it only exists for the <html> | |
48 * tag. That tag only makes sense in a HTML-served-as-HTML context, in which | |
49 * case relative URLs are guaranteed to work. | |
50 * | |
51 * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes | |
52 * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value | |
53 * | |
54 * @var string[] | |
55 */ | |
56 protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about']; | |
57 | |
58 /** | |
59 * Prepares a string for use as a valid class name. | |
60 * | |
61 * Do not pass one string containing multiple classes as they will be | |
62 * incorrectly concatenated with dashes, i.e. "one two" will become "one-two". | |
63 * | |
64 * @param mixed $class | |
65 * The class name to clean. It can be a string or anything that can be cast | |
66 * to string. | |
67 * | |
68 * @return string | |
69 * The cleaned class name. | |
70 */ | |
71 public static function getClass($class) { | |
72 $class = (string) $class; | |
73 if (!isset(static::$classes[$class])) { | |
74 static::$classes[$class] = static::cleanCssIdentifier(Unicode::strtolower($class)); | |
75 } | |
76 return static::$classes[$class]; | |
77 } | |
78 | |
79 /** | |
80 * Prepares a string for use as a CSS identifier (element, class, or ID name). | |
81 * | |
82 * http://www.w3.org/TR/CSS21/syndata.html#characters shows the syntax for | |
83 * valid CSS identifiers (including element names, classes, and IDs in | |
84 * selectors.) | |
85 * | |
86 * @param string $identifier | |
87 * The identifier to clean. | |
88 * @param array $filter | |
89 * An array of string replacements to use on the identifier. | |
90 * | |
91 * @return string | |
92 * The cleaned identifier. | |
93 */ | |
94 public static function cleanCssIdentifier($identifier, array $filter = [ | |
95 ' ' => '-', | |
96 '_' => '-', | |
97 '/' => '-', | |
98 '[' => '-', | |
99 ']' => '', | |
100 ]) { | |
101 // We could also use strtr() here but its much slower than str_replace(). In | |
102 // order to keep '__' to stay '__' we first replace it with a different | |
103 // placeholder after checking that it is not defined as a filter. | |
104 $double_underscore_replacements = 0; | |
105 if (!isset($filter['__'])) { | |
106 $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements); | |
107 } | |
108 $identifier = str_replace(array_keys($filter), array_values($filter), $identifier); | |
109 // Replace temporary placeholder '##' with '__' only if the original | |
110 // $identifier contained '__'. | |
111 if ($double_underscore_replacements > 0) { | |
112 $identifier = str_replace('##', '__', $identifier); | |
113 } | |
114 | |
115 // Valid characters in a CSS identifier are: | |
116 // - the hyphen (U+002D) | |
117 // - a-z (U+0030 - U+0039) | |
118 // - A-Z (U+0041 - U+005A) | |
119 // - the underscore (U+005F) | |
120 // - 0-9 (U+0061 - U+007A) | |
121 // - ISO 10646 characters U+00A1 and higher | |
122 // We strip out any character not in the above list. | |
123 $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier); | |
124 // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit. | |
125 $identifier = preg_replace([ | |
126 '/^[0-9]/', | |
127 '/^(-[0-9])|^(--)/' | |
128 ], ['_', '__'], $identifier); | |
129 return $identifier; | |
130 } | |
131 | |
132 /** | |
133 * Sets if this request is an Ajax request. | |
134 * | |
135 * @param bool $is_ajax | |
136 * TRUE if this request is an Ajax request, FALSE otherwise. | |
137 */ | |
138 public static function setIsAjax($is_ajax) { | |
139 static::$isAjax = $is_ajax; | |
140 } | |
141 | |
142 /** | |
143 * Prepares a string for use as a valid HTML ID and guarantees uniqueness. | |
144 * | |
145 * This function ensures that each passed HTML ID value only exists once on | |
146 * the page. By tracking the already returned ids, this function enables | |
147 * forms, blocks, and other content to be output multiple times on the same | |
148 * page, without breaking (X)HTML validation. | |
149 * | |
150 * For already existing IDs, a counter is appended to the ID string. | |
151 * Therefore, JavaScript and CSS code should not rely on any value that was | |
152 * generated by this function and instead should rely on manually added CSS | |
153 * classes or similarly reliable constructs. | |
154 * | |
155 * Two consecutive hyphens separate the counter from the original ID. To | |
156 * manage uniqueness across multiple Ajax requests on the same page, Ajax | |
157 * requests POST an array of all IDs currently present on the page, which are | |
158 * used to prime this function's cache upon first invocation. | |
159 * | |
160 * To allow reverse-parsing of IDs submitted via Ajax, any multiple | |
161 * consecutive hyphens in the originally passed $id are replaced with a | |
162 * single hyphen. | |
163 * | |
164 * @param string $id | |
165 * The ID to clean. | |
166 * | |
167 * @return string | |
168 * The cleaned ID. | |
169 */ | |
170 public static function getUniqueId($id) { | |
171 // If this is an Ajax request, then content returned by this page request | |
172 // will be merged with content already on the base page. The HTML IDs must | |
173 // be unique for the fully merged content. Therefore use unique IDs. | |
174 if (static::$isAjax) { | |
175 return static::getId($id) . '--' . Crypt::randomBytesBase64(8); | |
176 } | |
177 | |
178 // @todo Remove all that code once we switch over to random IDs only, | |
179 // see https://www.drupal.org/node/1090592. | |
180 if (!isset(static::$seenIdsInit)) { | |
181 static::$seenIdsInit = []; | |
182 } | |
183 if (!isset(static::$seenIds)) { | |
184 static::$seenIds = static::$seenIdsInit; | |
185 } | |
186 | |
187 $id = static::getId($id); | |
188 | |
189 // Ensure IDs are unique by appending a counter after the first occurrence. | |
190 // The counter needs to be appended with a delimiter that does not exist in | |
191 // the base ID. Requiring a unique delimiter helps ensure that we really do | |
192 // return unique IDs and also helps us re-create the $seen_ids array during | |
193 // Ajax requests. | |
194 if (isset(static::$seenIds[$id])) { | |
195 $id = $id . '--' . ++static::$seenIds[$id]; | |
196 } | |
197 else { | |
198 static::$seenIds[$id] = 1; | |
199 } | |
200 return $id; | |
201 } | |
202 | |
203 /** | |
204 * Prepares a string for use as a valid HTML ID. | |
205 * | |
206 * Only use this function when you want to intentionally skip the uniqueness | |
207 * guarantee of self::getUniqueId(). | |
208 * | |
209 * @param string $id | |
210 * The ID to clean. | |
211 * | |
212 * @return string | |
213 * The cleaned ID. | |
214 * | |
215 * @see self::getUniqueId() | |
216 */ | |
217 public static function getId($id) { | |
218 $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], Unicode::strtolower($id)); | |
219 | |
220 // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can | |
221 // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"), | |
222 // colons (":"), and periods ("."). We strip out any character not in that | |
223 // list. Note that the CSS spec doesn't allow colons or periods in identifiers | |
224 // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two | |
225 // characters as well. | |
226 $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id); | |
227 | |
228 // Removing multiple consecutive hyphens. | |
229 $id = preg_replace('/\-+/', '-', $id); | |
230 return $id; | |
231 } | |
232 | |
233 /** | |
234 * Resets the list of seen IDs. | |
235 */ | |
236 public static function resetSeenIds() { | |
237 static::$seenIds = NULL; | |
238 } | |
239 | |
240 /** | |
241 * Normalizes an HTML snippet. | |
242 * | |
243 * This function is essentially \DOMDocument::normalizeDocument(), but | |
244 * operates on an HTML string instead of a \DOMDocument. | |
245 * | |
246 * @param string $html | |
247 * The HTML string to normalize. | |
248 * | |
249 * @return string | |
250 * The normalized HTML string. | |
251 */ | |
252 public static function normalize($html) { | |
253 $document = static::load($html); | |
254 return static::serialize($document); | |
255 } | |
256 | |
257 /** | |
258 * Parses an HTML snippet and returns it as a DOM object. | |
259 * | |
260 * This function loads the body part of a partial (X)HTML document and returns | |
261 * a full \DOMDocument object that represents this document. | |
262 * | |
263 * Use \Drupal\Component\Utility\Html::serialize() to serialize this | |
264 * \DOMDocument back to a string. | |
265 * | |
266 * @param string $html | |
267 * The partial (X)HTML snippet to load. Invalid markup will be corrected on | |
268 * import. | |
269 * | |
270 * @return \DOMDocument | |
271 * A \DOMDocument that represents the loaded (X)HTML snippet. | |
272 */ | |
273 public static function load($html) { | |
274 $document = <<<EOD | |
275 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
276 <html xmlns="http://www.w3.org/1999/xhtml"> | |
277 <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head> | |
278 <body>!html</body> | |
279 </html> | |
280 EOD; | |
281 // PHP's \DOMDocument serialization adds extra whitespace when the markup | |
282 // of the wrapping document contains newlines, so ensure we remove all | |
283 // newlines before injecting the actual HTML body to be processed. | |
284 $document = strtr($document, ["\n" => '', '!html' => $html]); | |
285 | |
286 $dom = new \DOMDocument(); | |
287 // Ignore warnings during HTML soup loading. | |
288 @$dom->loadHTML($document); | |
289 | |
290 return $dom; | |
291 } | |
292 | |
293 /** | |
294 * Converts the body of a \DOMDocument back to an HTML snippet. | |
295 * | |
296 * The function serializes the body part of a \DOMDocument back to an (X)HTML | |
297 * snippet. The resulting (X)HTML snippet will be properly formatted to be | |
298 * compatible with HTML user agents. | |
299 * | |
300 * @param \DOMDocument $document | |
301 * A \DOMDocument object to serialize, only the tags below the first <body> | |
302 * node will be converted. | |
303 * | |
304 * @return string | |
305 * A valid (X)HTML snippet, as a string. | |
306 */ | |
307 public static function serialize(\DOMDocument $document) { | |
308 $body_node = $document->getElementsByTagName('body')->item(0); | |
309 $html = ''; | |
310 | |
311 if ($body_node !== NULL) { | |
312 foreach ($body_node->getElementsByTagName('script') as $node) { | |
313 static::escapeCdataElement($node); | |
314 } | |
315 foreach ($body_node->getElementsByTagName('style') as $node) { | |
316 static::escapeCdataElement($node, '/*', '*/'); | |
317 } | |
318 foreach ($body_node->childNodes as $node) { | |
319 $html .= $document->saveXML($node); | |
320 } | |
321 } | |
322 return $html; | |
323 } | |
324 | |
325 /** | |
326 * Adds comments around a <!CDATA section in a \DOMNode. | |
327 * | |
328 * \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes | |
329 * CDATA sections from the contents of inline script and style tags. This can | |
330 * cause HTML4 browsers to throw exceptions. | |
331 * | |
332 * This function attempts to solve the problem by creating a | |
333 * \DOMDocumentFragment to comment the CDATA tag. | |
334 * | |
335 * @param \DOMNode $node | |
336 * The element potentially containing a CDATA node. | |
337 * @param string $comment_start | |
338 * (optional) A string to use as a comment start marker to escape the CDATA | |
339 * declaration. Defaults to '//'. | |
340 * @param string $comment_end | |
341 * (optional) A string to use as a comment end marker to escape the CDATA | |
342 * declaration. Defaults to an empty string. | |
343 */ | |
344 public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') { | |
345 foreach ($node->childNodes as $child_node) { | |
346 if ($child_node instanceof \DOMCdataSection) { | |
347 $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n"; | |
348 $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n"; | |
349 | |
350 // Prevent invalid cdata escaping as this would throw a DOM error. | |
351 // This is the same behavior as found in libxml2. | |
352 // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection | |
353 // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting | |
354 $data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data); | |
355 | |
356 $fragment = $node->ownerDocument->createDocumentFragment(); | |
357 $fragment->appendXML($embed_prefix . $data . $embed_suffix); | |
358 $node->appendChild($fragment); | |
359 $node->removeChild($child_node); | |
360 } | |
361 } | |
362 } | |
363 | |
364 /** | |
365 * Decodes all HTML entities including numerical ones to regular UTF-8 bytes. | |
366 * | |
367 * Double-escaped entities will only be decoded once ("&lt;" becomes | |
368 * "<", not "<"). Be careful when using this function, as it will revert | |
369 * previous sanitization efforts (<script> will become <script>). | |
370 * | |
371 * This method is not the opposite of Html::escape(). For example, this method | |
372 * will convert "é" to "é", whereas Html::escape() will not convert "é" | |
373 * to "é". | |
374 * | |
375 * @param string $text | |
376 * The text to decode entities in. | |
377 * | |
378 * @return string | |
379 * The input $text, with all HTML entities decoded once. | |
380 * | |
381 * @see html_entity_decode() | |
382 * @see \Drupal\Component\Utility\Html::escape() | |
383 */ | |
384 public static function decodeEntities($text) { | |
385 return html_entity_decode($text, ENT_QUOTES, 'UTF-8'); | |
386 } | |
387 | |
388 /** | |
389 * Escapes text by converting special characters to HTML entities. | |
390 * | |
391 * This method escapes HTML for sanitization purposes by replacing the | |
392 * following special characters with their HTML entity equivalents: | |
393 * - & (ampersand) becomes & | |
394 * - " (double quote) becomes " | |
395 * - ' (single quote) becomes ' | |
396 * - < (less than) becomes < | |
397 * - > (greater than) becomes > | |
398 * Special characters that have already been escaped will be double-escaped | |
399 * (for example, "<" becomes "&lt;"), and invalid UTF-8 encoding | |
400 * will be converted to the Unicode replacement character ("�"). | |
401 * | |
402 * This method is not the opposite of Html::decodeEntities(). For example, | |
403 * this method will not encode "é" to "é", whereas | |
404 * Html::decodeEntities() will convert all HTML entities to UTF-8 bytes, | |
405 * including "é" and "<" to "é" and "<". | |
406 * | |
407 * When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to | |
408 * '#markup' is not recommended. Use the '#plain_text' key instead and the | |
409 * renderer will autoescape the text. | |
410 * | |
411 * @param string $text | |
412 * The input text. | |
413 * | |
414 * @return string | |
415 * The text with all HTML special characters converted. | |
416 * | |
417 * @see htmlspecialchars() | |
418 * @see \Drupal\Component\Utility\Html::decodeEntities() | |
419 * | |
420 * @ingroup sanitization | |
421 */ | |
422 public static function escape($text) { | |
423 return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); | |
424 } | |
425 | |
426 /** | |
427 * Converts all root-relative URLs to absolute URLs. | |
428 * | |
429 * Does not change any existing protocol-relative or absolute URLs. Does not | |
430 * change other relative URLs because they would result in different absolute | |
431 * URLs depending on the current path. For example: when the same content | |
432 * containing such a relative URL (for example 'image.png'), is served from | |
433 * its canonical URL (for example 'http://example.com/some-article') or from | |
434 * a listing or feed (for example 'http://example.com/all-articles') their | |
435 * "current path" differs, resulting in different absolute URLs: | |
436 * 'http://example.com/some-article/image.png' versus | |
437 * 'http://example.com/all-articles/image.png'. Only one can be correct. | |
438 * Therefore relative URLs that are not root-relative cannot be safely | |
439 * transformed and should generally be avoided. | |
440 * | |
441 * Necessary for HTML that is served outside of a website, for example, RSS | |
442 * and e-mail. | |
443 * | |
444 * @param string $html | |
445 * The partial (X)HTML snippet to load. Invalid markup will be corrected on | |
446 * import. | |
447 * @param string $scheme_and_host | |
448 * The root URL, which has a URI scheme, host and optional port. | |
449 * | |
450 * @return string | |
451 * The updated (X)HTML snippet. | |
452 */ | |
453 public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) { | |
454 assert('empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"]))', '$scheme_and_host contains scheme, host and port at most.'); | |
455 assert('isset(parse_url($scheme_and_host)["scheme"])', '$scheme_and_host is absolute and hence has a scheme.'); | |
456 assert('isset(parse_url($scheme_and_host)["host"])', '$base_url is absolute and hence has a host.'); | |
457 | |
458 $html_dom = Html::load($html); | |
459 $xpath = new \DOMXpath($html_dom); | |
460 | |
461 // Update all root-relative URLs to absolute URLs in the given HTML. | |
462 foreach (static::$uriAttributes as $attr) { | |
463 foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) { | |
464 $node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr)); | |
465 } | |
466 foreach ($xpath->query("//*[@srcset]") as $node) { | |
467 // @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset | |
468 // @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string | |
469 $image_candidate_strings = explode(',', $node->getAttribute('srcset')); | |
470 $image_candidate_strings = array_map('trim', $image_candidate_strings); | |
471 for ($i = 0; $i < count($image_candidate_strings); $i++) { | |
472 $image_candidate_string = $image_candidate_strings[$i]; | |
473 if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') { | |
474 $image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string; | |
475 } | |
476 } | |
477 $node->setAttribute('srcset', implode(', ', $image_candidate_strings)); | |
478 } | |
479 } | |
480 return Html::serialize($html_dom); | |
481 } | |
482 | |
483 } |