annotate core/lib/Drupal/Component/Utility/Xss.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 4c8ae668cc8c
children
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Component\Utility;
Chris@0 4
Chris@0 5 /**
Chris@0 6 * Provides helper to filter for cross-site scripting.
Chris@0 7 *
Chris@0 8 * @ingroup utility
Chris@0 9 */
Chris@0 10 class Xss {
Chris@0 11
Chris@0 12 /**
Chris@0 13 * The list of HTML tags allowed by filterAdmin().
Chris@0 14 *
Chris@0 15 * @var array
Chris@0 16 *
Chris@0 17 * @see \Drupal\Component\Utility\Xss::filterAdmin()
Chris@0 18 */
Chris@0 19 protected static $adminTags = ['a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'command', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'menu', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'];
Chris@0 20
Chris@0 21 /**
Chris@0 22 * The default list of HTML tags allowed by filter().
Chris@0 23 *
Chris@0 24 * @var array
Chris@0 25 *
Chris@0 26 * @see \Drupal\Component\Utility\Xss::filter()
Chris@0 27 */
Chris@0 28 protected static $htmlTags = ['a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'];
Chris@0 29
Chris@0 30 /**
Chris@0 31 * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
Chris@0 32 *
Chris@0 33 * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
Chris@0 34 * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
Chris@0 35 *
Chris@0 36 * This code does four things:
Chris@0 37 * - Removes characters and constructs that can trick browsers.
Chris@0 38 * - Makes sure all HTML entities are well-formed.
Chris@0 39 * - Makes sure all HTML tags and attributes are well-formed.
Chris@0 40 * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
Chris@0 41 * javascript:).
Chris@0 42 *
Chris@0 43 * @param $string
Chris@0 44 * The string with raw HTML in it. It will be stripped of everything that
Chris@0 45 * can cause an XSS attack.
Chris@0 46 * @param array $html_tags
Chris@0 47 * An array of HTML tags.
Chris@0 48 *
Chris@0 49 * @return string
Chris@0 50 * An XSS safe version of $string, or an empty string if $string is not
Chris@0 51 * valid UTF-8.
Chris@0 52 *
Chris@0 53 * @see \Drupal\Component\Utility\Unicode::validateUtf8()
Chris@0 54 *
Chris@0 55 * @ingroup sanitization
Chris@0 56 */
Chris@0 57 public static function filter($string, array $html_tags = NULL) {
Chris@0 58 if (is_null($html_tags)) {
Chris@0 59 $html_tags = static::$htmlTags;
Chris@0 60 }
Chris@0 61 // Only operate on valid UTF-8 strings. This is necessary to prevent cross
Chris@0 62 // site scripting issues on Internet Explorer 6.
Chris@0 63 if (!Unicode::validateUtf8($string)) {
Chris@0 64 return '';
Chris@0 65 }
Chris@0 66 // Remove NULL characters (ignored by some browsers).
Chris@0 67 $string = str_replace(chr(0), '', $string);
Chris@0 68 // Remove Netscape 4 JS entities.
Chris@0 69 $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
Chris@0 70
Chris@0 71 // Defuse all HTML entities.
Chris@0 72 $string = str_replace('&', '&amp;', $string);
Chris@0 73 // Change back only well-formed entities in our whitelist:
Chris@0 74 // Decimal numeric entities.
Chris@0 75 $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
Chris@0 76 // Hexadecimal numeric entities.
Chris@0 77 $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
Chris@0 78 // Named entities.
Chris@0 79 $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
Chris@0 80 $html_tags = array_flip($html_tags);
Chris@0 81 // Late static binding does not work inside anonymous functions.
Chris@0 82 $class = get_called_class();
Chris@0 83 $splitter = function ($matches) use ($html_tags, $class) {
Chris@0 84 return $class::split($matches[1], $html_tags, $class);
Chris@0 85 };
Chris@0 86 // Strip any tags that are not in the whitelist.
Chris@0 87 return preg_replace_callback('%
Chris@0 88 (
Chris@0 89 <(?=[^a-zA-Z!/]) # a lone <
Chris@0 90 | # or
Chris@0 91 <!--.*?--> # a comment
Chris@0 92 | # or
Chris@0 93 <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string
Chris@0 94 | # or
Chris@0 95 > # just a >
Chris@0 96 )%x', $splitter, $string);
Chris@0 97 }
Chris@0 98
Chris@0 99 /**
Chris@0 100 * Applies a very permissive XSS/HTML filter for admin-only use.
Chris@0 101 *
Chris@0 102 * Use only for fields where it is impractical to use the
Chris@0 103 * whole filter system, but where some (mainly inline) mark-up
Chris@0 104 * is desired (so \Drupal\Component\Utility\Html::escape() is
Chris@0 105 * not acceptable).
Chris@0 106 *
Chris@0 107 * Allows all tags that can be used inside an HTML body, save
Chris@0 108 * for scripts and styles.
Chris@0 109 *
Chris@0 110 * @param string $string
Chris@0 111 * The string to apply the filter to.
Chris@0 112 *
Chris@0 113 * @return string
Chris@0 114 * The filtered string.
Chris@0 115 *
Chris@0 116 * @ingroup sanitization
Chris@0 117 *
Chris@0 118 * @see \Drupal\Component\Utility\Xss::getAdminTagList()
Chris@0 119 */
Chris@0 120 public static function filterAdmin($string) {
Chris@0 121 return static::filter($string, static::$adminTags);
Chris@0 122 }
Chris@0 123
Chris@0 124 /**
Chris@0 125 * Processes an HTML tag.
Chris@0 126 *
Chris@0 127 * @param string $string
Chris@0 128 * The HTML tag to process.
Chris@0 129 * @param array $html_tags
Chris@0 130 * An array where the keys are the allowed tags and the values are not
Chris@0 131 * used.
Chris@0 132 * @param string $class
Chris@0 133 * The called class. This method is called from an anonymous function which
Chris@0 134 * breaks late static binding. See https://bugs.php.net/bug.php?id=66622 for
Chris@0 135 * more information.
Chris@0 136 *
Chris@0 137 * @return string
Chris@0 138 * If the element isn't allowed, an empty string. Otherwise, the cleaned up
Chris@0 139 * version of the HTML element.
Chris@0 140 */
Chris@0 141 protected static function split($string, $html_tags, $class) {
Chris@0 142 if (substr($string, 0, 1) != '<') {
Chris@0 143 // We matched a lone ">" character.
Chris@0 144 return '&gt;';
Chris@0 145 }
Chris@0 146 elseif (strlen($string) == 1) {
Chris@0 147 // We matched a lone "<" character.
Chris@0 148 return '&lt;';
Chris@0 149 }
Chris@0 150
Chris@0 151 if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
Chris@0 152 // Seriously malformed.
Chris@0 153 return '';
Chris@0 154 }
Chris@0 155 $slash = trim($matches[1]);
Chris@0 156 $elem = &$matches[2];
Chris@0 157 $attrlist = &$matches[3];
Chris@0 158 $comment = &$matches[4];
Chris@0 159
Chris@0 160 if ($comment) {
Chris@0 161 $elem = '!--';
Chris@0 162 }
Chris@0 163
Chris@0 164 // When in whitelist mode, an element is disallowed when not listed.
Chris@0 165 if ($class::needsRemoval($html_tags, $elem)) {
Chris@0 166 return '';
Chris@0 167 }
Chris@0 168
Chris@0 169 if ($comment) {
Chris@0 170 return $comment;
Chris@0 171 }
Chris@0 172
Chris@0 173 if ($slash != '') {
Chris@0 174 return "</$elem>";
Chris@0 175 }
Chris@0 176
Chris@0 177 // Is there a closing XHTML slash at the end of the attributes?
Chris@0 178 $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
Chris@0 179 $xhtml_slash = $count ? ' /' : '';
Chris@0 180
Chris@0 181 // Clean up attributes.
Chris@0 182 $attr2 = implode(' ', $class::attributes($attrlist));
Chris@0 183 $attr2 = preg_replace('/[<>]/', '', $attr2);
Chris@0 184 $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
Chris@0 185
Chris@0 186 return "<$elem$attr2$xhtml_slash>";
Chris@0 187 }
Chris@0 188
Chris@0 189 /**
Chris@0 190 * Processes a string of HTML attributes.
Chris@0 191 *
Chris@0 192 * @param string $attributes
Chris@0 193 * The html attribute to process.
Chris@0 194 *
Chris@0 195 * @return string
Chris@0 196 * Cleaned up version of the HTML attributes.
Chris@0 197 */
Chris@0 198 protected static function attributes($attributes) {
Chris@0 199 $attributes_array = [];
Chris@0 200 $mode = 0;
Chris@0 201 $attribute_name = '';
Chris@0 202 $skip = FALSE;
Chris@0 203 $skip_protocol_filtering = FALSE;
Chris@0 204
Chris@0 205 while (strlen($attributes) != 0) {
Chris@0 206 // Was the last operation successful?
Chris@0 207 $working = 0;
Chris@0 208
Chris@0 209 switch ($mode) {
Chris@0 210 case 0:
Chris@0 211 // Attribute name, href for instance.
Chris@0 212 if (preg_match('/^([-a-zA-Z][-a-zA-Z0-9]*)/', $attributes, $match)) {
Chris@0 213 $attribute_name = strtolower($match[1]);
Chris@0 214 $skip = ($attribute_name == 'style' || substr($attribute_name, 0, 2) == 'on');
Chris@0 215
Chris@0 216 // Values for attributes of type URI should be filtered for
Chris@0 217 // potentially malicious protocols (for example, an href-attribute
Chris@0 218 // starting with "javascript:"). However, for some non-URI
Chris@0 219 // attributes performing this filtering causes valid and safe data
Chris@0 220 // to be mangled. We prevent this by skipping protocol filtering on
Chris@0 221 // such attributes.
Chris@0 222 // @see \Drupal\Component\Utility\UrlHelper::filterBadProtocol()
Chris@0 223 // @see http://www.w3.org/TR/html4/index/attributes.html
Chris@0 224 $skip_protocol_filtering = substr($attribute_name, 0, 5) === 'data-' || in_array($attribute_name, [
Chris@0 225 'title',
Chris@0 226 'alt',
Chris@0 227 'rel',
Chris@0 228 'property',
Chris@0 229 ]);
Chris@0 230
Chris@0 231 $working = $mode = 1;
Chris@0 232 $attributes = preg_replace('/^[-a-zA-Z][-a-zA-Z0-9]*/', '', $attributes);
Chris@0 233 }
Chris@0 234 break;
Chris@0 235
Chris@0 236 case 1:
Chris@0 237 // Equals sign or valueless ("selected").
Chris@0 238 if (preg_match('/^\s*=\s*/', $attributes)) {
Chris@0 239 $working = 1; $mode = 2;
Chris@0 240 $attributes = preg_replace('/^\s*=\s*/', '', $attributes);
Chris@0 241 break;
Chris@0 242 }
Chris@0 243
Chris@0 244 if (preg_match('/^\s+/', $attributes)) {
Chris@0 245 $working = 1; $mode = 0;
Chris@0 246 if (!$skip) {
Chris@0 247 $attributes_array[] = $attribute_name;
Chris@0 248 }
Chris@0 249 $attributes = preg_replace('/^\s+/', '', $attributes);
Chris@0 250 }
Chris@0 251 break;
Chris@0 252
Chris@0 253 case 2:
Chris@0 254 // Attribute value, a URL after href= for instance.
Chris@0 255 if (preg_match('/^"([^"]*)"(\s+|$)/', $attributes, $match)) {
Chris@0 256 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0 257
Chris@0 258 if (!$skip) {
Chris@0 259 $attributes_array[] = "$attribute_name=\"$thisval\"";
Chris@0 260 }
Chris@0 261 $working = 1;
Chris@0 262 $mode = 0;
Chris@0 263 $attributes = preg_replace('/^"[^"]*"(\s+|$)/', '', $attributes);
Chris@0 264 break;
Chris@0 265 }
Chris@0 266
Chris@0 267 if (preg_match("/^'([^']*)'(\s+|$)/", $attributes, $match)) {
Chris@0 268 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0 269
Chris@0 270 if (!$skip) {
Chris@0 271 $attributes_array[] = "$attribute_name='$thisval'";
Chris@0 272 }
Chris@0 273 $working = 1; $mode = 0;
Chris@0 274 $attributes = preg_replace("/^'[^']*'(\s+|$)/", '', $attributes);
Chris@0 275 break;
Chris@0 276 }
Chris@0 277
Chris@0 278 if (preg_match("%^([^\s\"']+)(\s+|$)%", $attributes, $match)) {
Chris@0 279 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0 280
Chris@0 281 if (!$skip) {
Chris@0 282 $attributes_array[] = "$attribute_name=\"$thisval\"";
Chris@0 283 }
Chris@0 284 $working = 1; $mode = 0;
Chris@0 285 $attributes = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attributes);
Chris@0 286 }
Chris@0 287 break;
Chris@0 288 }
Chris@0 289
Chris@0 290 if ($working == 0) {
Chris@0 291 // Not well formed; remove and try again.
Chris@0 292 $attributes = preg_replace('/
Chris@0 293 ^
Chris@0 294 (
Chris@0 295 "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
Chris@0 296 | # or
Chris@0 297 \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
Chris@0 298 | # or
Chris@0 299 \S # - a non-whitespace character
Chris@0 300 )* # any number of the above three
Chris@0 301 \s* # any number of whitespaces
Chris@0 302 /x', '', $attributes);
Chris@0 303 $mode = 0;
Chris@0 304 }
Chris@0 305 }
Chris@0 306
Chris@0 307 // The attribute list ends with a valueless attribute like "selected".
Chris@0 308 if ($mode == 1 && !$skip) {
Chris@0 309 $attributes_array[] = $attribute_name;
Chris@0 310 }
Chris@0 311 return $attributes_array;
Chris@0 312 }
Chris@0 313
Chris@0 314 /**
Chris@0 315 * Whether this element needs to be removed altogether.
Chris@0 316 *
Chris@0 317 * @param $html_tags
Chris@0 318 * The list of HTML tags.
Chris@0 319 * @param $elem
Chris@0 320 * The name of the HTML element.
Chris@0 321 *
Chris@0 322 * @return bool
Chris@0 323 * TRUE if this element needs to be removed.
Chris@0 324 */
Chris@0 325 protected static function needsRemoval($html_tags, $elem) {
Chris@0 326 return !isset($html_tags[strtolower($elem)]);
Chris@0 327 }
Chris@0 328
Chris@0 329 /**
Chris@0 330 * Gets the list of HTML tags allowed by Xss::filterAdmin().
Chris@0 331 *
Chris@0 332 * @return array
Chris@0 333 * The list of HTML tags allowed by filterAdmin().
Chris@0 334 */
Chris@0 335 public static function getAdminTagList() {
Chris@0 336 return static::$adminTags;
Chris@0 337 }
Chris@0 338
Chris@0 339 /**
Chris@0 340 * Gets the standard list of HTML tags allowed by Xss::filter().
Chris@0 341 *
Chris@0 342 * @return array
Chris@0 343 * The list of HTML tags allowed by Xss::filter().
Chris@0 344 */
Chris@0 345 public static function getHtmlTagList() {
Chris@0 346 return static::$htmlTags;
Chris@0 347 }
Chris@0 348
Chris@0 349 }