isophonics-drupal-site: core/lib/Drupal/Component/Utility/Xss.php annotate

annotate core/lib/Drupal/Component/Utility/Xss.php @ 19:fa3358dc1485 tip

Add ndrum files

author	Chris Cannam
date	Wed, 28 Aug 2019 13:14:47 +0100
parents	4c8ae668cc8c
children

rev	line source
Chris@0	1 <?php
Chris@0	2
Chris@0	3 namespace Drupal\Component\Utility;
Chris@0	4
Chris@0	5 /**
Chris@0	6 * Provides helper to filter for cross-site scripting.
Chris@0	7 *
Chris@0	8 * @ingroup utility
Chris@0	9 */
Chris@0	10 class Xss {
Chris@0	11
Chris@0	12 /**
Chris@0	13 * The list of HTML tags allowed by filterAdmin().
Chris@0	14 *
Chris@0	15 * @var array
Chris@0	16 *
Chris@0	17 * @see \Drupal\Component\Utility\Xss::filterAdmin()
Chris@0	18 */
Chris@0	19 protected static $adminTags = ['a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'command', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'menu', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'];
Chris@0	20
Chris@0	21 /**
Chris@0	22 * The default list of HTML tags allowed by filter().
Chris@0	23 *
Chris@0	24 * @var array
Chris@0	25 *
Chris@0	26 * @see \Drupal\Component\Utility\Xss::filter()
Chris@0	27 */
Chris@0	28 protected static $htmlTags = ['a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'];
Chris@0	29
Chris@0	30 /**
Chris@0	31 * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
Chris@0	32 *
Chris@0	33 * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
Chris@0	34 * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
Chris@0	35 *
Chris@0	36 * This code does four things:
Chris@0	37 * - Removes characters and constructs that can trick browsers.
Chris@0	38 * - Makes sure all HTML entities are well-formed.
Chris@0	39 * - Makes sure all HTML tags and attributes are well-formed.
Chris@0	40 * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
Chris@0	41 * javascript:).
Chris@0	42 *
Chris@0	43 * @param $string
Chris@0	44 * The string with raw HTML in it. It will be stripped of everything that
Chris@0	45 * can cause an XSS attack.
Chris@0	46 * @param array $html_tags
Chris@0	47 * An array of HTML tags.
Chris@0	48 *
Chris@0	49 * @return string
Chris@0	50 * An XSS safe version of $string, or an empty string if $string is not
Chris@0	51 * valid UTF-8.
Chris@0	52 *
Chris@0	53 * @see \Drupal\Component\Utility\Unicode::validateUtf8()
Chris@0	54 *
Chris@0	55 * @ingroup sanitization
Chris@0	56 */
Chris@0	57 public static function filter($string, array $html_tags = NULL) {
Chris@0	58 if (is_null($html_tags)) {
Chris@0	59 $html_tags = static::$htmlTags;
Chris@0	60 }
Chris@0	61 // Only operate on valid UTF-8 strings. This is necessary to prevent cross
Chris@0	62 // site scripting issues on Internet Explorer 6.
Chris@0	63 if (!Unicode::validateUtf8($string)) {
Chris@0	64 return '';
Chris@0	65 }
Chris@0	66 // Remove NULL characters (ignored by some browsers).
Chris@0	67 $string = str_replace(chr(0), '', $string);
Chris@0	68 // Remove Netscape 4 JS entities.
Chris@0	69 $string = preg_replace('%&\s\{[^}](\}\s*;?\|$)%', '', $string);
Chris@0	70
Chris@0	71 // Defuse all HTML entities.
Chris@0	72 $string = str_replace('&', '&', $string);
Chris@0	73 // Change back only well-formed entities in our whitelist:
Chris@0	74 // Decimal numeric entities.
Chris@0	75 $string = preg_replace('/&#([0-9]+;)/', '&#\1', $string);
Chris@0	76 // Hexadecimal numeric entities.
Chris@0	77 $string = preg_replace('/&#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
Chris@0	78 // Named entities.
Chris@0	79 $string = preg_replace('/&([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
Chris@0	80 $html_tags = array_flip($html_tags);
Chris@0	81 // Late static binding does not work inside anonymous functions.
Chris@0	82 $class = get_called_class();
Chris@0	83 $splitter = function ($matches) use ($html_tags, $class) {
Chris@0	84 return $class::split($matches[1], $html_tags, $class);
Chris@0	85 };
Chris@0	86 // Strip any tags that are not in the whitelist.
Chris@0	87 return preg_replace_callback('%
Chris@0	88 (
Chris@0	89 <(?=[^a-zA-Z!/]) # a lone <
Chris@0	90 \| # or
Chris@0	91 <!--.*?--> # a comment
Chris@0	92 \| # or
Chris@0	93 <[^>]*(>\|$) # a string that starts with a <, up until the > or the end of the string
Chris@0	94 \| # or
Chris@0	95 > # just a >
Chris@0	96 )%x', $splitter, $string);
Chris@0	97 }
Chris@0	98
Chris@0	99 /**
Chris@0	100 * Applies a very permissive XSS/HTML filter for admin-only use.
Chris@0	101 *
Chris@0	102 * Use only for fields where it is impractical to use the
Chris@0	103 * whole filter system, but where some (mainly inline) mark-up
Chris@0	104 * is desired (so \Drupal\Component\Utility\Html::escape() is
Chris@0	105 * not acceptable).
Chris@0	106 *
Chris@0	107 * Allows all tags that can be used inside an HTML body, save
Chris@0	108 * for scripts and styles.
Chris@0	109 *
Chris@0	110 * @param string $string
Chris@0	111 * The string to apply the filter to.
Chris@0	112 *
Chris@0	113 * @return string
Chris@0	114 * The filtered string.
Chris@0	115 *
Chris@0	116 * @ingroup sanitization
Chris@0	117 *
Chris@0	118 * @see \Drupal\Component\Utility\Xss::getAdminTagList()
Chris@0	119 */
Chris@0	120 public static function filterAdmin($string) {
Chris@0	121 return static::filter($string, static::$adminTags);
Chris@0	122 }
Chris@0	123
Chris@0	124 /**
Chris@0	125 * Processes an HTML tag.
Chris@0	126 *
Chris@0	127 * @param string $string
Chris@0	128 * The HTML tag to process.
Chris@0	129 * @param array $html_tags
Chris@0	130 * An array where the keys are the allowed tags and the values are not
Chris@0	131 * used.
Chris@0	132 * @param string $class
Chris@0	133 * The called class. This method is called from an anonymous function which
Chris@0	134 * breaks late static binding. See https://bugs.php.net/bug.php?id=66622 for
Chris@0	135 * more information.
Chris@0	136 *
Chris@0	137 * @return string
Chris@0	138 * If the element isn't allowed, an empty string. Otherwise, the cleaned up
Chris@0	139 * version of the HTML element.
Chris@0	140 */
Chris@0	141 protected static function split($string, $html_tags, $class) {
Chris@0	142 if (substr($string, 0, 1) != '<') {
Chris@0	143 // We matched a lone ">" character.
Chris@0	144 return '>';
Chris@0	145 }
Chris@0	146 elseif (strlen($string) == 1) {
Chris@0	147 // We matched a lone "<" character.
Chris@0	148 return '<';
Chris@0	149 }
Chris@0	150
Chris@0	151 if (!preg_match('%^<\s(/\s)?([a-zA-Z0-9\-]+)\s([^>])>?\|(<!--.*?-->)$%', $string, $matches)) {
Chris@0	152 // Seriously malformed.
Chris@0	153 return '';
Chris@0	154 }
Chris@0	155 $slash = trim($matches[1]);
Chris@0	156 $elem = &$matches[2];
Chris@0	157 $attrlist = &$matches[3];
Chris@0	158 $comment = &$matches[4];
Chris@0	159
Chris@0	160 if ($comment) {
Chris@0	161 $elem = '!--';
Chris@0	162 }
Chris@0	163
Chris@0	164 // When in whitelist mode, an element is disallowed when not listed.
Chris@0	165 if ($class::needsRemoval($html_tags, $elem)) {
Chris@0	166 return '';
Chris@0	167 }
Chris@0	168
Chris@0	169 if ($comment) {
Chris@0	170 return $comment;
Chris@0	171 }
Chris@0	172
Chris@0	173 if ($slash != '') {
Chris@0	174 return "</$elem>";
Chris@0	175 }
Chris@0	176
Chris@0	177 // Is there a closing XHTML slash at the end of the attributes?
Chris@0	178 $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
Chris@0	179 $xhtml_slash = $count ? ' /' : '';
Chris@0	180
Chris@0	181 // Clean up attributes.
Chris@0	182 $attr2 = implode(' ', $class::attributes($attrlist));
Chris@0	183 $attr2 = preg_replace('/[<>]/', '', $attr2);
Chris@0	184 $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
Chris@0	185
Chris@0	186 return "<$elem$attr2$xhtml_slash>";
Chris@0	187 }
Chris@0	188
Chris@0	189 /**
Chris@0	190 * Processes a string of HTML attributes.
Chris@0	191 *
Chris@0	192 * @param string $attributes
Chris@0	193 * The html attribute to process.
Chris@0	194 *
Chris@0	195 * @return string
Chris@0	196 * Cleaned up version of the HTML attributes.
Chris@0	197 */
Chris@0	198 protected static function attributes($attributes) {
Chris@0	199 $attributes_array = [];
Chris@0	200 $mode = 0;
Chris@0	201 $attribute_name = '';
Chris@0	202 $skip = FALSE;
Chris@0	203 $skip_protocol_filtering = FALSE;
Chris@0	204
Chris@0	205 while (strlen($attributes) != 0) {
Chris@0	206 // Was the last operation successful?
Chris@0	207 $working = 0;
Chris@0	208
Chris@0	209 switch ($mode) {
Chris@0	210 case 0:
Chris@0	211 // Attribute name, href for instance.
Chris@0	212 if (preg_match('/^([-a-zA-Z][-a-zA-Z0-9]*)/', $attributes, $match)) {
Chris@0	213 $attribute_name = strtolower($match[1]);
Chris@0	214 $skip = ($attribute_name == 'style' \|\| substr($attribute_name, 0, 2) == 'on');
Chris@0	215
Chris@0	216 // Values for attributes of type URI should be filtered for
Chris@0	217 // potentially malicious protocols (for example, an href-attribute
Chris@0	218 // starting with "javascript:"). However, for some non-URI
Chris@0	219 // attributes performing this filtering causes valid and safe data
Chris@0	220 // to be mangled. We prevent this by skipping protocol filtering on
Chris@0	221 // such attributes.
Chris@0	222 // @see \Drupal\Component\Utility\UrlHelper::filterBadProtocol()
Chris@0	223 // @see http://www.w3.org/TR/html4/index/attributes.html
Chris@0	224 $skip_protocol_filtering = substr($attribute_name, 0, 5) === 'data-' \|\| in_array($attribute_name, [
Chris@0	225 'title',
Chris@0	226 'alt',
Chris@0	227 'rel',
Chris@0	228 'property',
Chris@0	229 ]);
Chris@0	230
Chris@0	231 $working = $mode = 1;
Chris@0	232 $attributes = preg_replace('/^[-a-zA-Z][-a-zA-Z0-9]*/', '', $attributes);
Chris@0	233 }
Chris@0	234 break;
Chris@0	235
Chris@0	236 case 1:
Chris@0	237 // Equals sign or valueless ("selected").
Chris@0	238 if (preg_match('/^\s=\s/', $attributes)) {
Chris@0	239 $working = 1; $mode = 2;
Chris@0	240 $attributes = preg_replace('/^\s=\s/', '', $attributes);
Chris@0	241 break;
Chris@0	242 }
Chris@0	243
Chris@0	244 if (preg_match('/^\s+/', $attributes)) {
Chris@0	245 $working = 1; $mode = 0;
Chris@0	246 if (!$skip) {
Chris@0	247 $attributes_array[] = $attribute_name;
Chris@0	248 }
Chris@0	249 $attributes = preg_replace('/^\s+/', '', $attributes);
Chris@0	250 }
Chris@0	251 break;
Chris@0	252
Chris@0	253 case 2:
Chris@0	254 // Attribute value, a URL after href= for instance.
Chris@0	255 if (preg_match('/^"([^"]*)"(\s+\|$)/', $attributes, $match)) {
Chris@0	256 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0	257
Chris@0	258 if (!$skip) {
Chris@0	259 $attributes_array[] = "$attribute_name=\"$thisval\"";
Chris@0	260 }
Chris@0	261 $working = 1;
Chris@0	262 $mode = 0;
Chris@0	263 $attributes = preg_replace('/^"[^"]*"(\s+\|$)/', '', $attributes);
Chris@0	264 break;
Chris@0	265 }
Chris@0	266
Chris@0	267 if (preg_match("/^'([^']*)'(\s+\|$)/", $attributes, $match)) {
Chris@0	268 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0	269
Chris@0	270 if (!$skip) {
Chris@0	271 $attributes_array[] = "$attribute_name='$thisval'";
Chris@0	272 }
Chris@0	273 $working = 1; $mode = 0;
Chris@0	274 $attributes = preg_replace("/^'[^']*'(\s+\|$)/", '', $attributes);
Chris@0	275 break;
Chris@0	276 }
Chris@0	277
Chris@0	278 if (preg_match("%^([^\s\"']+)(\s+\|$)%", $attributes, $match)) {
Chris@0	279 $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
Chris@0	280
Chris@0	281 if (!$skip) {
Chris@0	282 $attributes_array[] = "$attribute_name=\"$thisval\"";
Chris@0	283 }
Chris@0	284 $working = 1; $mode = 0;
Chris@0	285 $attributes = preg_replace("%^[^\s\"']+(\s+\|$)%", '', $attributes);
Chris@0	286 }
Chris@0	287 break;
Chris@0	288 }
Chris@0	289
Chris@0	290 if ($working == 0) {
Chris@0	291 // Not well formed; remove and try again.
Chris@0	292 $attributes = preg_replace('/
Chris@0	293 ^
Chris@0	294 (
Chris@0	295 "[^"]*("\|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
Chris@0	296 \| # or
Chris@0	297 \'[^\']*(\'\|$)\| # - a string that starts with a quote, up until the next quote or the end of the string
Chris@0	298 \| # or
Chris@0	299 \S # - a non-whitespace character
Chris@0	300 )* # any number of the above three
Chris@0	301 \s* # any number of whitespaces
Chris@0	302 /x', '', $attributes);
Chris@0	303 $mode = 0;
Chris@0	304 }
Chris@0	305 }
Chris@0	306
Chris@0	307 // The attribute list ends with a valueless attribute like "selected".
Chris@0	308 if ($mode == 1 && !$skip) {
Chris@0	309 $attributes_array[] = $attribute_name;
Chris@0	310 }
Chris@0	311 return $attributes_array;
Chris@0	312 }
Chris@0	313
Chris@0	314 /**
Chris@0	315 * Whether this element needs to be removed altogether.
Chris@0	316 *
Chris@0	317 * @param $html_tags
Chris@0	318 * The list of HTML tags.
Chris@0	319 * @param $elem
Chris@0	320 * The name of the HTML element.
Chris@0	321 *
Chris@0	322 * @return bool
Chris@0	323 * TRUE if this element needs to be removed.
Chris@0	324 */
Chris@0	325 protected static function needsRemoval($html_tags, $elem) {
Chris@0	326 return !isset($html_tags[strtolower($elem)]);
Chris@0	327 }
Chris@0	328
Chris@0	329 /**
Chris@0	330 * Gets the list of HTML tags allowed by Xss::filterAdmin().
Chris@0	331 *
Chris@0	332 * @return array
Chris@0	333 * The list of HTML tags allowed by filterAdmin().
Chris@0	334 */
Chris@0	335 public static function getAdminTagList() {
Chris@0	336 return static::$adminTags;
Chris@0	337 }
Chris@0	338
Chris@0	339 /**
Chris@0	340 * Gets the standard list of HTML tags allowed by Xss::filter().
Chris@0	341 *
Chris@0	342 * @return array
Chris@0	343 * The list of HTML tags allowed by Xss::filter().
Chris@0	344 */
Chris@0	345 public static function getHtmlTagList() {
Chris@0	346 return static::$htmlTags;
Chris@0	347 }
Chris@0	348
Chris@0	349 }

Mercurial > hg > isophonics-drupal-site

annotate core/lib/Drupal/Component/Utility/Xss.php @ 19:fa3358dc1485 tip