Chris@0: 'quot', // quotation mark Chris@0: 38 => 'amp', // ampersand Chris@0: 60 => 'lt', // less-than sign Chris@0: 62 => 'gt', // greater-than sign Chris@0: ]; Chris@0: Chris@0: /** Chris@0: * Current encoding for escaping. If not UTF-8, we convert strings from this encoding Chris@0: * pre-escaping and back to this encoding post-escaping. Chris@0: * Chris@0: * @var string Chris@0: */ Chris@0: protected $encoding = 'utf-8'; Chris@0: Chris@0: /** Chris@0: * Holds the value of the special flags passed as second parameter to Chris@0: * htmlspecialchars(). Chris@0: * Chris@0: * @var int Chris@0: */ Chris@0: protected $htmlSpecialCharsFlags; Chris@0: Chris@0: /** Chris@0: * Static Matcher which escapes characters for HTML Attribute contexts Chris@0: * Chris@0: * @var callable Chris@0: */ Chris@0: protected $htmlAttrMatcher; Chris@0: Chris@0: /** Chris@0: * Static Matcher which escapes characters for Javascript contexts Chris@0: * Chris@0: * @var callable Chris@0: */ Chris@0: protected $jsMatcher; Chris@0: Chris@0: /** Chris@0: * Static Matcher which escapes characters for CSS Attribute contexts Chris@0: * Chris@0: * @var callable Chris@0: */ Chris@0: protected $cssMatcher; Chris@0: Chris@0: /** Chris@0: * List of all encoding supported by this class Chris@0: * Chris@0: * @var array Chris@0: */ Chris@0: protected $supportedEncodings = [ Chris@0: 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5', Chris@0: 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866', Chris@0: 'ibm866', '866', 'cp1251', 'windows-1251', Chris@0: 'win-1251', '1251', 'cp1252', 'windows-1252', Chris@0: '1252', 'koi8-r', 'koi8-ru', 'koi8r', Chris@0: 'big5', '950', 'gb2312', '936', Chris@0: 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win', Chris@0: 'cp932', '932', 'euc-jp', 'eucjp', Chris@0: 'eucjp-win', 'macroman' Chris@0: ]; Chris@0: Chris@0: /** Chris@0: * Constructor: Single parameter allows setting of global encoding for use by Chris@0: * the current object. Chris@0: * Chris@0: * @param string $encoding Chris@0: * @throws Exception\InvalidArgumentException Chris@0: */ Chris@0: public function __construct($encoding = null) Chris@0: { Chris@0: if ($encoding !== null) { Chris@15: if (! is_string($encoding)) { Chris@15: throw new Exception\InvalidArgumentException( Chris@15: get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding) Chris@15: ); Chris@15: } Chris@0: if ($encoding === '') { Chris@0: throw new Exception\InvalidArgumentException( Chris@0: get_class($this) . ' constructor parameter does not allow a blank value' Chris@0: ); Chris@0: } Chris@0: Chris@0: $encoding = strtolower($encoding); Chris@15: if (! in_array($encoding, $this->supportedEncodings)) { Chris@0: throw new Exception\InvalidArgumentException( Chris@0: 'Value of \'' . $encoding . '\' passed to ' . get_class($this) Chris@0: . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()' Chris@0: ); Chris@0: } Chris@0: Chris@0: $this->encoding = $encoding; Chris@0: } Chris@0: Chris@0: // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences. Chris@0: $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE; Chris@0: Chris@0: // set matcher callbacks Chris@0: $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher']; Chris@0: $this->jsMatcher = [$this, 'jsMatcher']; Chris@0: $this->cssMatcher = [$this, 'cssMatcher']; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Return the encoding that all output/input is expected to be encoded in. Chris@0: * Chris@0: * @return string Chris@0: */ Chris@0: public function getEncoding() Chris@0: { Chris@0: return $this->encoding; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape a string for the HTML Body context where there are very few characters Chris@0: * of special meaning. Internally this will use htmlspecialchars(). Chris@0: * Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: public function escapeHtml($string) Chris@0: { Chris@0: return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape a string for the HTML Attribute context. We use an extended set of characters Chris@0: * to escape that are not covered by htmlspecialchars() to cover cases where an attribute Chris@0: * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE). Chris@0: * Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: public function escapeHtmlAttr($string) Chris@0: { Chris@0: $string = $this->toUtf8($string); Chris@0: if ($string === '' || ctype_digit($string)) { Chris@0: return $string; Chris@0: } Chris@0: Chris@0: $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string); Chris@0: return $this->fromUtf8($result); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape a string for the Javascript context. This does not use json_encode(). An extended Chris@0: * set of characters are escaped beyond ECMAScript's rules for Javascript literal string Chris@0: * escaping in order to prevent misinterpretation of Javascript as HTML leading to the Chris@0: * injection of special characters and entities. The escaping used should be tolerant Chris@0: * of cases where HTML escaping was not applied on top of Javascript escaping correctly. Chris@0: * Backslash escaping is not used as it still leaves the escaped character as-is and so Chris@0: * is not useful in a HTML context. Chris@0: * Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: public function escapeJs($string) Chris@0: { Chris@0: $string = $this->toUtf8($string); Chris@0: if ($string === '' || ctype_digit($string)) { Chris@0: return $string; Chris@0: } Chris@0: Chris@0: $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string); Chris@0: return $this->fromUtf8($result); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape a string for the URI or Parameter contexts. This should not be used to escape Chris@0: * an entire URI - only a subcomponent being inserted. The function is a simple proxy Chris@0: * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely. Chris@0: * Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: public function escapeUrl($string) Chris@0: { Chris@0: return rawurlencode($string); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Escape a string for the CSS context. CSS escaping can be applied to any string being Chris@0: * inserted into CSS and escapes everything except alphanumerics. Chris@0: * Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: public function escapeCss($string) Chris@0: { Chris@0: $string = $this->toUtf8($string); Chris@0: if ($string === '' || ctype_digit($string)) { Chris@0: return $string; Chris@0: } Chris@0: Chris@0: $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string); Chris@0: return $this->fromUtf8($result); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Callback function for preg_replace_callback that applies HTML Attribute Chris@0: * escaping to all matches. Chris@0: * Chris@0: * @param array $matches Chris@0: * @return string Chris@0: */ Chris@0: protected function htmlAttrMatcher($matches) Chris@0: { Chris@0: $chr = $matches[0]; Chris@0: $ord = ord($chr); Chris@0: Chris@0: /** Chris@0: * The following replaces characters undefined in HTML with the Chris@0: * hex entity for the Unicode replacement character. Chris@0: */ Chris@0: if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r") Chris@0: || ($ord >= 0x7f && $ord <= 0x9f) Chris@0: ) { Chris@0: return '�'; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Check if the current character to escape has a name entity we should Chris@0: * replace it with while grabbing the integer value of the character. Chris@0: */ Chris@0: if (strlen($chr) > 1) { Chris@0: $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8'); Chris@0: } Chris@0: Chris@0: $hex = bin2hex($chr); Chris@0: $ord = hexdec($hex); Chris@0: if (isset(static::$htmlNamedEntityMap[$ord])) { Chris@0: return '&' . static::$htmlNamedEntityMap[$ord] . ';'; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Per OWASP recommendations, we'll use upper hex entities Chris@0: * for any other characters where a named entity does not exist. Chris@0: */ Chris@0: if ($ord > 255) { Chris@0: return sprintf('&#x%04X;', $ord); Chris@0: } Chris@0: return sprintf('&#x%02X;', $ord); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Callback function for preg_replace_callback that applies Javascript Chris@0: * escaping to all matches. Chris@0: * Chris@0: * @param array $matches Chris@0: * @return string Chris@0: */ Chris@0: protected function jsMatcher($matches) Chris@0: { Chris@0: $chr = $matches[0]; Chris@0: if (strlen($chr) == 1) { Chris@0: return sprintf('\\x%02X', ord($chr)); Chris@0: } Chris@0: $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); Chris@0: $hex = strtoupper(bin2hex($chr)); Chris@0: if (strlen($hex) <= 4) { Chris@0: return sprintf('\\u%04s', $hex); Chris@0: } Chris@0: $highSurrogate = substr($hex, 0, 4); Chris@0: $lowSurrogate = substr($hex, 4, 4); Chris@0: return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Callback function for preg_replace_callback that applies CSS Chris@0: * escaping to all matches. Chris@0: * Chris@0: * @param array $matches Chris@0: * @return string Chris@0: */ Chris@0: protected function cssMatcher($matches) Chris@0: { Chris@0: $chr = $matches[0]; Chris@0: if (strlen($chr) == 1) { Chris@0: $ord = ord($chr); Chris@0: } else { Chris@0: $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8'); Chris@0: $ord = hexdec(bin2hex($chr)); Chris@0: } Chris@0: return sprintf('\\%X ', $ord); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Converts a string to UTF-8 from the base encoding. The base encoding is set via this Chris@0: * class' constructor. Chris@0: * Chris@0: * @param string $string Chris@0: * @throws Exception\RuntimeException Chris@0: * @return string Chris@0: */ Chris@0: protected function toUtf8($string) Chris@0: { Chris@0: if ($this->getEncoding() === 'utf-8') { Chris@0: $result = $string; Chris@0: } else { Chris@0: $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding()); Chris@0: } Chris@0: Chris@15: if (! $this->isUtf8($result)) { Chris@0: throw new Exception\RuntimeException( Chris@0: sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result) Chris@0: ); Chris@0: } Chris@0: Chris@0: return $result; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Converts a string from UTF-8 to the base encoding. The base encoding is set via this Chris@0: * class' constructor. Chris@0: * @param string $string Chris@0: * @return string Chris@0: */ Chris@0: protected function fromUtf8($string) Chris@0: { Chris@0: if ($this->getEncoding() === 'utf-8') { Chris@0: return $string; Chris@0: } Chris@0: Chris@0: return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8'); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Checks if a given string appears to be valid UTF-8 or not. Chris@0: * Chris@0: * @param string $string Chris@0: * @return bool Chris@0: */ Chris@0: protected function isUtf8($string) Chris@0: { Chris@0: return ($string === '' || preg_match('/^./su', $string)); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Encoding conversion helper which wraps iconv and mbstring where they exist or throws Chris@0: * and exception where neither is available. Chris@0: * Chris@0: * @param string $string Chris@0: * @param string $to Chris@0: * @param array|string $from Chris@0: * @throws Exception\RuntimeException Chris@0: * @return string Chris@0: */ Chris@0: protected function convertEncoding($string, $to, $from) Chris@0: { Chris@0: if (function_exists('iconv')) { Chris@0: $result = iconv($from, $to, $string); Chris@0: } elseif (function_exists('mb_convert_encoding')) { Chris@0: $result = mb_convert_encoding($string, $to, $from); Chris@0: } else { Chris@0: throw new Exception\RuntimeException( Chris@0: get_class($this) Chris@0: . ' requires either the iconv or mbstring extension to be installed' Chris@0: . ' when escaping for non UTF-8 strings.' Chris@0: ); Chris@0: } Chris@0: Chris@0: if ($result === false) { Chris@0: return ''; // return non-fatal blank string on encoding errors from users Chris@0: } Chris@0: return $result; Chris@0: } Chris@0: }