Chris@0
|
1 <?php
|
Chris@0
|
2 /**
|
Chris@0
|
3 * Zend Framework (http://framework.zend.com/)
|
Chris@0
|
4 *
|
Chris@0
|
5 * @link http://github.com/zendframework/zf2 for the canonical source repository
|
Chris@0
|
6 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
|
Chris@0
|
7 * @license http://framework.zend.com/license/new-bsd New BSD License
|
Chris@0
|
8 */
|
Chris@0
|
9
|
Chris@0
|
10 namespace Zend\Escaper;
|
Chris@0
|
11
|
Chris@0
|
12 /**
|
Chris@0
|
13 * Context specific methods for use in secure output escaping
|
Chris@0
|
14 */
|
Chris@0
|
15 class Escaper
|
Chris@0
|
16 {
|
Chris@0
|
17 /**
|
Chris@0
|
18 * Entity Map mapping Unicode codepoints to any available named HTML entities.
|
Chris@0
|
19 *
|
Chris@0
|
20 * While HTML supports far more named entities, the lowest common denominator
|
Chris@0
|
21 * has become HTML5's XML Serialisation which is restricted to the those named
|
Chris@0
|
22 * entities that XML supports. Using HTML entities would result in this error:
|
Chris@0
|
23 * XML Parsing Error: undefined entity
|
Chris@0
|
24 *
|
Chris@0
|
25 * @var array
|
Chris@0
|
26 */
|
Chris@0
|
27 protected static $htmlNamedEntityMap = [
|
Chris@0
|
28 34 => 'quot', // quotation mark
|
Chris@0
|
29 38 => 'amp', // ampersand
|
Chris@0
|
30 60 => 'lt', // less-than sign
|
Chris@0
|
31 62 => 'gt', // greater-than sign
|
Chris@0
|
32 ];
|
Chris@0
|
33
|
Chris@0
|
34 /**
|
Chris@0
|
35 * Current encoding for escaping. If not UTF-8, we convert strings from this encoding
|
Chris@0
|
36 * pre-escaping and back to this encoding post-escaping.
|
Chris@0
|
37 *
|
Chris@0
|
38 * @var string
|
Chris@0
|
39 */
|
Chris@0
|
40 protected $encoding = 'utf-8';
|
Chris@0
|
41
|
Chris@0
|
42 /**
|
Chris@0
|
43 * Holds the value of the special flags passed as second parameter to
|
Chris@0
|
44 * htmlspecialchars().
|
Chris@0
|
45 *
|
Chris@0
|
46 * @var int
|
Chris@0
|
47 */
|
Chris@0
|
48 protected $htmlSpecialCharsFlags;
|
Chris@0
|
49
|
Chris@0
|
50 /**
|
Chris@0
|
51 * Static Matcher which escapes characters for HTML Attribute contexts
|
Chris@0
|
52 *
|
Chris@0
|
53 * @var callable
|
Chris@0
|
54 */
|
Chris@0
|
55 protected $htmlAttrMatcher;
|
Chris@0
|
56
|
Chris@0
|
57 /**
|
Chris@0
|
58 * Static Matcher which escapes characters for Javascript contexts
|
Chris@0
|
59 *
|
Chris@0
|
60 * @var callable
|
Chris@0
|
61 */
|
Chris@0
|
62 protected $jsMatcher;
|
Chris@0
|
63
|
Chris@0
|
64 /**
|
Chris@0
|
65 * Static Matcher which escapes characters for CSS Attribute contexts
|
Chris@0
|
66 *
|
Chris@0
|
67 * @var callable
|
Chris@0
|
68 */
|
Chris@0
|
69 protected $cssMatcher;
|
Chris@0
|
70
|
Chris@0
|
71 /**
|
Chris@0
|
72 * List of all encoding supported by this class
|
Chris@0
|
73 *
|
Chris@0
|
74 * @var array
|
Chris@0
|
75 */
|
Chris@0
|
76 protected $supportedEncodings = [
|
Chris@0
|
77 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5',
|
Chris@0
|
78 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866',
|
Chris@0
|
79 'ibm866', '866', 'cp1251', 'windows-1251',
|
Chris@0
|
80 'win-1251', '1251', 'cp1252', 'windows-1252',
|
Chris@0
|
81 '1252', 'koi8-r', 'koi8-ru', 'koi8r',
|
Chris@0
|
82 'big5', '950', 'gb2312', '936',
|
Chris@0
|
83 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win',
|
Chris@0
|
84 'cp932', '932', 'euc-jp', 'eucjp',
|
Chris@0
|
85 'eucjp-win', 'macroman'
|
Chris@0
|
86 ];
|
Chris@0
|
87
|
Chris@0
|
88 /**
|
Chris@0
|
89 * Constructor: Single parameter allows setting of global encoding for use by
|
Chris@0
|
90 * the current object.
|
Chris@0
|
91 *
|
Chris@0
|
92 * @param string $encoding
|
Chris@0
|
93 * @throws Exception\InvalidArgumentException
|
Chris@0
|
94 */
|
Chris@0
|
95 public function __construct($encoding = null)
|
Chris@0
|
96 {
|
Chris@0
|
97 if ($encoding !== null) {
|
Chris@15
|
98 if (! is_string($encoding)) {
|
Chris@15
|
99 throw new Exception\InvalidArgumentException(
|
Chris@15
|
100 get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding)
|
Chris@15
|
101 );
|
Chris@15
|
102 }
|
Chris@0
|
103 if ($encoding === '') {
|
Chris@0
|
104 throw new Exception\InvalidArgumentException(
|
Chris@0
|
105 get_class($this) . ' constructor parameter does not allow a blank value'
|
Chris@0
|
106 );
|
Chris@0
|
107 }
|
Chris@0
|
108
|
Chris@0
|
109 $encoding = strtolower($encoding);
|
Chris@15
|
110 if (! in_array($encoding, $this->supportedEncodings)) {
|
Chris@0
|
111 throw new Exception\InvalidArgumentException(
|
Chris@0
|
112 'Value of \'' . $encoding . '\' passed to ' . get_class($this)
|
Chris@0
|
113 . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()'
|
Chris@0
|
114 );
|
Chris@0
|
115 }
|
Chris@0
|
116
|
Chris@0
|
117 $this->encoding = $encoding;
|
Chris@0
|
118 }
|
Chris@0
|
119
|
Chris@0
|
120 // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences.
|
Chris@0
|
121 $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE;
|
Chris@0
|
122
|
Chris@0
|
123 // set matcher callbacks
|
Chris@0
|
124 $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher'];
|
Chris@0
|
125 $this->jsMatcher = [$this, 'jsMatcher'];
|
Chris@0
|
126 $this->cssMatcher = [$this, 'cssMatcher'];
|
Chris@0
|
127 }
|
Chris@0
|
128
|
Chris@0
|
129 /**
|
Chris@0
|
130 * Return the encoding that all output/input is expected to be encoded in.
|
Chris@0
|
131 *
|
Chris@0
|
132 * @return string
|
Chris@0
|
133 */
|
Chris@0
|
134 public function getEncoding()
|
Chris@0
|
135 {
|
Chris@0
|
136 return $this->encoding;
|
Chris@0
|
137 }
|
Chris@0
|
138
|
Chris@0
|
139 /**
|
Chris@0
|
140 * Escape a string for the HTML Body context where there are very few characters
|
Chris@0
|
141 * of special meaning. Internally this will use htmlspecialchars().
|
Chris@0
|
142 *
|
Chris@0
|
143 * @param string $string
|
Chris@0
|
144 * @return string
|
Chris@0
|
145 */
|
Chris@0
|
146 public function escapeHtml($string)
|
Chris@0
|
147 {
|
Chris@0
|
148 return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding);
|
Chris@0
|
149 }
|
Chris@0
|
150
|
Chris@0
|
151 /**
|
Chris@0
|
152 * Escape a string for the HTML Attribute context. We use an extended set of characters
|
Chris@0
|
153 * to escape that are not covered by htmlspecialchars() to cover cases where an attribute
|
Chris@0
|
154 * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE).
|
Chris@0
|
155 *
|
Chris@0
|
156 * @param string $string
|
Chris@0
|
157 * @return string
|
Chris@0
|
158 */
|
Chris@0
|
159 public function escapeHtmlAttr($string)
|
Chris@0
|
160 {
|
Chris@0
|
161 $string = $this->toUtf8($string);
|
Chris@0
|
162 if ($string === '' || ctype_digit($string)) {
|
Chris@0
|
163 return $string;
|
Chris@0
|
164 }
|
Chris@0
|
165
|
Chris@0
|
166 $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string);
|
Chris@0
|
167 return $this->fromUtf8($result);
|
Chris@0
|
168 }
|
Chris@0
|
169
|
Chris@0
|
170 /**
|
Chris@0
|
171 * Escape a string for the Javascript context. This does not use json_encode(). An extended
|
Chris@0
|
172 * set of characters are escaped beyond ECMAScript's rules for Javascript literal string
|
Chris@0
|
173 * escaping in order to prevent misinterpretation of Javascript as HTML leading to the
|
Chris@0
|
174 * injection of special characters and entities. The escaping used should be tolerant
|
Chris@0
|
175 * of cases where HTML escaping was not applied on top of Javascript escaping correctly.
|
Chris@0
|
176 * Backslash escaping is not used as it still leaves the escaped character as-is and so
|
Chris@0
|
177 * is not useful in a HTML context.
|
Chris@0
|
178 *
|
Chris@0
|
179 * @param string $string
|
Chris@0
|
180 * @return string
|
Chris@0
|
181 */
|
Chris@0
|
182 public function escapeJs($string)
|
Chris@0
|
183 {
|
Chris@0
|
184 $string = $this->toUtf8($string);
|
Chris@0
|
185 if ($string === '' || ctype_digit($string)) {
|
Chris@0
|
186 return $string;
|
Chris@0
|
187 }
|
Chris@0
|
188
|
Chris@0
|
189 $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string);
|
Chris@0
|
190 return $this->fromUtf8($result);
|
Chris@0
|
191 }
|
Chris@0
|
192
|
Chris@0
|
193 /**
|
Chris@0
|
194 * Escape a string for the URI or Parameter contexts. This should not be used to escape
|
Chris@0
|
195 * an entire URI - only a subcomponent being inserted. The function is a simple proxy
|
Chris@0
|
196 * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely.
|
Chris@0
|
197 *
|
Chris@0
|
198 * @param string $string
|
Chris@0
|
199 * @return string
|
Chris@0
|
200 */
|
Chris@0
|
201 public function escapeUrl($string)
|
Chris@0
|
202 {
|
Chris@0
|
203 return rawurlencode($string);
|
Chris@0
|
204 }
|
Chris@0
|
205
|
Chris@0
|
206 /**
|
Chris@0
|
207 * Escape a string for the CSS context. CSS escaping can be applied to any string being
|
Chris@0
|
208 * inserted into CSS and escapes everything except alphanumerics.
|
Chris@0
|
209 *
|
Chris@0
|
210 * @param string $string
|
Chris@0
|
211 * @return string
|
Chris@0
|
212 */
|
Chris@0
|
213 public function escapeCss($string)
|
Chris@0
|
214 {
|
Chris@0
|
215 $string = $this->toUtf8($string);
|
Chris@0
|
216 if ($string === '' || ctype_digit($string)) {
|
Chris@0
|
217 return $string;
|
Chris@0
|
218 }
|
Chris@0
|
219
|
Chris@0
|
220 $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string);
|
Chris@0
|
221 return $this->fromUtf8($result);
|
Chris@0
|
222 }
|
Chris@0
|
223
|
Chris@0
|
224 /**
|
Chris@0
|
225 * Callback function for preg_replace_callback that applies HTML Attribute
|
Chris@0
|
226 * escaping to all matches.
|
Chris@0
|
227 *
|
Chris@0
|
228 * @param array $matches
|
Chris@0
|
229 * @return string
|
Chris@0
|
230 */
|
Chris@0
|
231 protected function htmlAttrMatcher($matches)
|
Chris@0
|
232 {
|
Chris@0
|
233 $chr = $matches[0];
|
Chris@0
|
234 $ord = ord($chr);
|
Chris@0
|
235
|
Chris@0
|
236 /**
|
Chris@0
|
237 * The following replaces characters undefined in HTML with the
|
Chris@0
|
238 * hex entity for the Unicode replacement character.
|
Chris@0
|
239 */
|
Chris@0
|
240 if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r")
|
Chris@0
|
241 || ($ord >= 0x7f && $ord <= 0x9f)
|
Chris@0
|
242 ) {
|
Chris@0
|
243 return '�';
|
Chris@0
|
244 }
|
Chris@0
|
245
|
Chris@0
|
246 /**
|
Chris@0
|
247 * Check if the current character to escape has a name entity we should
|
Chris@0
|
248 * replace it with while grabbing the integer value of the character.
|
Chris@0
|
249 */
|
Chris@0
|
250 if (strlen($chr) > 1) {
|
Chris@0
|
251 $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
|
Chris@0
|
252 }
|
Chris@0
|
253
|
Chris@0
|
254 $hex = bin2hex($chr);
|
Chris@0
|
255 $ord = hexdec($hex);
|
Chris@0
|
256 if (isset(static::$htmlNamedEntityMap[$ord])) {
|
Chris@0
|
257 return '&' . static::$htmlNamedEntityMap[$ord] . ';';
|
Chris@0
|
258 }
|
Chris@0
|
259
|
Chris@0
|
260 /**
|
Chris@0
|
261 * Per OWASP recommendations, we'll use upper hex entities
|
Chris@0
|
262 * for any other characters where a named entity does not exist.
|
Chris@0
|
263 */
|
Chris@0
|
264 if ($ord > 255) {
|
Chris@0
|
265 return sprintf('&#x%04X;', $ord);
|
Chris@0
|
266 }
|
Chris@0
|
267 return sprintf('&#x%02X;', $ord);
|
Chris@0
|
268 }
|
Chris@0
|
269
|
Chris@0
|
270 /**
|
Chris@0
|
271 * Callback function for preg_replace_callback that applies Javascript
|
Chris@0
|
272 * escaping to all matches.
|
Chris@0
|
273 *
|
Chris@0
|
274 * @param array $matches
|
Chris@0
|
275 * @return string
|
Chris@0
|
276 */
|
Chris@0
|
277 protected function jsMatcher($matches)
|
Chris@0
|
278 {
|
Chris@0
|
279 $chr = $matches[0];
|
Chris@0
|
280 if (strlen($chr) == 1) {
|
Chris@0
|
281 return sprintf('\\x%02X', ord($chr));
|
Chris@0
|
282 }
|
Chris@0
|
283 $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
|
Chris@0
|
284 $hex = strtoupper(bin2hex($chr));
|
Chris@0
|
285 if (strlen($hex) <= 4) {
|
Chris@0
|
286 return sprintf('\\u%04s', $hex);
|
Chris@0
|
287 }
|
Chris@0
|
288 $highSurrogate = substr($hex, 0, 4);
|
Chris@0
|
289 $lowSurrogate = substr($hex, 4, 4);
|
Chris@0
|
290 return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate);
|
Chris@0
|
291 }
|
Chris@0
|
292
|
Chris@0
|
293 /**
|
Chris@0
|
294 * Callback function for preg_replace_callback that applies CSS
|
Chris@0
|
295 * escaping to all matches.
|
Chris@0
|
296 *
|
Chris@0
|
297 * @param array $matches
|
Chris@0
|
298 * @return string
|
Chris@0
|
299 */
|
Chris@0
|
300 protected function cssMatcher($matches)
|
Chris@0
|
301 {
|
Chris@0
|
302 $chr = $matches[0];
|
Chris@0
|
303 if (strlen($chr) == 1) {
|
Chris@0
|
304 $ord = ord($chr);
|
Chris@0
|
305 } else {
|
Chris@0
|
306 $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
|
Chris@0
|
307 $ord = hexdec(bin2hex($chr));
|
Chris@0
|
308 }
|
Chris@0
|
309 return sprintf('\\%X ', $ord);
|
Chris@0
|
310 }
|
Chris@0
|
311
|
Chris@0
|
312 /**
|
Chris@0
|
313 * Converts a string to UTF-8 from the base encoding. The base encoding is set via this
|
Chris@0
|
314 * class' constructor.
|
Chris@0
|
315 *
|
Chris@0
|
316 * @param string $string
|
Chris@0
|
317 * @throws Exception\RuntimeException
|
Chris@0
|
318 * @return string
|
Chris@0
|
319 */
|
Chris@0
|
320 protected function toUtf8($string)
|
Chris@0
|
321 {
|
Chris@0
|
322 if ($this->getEncoding() === 'utf-8') {
|
Chris@0
|
323 $result = $string;
|
Chris@0
|
324 } else {
|
Chris@0
|
325 $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding());
|
Chris@0
|
326 }
|
Chris@0
|
327
|
Chris@15
|
328 if (! $this->isUtf8($result)) {
|
Chris@0
|
329 throw new Exception\RuntimeException(
|
Chris@0
|
330 sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result)
|
Chris@0
|
331 );
|
Chris@0
|
332 }
|
Chris@0
|
333
|
Chris@0
|
334 return $result;
|
Chris@0
|
335 }
|
Chris@0
|
336
|
Chris@0
|
337 /**
|
Chris@0
|
338 * Converts a string from UTF-8 to the base encoding. The base encoding is set via this
|
Chris@0
|
339 * class' constructor.
|
Chris@0
|
340 * @param string $string
|
Chris@0
|
341 * @return string
|
Chris@0
|
342 */
|
Chris@0
|
343 protected function fromUtf8($string)
|
Chris@0
|
344 {
|
Chris@0
|
345 if ($this->getEncoding() === 'utf-8') {
|
Chris@0
|
346 return $string;
|
Chris@0
|
347 }
|
Chris@0
|
348
|
Chris@0
|
349 return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8');
|
Chris@0
|
350 }
|
Chris@0
|
351
|
Chris@0
|
352 /**
|
Chris@0
|
353 * Checks if a given string appears to be valid UTF-8 or not.
|
Chris@0
|
354 *
|
Chris@0
|
355 * @param string $string
|
Chris@0
|
356 * @return bool
|
Chris@0
|
357 */
|
Chris@0
|
358 protected function isUtf8($string)
|
Chris@0
|
359 {
|
Chris@0
|
360 return ($string === '' || preg_match('/^./su', $string));
|
Chris@0
|
361 }
|
Chris@0
|
362
|
Chris@0
|
363 /**
|
Chris@0
|
364 * Encoding conversion helper which wraps iconv and mbstring where they exist or throws
|
Chris@0
|
365 * and exception where neither is available.
|
Chris@0
|
366 *
|
Chris@0
|
367 * @param string $string
|
Chris@0
|
368 * @param string $to
|
Chris@0
|
369 * @param array|string $from
|
Chris@0
|
370 * @throws Exception\RuntimeException
|
Chris@0
|
371 * @return string
|
Chris@0
|
372 */
|
Chris@0
|
373 protected function convertEncoding($string, $to, $from)
|
Chris@0
|
374 {
|
Chris@0
|
375 if (function_exists('iconv')) {
|
Chris@0
|
376 $result = iconv($from, $to, $string);
|
Chris@0
|
377 } elseif (function_exists('mb_convert_encoding')) {
|
Chris@0
|
378 $result = mb_convert_encoding($string, $to, $from);
|
Chris@0
|
379 } else {
|
Chris@0
|
380 throw new Exception\RuntimeException(
|
Chris@0
|
381 get_class($this)
|
Chris@0
|
382 . ' requires either the iconv or mbstring extension to be installed'
|
Chris@0
|
383 . ' when escaping for non UTF-8 strings.'
|
Chris@0
|
384 );
|
Chris@0
|
385 }
|
Chris@0
|
386
|
Chris@0
|
387 if ($result === false) {
|
Chris@0
|
388 return ''; // return non-fatal blank string on encoding errors from users
|
Chris@0
|
389 }
|
Chris@0
|
390 return $result;
|
Chris@0
|
391 }
|
Chris@0
|
392 }
|