comparison core/lib/Drupal/Component/Utility/UrlHelper.php @ 0:4c8ae668cc8c

Initial import (non-working)
author Chris Cannam
date Wed, 29 Nov 2017 16:09:58 +0000
parents
children 129ea1e6d783
comparison
equal deleted inserted replaced
-1:000000000000 0:4c8ae668cc8c
1 <?php
2
3 namespace Drupal\Component\Utility;
4
5 /**
6 * Helper class URL based methods.
7 *
8 * @ingroup utility
9 */
10 class UrlHelper {
11
12 /**
13 * The list of allowed protocols.
14 *
15 * @var array
16 */
17 protected static $allowedProtocols = ['http', 'https'];
18
19 /**
20 * Parses an array into a valid, rawurlencoded query string.
21 *
22 * rawurlencode() is RFC3986 compliant, and as a consequence RFC3987
23 * compliant. The latter defines the required format of "URLs" in HTML5.
24 * urlencode() is almost the same as rawurlencode(), except that it encodes
25 * spaces as "+" instead of "%20". This makes its result non compliant to
26 * RFC3986 and as a consequence non compliant to RFC3987 and as a consequence
27 * not valid as a "URL" in HTML5.
28 *
29 * @todo Remove this function once PHP 5.4 is required as we can use just
30 * http_build_query() directly.
31 *
32 * @param array $query
33 * The query parameter array to be processed; for instance,
34 * \Drupal::request()->query->all().
35 * @param string $parent
36 * (optional) Internal use only. Used to build the $query array key for
37 * nested items. Defaults to an empty string.
38 *
39 * @return string
40 * A rawurlencoded string which can be used as or appended to the URL query
41 * string.
42 *
43 * @ingroup php_wrappers
44 */
45 public static function buildQuery(array $query, $parent = '') {
46 $params = [];
47
48 foreach ($query as $key => $value) {
49 $key = ($parent ? $parent . rawurlencode('[' . $key . ']') : rawurlencode($key));
50
51 // Recurse into children.
52 if (is_array($value)) {
53 $params[] = static::buildQuery($value, $key);
54 }
55 // If a query parameter value is NULL, only append its key.
56 elseif (!isset($value)) {
57 $params[] = $key;
58 }
59 else {
60 // For better readability of paths in query strings, we decode slashes.
61 $params[] = $key . '=' . str_replace('%2F', '/', rawurlencode($value));
62 }
63 }
64
65 return implode('&', $params);
66 }
67
68 /**
69 * Filters a URL query parameter array to remove unwanted elements.
70 *
71 * @param array $query
72 * An array to be processed.
73 * @param array $exclude
74 * (optional) A list of $query array keys to remove. Use "parent[child]" to
75 * exclude nested items.
76 * @param string $parent
77 * Internal use only. Used to build the $query array key for nested items.
78 *
79 * @return
80 * An array containing query parameters.
81 */
82 public static function filterQueryParameters(array $query, array $exclude = [], $parent = '') {
83 // If $exclude is empty, there is nothing to filter.
84 if (empty($exclude)) {
85 return $query;
86 }
87 elseif (!$parent) {
88 $exclude = array_flip($exclude);
89 }
90
91 $params = [];
92 foreach ($query as $key => $value) {
93 $string_key = ($parent ? $parent . '[' . $key . ']' : $key);
94 if (isset($exclude[$string_key])) {
95 continue;
96 }
97
98 if (is_array($value)) {
99 $params[$key] = static::filterQueryParameters($value, $exclude, $string_key);
100 }
101 else {
102 $params[$key] = $value;
103 }
104 }
105
106 return $params;
107 }
108
109 /**
110 * Parses a URL string into its path, query, and fragment components.
111 *
112 * This function splits both internal paths like @code node?b=c#d @endcode and
113 * external URLs like @code https://example.com/a?b=c#d @endcode into their
114 * component parts. See
115 * @link http://tools.ietf.org/html/rfc3986#section-3 RFC 3986 @endlink for an
116 * explanation of what the component parts are.
117 *
118 * Note that, unlike the RFC, when passed an external URL, this function
119 * groups the scheme, authority, and path together into the path component.
120 *
121 * @param string $url
122 * The internal path or external URL string to parse.
123 *
124 * @return array
125 * An associative array containing:
126 * - path: The path component of $url. If $url is an external URL, this
127 * includes the scheme, authority, and path.
128 * - query: An array of query parameters from $url, if they exist.
129 * - fragment: The fragment component from $url, if it exists.
130 *
131 * @see \Drupal\Core\Utility\LinkGenerator
132 * @see http://tools.ietf.org/html/rfc3986
133 *
134 * @ingroup php_wrappers
135 */
136 public static function parse($url) {
137 $options = [
138 'path' => NULL,
139 'query' => [],
140 'fragment' => '',
141 ];
142
143 // External URLs: not using parse_url() here, so we do not have to rebuild
144 // the scheme, host, and path without having any use for it.
145 // The URL is considered external if it contains the '://' delimiter. Since
146 // a URL can also be passed as a query argument, we check if this delimiter
147 // appears in front of the '?' query argument delimiter.
148 $scheme_delimiter_position = strpos($url, '://');
149 $query_delimiter_position = strpos($url, '?');
150 if ($scheme_delimiter_position !== FALSE && ($query_delimiter_position === FALSE || $scheme_delimiter_position < $query_delimiter_position)) {
151 // Split off the fragment, if any.
152 if (strpos($url, '#') !== FALSE) {
153 list($url, $options['fragment']) = explode('#', $url, 2);
154 }
155
156 // Split off everything before the query string into 'path'.
157 $parts = explode('?', $url);
158
159 // Don't support URLs without a path, like 'http://'.
160 list(, $path) = explode('://', $parts[0], 2);
161 if ($path != '') {
162 $options['path'] = $parts[0];
163 }
164 // If there is a query string, transform it into keyed query parameters.
165 if (isset($parts[1])) {
166 parse_str($parts[1], $options['query']);
167 }
168 }
169 // Internal URLs.
170 else {
171 // parse_url() does not support relative URLs, so make it absolute. For
172 // instance, the relative URL "foo/bar:1" isn't properly parsed.
173 $parts = parse_url('http://example.com/' . $url);
174 // Strip the leading slash that was just added.
175 $options['path'] = substr($parts['path'], 1);
176 if (isset($parts['query'])) {
177 parse_str($parts['query'], $options['query']);
178 }
179 if (isset($parts['fragment'])) {
180 $options['fragment'] = $parts['fragment'];
181 }
182 }
183
184 return $options;
185 }
186
187 /**
188 * Encodes a Drupal path for use in a URL.
189 *
190 * For aesthetic reasons slashes are not escaped.
191 *
192 * @param string $path
193 * The Drupal path to encode.
194 *
195 * @return string
196 * The encoded path.
197 */
198 public static function encodePath($path) {
199 return str_replace('%2F', '/', rawurlencode($path));
200 }
201
202 /**
203 * Determines whether a path is external to Drupal.
204 *
205 * An example of an external path is http://example.com. If a path cannot be
206 * assessed by Drupal's menu handler, then we must treat it as potentially
207 * insecure.
208 *
209 * @param string $path
210 * The internal path or external URL being linked to, such as "node/34" or
211 * "http://example.com/foo".
212 *
213 * @return bool
214 * TRUE or FALSE, where TRUE indicates an external path.
215 */
216 public static function isExternal($path) {
217 $colonpos = strpos($path, ':');
218 // Some browsers treat \ as / so normalize to forward slashes.
219 $path = str_replace('\\', '/', $path);
220 // If the path starts with 2 slashes then it is always considered an
221 // external URL without an explicit protocol part.
222 return (strpos($path, '//') === 0)
223 // Leading control characters may be ignored or mishandled by browsers,
224 // so assume such a path may lead to an external location. The \p{C}
225 // character class matches all UTF-8 control, unassigned, and private
226 // characters.
227 || (preg_match('/^\p{C}/u', $path) !== 0)
228 // Avoid calling static::stripDangerousProtocols() if there is any slash
229 // (/), hash (#) or question_mark (?) before the colon (:) occurrence -
230 // if any - as this would clearly mean it is not a URL.
231 || ($colonpos !== FALSE
232 && !preg_match('![/?#]!', substr($path, 0, $colonpos))
233 && static::stripDangerousProtocols($path) == $path);
234 }
235
236 /**
237 * Determines if an external URL points to this installation.
238 *
239 * @param string $url
240 * A string containing an external URL, such as "http://example.com/foo".
241 * @param string $base_url
242 * The base URL string to check against, such as "http://example.com/"
243 *
244 * @return bool
245 * TRUE if the URL has the same domain and base path.
246 *
247 * @throws \InvalidArgumentException
248 * Exception thrown when a either $url or $bath_url are not fully qualified.
249 */
250 public static function externalIsLocal($url, $base_url) {
251 $url_parts = parse_url($url);
252 $base_parts = parse_url($base_url);
253
254 if (empty($base_parts['host']) || empty($url_parts['host'])) {
255 throw new \InvalidArgumentException('A path was passed when a fully qualified domain was expected.');
256 }
257
258 if (!isset($url_parts['path']) || !isset($base_parts['path'])) {
259 return (!isset($base_parts['path']) || $base_parts['path'] == '/')
260 && ($url_parts['host'] == $base_parts['host']);
261 }
262 else {
263 // When comparing base paths, we need a trailing slash to make sure a
264 // partial URL match isn't occurring. Since base_path() always returns
265 // with a trailing slash, we don't need to add the trailing slash here.
266 return ($url_parts['host'] == $base_parts['host'] && stripos($url_parts['path'], $base_parts['path']) === 0);
267 }
268 }
269
270 /**
271 * Processes an HTML attribute value and strips dangerous protocols from URLs.
272 *
273 * @param string $string
274 * The string with the attribute value.
275 *
276 * @return string
277 * Cleaned up and HTML-escaped version of $string.
278 */
279 public static function filterBadProtocol($string) {
280 // Get the plain text representation of the attribute value (i.e. its
281 // meaning).
282 $string = Html::decodeEntities($string);
283 return Html::escape(static::stripDangerousProtocols($string));
284 }
285
286 /**
287 * Gets the allowed protocols.
288 *
289 * @return array
290 * An array of protocols, for example http, https and irc.
291 */
292 public static function getAllowedProtocols() {
293 return static::$allowedProtocols;
294 }
295
296 /**
297 * Sets the allowed protocols.
298 *
299 * @param array $protocols
300 * An array of protocols, for example http, https and irc.
301 */
302 public static function setAllowedProtocols(array $protocols = []) {
303 static::$allowedProtocols = $protocols;
304 }
305
306 /**
307 * Strips dangerous protocols (for example, 'javascript:') from a URI.
308 *
309 * This function must be called for all URIs within user-entered input prior
310 * to being output to an HTML attribute value. It is often called as part of
311 * \Drupal\Component\Utility\UrlHelper::filterBadProtocol() or
312 * \Drupal\Component\Utility\Xss::filter(), but those functions return an
313 * HTML-encoded string, so this function can be called independently when the
314 * output needs to be a plain-text string for passing to functions that will
315 * call Html::escape() separately. The exact behavior depends on the value:
316 * - If the value is a well-formed (per RFC 3986) relative URL or
317 * absolute URL that does not use a dangerous protocol (like
318 * "javascript:"), then the URL remains unchanged. This includes all
319 * URLs generated via Url::toString() and UrlGeneratorTrait::url().
320 * - If the value is a well-formed absolute URL with a dangerous protocol,
321 * the protocol is stripped. This process is repeated on the remaining URL
322 * until it is stripped down to a safe protocol.
323 * - If the value is not a well-formed URL, the same sanitization behavior as
324 * for well-formed URLs will be invoked, which strips most substrings that
325 * precede a ":". The result can be used in URL attributes such as "href"
326 * or "src" (only after calling Html::escape() separately), but this may not
327 * produce valid HTML (for example, malformed URLs within "href" attributes
328 * fail HTML validation). This can be avoided by using
329 * Url::fromUri($possibly_not_a_url)->toString(), which either throws an
330 * exception or returns a well-formed URL.
331 *
332 * @param string $uri
333 * A plain-text URI that might contain dangerous protocols.
334 *
335 * @return string
336 * A plain-text URI stripped of dangerous protocols. As with all plain-text
337 * strings, this return value must not be output to an HTML page without
338 * being sanitized first. However, it can be passed to functions
339 * expecting plain-text strings.
340 *
341 * @see \Drupal\Component\Utility\Html::escape()
342 * @see \Drupal\Core\Url::toString()
343 * @see \Drupal\Core\Routing\UrlGeneratorTrait::url()
344 * @see \Drupal\Core\Url::fromUri()
345 */
346 public static function stripDangerousProtocols($uri) {
347 $allowed_protocols = array_flip(static::$allowedProtocols);
348
349 // Iteratively remove any invalid protocol found.
350 do {
351 $before = $uri;
352 $colonpos = strpos($uri, ':');
353 if ($colonpos > 0) {
354 // We found a colon, possibly a protocol. Verify.
355 $protocol = substr($uri, 0, $colonpos);
356 // If a colon is preceded by a slash, question mark or hash, it cannot
357 // possibly be part of the URL scheme. This must be a relative URL, which
358 // inherits the (safe) protocol of the base document.
359 if (preg_match('![/?#]!', $protocol)) {
360 break;
361 }
362 // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
363 // (URI Comparison) scheme comparison must be case-insensitive.
364 if (!isset($allowed_protocols[strtolower($protocol)])) {
365 $uri = substr($uri, $colonpos + 1);
366 }
367 }
368 } while ($before != $uri);
369
370 return $uri;
371 }
372
373 /**
374 * Verifies the syntax of the given URL.
375 *
376 * This function should only be used on actual URLs. It should not be used for
377 * Drupal menu paths, which can contain arbitrary characters.
378 * Valid values per RFC 3986.
379 *
380 * @param string $url
381 * The URL to verify.
382 * @param bool $absolute
383 * Whether the URL is absolute (beginning with a scheme such as "http:").
384 *
385 * @return bool
386 * TRUE if the URL is in a valid format, FALSE otherwise.
387 */
388 public static function isValid($url, $absolute = FALSE) {
389 if ($absolute) {
390 return (bool) preg_match("
391 /^ # Start at the beginning of the text
392 (?:ftp|https?|feed):\/\/ # Look for ftp, http, https or feed schemes
393 (?: # Userinfo (optional) which is typically
394 (?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)* # a username or a username and password
395 (?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@ # combination
396 )?
397 (?:
398 (?:[a-z0-9\-\.]|%[0-9a-f]{2})+ # A domain name or a IPv4 address
399 |(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]) # or a well formed IPv6 address
400 )
401 (?::[0-9]+)? # Server port number (optional)
402 (?:[\/|\?]
403 (?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2}) # The path and query (optional)
404 *)?
405 $/xi", $url);
406 }
407 else {
408 return (bool) preg_match("/^(?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})+$/i", $url);
409 }
410 }
411
412 }