annotate core/lib/Drupal/Component/Utility/UrlHelper.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 129ea1e6d783
children
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Component\Utility;
Chris@0 4
Chris@0 5 /**
Chris@0 6 * Helper class URL based methods.
Chris@0 7 *
Chris@0 8 * @ingroup utility
Chris@0 9 */
Chris@0 10 class UrlHelper {
Chris@0 11
Chris@0 12 /**
Chris@0 13 * The list of allowed protocols.
Chris@0 14 *
Chris@0 15 * @var array
Chris@0 16 */
Chris@0 17 protected static $allowedProtocols = ['http', 'https'];
Chris@0 18
Chris@0 19 /**
Chris@0 20 * Parses an array into a valid, rawurlencoded query string.
Chris@0 21 *
Chris@17 22 * Function rawurlencode() is RFC3986 compliant, and as a consequence RFC3987
Chris@0 23 * compliant. The latter defines the required format of "URLs" in HTML5.
Chris@0 24 * urlencode() is almost the same as rawurlencode(), except that it encodes
Chris@0 25 * spaces as "+" instead of "%20". This makes its result non compliant to
Chris@0 26 * RFC3986 and as a consequence non compliant to RFC3987 and as a consequence
Chris@0 27 * not valid as a "URL" in HTML5.
Chris@0 28 *
Chris@0 29 * @todo Remove this function once PHP 5.4 is required as we can use just
Chris@0 30 * http_build_query() directly.
Chris@0 31 *
Chris@0 32 * @param array $query
Chris@0 33 * The query parameter array to be processed; for instance,
Chris@0 34 * \Drupal::request()->query->all().
Chris@0 35 * @param string $parent
Chris@0 36 * (optional) Internal use only. Used to build the $query array key for
Chris@0 37 * nested items. Defaults to an empty string.
Chris@0 38 *
Chris@0 39 * @return string
Chris@0 40 * A rawurlencoded string which can be used as or appended to the URL query
Chris@0 41 * string.
Chris@0 42 *
Chris@0 43 * @ingroup php_wrappers
Chris@0 44 */
Chris@0 45 public static function buildQuery(array $query, $parent = '') {
Chris@0 46 $params = [];
Chris@0 47
Chris@0 48 foreach ($query as $key => $value) {
Chris@0 49 $key = ($parent ? $parent . rawurlencode('[' . $key . ']') : rawurlencode($key));
Chris@0 50
Chris@0 51 // Recurse into children.
Chris@0 52 if (is_array($value)) {
Chris@0 53 $params[] = static::buildQuery($value, $key);
Chris@0 54 }
Chris@0 55 // If a query parameter value is NULL, only append its key.
Chris@0 56 elseif (!isset($value)) {
Chris@0 57 $params[] = $key;
Chris@0 58 }
Chris@0 59 else {
Chris@0 60 // For better readability of paths in query strings, we decode slashes.
Chris@0 61 $params[] = $key . '=' . str_replace('%2F', '/', rawurlencode($value));
Chris@0 62 }
Chris@0 63 }
Chris@0 64
Chris@0 65 return implode('&', $params);
Chris@0 66 }
Chris@0 67
Chris@0 68 /**
Chris@0 69 * Filters a URL query parameter array to remove unwanted elements.
Chris@0 70 *
Chris@0 71 * @param array $query
Chris@0 72 * An array to be processed.
Chris@0 73 * @param array $exclude
Chris@0 74 * (optional) A list of $query array keys to remove. Use "parent[child]" to
Chris@0 75 * exclude nested items.
Chris@0 76 * @param string $parent
Chris@0 77 * Internal use only. Used to build the $query array key for nested items.
Chris@0 78 *
Chris@0 79 * @return
Chris@0 80 * An array containing query parameters.
Chris@0 81 */
Chris@0 82 public static function filterQueryParameters(array $query, array $exclude = [], $parent = '') {
Chris@0 83 // If $exclude is empty, there is nothing to filter.
Chris@0 84 if (empty($exclude)) {
Chris@0 85 return $query;
Chris@0 86 }
Chris@0 87 elseif (!$parent) {
Chris@0 88 $exclude = array_flip($exclude);
Chris@0 89 }
Chris@0 90
Chris@0 91 $params = [];
Chris@0 92 foreach ($query as $key => $value) {
Chris@0 93 $string_key = ($parent ? $parent . '[' . $key . ']' : $key);
Chris@0 94 if (isset($exclude[$string_key])) {
Chris@0 95 continue;
Chris@0 96 }
Chris@0 97
Chris@0 98 if (is_array($value)) {
Chris@0 99 $params[$key] = static::filterQueryParameters($value, $exclude, $string_key);
Chris@0 100 }
Chris@0 101 else {
Chris@0 102 $params[$key] = $value;
Chris@0 103 }
Chris@0 104 }
Chris@0 105
Chris@0 106 return $params;
Chris@0 107 }
Chris@0 108
Chris@0 109 /**
Chris@0 110 * Parses a URL string into its path, query, and fragment components.
Chris@0 111 *
Chris@0 112 * This function splits both internal paths like @code node?b=c#d @endcode and
Chris@0 113 * external URLs like @code https://example.com/a?b=c#d @endcode into their
Chris@0 114 * component parts. See
Chris@0 115 * @link http://tools.ietf.org/html/rfc3986#section-3 RFC 3986 @endlink for an
Chris@0 116 * explanation of what the component parts are.
Chris@0 117 *
Chris@0 118 * Note that, unlike the RFC, when passed an external URL, this function
Chris@0 119 * groups the scheme, authority, and path together into the path component.
Chris@0 120 *
Chris@0 121 * @param string $url
Chris@0 122 * The internal path or external URL string to parse.
Chris@0 123 *
Chris@0 124 * @return array
Chris@0 125 * An associative array containing:
Chris@0 126 * - path: The path component of $url. If $url is an external URL, this
Chris@0 127 * includes the scheme, authority, and path.
Chris@0 128 * - query: An array of query parameters from $url, if they exist.
Chris@0 129 * - fragment: The fragment component from $url, if it exists.
Chris@0 130 *
Chris@0 131 * @see \Drupal\Core\Utility\LinkGenerator
Chris@0 132 * @see http://tools.ietf.org/html/rfc3986
Chris@0 133 *
Chris@0 134 * @ingroup php_wrappers
Chris@0 135 */
Chris@0 136 public static function parse($url) {
Chris@0 137 $options = [
Chris@0 138 'path' => NULL,
Chris@0 139 'query' => [],
Chris@0 140 'fragment' => '',
Chris@0 141 ];
Chris@0 142
Chris@0 143 // External URLs: not using parse_url() here, so we do not have to rebuild
Chris@0 144 // the scheme, host, and path without having any use for it.
Chris@0 145 // The URL is considered external if it contains the '://' delimiter. Since
Chris@0 146 // a URL can also be passed as a query argument, we check if this delimiter
Chris@0 147 // appears in front of the '?' query argument delimiter.
Chris@0 148 $scheme_delimiter_position = strpos($url, '://');
Chris@0 149 $query_delimiter_position = strpos($url, '?');
Chris@0 150 if ($scheme_delimiter_position !== FALSE && ($query_delimiter_position === FALSE || $scheme_delimiter_position < $query_delimiter_position)) {
Chris@0 151 // Split off the fragment, if any.
Chris@0 152 if (strpos($url, '#') !== FALSE) {
Chris@0 153 list($url, $options['fragment']) = explode('#', $url, 2);
Chris@0 154 }
Chris@0 155
Chris@0 156 // Split off everything before the query string into 'path'.
Chris@0 157 $parts = explode('?', $url);
Chris@0 158
Chris@0 159 // Don't support URLs without a path, like 'http://'.
Chris@0 160 list(, $path) = explode('://', $parts[0], 2);
Chris@0 161 if ($path != '') {
Chris@0 162 $options['path'] = $parts[0];
Chris@0 163 }
Chris@0 164 // If there is a query string, transform it into keyed query parameters.
Chris@0 165 if (isset($parts[1])) {
Chris@0 166 parse_str($parts[1], $options['query']);
Chris@0 167 }
Chris@0 168 }
Chris@0 169 // Internal URLs.
Chris@0 170 else {
Chris@0 171 // parse_url() does not support relative URLs, so make it absolute. For
Chris@0 172 // instance, the relative URL "foo/bar:1" isn't properly parsed.
Chris@0 173 $parts = parse_url('http://example.com/' . $url);
Chris@0 174 // Strip the leading slash that was just added.
Chris@0 175 $options['path'] = substr($parts['path'], 1);
Chris@0 176 if (isset($parts['query'])) {
Chris@0 177 parse_str($parts['query'], $options['query']);
Chris@0 178 }
Chris@0 179 if (isset($parts['fragment'])) {
Chris@0 180 $options['fragment'] = $parts['fragment'];
Chris@0 181 }
Chris@0 182 }
Chris@0 183
Chris@0 184 return $options;
Chris@0 185 }
Chris@0 186
Chris@0 187 /**
Chris@0 188 * Encodes a Drupal path for use in a URL.
Chris@0 189 *
Chris@0 190 * For aesthetic reasons slashes are not escaped.
Chris@0 191 *
Chris@0 192 * @param string $path
Chris@0 193 * The Drupal path to encode.
Chris@0 194 *
Chris@0 195 * @return string
Chris@0 196 * The encoded path.
Chris@0 197 */
Chris@0 198 public static function encodePath($path) {
Chris@0 199 return str_replace('%2F', '/', rawurlencode($path));
Chris@0 200 }
Chris@0 201
Chris@0 202 /**
Chris@0 203 * Determines whether a path is external to Drupal.
Chris@0 204 *
Chris@0 205 * An example of an external path is http://example.com. If a path cannot be
Chris@0 206 * assessed by Drupal's menu handler, then we must treat it as potentially
Chris@0 207 * insecure.
Chris@0 208 *
Chris@0 209 * @param string $path
Chris@0 210 * The internal path or external URL being linked to, such as "node/34" or
Chris@0 211 * "http://example.com/foo".
Chris@0 212 *
Chris@0 213 * @return bool
Chris@0 214 * TRUE or FALSE, where TRUE indicates an external path.
Chris@0 215 */
Chris@0 216 public static function isExternal($path) {
Chris@0 217 $colonpos = strpos($path, ':');
Chris@0 218 // Some browsers treat \ as / so normalize to forward slashes.
Chris@0 219 $path = str_replace('\\', '/', $path);
Chris@0 220 // If the path starts with 2 slashes then it is always considered an
Chris@0 221 // external URL without an explicit protocol part.
Chris@0 222 return (strpos($path, '//') === 0)
Chris@0 223 // Leading control characters may be ignored or mishandled by browsers,
Chris@0 224 // so assume such a path may lead to an external location. The \p{C}
Chris@0 225 // character class matches all UTF-8 control, unassigned, and private
Chris@0 226 // characters.
Chris@0 227 || (preg_match('/^\p{C}/u', $path) !== 0)
Chris@0 228 // Avoid calling static::stripDangerousProtocols() if there is any slash
Chris@0 229 // (/), hash (#) or question_mark (?) before the colon (:) occurrence -
Chris@0 230 // if any - as this would clearly mean it is not a URL.
Chris@0 231 || ($colonpos !== FALSE
Chris@0 232 && !preg_match('![/?#]!', substr($path, 0, $colonpos))
Chris@0 233 && static::stripDangerousProtocols($path) == $path);
Chris@0 234 }
Chris@0 235
Chris@0 236 /**
Chris@0 237 * Determines if an external URL points to this installation.
Chris@0 238 *
Chris@0 239 * @param string $url
Chris@0 240 * A string containing an external URL, such as "http://example.com/foo".
Chris@0 241 * @param string $base_url
Chris@0 242 * The base URL string to check against, such as "http://example.com/"
Chris@0 243 *
Chris@0 244 * @return bool
Chris@0 245 * TRUE if the URL has the same domain and base path.
Chris@0 246 *
Chris@0 247 * @throws \InvalidArgumentException
Chris@0 248 * Exception thrown when a either $url or $bath_url are not fully qualified.
Chris@0 249 */
Chris@0 250 public static function externalIsLocal($url, $base_url) {
Chris@17 251 // Some browsers treat \ as / so normalize to forward slashes.
Chris@17 252 $url = str_replace('\\', '/', $url);
Chris@17 253
Chris@17 254 // Leading control characters may be ignored or mishandled by browsers, so
Chris@17 255 // assume such a path may lead to an non-local location. The \p{C} character
Chris@17 256 // class matches all UTF-8 control, unassigned, and private characters.
Chris@17 257 if (preg_match('/^\p{C}/u', $url) !== 0) {
Chris@17 258 return FALSE;
Chris@17 259 }
Chris@17 260
Chris@0 261 $url_parts = parse_url($url);
Chris@0 262 $base_parts = parse_url($base_url);
Chris@0 263
Chris@0 264 if (empty($base_parts['host']) || empty($url_parts['host'])) {
Chris@0 265 throw new \InvalidArgumentException('A path was passed when a fully qualified domain was expected.');
Chris@0 266 }
Chris@0 267
Chris@0 268 if (!isset($url_parts['path']) || !isset($base_parts['path'])) {
Chris@0 269 return (!isset($base_parts['path']) || $base_parts['path'] == '/')
Chris@0 270 && ($url_parts['host'] == $base_parts['host']);
Chris@0 271 }
Chris@0 272 else {
Chris@0 273 // When comparing base paths, we need a trailing slash to make sure a
Chris@0 274 // partial URL match isn't occurring. Since base_path() always returns
Chris@0 275 // with a trailing slash, we don't need to add the trailing slash here.
Chris@0 276 return ($url_parts['host'] == $base_parts['host'] && stripos($url_parts['path'], $base_parts['path']) === 0);
Chris@0 277 }
Chris@0 278 }
Chris@0 279
Chris@0 280 /**
Chris@0 281 * Processes an HTML attribute value and strips dangerous protocols from URLs.
Chris@0 282 *
Chris@0 283 * @param string $string
Chris@0 284 * The string with the attribute value.
Chris@0 285 *
Chris@0 286 * @return string
Chris@0 287 * Cleaned up and HTML-escaped version of $string.
Chris@0 288 */
Chris@0 289 public static function filterBadProtocol($string) {
Chris@0 290 // Get the plain text representation of the attribute value (i.e. its
Chris@0 291 // meaning).
Chris@0 292 $string = Html::decodeEntities($string);
Chris@0 293 return Html::escape(static::stripDangerousProtocols($string));
Chris@0 294 }
Chris@0 295
Chris@0 296 /**
Chris@0 297 * Gets the allowed protocols.
Chris@0 298 *
Chris@0 299 * @return array
Chris@0 300 * An array of protocols, for example http, https and irc.
Chris@0 301 */
Chris@0 302 public static function getAllowedProtocols() {
Chris@0 303 return static::$allowedProtocols;
Chris@0 304 }
Chris@0 305
Chris@0 306 /**
Chris@0 307 * Sets the allowed protocols.
Chris@0 308 *
Chris@0 309 * @param array $protocols
Chris@0 310 * An array of protocols, for example http, https and irc.
Chris@0 311 */
Chris@0 312 public static function setAllowedProtocols(array $protocols = []) {
Chris@0 313 static::$allowedProtocols = $protocols;
Chris@0 314 }
Chris@0 315
Chris@0 316 /**
Chris@0 317 * Strips dangerous protocols (for example, 'javascript:') from a URI.
Chris@0 318 *
Chris@0 319 * This function must be called for all URIs within user-entered input prior
Chris@0 320 * to being output to an HTML attribute value. It is often called as part of
Chris@0 321 * \Drupal\Component\Utility\UrlHelper::filterBadProtocol() or
Chris@0 322 * \Drupal\Component\Utility\Xss::filter(), but those functions return an
Chris@0 323 * HTML-encoded string, so this function can be called independently when the
Chris@0 324 * output needs to be a plain-text string for passing to functions that will
Chris@0 325 * call Html::escape() separately. The exact behavior depends on the value:
Chris@0 326 * - If the value is a well-formed (per RFC 3986) relative URL or
Chris@0 327 * absolute URL that does not use a dangerous protocol (like
Chris@0 328 * "javascript:"), then the URL remains unchanged. This includes all
Chris@0 329 * URLs generated via Url::toString() and UrlGeneratorTrait::url().
Chris@0 330 * - If the value is a well-formed absolute URL with a dangerous protocol,
Chris@0 331 * the protocol is stripped. This process is repeated on the remaining URL
Chris@0 332 * until it is stripped down to a safe protocol.
Chris@0 333 * - If the value is not a well-formed URL, the same sanitization behavior as
Chris@0 334 * for well-formed URLs will be invoked, which strips most substrings that
Chris@0 335 * precede a ":". The result can be used in URL attributes such as "href"
Chris@0 336 * or "src" (only after calling Html::escape() separately), but this may not
Chris@0 337 * produce valid HTML (for example, malformed URLs within "href" attributes
Chris@0 338 * fail HTML validation). This can be avoided by using
Chris@0 339 * Url::fromUri($possibly_not_a_url)->toString(), which either throws an
Chris@0 340 * exception or returns a well-formed URL.
Chris@0 341 *
Chris@0 342 * @param string $uri
Chris@0 343 * A plain-text URI that might contain dangerous protocols.
Chris@0 344 *
Chris@0 345 * @return string
Chris@0 346 * A plain-text URI stripped of dangerous protocols. As with all plain-text
Chris@0 347 * strings, this return value must not be output to an HTML page without
Chris@0 348 * being sanitized first. However, it can be passed to functions
Chris@0 349 * expecting plain-text strings.
Chris@0 350 *
Chris@0 351 * @see \Drupal\Component\Utility\Html::escape()
Chris@0 352 * @see \Drupal\Core\Url::toString()
Chris@0 353 * @see \Drupal\Core\Routing\UrlGeneratorTrait::url()
Chris@0 354 * @see \Drupal\Core\Url::fromUri()
Chris@0 355 */
Chris@0 356 public static function stripDangerousProtocols($uri) {
Chris@0 357 $allowed_protocols = array_flip(static::$allowedProtocols);
Chris@0 358
Chris@0 359 // Iteratively remove any invalid protocol found.
Chris@0 360 do {
Chris@0 361 $before = $uri;
Chris@0 362 $colonpos = strpos($uri, ':');
Chris@0 363 if ($colonpos > 0) {
Chris@0 364 // We found a colon, possibly a protocol. Verify.
Chris@0 365 $protocol = substr($uri, 0, $colonpos);
Chris@0 366 // If a colon is preceded by a slash, question mark or hash, it cannot
Chris@0 367 // possibly be part of the URL scheme. This must be a relative URL, which
Chris@0 368 // inherits the (safe) protocol of the base document.
Chris@0 369 if (preg_match('![/?#]!', $protocol)) {
Chris@0 370 break;
Chris@0 371 }
Chris@0 372 // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
Chris@0 373 // (URI Comparison) scheme comparison must be case-insensitive.
Chris@0 374 if (!isset($allowed_protocols[strtolower($protocol)])) {
Chris@0 375 $uri = substr($uri, $colonpos + 1);
Chris@0 376 }
Chris@0 377 }
Chris@0 378 } while ($before != $uri);
Chris@0 379
Chris@0 380 return $uri;
Chris@0 381 }
Chris@0 382
Chris@0 383 /**
Chris@0 384 * Verifies the syntax of the given URL.
Chris@0 385 *
Chris@0 386 * This function should only be used on actual URLs. It should not be used for
Chris@0 387 * Drupal menu paths, which can contain arbitrary characters.
Chris@0 388 * Valid values per RFC 3986.
Chris@0 389 *
Chris@0 390 * @param string $url
Chris@0 391 * The URL to verify.
Chris@0 392 * @param bool $absolute
Chris@0 393 * Whether the URL is absolute (beginning with a scheme such as "http:").
Chris@0 394 *
Chris@0 395 * @return bool
Chris@0 396 * TRUE if the URL is in a valid format, FALSE otherwise.
Chris@0 397 */
Chris@0 398 public static function isValid($url, $absolute = FALSE) {
Chris@0 399 if ($absolute) {
Chris@0 400 return (bool) preg_match("
Chris@0 401 /^ # Start at the beginning of the text
Chris@0 402 (?:ftp|https?|feed):\/\/ # Look for ftp, http, https or feed schemes
Chris@0 403 (?: # Userinfo (optional) which is typically
Chris@0 404 (?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)* # a username or a username and password
Chris@0 405 (?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@ # combination
Chris@0 406 )?
Chris@0 407 (?:
Chris@0 408 (?:[a-z0-9\-\.]|%[0-9a-f]{2})+ # A domain name or a IPv4 address
Chris@0 409 |(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]) # or a well formed IPv6 address
Chris@0 410 )
Chris@0 411 (?::[0-9]+)? # Server port number (optional)
Chris@0 412 (?:[\/|\?]
Chris@0 413 (?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2}) # The path and query (optional)
Chris@0 414 *)?
Chris@0 415 $/xi", $url);
Chris@0 416 }
Chris@0 417 else {
Chris@0 418 return (bool) preg_match("/^(?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})+$/i", $url);
Chris@0 419 }
Chris@0 420 }
Chris@0 421
Chris@0 422 }