comparison core/lib/Drupal/Component/Utility/Unicode.php @ 17:129ea1e6d783

Update, including to Drupal core 8.6.10
author Chris Cannam
date Thu, 28 Feb 2019 13:21:36 +0000
parents 1fec387a4317
children
comparison
equal deleted inserted replaced
16:c2387f117808 17:129ea1e6d783
86 * Indicates an error during check for PHP unicode support. 86 * Indicates an error during check for PHP unicode support.
87 */ 87 */
88 const STATUS_ERROR = -1; 88 const STATUS_ERROR = -1;
89 89
90 /** 90 /**
91 * Holds the multibyte capabilities of the current environment.
92 *
93 * @var int
94 */
95 protected static $status = 0;
96
97 /**
98 * Gets the current status of unicode/multibyte support on this environment. 91 * Gets the current status of unicode/multibyte support on this environment.
99 * 92 *
100 * @return int 93 * @return int
101 * The status of multibyte support. It can be one of: 94 * The status of multibyte support. It can be one of:
102 * - \Drupal\Component\Utility\Unicode::STATUS_MULTIBYTE 95 * - \Drupal\Component\Utility\Unicode::STATUS_MULTIBYTE
105 * Standard PHP (emulated) unicode support. 98 * Standard PHP (emulated) unicode support.
106 * - \Drupal\Component\Utility\Unicode::STATUS_ERROR 99 * - \Drupal\Component\Utility\Unicode::STATUS_ERROR
107 * An error occurred. No unicode support. 100 * An error occurred. No unicode support.
108 */ 101 */
109 public static function getStatus() { 102 public static function getStatus() {
110 return static::$status; 103 switch (static::check()) {
104 case 'mb_strlen':
105 return Unicode::STATUS_SINGLEBYTE;
106 case '':
107 return Unicode::STATUS_MULTIBYTE;
108 }
109 return Unicode::STATUS_ERROR;
111 } 110 }
112 111
113 /** 112 /**
114 * Sets the value for multibyte support status for the current environment. 113 * Sets the value for multibyte support status for the current environment.
115 * 114 *
121 * - \Drupal\Component\Utility\Unicode::STATUS_ERROR 120 * - \Drupal\Component\Utility\Unicode::STATUS_ERROR
122 * An error occurred. No unicode support. 121 * An error occurred. No unicode support.
123 * 122 *
124 * @param int $status 123 * @param int $status
125 * The new status of multibyte support. 124 * The new status of multibyte support.
125 *
126 * @deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. In
127 * Drupal 9 there will be no way to set the status and in Drupal 8 this
128 * ability has been removed because mb_*() functions are supplied using
129 * Symfony's polyfill.
130 *
131 * @see https://www.drupal.org/node/2850048
126 */ 132 */
127 public static function setStatus($status) { 133 public static function setStatus($status) {
128 if (!in_array($status, [static::STATUS_SINGLEBYTE, static::STATUS_MULTIBYTE, static::STATUS_ERROR])) { 134 @trigger_error('\Drupal\Component\Utility\Unicode::setStatus() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. In Drupal 9 there will be no way to set the status and in Drupal 8 this ability has been removed because mb_*() functions are supplied using Symfony\'s polyfill. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
129 throw new \InvalidArgumentException('Invalid status value for unicode support.');
130 }
131 static::$status = $status;
132 } 135 }
133 136
134 /** 137 /**
135 * Checks for Unicode support in PHP and sets the proper settings if possible. 138 * Checks for Unicode support in PHP and sets the proper settings if possible.
136 * 139 *
141 * @return string 144 * @return string
142 * A string identifier of a failed multibyte extension check, if any. 145 * A string identifier of a failed multibyte extension check, if any.
143 * Otherwise, an empty string. 146 * Otherwise, an empty string.
144 */ 147 */
145 public static function check() { 148 public static function check() {
149 // Set appropriate configuration.
150 mb_internal_encoding('utf-8');
151 mb_language('uni');
152
146 // Check for mbstring extension. 153 // Check for mbstring extension.
147 if (!function_exists('mb_strlen')) { 154 if (!extension_loaded('mbstring')) {
148 static::$status = static::STATUS_SINGLEBYTE;
149 return 'mb_strlen'; 155 return 'mb_strlen';
150 } 156 }
151 157
152 // Check mbstring configuration. 158 // Check mbstring configuration.
153 if (ini_get('mbstring.func_overload') != 0) { 159 if (ini_get('mbstring.func_overload') != 0) {
154 static::$status = static::STATUS_ERROR;
155 return 'mbstring.func_overload'; 160 return 'mbstring.func_overload';
156 } 161 }
157 if (ini_get('mbstring.encoding_translation') != 0) { 162 if (ini_get('mbstring.encoding_translation') != 0) {
158 static::$status = static::STATUS_ERROR;
159 return 'mbstring.encoding_translation'; 163 return 'mbstring.encoding_translation';
160 } 164 }
161 // mbstring.http_input and mbstring.http_output are deprecated and empty by 165 // mbstring.http_input and mbstring.http_output are deprecated and empty by
162 // default in PHP 5.6. 166 // default in PHP 5.6.
163 if (version_compare(PHP_VERSION, '5.6.0') == -1) { 167 if (version_compare(PHP_VERSION, '5.6.0') == -1) {
164 if (ini_get('mbstring.http_input') != 'pass') { 168 if (ini_get('mbstring.http_input') != 'pass') {
165 static::$status = static::STATUS_ERROR;
166 return 'mbstring.http_input'; 169 return 'mbstring.http_input';
167 } 170 }
168 if (ini_get('mbstring.http_output') != 'pass') { 171 if (ini_get('mbstring.http_output') != 'pass') {
169 static::$status = static::STATUS_ERROR;
170 return 'mbstring.http_output'; 172 return 'mbstring.http_output';
171 } 173 }
172 } 174 }
173 175
174 // Set appropriate configuration.
175 mb_internal_encoding('utf-8');
176 mb_language('uni');
177 static::$status = static::STATUS_MULTIBYTE;
178 return ''; 176 return '';
179 } 177 }
180 178
181 /** 179 /**
182 * Decodes UTF byte-order mark (BOM) into the encoding's name. 180 * Decodes UTF byte-order mark (BOM) into the encoding's name.
222 * 220 *
223 * @return string|bool 221 * @return string|bool
224 * Converted data or FALSE. 222 * Converted data or FALSE.
225 */ 223 */
226 public static function convertToUtf8($data, $encoding) { 224 public static function convertToUtf8($data, $encoding) {
227 if (function_exists('iconv')) { 225 return @iconv($encoding, 'utf-8', $data);
228 return @iconv($encoding, 'utf-8', $data);
229 }
230 elseif (function_exists('mb_convert_encoding')) {
231 return @mb_convert_encoding($data, 'utf-8', $encoding);
232 }
233 elseif (function_exists('recode_string')) {
234 return @recode_string($encoding . '..utf-8', $data);
235 }
236 // Cannot convert.
237 return FALSE;
238 } 226 }
239 227
240 /** 228 /**
241 * Truncates a UTF-8-encoded string safely to a number of bytes. 229 * Truncates a UTF-8-encoded string safely to a number of bytes.
242 * 230 *
279 * @param string $text 267 * @param string $text
280 * The string to run the operation on. 268 * The string to run the operation on.
281 * 269 *
282 * @return int 270 * @return int
283 * The length of the string. 271 * The length of the string.
272 *
273 * @deprecated in Drupal 8.6.0, will be removed before Drupal 9.0.0. Use
274 * mb_strlen() instead.
275 *
276 * @see https://www.drupal.org/node/2850048
284 */ 277 */
285 public static function strlen($text) { 278 public static function strlen($text) {
286 if (static::getStatus() == static::STATUS_MULTIBYTE) { 279 @trigger_error('\Drupal\Component\Utility\Unicode::strlen() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. Use mb_strlen() instead. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
287 return mb_strlen($text); 280 return mb_strlen($text);
288 }
289 else {
290 // Do not count UTF-8 continuation bytes.
291 return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
292 }
293 } 281 }
294 282
295 /** 283 /**
296 * Converts a UTF-8 string to uppercase. 284 * Converts a UTF-8 string to uppercase.
297 * 285 *
298 * @param string $text 286 * @param string $text
299 * The string to run the operation on. 287 * The string to run the operation on.
300 * 288 *
301 * @return string 289 * @return string
302 * The string in uppercase. 290 * The string in uppercase.
291 *
292 * @deprecated in Drupal 8.6.0, will be removed before Drupal 9.0.0. Use
293 * mb_strtoupper() instead.
294 *
295 * @see https://www.drupal.org/node/2850048
303 */ 296 */
304 public static function strtoupper($text) { 297 public static function strtoupper($text) {
305 if (static::getStatus() == static::STATUS_MULTIBYTE) { 298 @trigger_error('\Drupal\Component\Utility\Unicode::strtoupper() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. Use mb_strtoupper() instead. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
306 return mb_strtoupper($text); 299 return mb_strtoupper($text);
307 }
308 else {
309 // Use C-locale for ASCII-only uppercase.
310 $text = strtoupper($text);
311 // Case flip Latin-1 accented letters.
312 $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '\Drupal\Component\Utility\Unicode::caseFlip', $text);
313 return $text;
314 }
315 } 300 }
316 301
317 /** 302 /**
318 * Converts a UTF-8 string to lowercase. 303 * Converts a UTF-8 string to lowercase.
319 * 304 *
320 * @param string $text 305 * @param string $text
321 * The string to run the operation on. 306 * The string to run the operation on.
322 * 307 *
323 * @return string 308 * @return string
324 * The string in lowercase. 309 * The string in lowercase.
310 *
311 * @deprecated in Drupal 8.6.0, will be removed before Drupal 9.0.0. Use
312 * mb_strtolower() instead.
313 *
314 * @see https://www.drupal.org/node/2850048
325 */ 315 */
326 public static function strtolower($text) { 316 public static function strtolower($text) {
327 if (static::getStatus() == static::STATUS_MULTIBYTE) { 317 @trigger_error('\Drupal\Component\Utility\Unicode::strtolower() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. Use mb_strtolower() instead. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
328 return mb_strtolower($text); 318 return mb_strtolower($text);
329 }
330 else {
331 // Use C-locale for ASCII-only lowercase.
332 $text = strtolower($text);
333 // Case flip Latin-1 accented letters.
334 $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '\Drupal\Component\Utility\Unicode::caseFlip', $text);
335 return $text;
336 }
337 } 319 }
338 320
339 /** 321 /**
340 * Capitalizes the first character of a UTF-8 string. 322 * Capitalizes the first character of a UTF-8 string.
341 * 323 *
344 * 326 *
345 * @return string 327 * @return string
346 * The string with the first character as uppercase. 328 * The string with the first character as uppercase.
347 */ 329 */
348 public static function ucfirst($text) { 330 public static function ucfirst($text) {
349 return static::strtoupper(static::substr($text, 0, 1)) . static::substr($text, 1); 331 return mb_strtoupper(mb_substr($text, 0, 1)) . mb_substr($text, 1);
350 } 332 }
351 333
352 /** 334 /**
353 * Converts the first character of a UTF-8 string to lowercase. 335 * Converts the first character of a UTF-8 string to lowercase.
354 * 336 *
360 * 342 *
361 * @ingroup php_wrappers 343 * @ingroup php_wrappers
362 */ 344 */
363 public static function lcfirst($text) { 345 public static function lcfirst($text) {
364 // Note: no mbstring equivalent! 346 // Note: no mbstring equivalent!
365 return static::strtolower(static::substr($text, 0, 1)) . static::substr($text, 1); 347 return mb_strtolower(mb_substr($text, 0, 1)) . mb_substr($text, 1);
366 } 348 }
367 349
368 /** 350 /**
369 * Capitalizes the first character of each word in a UTF-8 string. 351 * Capitalizes the first character of each word in a UTF-8 string.
370 * 352 *
377 * @ingroup php_wrappers 359 * @ingroup php_wrappers
378 */ 360 */
379 public static function ucwords($text) { 361 public static function ucwords($text) {
380 $regex = '/(^|[' . static::PREG_CLASS_WORD_BOUNDARY . '])([^' . static::PREG_CLASS_WORD_BOUNDARY . '])/u'; 362 $regex = '/(^|[' . static::PREG_CLASS_WORD_BOUNDARY . '])([^' . static::PREG_CLASS_WORD_BOUNDARY . '])/u';
381 return preg_replace_callback($regex, function (array $matches) { 363 return preg_replace_callback($regex, function (array $matches) {
382 return $matches[1] . Unicode::strtoupper($matches[2]); 364 return $matches[1] . mb_strtoupper($matches[2]);
383 }, $text); 365 }, $text);
384 } 366 }
385 367
386 /** 368 /**
387 * Cuts off a piece of a string based on character indices and counts. 369 * Cuts off a piece of a string based on character indices and counts.
397 * @param int $length 379 * @param int $length
398 * The number of characters to read. 380 * The number of characters to read.
399 * 381 *
400 * @return string 382 * @return string
401 * The shortened string. 383 * The shortened string.
384 *
385 * @deprecated in Drupal 8.6.0, will be removed before Drupal 9.0.0. Use
386 * mb_substr() instead.
387 *
388 * @see https://www.drupal.org/node/2850048
402 */ 389 */
403 public static function substr($text, $start, $length = NULL) { 390 public static function substr($text, $start, $length = NULL) {
404 if (static::getStatus() == static::STATUS_MULTIBYTE) { 391 @trigger_error('\Drupal\Component\Utility\Unicode::substr() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. Use mb_substr() instead. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
405 return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length); 392 return mb_substr($text, $start, $length);
406 }
407 else {
408 $strlen = strlen($text);
409 // Find the starting byte offset.
410 $bytes = 0;
411 if ($start > 0) {
412 // Count all the characters except continuation bytes from the start
413 // until we have found $start characters or the end of the string.
414 $bytes = -1; $chars = -1;
415 while ($bytes < $strlen - 1 && $chars < $start) {
416 $bytes++;
417 $c = ord($text[$bytes]);
418 if ($c < 0x80 || $c >= 0xC0) {
419 $chars++;
420 }
421 }
422 }
423 elseif ($start < 0) {
424 // Count all the characters except continuation bytes from the end
425 // until we have found abs($start) characters.
426 $start = abs($start);
427 $bytes = $strlen; $chars = 0;
428 while ($bytes > 0 && $chars < $start) {
429 $bytes--;
430 $c = ord($text[$bytes]);
431 if ($c < 0x80 || $c >= 0xC0) {
432 $chars++;
433 }
434 }
435 }
436 $istart = $bytes;
437
438 // Find the ending byte offset.
439 if ($length === NULL) {
440 $iend = $strlen;
441 }
442 elseif ($length > 0) {
443 // Count all the characters except continuation bytes from the starting
444 // index until we have found $length characters or reached the end of
445 // the string, then backtrace one byte.
446 $iend = $istart - 1;
447 $chars = -1;
448 $last_real = FALSE;
449 while ($iend < $strlen - 1 && $chars < $length) {
450 $iend++;
451 $c = ord($text[$iend]);
452 $last_real = FALSE;
453 if ($c < 0x80 || $c >= 0xC0) {
454 $chars++;
455 $last_real = TRUE;
456 }
457 }
458 // Backtrace one byte if the last character we found was a real
459 // character and we don't need it.
460 if ($last_real && $chars >= $length) {
461 $iend--;
462 }
463 }
464 elseif ($length < 0) {
465 // Count all the characters except continuation bytes from the end
466 // until we have found abs($start) characters, then backtrace one byte.
467 $length = abs($length);
468 $iend = $strlen; $chars = 0;
469 while ($iend > 0 && $chars < $length) {
470 $iend--;
471 $c = ord($text[$iend]);
472 if ($c < 0x80 || $c >= 0xC0) {
473 $chars++;
474 }
475 }
476 // Backtrace one byte if we are not at the beginning of the string.
477 if ($iend > 0) {
478 $iend--;
479 }
480 }
481 else {
482 // $length == 0, return an empty string.
483 return '';
484 }
485
486 return substr($text, $istart, max(0, $iend - $istart + 1));
487 }
488 } 393 }
489 394
490 /** 395 /**
491 * Truncates a UTF-8-encoded string safely to a number of characters. 396 * Truncates a UTF-8-encoded string safely to a number of characters.
492 * 397 *
524 public static function truncate($string, $max_length, $wordsafe = FALSE, $add_ellipsis = FALSE, $min_wordsafe_length = 1) { 429 public static function truncate($string, $max_length, $wordsafe = FALSE, $add_ellipsis = FALSE, $min_wordsafe_length = 1) {
525 $ellipsis = ''; 430 $ellipsis = '';
526 $max_length = max($max_length, 0); 431 $max_length = max($max_length, 0);
527 $min_wordsafe_length = max($min_wordsafe_length, 0); 432 $min_wordsafe_length = max($min_wordsafe_length, 0);
528 433
529 if (static::strlen($string) <= $max_length) { 434 if (mb_strlen($string) <= $max_length) {
530 // No truncation needed, so don't add ellipsis, just return. 435 // No truncation needed, so don't add ellipsis, just return.
531 return $string; 436 return $string;
532 } 437 }
533 438
534 if ($add_ellipsis) { 439 if ($add_ellipsis) {
535 // Truncate ellipsis in case $max_length is small. 440 // Truncate ellipsis in case $max_length is small.
536 $ellipsis = static::substr('…', 0, $max_length); 441 $ellipsis = mb_substr('…', 0, $max_length);
537 $max_length -= static::strlen($ellipsis); 442 $max_length -= mb_strlen($ellipsis);
538 $max_length = max($max_length, 0); 443 $max_length = max($max_length, 0);
539 } 444 }
540 445
541 if ($max_length <= $min_wordsafe_length) { 446 if ($max_length <= $min_wordsafe_length) {
542 // Do not attempt word-safe if lengths are bad. 447 // Do not attempt word-safe if lengths are bad.
551 $found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']/us', $string, $matches); 456 $found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']/us', $string, $matches);
552 if ($found) { 457 if ($found) {
553 $string = $matches[1]; 458 $string = $matches[1];
554 } 459 }
555 else { 460 else {
556 $string = static::substr($string, 0, $max_length); 461 $string = mb_substr($string, 0, $max_length);
557 } 462 }
558 } 463 }
559 else { 464 else {
560 $string = static::substr($string, 0, $max_length); 465 $string = mb_substr($string, 0, $max_length);
561 } 466 }
562 467
563 if ($add_ellipsis) { 468 if ($add_ellipsis) {
564 // If we're adding an ellipsis, remove any trailing periods. 469 // If we're adding an ellipsis, remove any trailing periods.
565 $string = rtrim($string, '.'); 470 $string = rtrim($string, '.');
581 * @return int 486 * @return int
582 * Returns < 0 if $str1 is less than $str2; > 0 if $str1 is greater than 487 * Returns < 0 if $str1 is less than $str2; > 0 if $str1 is greater than
583 * $str2, and 0 if they are equal. 488 * $str2, and 0 if they are equal.
584 */ 489 */
585 public static function strcasecmp($str1, $str2) { 490 public static function strcasecmp($str1, $str2) {
586 return strcmp(static::strtoupper($str1), static::strtoupper($str2)); 491 return strcmp(mb_strtoupper($str1), mb_strtoupper($str2));
587 } 492 }
588 493
589 /** 494 /**
590 * Encodes MIME/HTTP headers that contain incorrectly encoded characters. 495 * Encodes MIME/HTTP headers that contain incorrectly encoded characters.
591 * 496 *
713 * 618 *
714 * @return int|false 619 * @return int|false
715 * The position where $needle occurs in $haystack, always relative to the 620 * The position where $needle occurs in $haystack, always relative to the
716 * beginning (independent of $offset), or FALSE if not found. Note that 621 * beginning (independent of $offset), or FALSE if not found. Note that
717 * a return value of 0 is not the same as FALSE. 622 * a return value of 0 is not the same as FALSE.
623 *
624 * @deprecated in Drupal 8.6.0, will be removed before Drupal 9.0.0. Use
625 * mb_strpos() instead.
626 *
627 * @see https://www.drupal.org/node/2850048
718 */ 628 */
719 public static function strpos($haystack, $needle, $offset = 0) { 629 public static function strpos($haystack, $needle, $offset = 0) {
720 if (static::getStatus() == static::STATUS_MULTIBYTE) { 630 @trigger_error('\Drupal\Component\Utility\Unicode::strpos() is deprecated in Drupal 8.6.0 and will be removed before Drupal 9.0.0. Use mb_strpos() instead. See https://www.drupal.org/node/2850048.', E_USER_DEPRECATED);
721 return mb_strpos($haystack, $needle, $offset); 631 return mb_strpos($haystack, $needle, $offset);
722 }
723 else {
724 // Remove Unicode continuation characters, to be compatible with
725 // Unicode::strlen() and Unicode::substr().
726 $haystack = preg_replace("/[\x80-\xBF]/", '', $haystack);
727 $needle = preg_replace("/[\x80-\xBF]/", '', $needle);
728 return strpos($haystack, $needle, $offset);
729 }
730 } 632 }
731 633
732 } 634 }