annotate core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 129ea1e6d783
children
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Component\Transliteration;
Chris@0 4
Chris@0 5 /**
Chris@0 6 * Implements transliteration without using the PECL extensions.
Chris@0 7 *
Chris@0 8 * Transliterations are done character-by-character, by looking up non-US-ASCII
Chris@0 9 * characters in a transliteration database.
Chris@0 10 *
Chris@0 11 * The database comes from two types of files, both of which are searched for in
Chris@0 12 * the PhpTransliteration::$dataDirectory directory. First, language-specific
Chris@0 13 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
Chris@0 14 * there is no language-specific override for a character, the generic
Chris@0 15 * transliteration character tables are searched (see
Chris@0 16 * PhpTransliteration::readGenericData()). If looking up the character in the
Chris@0 17 * generic table results in a NULL value, or an illegal character is
Chris@0 18 * encountered, then a substitute character is returned.
Chris@0 19 *
Chris@0 20 * Some parts of this code were derived from the MediaWiki project's UtfNormal
Chris@0 21 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
Chris@0 22 * http://www.mediawiki.org/
Chris@0 23 */
Chris@0 24 class PhpTransliteration implements TransliterationInterface {
Chris@0 25
Chris@0 26 /**
Chris@0 27 * Directory where data for transliteration resides.
Chris@0 28 *
Chris@0 29 * The constructor sets this (by default) to subdirectory 'data' underneath
Chris@0 30 * the directory where the class's PHP file resides.
Chris@0 31 *
Chris@0 32 * @var string
Chris@0 33 */
Chris@0 34 protected $dataDirectory;
Chris@0 35
Chris@0 36 /**
Chris@0 37 * Associative array of language-specific character transliteration tables.
Chris@0 38 *
Chris@0 39 * The outermost array keys are language codes. For each language code key,
Chris@0 40 * the value is an array whose keys are Unicode character codes, and whose
Chris@0 41 * values are the transliterations of those characters to US-ASCII. This is
Chris@0 42 * set up as needed in PhpTransliteration::replace() by calling
Chris@0 43 * PhpTransliteration::readLanguageOverrides().
Chris@0 44 *
Chris@0 45 * @var array
Chris@0 46 */
Chris@0 47 protected $languageOverrides = [];
Chris@0 48
Chris@0 49 /**
Chris@0 50 * Non-language-specific transliteration tables.
Chris@0 51 *
Chris@0 52 * Array whose keys are the upper two bytes of the Unicode character, and
Chris@0 53 * whose values are an array of transliterations for each lower-two bytes
Chris@0 54 * character code. This is set up as needed in PhpTransliteration::replace()
Chris@0 55 * by calling PhpTransliteration::readGenericData().
Chris@0 56 *
Chris@0 57 * @var array
Chris@0 58 */
Chris@0 59 protected $genericMap = [];
Chris@0 60
Chris@0 61 /**
Chris@0 62 * Constructs a transliteration object.
Chris@0 63 *
Chris@0 64 * @param string $data_directory
Chris@0 65 * (optional) The directory where data files reside. If omitted, defaults
Chris@0 66 * to subdirectory 'data' underneath the directory where the class's PHP
Chris@0 67 * file resides.
Chris@0 68 */
Chris@0 69 public function __construct($data_directory = NULL) {
Chris@0 70 $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
Chris@0 71 }
Chris@0 72
Chris@0 73 /**
Chris@0 74 * {@inheritdoc}
Chris@0 75 */
Chris@0 76 public function removeDiacritics($string) {
Chris@0 77 $result = '';
Chris@0 78
Chris@0 79 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
Chris@0 80 $code = self::ordUTF8($character);
Chris@0 81
Chris@0 82 // These two Unicode ranges include the accented US-ASCII letters, with a
Chris@0 83 // few characters that aren't accented letters mixed in. So define the
Chris@0 84 // ranges and the excluded characters.
Chris@0 85 $range1 = $code > 0x00bf && $code < 0x017f;
Chris@0 86 $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
Chris@0 87 $range2 = $code > 0x01cc && $code < 0x0250;
Chris@0 88 $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
Chris@0 89
Chris@0 90 $replacement = $character;
Chris@0 91 if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
Chris@0 92 $to_add = $this->lookupReplacement($code, 'xyz');
Chris@0 93 if (strlen($to_add) === 1) {
Chris@0 94 $replacement = $to_add;
Chris@0 95 }
Chris@0 96 }
Chris@0 97
Chris@0 98 $result .= $replacement;
Chris@0 99 }
Chris@0 100
Chris@0 101 return $result;
Chris@0 102 }
Chris@0 103
Chris@0 104 /**
Chris@0 105 * {@inheritdoc}
Chris@0 106 */
Chris@0 107 public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
Chris@0 108 $result = '';
Chris@0 109 $length = 0;
Chris@17 110 $hash = FALSE;
Chris@17 111
Chris@17 112 // Replace question marks with a unique hash if necessary. This because
Chris@17 113 // mb_convert_encoding() replaces all invalid characters with a question
Chris@17 114 // mark.
Chris@17 115 if ($unknown_character != '?' && strpos($string, '?') !== FALSE) {
Chris@17 116 $hash = hash('sha256', $string);
Chris@17 117 $string = str_replace('?', $hash, $string);
Chris@17 118 }
Chris@17 119
Chris@17 120 // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
Chris@17 121 // be replaced by a question mark.
Chris@17 122 $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
Chris@17 123
Chris@17 124 // Use the provided unknown character instead of a question mark.
Chris@17 125 if ($unknown_character != '?') {
Chris@17 126 $string = str_replace('?', $unknown_character, $string);
Chris@17 127 // Restore original question marks if necessary.
Chris@17 128 if ($hash !== FALSE) {
Chris@17 129 $string = str_replace($hash, '?', $string);
Chris@17 130 }
Chris@17 131 }
Chris@17 132
Chris@0 133 // Split into Unicode characters and transliterate each one.
Chris@0 134 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
Chris@0 135 $code = self::ordUTF8($character);
Chris@0 136 if ($code == -1) {
Chris@0 137 $to_add = $unknown_character;
Chris@0 138 }
Chris@0 139 else {
Chris@0 140 $to_add = $this->replace($code, $langcode, $unknown_character);
Chris@0 141 }
Chris@0 142
Chris@0 143 // Check if this exceeds the maximum allowed length.
Chris@0 144 if (isset($max_length)) {
Chris@0 145 $length += strlen($to_add);
Chris@0 146 if ($length > $max_length) {
Chris@0 147 // There is no more space.
Chris@0 148 return $result;
Chris@0 149 }
Chris@0 150 }
Chris@0 151
Chris@0 152 $result .= $to_add;
Chris@0 153 }
Chris@0 154
Chris@0 155 return $result;
Chris@0 156 }
Chris@0 157
Chris@0 158 /**
Chris@0 159 * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
Chris@0 160 *
Chris@0 161 * @param string $character
Chris@0 162 * A single UTF-8 character.
Chris@0 163 *
Chris@0 164 * @return int
Chris@0 165 * The character code, or -1 if an illegal character is found.
Chris@0 166 */
Chris@0 167 protected static function ordUTF8($character) {
Chris@0 168 $first_byte = ord($character[0]);
Chris@0 169
Chris@0 170 if (($first_byte & 0x80) == 0) {
Chris@0 171 // Single-byte form: 0xxxxxxxx.
Chris@0 172 return $first_byte;
Chris@0 173 }
Chris@0 174 if (($first_byte & 0xe0) == 0xc0) {
Chris@0 175 // Two-byte form: 110xxxxx 10xxxxxx.
Chris@0 176 return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
Chris@0 177 }
Chris@0 178 if (($first_byte & 0xf0) == 0xe0) {
Chris@0 179 // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
Chris@0 180 return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
Chris@0 181 }
Chris@0 182 if (($first_byte & 0xf8) == 0xf0) {
Chris@0 183 // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
Chris@0 184 return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
Chris@0 185 }
Chris@0 186
Chris@0 187 // Other forms are not legal.
Chris@0 188 return -1;
Chris@0 189 }
Chris@0 190
Chris@0 191 /**
Chris@0 192 * Replaces a single Unicode character using the transliteration database.
Chris@0 193 *
Chris@0 194 * @param int $code
Chris@0 195 * The character code of a Unicode character.
Chris@0 196 * @param string $langcode
Chris@0 197 * The language code of the language the character is in.
Chris@0 198 * @param string $unknown_character
Chris@0 199 * The character to substitute for characters without transliterated
Chris@0 200 * equivalents.
Chris@0 201 *
Chris@0 202 * @return string
Chris@0 203 * US-ASCII replacement character. If it has a mapping, it is returned;
Chris@0 204 * otherwise, $unknown_character is returned. The replacement can contain
Chris@0 205 * multiple characters.
Chris@0 206 */
Chris@0 207 protected function replace($code, $langcode, $unknown_character) {
Chris@0 208 if ($code < 0x80) {
Chris@0 209 // Already lower ASCII.
Chris@0 210 return chr($code);
Chris@0 211 }
Chris@0 212
Chris@0 213 // See if there is a language-specific override for this character.
Chris@0 214 if (!isset($this->languageOverrides[$langcode])) {
Chris@0 215 $this->readLanguageOverrides($langcode);
Chris@0 216 }
Chris@0 217 if (isset($this->languageOverrides[$langcode][$code])) {
Chris@0 218 return $this->languageOverrides[$langcode][$code];
Chris@0 219 }
Chris@0 220
Chris@0 221 return $this->lookupReplacement($code, $unknown_character);
Chris@0 222 }
Chris@0 223
Chris@0 224 /**
Chris@0 225 * Look up the generic replacement for a UTF-8 character code.
Chris@0 226 *
Chris@0 227 * @param $code
Chris@0 228 * The UTF-8 character code.
Chris@0 229 * @param string $unknown_character
Chris@0 230 * (optional) The character to substitute for characters without entries in
Chris@0 231 * the replacement tables.
Chris@0 232 *
Chris@0 233 * @return string
Chris@0 234 * US-ASCII replacement characters. If it has a mapping, it is returned;
Chris@0 235 * otherwise, $unknown_character is returned. The replacement can contain
Chris@0 236 * multiple characters.
Chris@0 237 */
Chris@0 238 protected function lookupReplacement($code, $unknown_character = '?') {
Chris@0 239 // See if there is a generic mapping for this character.
Chris@0 240 $bank = $code >> 8;
Chris@0 241 if (!isset($this->genericMap[$bank])) {
Chris@0 242 $this->readGenericData($bank);
Chris@0 243 }
Chris@0 244 $code = $code & 0xff;
Chris@0 245 return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
Chris@0 246 }
Chris@0 247
Chris@0 248 /**
Chris@0 249 * Reads in language overrides for a language code.
Chris@0 250 *
Chris@0 251 * The data is read from files named "$langcode.php" in
Chris@0 252 * PhpTransliteration::$dataDirectory. These files should set up an array
Chris@0 253 * variable $overrides with an element whose key is $langcode and whose value
Chris@0 254 * is an array whose keys are character codes, and whose values are their
Chris@0 255 * transliterations in this language. The character codes can be for any valid
Chris@0 256 * Unicode character, independent of the number of bytes.
Chris@0 257 *
Chris@0 258 * @param $langcode
Chris@0 259 * Code for the language to read.
Chris@0 260 */
Chris@0 261 protected function readLanguageOverrides($langcode) {
Chris@0 262 // Figure out the file name to use by sanitizing the language code,
Chris@0 263 // just in case.
Chris@0 264 $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
Chris@0 265
Chris@0 266 // Read in this file, which should set up a variable called $overrides,
Chris@0 267 // which will be local to this function.
Chris@0 268 if (is_file($file)) {
Chris@0 269 include $file;
Chris@0 270 }
Chris@0 271 if (!isset($overrides) || !is_array($overrides)) {
Chris@0 272 $overrides = [$langcode => []];
Chris@0 273 }
Chris@0 274 $this->languageOverrides[$langcode] = $overrides[$langcode];
Chris@0 275 }
Chris@0 276
Chris@0 277 /**
Chris@0 278 * Reads in generic transliteration data for a bank of characters.
Chris@0 279 *
Chris@0 280 * The data is read in from a file named "x$bank.php" (with $bank in
Chris@0 281 * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
Chris@0 282 * should set up a variable $bank containing an array whose numerical indices
Chris@0 283 * are the remaining two bytes of the character code, and whose values are the
Chris@0 284 * transliterations of these characters into US-ASCII. Note that the maximum
Chris@0 285 * Unicode character that can be encoded in this way is 4 bytes.
Chris@0 286 *
Chris@0 287 * @param $bank
Chris@0 288 * First two bytes of the Unicode character, or 0 for the ASCII range.
Chris@0 289 */
Chris@0 290 protected function readGenericData($bank) {
Chris@0 291 // Figure out the file name.
Chris@0 292 $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
Chris@0 293
Chris@0 294 // Read in this file, which should set up a variable called $base, which
Chris@0 295 // will be local to this function.
Chris@0 296 if (is_file($file)) {
Chris@0 297 include $file;
Chris@0 298 }
Chris@0 299 if (!isset($base) || !is_array($base)) {
Chris@0 300 $base = [];
Chris@0 301 }
Chris@0 302
Chris@0 303 // Save this data.
Chris@0 304 $this->genericMap[$bank] = $base;
Chris@0 305 }
Chris@0 306
Chris@0 307 }