Mercurial > hg > isophonics-drupal-site
comparison core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c
Initial import (non-working)
| author | Chris Cannam |
|---|---|
| date | Wed, 29 Nov 2017 16:09:58 +0000 |
| parents | |
| children | 129ea1e6d783 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4c8ae668cc8c |
|---|---|
| 1 <?php | |
| 2 | |
| 3 namespace Drupal\Component\Transliteration; | |
| 4 | |
| 5 /** | |
| 6 * Implements transliteration without using the PECL extensions. | |
| 7 * | |
| 8 * Transliterations are done character-by-character, by looking up non-US-ASCII | |
| 9 * characters in a transliteration database. | |
| 10 * | |
| 11 * The database comes from two types of files, both of which are searched for in | |
| 12 * the PhpTransliteration::$dataDirectory directory. First, language-specific | |
| 13 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If | |
| 14 * there is no language-specific override for a character, the generic | |
| 15 * transliteration character tables are searched (see | |
| 16 * PhpTransliteration::readGenericData()). If looking up the character in the | |
| 17 * generic table results in a NULL value, or an illegal character is | |
| 18 * encountered, then a substitute character is returned. | |
| 19 * | |
| 20 * Some parts of this code were derived from the MediaWiki project's UtfNormal | |
| 21 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>, | |
| 22 * http://www.mediawiki.org/ | |
| 23 */ | |
| 24 class PhpTransliteration implements TransliterationInterface { | |
| 25 | |
| 26 /** | |
| 27 * Directory where data for transliteration resides. | |
| 28 * | |
| 29 * The constructor sets this (by default) to subdirectory 'data' underneath | |
| 30 * the directory where the class's PHP file resides. | |
| 31 * | |
| 32 * @var string | |
| 33 */ | |
| 34 protected $dataDirectory; | |
| 35 | |
| 36 /** | |
| 37 * Associative array of language-specific character transliteration tables. | |
| 38 * | |
| 39 * The outermost array keys are language codes. For each language code key, | |
| 40 * the value is an array whose keys are Unicode character codes, and whose | |
| 41 * values are the transliterations of those characters to US-ASCII. This is | |
| 42 * set up as needed in PhpTransliteration::replace() by calling | |
| 43 * PhpTransliteration::readLanguageOverrides(). | |
| 44 * | |
| 45 * @var array | |
| 46 */ | |
| 47 protected $languageOverrides = []; | |
| 48 | |
| 49 /** | |
| 50 * Non-language-specific transliteration tables. | |
| 51 * | |
| 52 * Array whose keys are the upper two bytes of the Unicode character, and | |
| 53 * whose values are an array of transliterations for each lower-two bytes | |
| 54 * character code. This is set up as needed in PhpTransliteration::replace() | |
| 55 * by calling PhpTransliteration::readGenericData(). | |
| 56 * | |
| 57 * @var array | |
| 58 */ | |
| 59 protected $genericMap = []; | |
| 60 | |
| 61 /** | |
| 62 * Constructs a transliteration object. | |
| 63 * | |
| 64 * @param string $data_directory | |
| 65 * (optional) The directory where data files reside. If omitted, defaults | |
| 66 * to subdirectory 'data' underneath the directory where the class's PHP | |
| 67 * file resides. | |
| 68 */ | |
| 69 public function __construct($data_directory = NULL) { | |
| 70 $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data'; | |
| 71 } | |
| 72 | |
| 73 /** | |
| 74 * {@inheritdoc} | |
| 75 */ | |
| 76 public function removeDiacritics($string) { | |
| 77 $result = ''; | |
| 78 | |
| 79 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { | |
| 80 $code = self::ordUTF8($character); | |
| 81 | |
| 82 // These two Unicode ranges include the accented US-ASCII letters, with a | |
| 83 // few characters that aren't accented letters mixed in. So define the | |
| 84 // ranges and the excluded characters. | |
| 85 $range1 = $code > 0x00bf && $code < 0x017f; | |
| 86 $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b]; | |
| 87 $range2 = $code > 0x01cc && $code < 0x0250; | |
| 88 $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245]; | |
| 89 | |
| 90 $replacement = $character; | |
| 91 if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) { | |
| 92 $to_add = $this->lookupReplacement($code, 'xyz'); | |
| 93 if (strlen($to_add) === 1) { | |
| 94 $replacement = $to_add; | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 $result .= $replacement; | |
| 99 } | |
| 100 | |
| 101 return $result; | |
| 102 } | |
| 103 | |
| 104 /** | |
| 105 * {@inheritdoc} | |
| 106 */ | |
| 107 public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) { | |
| 108 $result = ''; | |
| 109 $length = 0; | |
| 110 // Split into Unicode characters and transliterate each one. | |
| 111 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { | |
| 112 $code = self::ordUTF8($character); | |
| 113 if ($code == -1) { | |
| 114 $to_add = $unknown_character; | |
| 115 } | |
| 116 else { | |
| 117 $to_add = $this->replace($code, $langcode, $unknown_character); | |
| 118 } | |
| 119 | |
| 120 // Check if this exceeds the maximum allowed length. | |
| 121 if (isset($max_length)) { | |
| 122 $length += strlen($to_add); | |
| 123 if ($length > $max_length) { | |
| 124 // There is no more space. | |
| 125 return $result; | |
| 126 } | |
| 127 } | |
| 128 | |
| 129 $result .= $to_add; | |
| 130 } | |
| 131 | |
| 132 return $result; | |
| 133 } | |
| 134 | |
| 135 /** | |
| 136 * Finds the character code for a UTF-8 character: like ord() but for UTF-8. | |
| 137 * | |
| 138 * @param string $character | |
| 139 * A single UTF-8 character. | |
| 140 * | |
| 141 * @return int | |
| 142 * The character code, or -1 if an illegal character is found. | |
| 143 */ | |
| 144 protected static function ordUTF8($character) { | |
| 145 $first_byte = ord($character[0]); | |
| 146 | |
| 147 if (($first_byte & 0x80) == 0) { | |
| 148 // Single-byte form: 0xxxxxxxx. | |
| 149 return $first_byte; | |
| 150 } | |
| 151 if (($first_byte & 0xe0) == 0xc0) { | |
| 152 // Two-byte form: 110xxxxx 10xxxxxx. | |
| 153 return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f); | |
| 154 } | |
| 155 if (($first_byte & 0xf0) == 0xe0) { | |
| 156 // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx. | |
| 157 return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f); | |
| 158 } | |
| 159 if (($first_byte & 0xf8) == 0xf0) { | |
| 160 // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. | |
| 161 return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f); | |
| 162 } | |
| 163 | |
| 164 // Other forms are not legal. | |
| 165 return -1; | |
| 166 } | |
| 167 | |
| 168 /** | |
| 169 * Replaces a single Unicode character using the transliteration database. | |
| 170 * | |
| 171 * @param int $code | |
| 172 * The character code of a Unicode character. | |
| 173 * @param string $langcode | |
| 174 * The language code of the language the character is in. | |
| 175 * @param string $unknown_character | |
| 176 * The character to substitute for characters without transliterated | |
| 177 * equivalents. | |
| 178 * | |
| 179 * @return string | |
| 180 * US-ASCII replacement character. If it has a mapping, it is returned; | |
| 181 * otherwise, $unknown_character is returned. The replacement can contain | |
| 182 * multiple characters. | |
| 183 */ | |
| 184 protected function replace($code, $langcode, $unknown_character) { | |
| 185 if ($code < 0x80) { | |
| 186 // Already lower ASCII. | |
| 187 return chr($code); | |
| 188 } | |
| 189 | |
| 190 // See if there is a language-specific override for this character. | |
| 191 if (!isset($this->languageOverrides[$langcode])) { | |
| 192 $this->readLanguageOverrides($langcode); | |
| 193 } | |
| 194 if (isset($this->languageOverrides[$langcode][$code])) { | |
| 195 return $this->languageOverrides[$langcode][$code]; | |
| 196 } | |
| 197 | |
| 198 return $this->lookupReplacement($code, $unknown_character); | |
| 199 } | |
| 200 | |
| 201 /** | |
| 202 * Look up the generic replacement for a UTF-8 character code. | |
| 203 * | |
| 204 * @param $code | |
| 205 * The UTF-8 character code. | |
| 206 * @param string $unknown_character | |
| 207 * (optional) The character to substitute for characters without entries in | |
| 208 * the replacement tables. | |
| 209 * | |
| 210 * @return string | |
| 211 * US-ASCII replacement characters. If it has a mapping, it is returned; | |
| 212 * otherwise, $unknown_character is returned. The replacement can contain | |
| 213 * multiple characters. | |
| 214 */ | |
| 215 protected function lookupReplacement($code, $unknown_character = '?') { | |
| 216 // See if there is a generic mapping for this character. | |
| 217 $bank = $code >> 8; | |
| 218 if (!isset($this->genericMap[$bank])) { | |
| 219 $this->readGenericData($bank); | |
| 220 } | |
| 221 $code = $code & 0xff; | |
| 222 return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character; | |
| 223 } | |
| 224 | |
| 225 /** | |
| 226 * Reads in language overrides for a language code. | |
| 227 * | |
| 228 * The data is read from files named "$langcode.php" in | |
| 229 * PhpTransliteration::$dataDirectory. These files should set up an array | |
| 230 * variable $overrides with an element whose key is $langcode and whose value | |
| 231 * is an array whose keys are character codes, and whose values are their | |
| 232 * transliterations in this language. The character codes can be for any valid | |
| 233 * Unicode character, independent of the number of bytes. | |
| 234 * | |
| 235 * @param $langcode | |
| 236 * Code for the language to read. | |
| 237 */ | |
| 238 protected function readLanguageOverrides($langcode) { | |
| 239 // Figure out the file name to use by sanitizing the language code, | |
| 240 // just in case. | |
| 241 $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php'; | |
| 242 | |
| 243 // Read in this file, which should set up a variable called $overrides, | |
| 244 // which will be local to this function. | |
| 245 if (is_file($file)) { | |
| 246 include $file; | |
| 247 } | |
| 248 if (!isset($overrides) || !is_array($overrides)) { | |
| 249 $overrides = [$langcode => []]; | |
| 250 } | |
| 251 $this->languageOverrides[$langcode] = $overrides[$langcode]; | |
| 252 } | |
| 253 | |
| 254 /** | |
| 255 * Reads in generic transliteration data for a bank of characters. | |
| 256 * | |
| 257 * The data is read in from a file named "x$bank.php" (with $bank in | |
| 258 * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files | |
| 259 * should set up a variable $bank containing an array whose numerical indices | |
| 260 * are the remaining two bytes of the character code, and whose values are the | |
| 261 * transliterations of these characters into US-ASCII. Note that the maximum | |
| 262 * Unicode character that can be encoded in this way is 4 bytes. | |
| 263 * | |
| 264 * @param $bank | |
| 265 * First two bytes of the Unicode character, or 0 for the ASCII range. | |
| 266 */ | |
| 267 protected function readGenericData($bank) { | |
| 268 // Figure out the file name. | |
| 269 $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php'; | |
| 270 | |
| 271 // Read in this file, which should set up a variable called $base, which | |
| 272 // will be local to this function. | |
| 273 if (is_file($file)) { | |
| 274 include $file; | |
| 275 } | |
| 276 if (!isset($base) || !is_array($base)) { | |
| 277 $base = []; | |
| 278 } | |
| 279 | |
| 280 // Save this data. | |
| 281 $this->genericMap[$bank] = $base; | |
| 282 } | |
| 283 | |
| 284 } |
