Chris@0
|
1 <?php
|
Chris@0
|
2
|
Chris@0
|
3 namespace Drupal\Component\Transliteration;
|
Chris@0
|
4
|
Chris@0
|
5 /**
|
Chris@0
|
6 * Implements transliteration without using the PECL extensions.
|
Chris@0
|
7 *
|
Chris@0
|
8 * Transliterations are done character-by-character, by looking up non-US-ASCII
|
Chris@0
|
9 * characters in a transliteration database.
|
Chris@0
|
10 *
|
Chris@0
|
11 * The database comes from two types of files, both of which are searched for in
|
Chris@0
|
12 * the PhpTransliteration::$dataDirectory directory. First, language-specific
|
Chris@0
|
13 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
|
Chris@0
|
14 * there is no language-specific override for a character, the generic
|
Chris@0
|
15 * transliteration character tables are searched (see
|
Chris@0
|
16 * PhpTransliteration::readGenericData()). If looking up the character in the
|
Chris@0
|
17 * generic table results in a NULL value, or an illegal character is
|
Chris@0
|
18 * encountered, then a substitute character is returned.
|
Chris@0
|
19 *
|
Chris@0
|
20 * Some parts of this code were derived from the MediaWiki project's UtfNormal
|
Chris@0
|
21 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
|
Chris@0
|
22 * http://www.mediawiki.org/
|
Chris@0
|
23 */
|
Chris@0
|
24 class PhpTransliteration implements TransliterationInterface {
|
Chris@0
|
25
|
Chris@0
|
26 /**
|
Chris@0
|
27 * Directory where data for transliteration resides.
|
Chris@0
|
28 *
|
Chris@0
|
29 * The constructor sets this (by default) to subdirectory 'data' underneath
|
Chris@0
|
30 * the directory where the class's PHP file resides.
|
Chris@0
|
31 *
|
Chris@0
|
32 * @var string
|
Chris@0
|
33 */
|
Chris@0
|
34 protected $dataDirectory;
|
Chris@0
|
35
|
Chris@0
|
36 /**
|
Chris@0
|
37 * Associative array of language-specific character transliteration tables.
|
Chris@0
|
38 *
|
Chris@0
|
39 * The outermost array keys are language codes. For each language code key,
|
Chris@0
|
40 * the value is an array whose keys are Unicode character codes, and whose
|
Chris@0
|
41 * values are the transliterations of those characters to US-ASCII. This is
|
Chris@0
|
42 * set up as needed in PhpTransliteration::replace() by calling
|
Chris@0
|
43 * PhpTransliteration::readLanguageOverrides().
|
Chris@0
|
44 *
|
Chris@0
|
45 * @var array
|
Chris@0
|
46 */
|
Chris@0
|
47 protected $languageOverrides = [];
|
Chris@0
|
48
|
Chris@0
|
49 /**
|
Chris@0
|
50 * Non-language-specific transliteration tables.
|
Chris@0
|
51 *
|
Chris@0
|
52 * Array whose keys are the upper two bytes of the Unicode character, and
|
Chris@0
|
53 * whose values are an array of transliterations for each lower-two bytes
|
Chris@0
|
54 * character code. This is set up as needed in PhpTransliteration::replace()
|
Chris@0
|
55 * by calling PhpTransliteration::readGenericData().
|
Chris@0
|
56 *
|
Chris@0
|
57 * @var array
|
Chris@0
|
58 */
|
Chris@0
|
59 protected $genericMap = [];
|
Chris@0
|
60
|
Chris@0
|
61 /**
|
Chris@0
|
62 * Constructs a transliteration object.
|
Chris@0
|
63 *
|
Chris@0
|
64 * @param string $data_directory
|
Chris@0
|
65 * (optional) The directory where data files reside. If omitted, defaults
|
Chris@0
|
66 * to subdirectory 'data' underneath the directory where the class's PHP
|
Chris@0
|
67 * file resides.
|
Chris@0
|
68 */
|
Chris@0
|
69 public function __construct($data_directory = NULL) {
|
Chris@0
|
70 $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
|
Chris@0
|
71 }
|
Chris@0
|
72
|
Chris@0
|
73 /**
|
Chris@0
|
74 * {@inheritdoc}
|
Chris@0
|
75 */
|
Chris@0
|
76 public function removeDiacritics($string) {
|
Chris@0
|
77 $result = '';
|
Chris@0
|
78
|
Chris@0
|
79 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
|
Chris@0
|
80 $code = self::ordUTF8($character);
|
Chris@0
|
81
|
Chris@0
|
82 // These two Unicode ranges include the accented US-ASCII letters, with a
|
Chris@0
|
83 // few characters that aren't accented letters mixed in. So define the
|
Chris@0
|
84 // ranges and the excluded characters.
|
Chris@0
|
85 $range1 = $code > 0x00bf && $code < 0x017f;
|
Chris@0
|
86 $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
|
Chris@0
|
87 $range2 = $code > 0x01cc && $code < 0x0250;
|
Chris@0
|
88 $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
|
Chris@0
|
89
|
Chris@0
|
90 $replacement = $character;
|
Chris@0
|
91 if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
|
Chris@0
|
92 $to_add = $this->lookupReplacement($code, 'xyz');
|
Chris@0
|
93 if (strlen($to_add) === 1) {
|
Chris@0
|
94 $replacement = $to_add;
|
Chris@0
|
95 }
|
Chris@0
|
96 }
|
Chris@0
|
97
|
Chris@0
|
98 $result .= $replacement;
|
Chris@0
|
99 }
|
Chris@0
|
100
|
Chris@0
|
101 return $result;
|
Chris@0
|
102 }
|
Chris@0
|
103
|
Chris@0
|
104 /**
|
Chris@0
|
105 * {@inheritdoc}
|
Chris@0
|
106 */
|
Chris@0
|
107 public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
|
Chris@0
|
108 $result = '';
|
Chris@0
|
109 $length = 0;
|
Chris@17
|
110 $hash = FALSE;
|
Chris@17
|
111
|
Chris@17
|
112 // Replace question marks with a unique hash if necessary. This because
|
Chris@17
|
113 // mb_convert_encoding() replaces all invalid characters with a question
|
Chris@17
|
114 // mark.
|
Chris@17
|
115 if ($unknown_character != '?' && strpos($string, '?') !== FALSE) {
|
Chris@17
|
116 $hash = hash('sha256', $string);
|
Chris@17
|
117 $string = str_replace('?', $hash, $string);
|
Chris@17
|
118 }
|
Chris@17
|
119
|
Chris@17
|
120 // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
|
Chris@17
|
121 // be replaced by a question mark.
|
Chris@17
|
122 $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
|
Chris@17
|
123
|
Chris@17
|
124 // Use the provided unknown character instead of a question mark.
|
Chris@17
|
125 if ($unknown_character != '?') {
|
Chris@17
|
126 $string = str_replace('?', $unknown_character, $string);
|
Chris@17
|
127 // Restore original question marks if necessary.
|
Chris@17
|
128 if ($hash !== FALSE) {
|
Chris@17
|
129 $string = str_replace($hash, '?', $string);
|
Chris@17
|
130 }
|
Chris@17
|
131 }
|
Chris@17
|
132
|
Chris@0
|
133 // Split into Unicode characters and transliterate each one.
|
Chris@0
|
134 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
|
Chris@0
|
135 $code = self::ordUTF8($character);
|
Chris@0
|
136 if ($code == -1) {
|
Chris@0
|
137 $to_add = $unknown_character;
|
Chris@0
|
138 }
|
Chris@0
|
139 else {
|
Chris@0
|
140 $to_add = $this->replace($code, $langcode, $unknown_character);
|
Chris@0
|
141 }
|
Chris@0
|
142
|
Chris@0
|
143 // Check if this exceeds the maximum allowed length.
|
Chris@0
|
144 if (isset($max_length)) {
|
Chris@0
|
145 $length += strlen($to_add);
|
Chris@0
|
146 if ($length > $max_length) {
|
Chris@0
|
147 // There is no more space.
|
Chris@0
|
148 return $result;
|
Chris@0
|
149 }
|
Chris@0
|
150 }
|
Chris@0
|
151
|
Chris@0
|
152 $result .= $to_add;
|
Chris@0
|
153 }
|
Chris@0
|
154
|
Chris@0
|
155 return $result;
|
Chris@0
|
156 }
|
Chris@0
|
157
|
Chris@0
|
158 /**
|
Chris@0
|
159 * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
|
Chris@0
|
160 *
|
Chris@0
|
161 * @param string $character
|
Chris@0
|
162 * A single UTF-8 character.
|
Chris@0
|
163 *
|
Chris@0
|
164 * @return int
|
Chris@0
|
165 * The character code, or -1 if an illegal character is found.
|
Chris@0
|
166 */
|
Chris@0
|
167 protected static function ordUTF8($character) {
|
Chris@0
|
168 $first_byte = ord($character[0]);
|
Chris@0
|
169
|
Chris@0
|
170 if (($first_byte & 0x80) == 0) {
|
Chris@0
|
171 // Single-byte form: 0xxxxxxxx.
|
Chris@0
|
172 return $first_byte;
|
Chris@0
|
173 }
|
Chris@0
|
174 if (($first_byte & 0xe0) == 0xc0) {
|
Chris@0
|
175 // Two-byte form: 110xxxxx 10xxxxxx.
|
Chris@0
|
176 return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
|
Chris@0
|
177 }
|
Chris@0
|
178 if (($first_byte & 0xf0) == 0xe0) {
|
Chris@0
|
179 // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
|
Chris@0
|
180 return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
|
Chris@0
|
181 }
|
Chris@0
|
182 if (($first_byte & 0xf8) == 0xf0) {
|
Chris@0
|
183 // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
|
Chris@0
|
184 return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
|
Chris@0
|
185 }
|
Chris@0
|
186
|
Chris@0
|
187 // Other forms are not legal.
|
Chris@0
|
188 return -1;
|
Chris@0
|
189 }
|
Chris@0
|
190
|
Chris@0
|
191 /**
|
Chris@0
|
192 * Replaces a single Unicode character using the transliteration database.
|
Chris@0
|
193 *
|
Chris@0
|
194 * @param int $code
|
Chris@0
|
195 * The character code of a Unicode character.
|
Chris@0
|
196 * @param string $langcode
|
Chris@0
|
197 * The language code of the language the character is in.
|
Chris@0
|
198 * @param string $unknown_character
|
Chris@0
|
199 * The character to substitute for characters without transliterated
|
Chris@0
|
200 * equivalents.
|
Chris@0
|
201 *
|
Chris@0
|
202 * @return string
|
Chris@0
|
203 * US-ASCII replacement character. If it has a mapping, it is returned;
|
Chris@0
|
204 * otherwise, $unknown_character is returned. The replacement can contain
|
Chris@0
|
205 * multiple characters.
|
Chris@0
|
206 */
|
Chris@0
|
207 protected function replace($code, $langcode, $unknown_character) {
|
Chris@0
|
208 if ($code < 0x80) {
|
Chris@0
|
209 // Already lower ASCII.
|
Chris@0
|
210 return chr($code);
|
Chris@0
|
211 }
|
Chris@0
|
212
|
Chris@0
|
213 // See if there is a language-specific override for this character.
|
Chris@0
|
214 if (!isset($this->languageOverrides[$langcode])) {
|
Chris@0
|
215 $this->readLanguageOverrides($langcode);
|
Chris@0
|
216 }
|
Chris@0
|
217 if (isset($this->languageOverrides[$langcode][$code])) {
|
Chris@0
|
218 return $this->languageOverrides[$langcode][$code];
|
Chris@0
|
219 }
|
Chris@0
|
220
|
Chris@0
|
221 return $this->lookupReplacement($code, $unknown_character);
|
Chris@0
|
222 }
|
Chris@0
|
223
|
Chris@0
|
224 /**
|
Chris@0
|
225 * Look up the generic replacement for a UTF-8 character code.
|
Chris@0
|
226 *
|
Chris@0
|
227 * @param $code
|
Chris@0
|
228 * The UTF-8 character code.
|
Chris@0
|
229 * @param string $unknown_character
|
Chris@0
|
230 * (optional) The character to substitute for characters without entries in
|
Chris@0
|
231 * the replacement tables.
|
Chris@0
|
232 *
|
Chris@0
|
233 * @return string
|
Chris@0
|
234 * US-ASCII replacement characters. If it has a mapping, it is returned;
|
Chris@0
|
235 * otherwise, $unknown_character is returned. The replacement can contain
|
Chris@0
|
236 * multiple characters.
|
Chris@0
|
237 */
|
Chris@0
|
238 protected function lookupReplacement($code, $unknown_character = '?') {
|
Chris@0
|
239 // See if there is a generic mapping for this character.
|
Chris@0
|
240 $bank = $code >> 8;
|
Chris@0
|
241 if (!isset($this->genericMap[$bank])) {
|
Chris@0
|
242 $this->readGenericData($bank);
|
Chris@0
|
243 }
|
Chris@0
|
244 $code = $code & 0xff;
|
Chris@0
|
245 return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
|
Chris@0
|
246 }
|
Chris@0
|
247
|
Chris@0
|
248 /**
|
Chris@0
|
249 * Reads in language overrides for a language code.
|
Chris@0
|
250 *
|
Chris@0
|
251 * The data is read from files named "$langcode.php" in
|
Chris@0
|
252 * PhpTransliteration::$dataDirectory. These files should set up an array
|
Chris@0
|
253 * variable $overrides with an element whose key is $langcode and whose value
|
Chris@0
|
254 * is an array whose keys are character codes, and whose values are their
|
Chris@0
|
255 * transliterations in this language. The character codes can be for any valid
|
Chris@0
|
256 * Unicode character, independent of the number of bytes.
|
Chris@0
|
257 *
|
Chris@0
|
258 * @param $langcode
|
Chris@0
|
259 * Code for the language to read.
|
Chris@0
|
260 */
|
Chris@0
|
261 protected function readLanguageOverrides($langcode) {
|
Chris@0
|
262 // Figure out the file name to use by sanitizing the language code,
|
Chris@0
|
263 // just in case.
|
Chris@0
|
264 $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
|
Chris@0
|
265
|
Chris@0
|
266 // Read in this file, which should set up a variable called $overrides,
|
Chris@0
|
267 // which will be local to this function.
|
Chris@0
|
268 if (is_file($file)) {
|
Chris@0
|
269 include $file;
|
Chris@0
|
270 }
|
Chris@0
|
271 if (!isset($overrides) || !is_array($overrides)) {
|
Chris@0
|
272 $overrides = [$langcode => []];
|
Chris@0
|
273 }
|
Chris@0
|
274 $this->languageOverrides[$langcode] = $overrides[$langcode];
|
Chris@0
|
275 }
|
Chris@0
|
276
|
Chris@0
|
277 /**
|
Chris@0
|
278 * Reads in generic transliteration data for a bank of characters.
|
Chris@0
|
279 *
|
Chris@0
|
280 * The data is read in from a file named "x$bank.php" (with $bank in
|
Chris@0
|
281 * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
|
Chris@0
|
282 * should set up a variable $bank containing an array whose numerical indices
|
Chris@0
|
283 * are the remaining two bytes of the character code, and whose values are the
|
Chris@0
|
284 * transliterations of these characters into US-ASCII. Note that the maximum
|
Chris@0
|
285 * Unicode character that can be encoded in this way is 4 bytes.
|
Chris@0
|
286 *
|
Chris@0
|
287 * @param $bank
|
Chris@0
|
288 * First two bytes of the Unicode character, or 0 for the ASCII range.
|
Chris@0
|
289 */
|
Chris@0
|
290 protected function readGenericData($bank) {
|
Chris@0
|
291 // Figure out the file name.
|
Chris@0
|
292 $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
|
Chris@0
|
293
|
Chris@0
|
294 // Read in this file, which should set up a variable called $base, which
|
Chris@0
|
295 // will be local to this function.
|
Chris@0
|
296 if (is_file($file)) {
|
Chris@0
|
297 include $file;
|
Chris@0
|
298 }
|
Chris@0
|
299 if (!isset($base) || !is_array($base)) {
|
Chris@0
|
300 $base = [];
|
Chris@0
|
301 }
|
Chris@0
|
302
|
Chris@0
|
303 // Save this data.
|
Chris@0
|
304 $this->genericMap[$bank] = $base;
|
Chris@0
|
305 }
|
Chris@0
|
306
|
Chris@0
|
307 }
|