comparison core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c

Initial import (non-working)
author Chris Cannam
date Wed, 29 Nov 2017 16:09:58 +0000
parents
children 129ea1e6d783
comparison
equal deleted inserted replaced
-1:000000000000 0:4c8ae668cc8c
1 <?php
2
3 namespace Drupal\Component\Transliteration;
4
5 /**
6 * Implements transliteration without using the PECL extensions.
7 *
8 * Transliterations are done character-by-character, by looking up non-US-ASCII
9 * characters in a transliteration database.
10 *
11 * The database comes from two types of files, both of which are searched for in
12 * the PhpTransliteration::$dataDirectory directory. First, language-specific
13 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
14 * there is no language-specific override for a character, the generic
15 * transliteration character tables are searched (see
16 * PhpTransliteration::readGenericData()). If looking up the character in the
17 * generic table results in a NULL value, or an illegal character is
18 * encountered, then a substitute character is returned.
19 *
20 * Some parts of this code were derived from the MediaWiki project's UtfNormal
21 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
22 * http://www.mediawiki.org/
23 */
24 class PhpTransliteration implements TransliterationInterface {
25
26 /**
27 * Directory where data for transliteration resides.
28 *
29 * The constructor sets this (by default) to subdirectory 'data' underneath
30 * the directory where the class's PHP file resides.
31 *
32 * @var string
33 */
34 protected $dataDirectory;
35
36 /**
37 * Associative array of language-specific character transliteration tables.
38 *
39 * The outermost array keys are language codes. For each language code key,
40 * the value is an array whose keys are Unicode character codes, and whose
41 * values are the transliterations of those characters to US-ASCII. This is
42 * set up as needed in PhpTransliteration::replace() by calling
43 * PhpTransliteration::readLanguageOverrides().
44 *
45 * @var array
46 */
47 protected $languageOverrides = [];
48
49 /**
50 * Non-language-specific transliteration tables.
51 *
52 * Array whose keys are the upper two bytes of the Unicode character, and
53 * whose values are an array of transliterations for each lower-two bytes
54 * character code. This is set up as needed in PhpTransliteration::replace()
55 * by calling PhpTransliteration::readGenericData().
56 *
57 * @var array
58 */
59 protected $genericMap = [];
60
61 /**
62 * Constructs a transliteration object.
63 *
64 * @param string $data_directory
65 * (optional) The directory where data files reside. If omitted, defaults
66 * to subdirectory 'data' underneath the directory where the class's PHP
67 * file resides.
68 */
69 public function __construct($data_directory = NULL) {
70 $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
71 }
72
73 /**
74 * {@inheritdoc}
75 */
76 public function removeDiacritics($string) {
77 $result = '';
78
79 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
80 $code = self::ordUTF8($character);
81
82 // These two Unicode ranges include the accented US-ASCII letters, with a
83 // few characters that aren't accented letters mixed in. So define the
84 // ranges and the excluded characters.
85 $range1 = $code > 0x00bf && $code < 0x017f;
86 $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
87 $range2 = $code > 0x01cc && $code < 0x0250;
88 $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
89
90 $replacement = $character;
91 if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
92 $to_add = $this->lookupReplacement($code, 'xyz');
93 if (strlen($to_add) === 1) {
94 $replacement = $to_add;
95 }
96 }
97
98 $result .= $replacement;
99 }
100
101 return $result;
102 }
103
104 /**
105 * {@inheritdoc}
106 */
107 public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
108 $result = '';
109 $length = 0;
110 // Split into Unicode characters and transliterate each one.
111 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
112 $code = self::ordUTF8($character);
113 if ($code == -1) {
114 $to_add = $unknown_character;
115 }
116 else {
117 $to_add = $this->replace($code, $langcode, $unknown_character);
118 }
119
120 // Check if this exceeds the maximum allowed length.
121 if (isset($max_length)) {
122 $length += strlen($to_add);
123 if ($length > $max_length) {
124 // There is no more space.
125 return $result;
126 }
127 }
128
129 $result .= $to_add;
130 }
131
132 return $result;
133 }
134
135 /**
136 * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
137 *
138 * @param string $character
139 * A single UTF-8 character.
140 *
141 * @return int
142 * The character code, or -1 if an illegal character is found.
143 */
144 protected static function ordUTF8($character) {
145 $first_byte = ord($character[0]);
146
147 if (($first_byte & 0x80) == 0) {
148 // Single-byte form: 0xxxxxxxx.
149 return $first_byte;
150 }
151 if (($first_byte & 0xe0) == 0xc0) {
152 // Two-byte form: 110xxxxx 10xxxxxx.
153 return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
154 }
155 if (($first_byte & 0xf0) == 0xe0) {
156 // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
157 return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
158 }
159 if (($first_byte & 0xf8) == 0xf0) {
160 // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
161 return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
162 }
163
164 // Other forms are not legal.
165 return -1;
166 }
167
168 /**
169 * Replaces a single Unicode character using the transliteration database.
170 *
171 * @param int $code
172 * The character code of a Unicode character.
173 * @param string $langcode
174 * The language code of the language the character is in.
175 * @param string $unknown_character
176 * The character to substitute for characters without transliterated
177 * equivalents.
178 *
179 * @return string
180 * US-ASCII replacement character. If it has a mapping, it is returned;
181 * otherwise, $unknown_character is returned. The replacement can contain
182 * multiple characters.
183 */
184 protected function replace($code, $langcode, $unknown_character) {
185 if ($code < 0x80) {
186 // Already lower ASCII.
187 return chr($code);
188 }
189
190 // See if there is a language-specific override for this character.
191 if (!isset($this->languageOverrides[$langcode])) {
192 $this->readLanguageOverrides($langcode);
193 }
194 if (isset($this->languageOverrides[$langcode][$code])) {
195 return $this->languageOverrides[$langcode][$code];
196 }
197
198 return $this->lookupReplacement($code, $unknown_character);
199 }
200
201 /**
202 * Look up the generic replacement for a UTF-8 character code.
203 *
204 * @param $code
205 * The UTF-8 character code.
206 * @param string $unknown_character
207 * (optional) The character to substitute for characters without entries in
208 * the replacement tables.
209 *
210 * @return string
211 * US-ASCII replacement characters. If it has a mapping, it is returned;
212 * otherwise, $unknown_character is returned. The replacement can contain
213 * multiple characters.
214 */
215 protected function lookupReplacement($code, $unknown_character = '?') {
216 // See if there is a generic mapping for this character.
217 $bank = $code >> 8;
218 if (!isset($this->genericMap[$bank])) {
219 $this->readGenericData($bank);
220 }
221 $code = $code & 0xff;
222 return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
223 }
224
225 /**
226 * Reads in language overrides for a language code.
227 *
228 * The data is read from files named "$langcode.php" in
229 * PhpTransliteration::$dataDirectory. These files should set up an array
230 * variable $overrides with an element whose key is $langcode and whose value
231 * is an array whose keys are character codes, and whose values are their
232 * transliterations in this language. The character codes can be for any valid
233 * Unicode character, independent of the number of bytes.
234 *
235 * @param $langcode
236 * Code for the language to read.
237 */
238 protected function readLanguageOverrides($langcode) {
239 // Figure out the file name to use by sanitizing the language code,
240 // just in case.
241 $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
242
243 // Read in this file, which should set up a variable called $overrides,
244 // which will be local to this function.
245 if (is_file($file)) {
246 include $file;
247 }
248 if (!isset($overrides) || !is_array($overrides)) {
249 $overrides = [$langcode => []];
250 }
251 $this->languageOverrides[$langcode] = $overrides[$langcode];
252 }
253
254 /**
255 * Reads in generic transliteration data for a bank of characters.
256 *
257 * The data is read in from a file named "x$bank.php" (with $bank in
258 * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
259 * should set up a variable $bank containing an array whose numerical indices
260 * are the remaining two bytes of the character code, and whose values are the
261 * transliterations of these characters into US-ASCII. Note that the maximum
262 * Unicode character that can be encoded in this way is 4 bytes.
263 *
264 * @param $bank
265 * First two bytes of the Unicode character, or 0 for the ASCII range.
266 */
267 protected function readGenericData($bank) {
268 // Figure out the file name.
269 $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
270
271 // Read in this file, which should set up a variable called $base, which
272 // will be local to this function.
273 if (is_file($file)) {
274 include $file;
275 }
276 if (!isset($base) || !is_array($base)) {
277 $base = [];
278 }
279
280 // Save this data.
281 $this->genericMap[$bank] = $base;
282 }
283
284 }