Chris@0
|
1 <?php
|
Chris@0
|
2
|
Chris@0
|
3 /**
|
Chris@0
|
4 * @file
|
Chris@0
|
5 * Unifies formats of transliteration data from various sources.
|
Chris@0
|
6 *
|
Chris@0
|
7 * A few notes about this script:
|
Chris@0
|
8 * - The functions in this file are NOT SECURE, because they use PHP functions
|
Chris@0
|
9 * like eval(). Absolutely do not run this script unless you trust the data
|
Chris@0
|
10 * files used for input.
|
Chris@0
|
11 * - You will need to change the name of this file to remove the .txt extension
|
Chris@0
|
12 * before running it (it has been given this name so that you cannot run it
|
Chris@0
|
13 * by mistake). When you do that, move it out of your web root as well so
|
Chris@0
|
14 * that it cannot be run via a URL, and run the script via the PHP command
|
Chris@0
|
15 * at a command prompt.
|
Chris@0
|
16 * - This script, depending on which portions of it you run, depends on having
|
Chris@0
|
17 * input data from various sources in sub-directories below where this file
|
Chris@0
|
18 * is located. The data inputs are as follows:
|
Chris@0
|
19 * - Existing Drupal Core transliteration data: Sub-directory 'data'; comes
|
Chris@0
|
20 * from core/lib/Drupal/Component/Transliteration/data
|
Chris@0
|
21 * - Midgardmvc data: Sub-directory 'utf8_to_ascii_db'; download from
|
Chris@0
|
22 * https://github.com/bergie/midgardmvc_helper_urlize/downloads
|
Chris@0
|
23 * - CPAN Text-Unidecode data: Sub-directory 'Unidecode'; download from
|
Chris@0
|
24 * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
|
Chris@0
|
25 * - Node.js project: Sub-directory 'unidecoder_data'; download from
|
Chris@0
|
26 * https://github.com/bitwalker/stringex/downloads
|
Chris@0
|
27 * - JUnidecode project: Sub-directory 'junidecode'; download source from
|
Chris@0
|
28 * http://www.ippatsuman.com/projects/junidecode/index.html
|
Chris@0
|
29 * - You will also need to make directory 'outdata' to hold output.
|
Chris@0
|
30 * - If you plan to use the 'intl' data, you will also need to have the PECL
|
Chris@0
|
31 * packages 'yaml' and 'intl' installed. See
|
Chris@0
|
32 * http://php.net/manual/install.pecl.downloads.php for generic PECL
|
Chris@0
|
33 * package installation instructions. The following commands on Ubuntu Linux
|
Chris@0
|
34 * will install yaml and intl packages:
|
Chris@0
|
35 * @code
|
Chris@0
|
36 * sudo apt-get install libyaml-dev
|
Chris@0
|
37 * sudo pecl install yaml
|
Chris@0
|
38 * sudo apt-get install php5-intl
|
Chris@0
|
39 * sudo apt-get install libicu-dev
|
Chris@0
|
40 * sudo pecl install intl
|
Chris@0
|
41 * @endcode
|
Chris@0
|
42 * After running these commands, you will need to make sure
|
Chris@0
|
43 * 'extension=intl.so' and 'extension=yaml.so' are added to the php.ini file
|
Chris@0
|
44 * that is in use for the PHP command-line command.
|
Chris@0
|
45 * - When you have collected all of the data and installed the required
|
Chris@0
|
46 * packages, you will need to find the specific commands below that you want
|
Chris@0
|
47 * to use and un-comment them. The preferred data source for Drupal Core is
|
Chris@0
|
48 * the PECL 'intl' package, and the line that needs to be un-commented in
|
Chris@0
|
49 * order to make a Drupal Core patch is:
|
Chris@0
|
50 * @code
|
Chris@0
|
51 * patch_drupal('outdata');
|
Chris@0
|
52 * @endcode
|
Chris@0
|
53 * - The functions are documented in more detail in their headers where they
|
Chris@0
|
54 * are defined. Many have parameters that you can use to change the output.
|
Chris@0
|
55 */
|
Chris@0
|
56
|
Chris@0
|
57 // Commands to read various data sources:
|
Chris@0
|
58 // $data = read_drupal_data();
|
Chris@0
|
59 // $data = read_midgard_data();
|
Chris@0
|
60 // $data = read_cpan_data();
|
Chris@0
|
61 // $data = read_nodejs_data();
|
Chris@0
|
62 // $data = read_intl_data();
|
Chris@0
|
63 // $data = read_junidecode_data();
|
Chris@0
|
64
|
Chris@0
|
65 // After running a read_*_data() function, you can print out the data
|
Chris@0
|
66 // (it will make a LOT of output):
|
Chris@0
|
67 // print_r($data);
|
Chris@0
|
68
|
Chris@0
|
69 // Command to read in all of data sources and output in CSV format, explaining
|
Chris@0
|
70 // the differences:
|
Chris@0
|
71 // read_all_to_csv();
|
Chris@0
|
72
|
Chris@0
|
73 // Command to patch Drupal Core data, using the intl data set, and put the
|
Chris@0
|
74 // resulting changed data files in the 'outdata' directory:
|
Chris@0
|
75 patch_drupal('outdata');
|
Chris@0
|
76
|
Chris@0
|
77 /**
|
Chris@0
|
78 * Reads in all transliteration data and outputs differences in CSV format.
|
Chris@0
|
79 *
|
Chris@0
|
80 * Each data set is compared to the Drupal Core reference data set, and the
|
Chris@0
|
81 * differences are noted. The data must be in the locations noted in the
|
Chris@0
|
82 * file header above. The CSV output has several columns. The first one is the
|
Chris@0
|
83 * Unicode character code. The next columns contain the transliteration of
|
Chris@0
|
84 * that character in each of the data sets. The last column, tells what the
|
Chris@0
|
85 * differences are between the Drupal Core reference set and the other data
|
Chris@0
|
86 * sets:
|
Chris@0
|
87 * - missing: The target set is missing data that the Drupal set has.
|
Chris@0
|
88 * - provided: The target set has provided data that Drupal does not have.
|
Chris@0
|
89 * - case: The target and Drupal set output differ only in upper/lower case.
|
Chris@0
|
90 * - different: The target and Drupal set output differ in more than just case.
|
Chris@0
|
91 *
|
Chris@0
|
92 * @param bool $print_all
|
Chris@0
|
93 * TRUE to print all data; FALSE (default) to print just data where there
|
Chris@0
|
94 * are differences between the Drupal set and other data sources.
|
Chris@0
|
95 * @param bool $print_missing
|
Chris@0
|
96 * TRUE to print cases where one of the non-Drupal sets is missing information
|
Chris@0
|
97 * and that is the only difference; FALSE (default) to include these rows.
|
Chris@0
|
98 */
|
Chris@0
|
99 function read_all_to_csv($print_all = FALSE, $print_missing = FALSE) {
|
Chris@0
|
100 $data = array();
|
Chris@0
|
101 $types = array('drupal', 'midgard', 'cpan', 'nodejs', 'junidecode', 'intl');
|
Chris@0
|
102
|
Chris@0
|
103 // Alternatively, if you just want to compare a couple of data sets, you can
|
Chris@0
|
104 // uncomment and edit the following line:
|
Chris@0
|
105 // $types = array('drupal', 'intl');
|
Chris@0
|
106
|
Chris@0
|
107 // Read in all the data.
|
Chris@0
|
108 foreach ($types as $type) {
|
Chris@0
|
109 $data[$type] = call_user_func('read_' . $type . '_data');
|
Chris@0
|
110 }
|
Chris@0
|
111
|
Chris@0
|
112 // Print CSV header row.
|
Chris@0
|
113 print "character,";
|
Chris@0
|
114 print implode(',', $types);
|
Chris@0
|
115 print ",why\n";
|
Chris@0
|
116
|
Chris@0
|
117 // Go through all the banks of character data.
|
Chris@0
|
118 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
119
|
Chris@0
|
120 // Go through characters in bank; skip pure ASCII characters.
|
Chris@0
|
121 $start = ($bank == 0) ? 0x80 : 0;
|
Chris@0
|
122 for ($chr = $start; $chr < 256; $chr++) {
|
Chris@0
|
123
|
Chris@0
|
124 // Gather the data together for this character.
|
Chris@0
|
125 $row = array();
|
Chris@0
|
126 foreach ($types as $type) {
|
Chris@0
|
127 $row[$type] = (isset($data[$type][$bank][$chr]) && is_string($data[$type][$bank][$chr])) ? $data[$type][$bank][$chr] : '';
|
Chris@0
|
128 }
|
Chris@0
|
129
|
Chris@0
|
130 // Only print if there are differences or we are printing all data.
|
Chris@0
|
131 $print = $print_all;
|
Chris@0
|
132 $ref = $row['drupal'];
|
Chris@0
|
133 $why = array();
|
Chris@0
|
134 foreach ($types as $type) {
|
Chris@0
|
135 // Try to characterize what the differences are.
|
Chris@0
|
136 if ($row[$type] != $ref) {
|
Chris@0
|
137 if ($row[$type] == '') {
|
Chris@0
|
138 $why['missing'] = 'missing';
|
Chris@0
|
139 if ($print_missing) {
|
Chris@0
|
140 $print = TRUE;
|
Chris@0
|
141 }
|
Chris@0
|
142 }
|
Chris@0
|
143 elseif ($ref == '') {
|
Chris@0
|
144 $why['provided'] = 'provided';
|
Chris@0
|
145 $print = TRUE;
|
Chris@0
|
146 }
|
Chris@0
|
147 elseif ($row[$type] == strtolower($ref) || $row[$type] == strtoupper($ref)) {
|
Chris@0
|
148 $why['case'] = 'case';
|
Chris@0
|
149 $print = TRUE;
|
Chris@0
|
150 }
|
Chris@0
|
151 else {
|
Chris@0
|
152 $why['different'] = 'different';
|
Chris@0
|
153 $print = TRUE;
|
Chris@0
|
154 }
|
Chris@0
|
155 }
|
Chris@0
|
156 }
|
Chris@0
|
157
|
Chris@0
|
158 // Print the data line.
|
Chris@0
|
159 if ($print) {
|
Chris@0
|
160 print '0x' . sprintf('%04x', 256 * $bank + $chr) . ',';
|
Chris@0
|
161 foreach ($row as $out) {
|
Chris@0
|
162 print '"' . addcslashes($out, '"') . '", ';
|
Chris@0
|
163 }
|
Chris@0
|
164 print implode(':', $why);
|
Chris@0
|
165 print "\n";
|
Chris@0
|
166 }
|
Chris@0
|
167 }
|
Chris@0
|
168 }
|
Chris@0
|
169 }
|
Chris@0
|
170
|
Chris@0
|
171 /**
|
Chris@0
|
172 * Reads in 'intl' transliteration data and writes out changed Drupal files.
|
Chris@0
|
173 *
|
Chris@0
|
174 * Writes out the Drupal data files that would have to change to make our data
|
Chris@0
|
175 * match the intl data set.
|
Chris@0
|
176 *
|
Chris@0
|
177 * @param string $outdir
|
Chris@0
|
178 * Directory to put the patched data files in (under where the script is
|
Chris@0
|
179 * being run).
|
Chris@0
|
180 */
|
Chris@0
|
181 function patch_drupal($outdir) {
|
Chris@0
|
182 $data = array();
|
Chris@0
|
183
|
Chris@0
|
184 // Note that this is hard-wired below. Changing this line will have no
|
Chris@0
|
185 // effect except to break this function.
|
Chris@0
|
186 $types = array('drupal', 'intl');
|
Chris@0
|
187
|
Chris@0
|
188 // Read in all the data.
|
Chris@0
|
189 foreach ($types as $type) {
|
Chris@0
|
190 $data[$type] = call_user_func('read_' . $type . '_data');
|
Chris@0
|
191 }
|
Chris@0
|
192
|
Chris@0
|
193 // Go through all the banks of character data.
|
Chris@0
|
194 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
195 $print_bank = FALSE;
|
Chris@0
|
196
|
Chris@0
|
197 // Go through characters in bank; skip pure ASCII characters.
|
Chris@0
|
198 $start = ($bank == 0) ? 0x80 : 0;
|
Chris@0
|
199 $newdata = array();
|
Chris@0
|
200 for ($chr = 0; $chr < 256; $chr++) {
|
Chris@0
|
201 // Fill up the start of the ASCII range.
|
Chris@0
|
202 if ($chr < $start) {
|
Chris@0
|
203 $newdata[$chr] = chr($chr);
|
Chris@0
|
204 continue;
|
Chris@0
|
205 }
|
Chris@0
|
206
|
Chris@0
|
207 // Figure out what characters we actually have.
|
Chris@0
|
208 $drupal = isset($data['drupal'][$bank][$chr]) ? $data['drupal'][$bank][$chr] : NULL;
|
Chris@0
|
209 // Note that for intl, we only want to keep the transliteration if it
|
Chris@0
|
210 // has something other than '' in it.
|
Chris@0
|
211 $intl = isset($data['intl'][$bank][$chr]) && $data['intl'][$bank][$chr] != '' ? $data['intl'][$bank][$chr] : NULL;
|
Chris@0
|
212 // Make sure we have something in the Drupal data set, in case we need
|
Chris@0
|
213 // to print.
|
Chris@0
|
214 $newdata[$chr] = $drupal;
|
Chris@0
|
215
|
Chris@0
|
216 if (!isset($intl)) {
|
Chris@0
|
217 continue;
|
Chris@0
|
218 }
|
Chris@0
|
219 if (!isset($drupal) || $drupal != $intl) {
|
Chris@0
|
220 $print_bank = TRUE;
|
Chris@0
|
221 $newdata[$chr] = $intl;
|
Chris@0
|
222 }
|
Chris@0
|
223 }
|
Chris@0
|
224
|
Chris@0
|
225 // If we found a difference, output a data file.
|
Chris@0
|
226 if ($print_bank) {
|
Chris@0
|
227 write_data_file($newdata, $bank, $outdir);
|
Chris@0
|
228 }
|
Chris@0
|
229 }
|
Chris@0
|
230 }
|
Chris@0
|
231
|
Chris@0
|
232 /**
|
Chris@0
|
233 * Reads in the Drupal Core generic transliteration data set.
|
Chris@0
|
234 *
|
Chris@0
|
235 * The data is expected to be in files xNN.php in directory 'data' under
|
Chris@0
|
236 * this file's directory.
|
Chris@0
|
237 *
|
Chris@0
|
238 * @return array
|
Chris@0
|
239 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
240 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
241 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
242 *
|
Chris@0
|
243 * @see PhpTransliteration::readGenericData()
|
Chris@0
|
244 */
|
Chris@0
|
245 function read_drupal_data() {
|
Chris@0
|
246 $dir = __DIR__ . '/data';
|
Chris@0
|
247 $out = array();
|
Chris@0
|
248
|
Chris@0
|
249 // Read data files.
|
Chris@0
|
250 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
251 $base = array();
|
Chris@0
|
252 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
|
Chris@0
|
253 if (is_file($file)) {
|
Chris@0
|
254 include($file);
|
Chris@0
|
255 }
|
Chris@0
|
256 $out[$bank] = $base;
|
Chris@0
|
257 }
|
Chris@0
|
258
|
Chris@0
|
259 return $out;
|
Chris@0
|
260 }
|
Chris@0
|
261
|
Chris@0
|
262 /**
|
Chris@0
|
263 * Reads in the MidgardMVC transliteration data.
|
Chris@0
|
264 *
|
Chris@0
|
265 * The data is expected to be in files xNN.php in directory utf8_to_ascii_db
|
Chris@0
|
266 * under the directory where this file resides. It can be downloaded from
|
Chris@0
|
267 * https://github.com/bergie/midgardmvc_helper_urlize/downloads.
|
Chris@0
|
268 *
|
Chris@0
|
269 * @return array
|
Chris@0
|
270 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
271 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
272 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
273 */
|
Chris@0
|
274 function read_midgard_data() {
|
Chris@0
|
275 $dir = __DIR__ . '/utf8_to_ascii_db';
|
Chris@0
|
276 $out = array();
|
Chris@0
|
277
|
Chris@0
|
278 // Read data files.
|
Chris@0
|
279 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
280 $UTF8_TO_ASCII = array($bank => array());
|
Chris@0
|
281 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
|
Chris@0
|
282 if (is_file($file)) {
|
Chris@0
|
283 include($file);
|
Chris@0
|
284 }
|
Chris@0
|
285 $base = $UTF8_TO_ASCII[$bank];
|
Chris@0
|
286
|
Chris@0
|
287 // For unknown characters, these files have '[?]' in them. Replace with
|
Chris@0
|
288 // NULL for compatibility with our data.
|
Chris@0
|
289 $base = array_map('_replace_question_with_null', $base);
|
Chris@0
|
290 $out[$bank] = $base;
|
Chris@0
|
291 }
|
Chris@0
|
292
|
Chris@0
|
293 return $out;
|
Chris@0
|
294 }
|
Chris@0
|
295
|
Chris@0
|
296 /**
|
Chris@0
|
297 * Reads in the CPAN Text::Unidecode data set.
|
Chris@0
|
298 *
|
Chris@0
|
299 * The data is expected to be in files xNN.pm in directory 'Unidecode' under
|
Chris@0
|
300 * this file's directory. It can be downloaded from
|
Chris@0
|
301 * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm.
|
Chris@0
|
302 *
|
Chris@0
|
303 * @return array
|
Chris@0
|
304 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
305 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
306 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
307 */
|
Chris@0
|
308 function read_cpan_data() {
|
Chris@0
|
309 $dir = __DIR__ . '/Unidecode';
|
Chris@0
|
310 $out = array();
|
Chris@0
|
311
|
Chris@0
|
312 // Read data files.
|
Chris@0
|
313 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
314 $base = array();
|
Chris@0
|
315 $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm';
|
Chris@0
|
316 if (is_file($file)) {
|
Chris@0
|
317 $base = _cpan_read_file($file);
|
Chris@0
|
318 }
|
Chris@0
|
319 $out[$bank] = $base;
|
Chris@0
|
320 }
|
Chris@0
|
321
|
Chris@0
|
322 return $out;
|
Chris@0
|
323 }
|
Chris@0
|
324
|
Chris@0
|
325 /**
|
Chris@0
|
326 * Reads in the data in a single file from the Text::Unidecode CPAN project.
|
Chris@0
|
327 *
|
Chris@0
|
328 * @param string $file
|
Chris@0
|
329 * File to read from.
|
Chris@0
|
330 *
|
Chris@0
|
331 * @return array
|
Chris@0
|
332 * Data read from the file.
|
Chris@0
|
333 *
|
Chris@0
|
334 * @see read_cpan_data()
|
Chris@0
|
335 */
|
Chris@0
|
336 function _cpan_read_file($file) {
|
Chris@0
|
337
|
Chris@0
|
338 $contents = file($file);
|
Chris@0
|
339 $save = '';
|
Chris@0
|
340 foreach ($contents as $line) {
|
Chris@0
|
341 // Discard lines starting with # or $. The first line seems to have a
|
Chris@0
|
342 // comment starting with #, the second has a Perl line like
|
Chris@0
|
343 // $Text::Unidecode::Char[0x04] = [, -- and we do not want either.
|
Chris@0
|
344 if (preg_match('|^\s*[#\$]|', $line)) {
|
Chris@0
|
345 continue;
|
Chris@0
|
346 }
|
Chris@0
|
347
|
Chris@0
|
348 // Discard lines ending with semi-colons, which we also don't want
|
Chris@0
|
349 // (there seem to be two of these lines at the end of the files).
|
Chris@0
|
350 if (preg_match('|;\s*$|', $line)) {
|
Chris@0
|
351 continue;
|
Chris@0
|
352 }
|
Chris@0
|
353
|
Chris@0
|
354 // Replace '[?]' with nothing (that means "don't know how to
|
Chris@0
|
355 // transliterate"). In some files, this is encoded as qq{[?]} or
|
Chris@0
|
356 // qq{[?] } instead.
|
Chris@0
|
357 $line = str_replace('qq{[?]}', 'NULL', $line);
|
Chris@0
|
358 $line = str_replace('qq{[?] }', 'NULL', $line);
|
Chris@0
|
359 $line = str_replace("'[?]'", 'NULL', $line);
|
Chris@0
|
360
|
Chris@0
|
361 // Replace qq{} with either "" or '' or nothing, depending on what is
|
Chris@0
|
362 // inside it.
|
Chris@0
|
363 $line = str_replace('qq{\{}', "'{'", $line);
|
Chris@0
|
364 $line = str_replace('qq{\}}', "'}'", $line);
|
Chris@0
|
365 $line = str_replace('qq{\} }', "'} '", $line);
|
Chris@0
|
366 $line = str_replace("qq{\\\\}", '"\\\\"', $line);
|
Chris@0
|
367 $line = str_replace("qq{\\", "qq{'", $line);
|
Chris@0
|
368 $line = str_replace("qq{\"'}", "\"\\\"'\"", $line);
|
Chris@0
|
369 $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line);
|
Chris@0
|
370 $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line);
|
Chris@0
|
371
|
Chris@0
|
372 $save .= $line;
|
Chris@0
|
373 }
|
Chris@0
|
374
|
Chris@0
|
375 // Now we should have a string that looks like:
|
Chris@0
|
376 // 'a', 'b', ...
|
Chris@0
|
377 // Evaluate as an array.
|
Chris@0
|
378 $save = 'return array(' . $save . ');';
|
Chris@0
|
379
|
Chris@0
|
380 $data = @eval($save);
|
Chris@0
|
381 if (isset($data) && is_array($data)) {
|
Chris@0
|
382 $data = array_map('_replace_hex_with_character', $data);
|
Chris@0
|
383 }
|
Chris@0
|
384 else {
|
Chris@0
|
385 // There was a problem, so throw an error and exit.
|
Chris@0
|
386 print "Problem in evaluating $file\n";
|
Chris@0
|
387 print $save;
|
Chris@0
|
388 eval($save);
|
Chris@0
|
389 exit();
|
Chris@0
|
390 }
|
Chris@0
|
391
|
Chris@0
|
392 // For unknown characters, these files may still have '[?]' in them. Replace
|
Chris@0
|
393 // with NULL for compatibility with our data.
|
Chris@0
|
394 $data = array_map('_replace_question_with_null', $data);
|
Chris@0
|
395
|
Chris@0
|
396 return $data;
|
Chris@0
|
397 }
|
Chris@0
|
398
|
Chris@0
|
399 /**
|
Chris@0
|
400 * Reads in the Node.js transliteration data.
|
Chris@0
|
401 *
|
Chris@0
|
402 * The data is expected to be in files xNN.yml in directory unidecoder_data
|
Chris@0
|
403 * under the directory where this file resides. It can be downloaded from
|
Chris@0
|
404 * https://github.com/bitwalker/stringex/downloads. You also need the PECL
|
Chris@0
|
405 * 'yaml' extension installed for this function to work.
|
Chris@0
|
406 *
|
Chris@0
|
407 * @return array
|
Chris@0
|
408 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
409 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
410 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
411 */
|
Chris@0
|
412 function read_nodejs_data() {
|
Chris@0
|
413 $dir = __DIR__ . '/unidecoder_data';
|
Chris@0
|
414 $out = array();
|
Chris@0
|
415
|
Chris@0
|
416 // Read data files.
|
Chris@0
|
417 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
418 $base = array();
|
Chris@0
|
419 $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml';
|
Chris@0
|
420 if (is_file($file)) {
|
Chris@0
|
421 $base = yaml_parse_file($file);
|
Chris@0
|
422 // For unknown characters, these files have '[?]' in them. Replace with
|
Chris@0
|
423 // NULL for compatibility with our data.
|
Chris@0
|
424 $base = array_map('_replace_question_with_null', $base);
|
Chris@0
|
425 }
|
Chris@0
|
426 $out[$bank] = $base;
|
Chris@0
|
427 }
|
Chris@0
|
428
|
Chris@0
|
429 return $out;
|
Chris@0
|
430 }
|
Chris@0
|
431
|
Chris@0
|
432 /**
|
Chris@0
|
433 * Loads the PECL 'intl' Transliterator class's transliteration data.
|
Chris@0
|
434 *
|
Chris@0
|
435 * You need to have the PECL 'intl' package installed for this to work.
|
Chris@0
|
436 *
|
Chris@0
|
437 * @return array
|
Chris@0
|
438 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
439 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
440 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
441 */
|
Chris@0
|
442 function read_intl_data() {
|
Chris@0
|
443 // In order to transliterate, you first have to create a transliterator
|
Chris@0
|
444 // object. This needs a list of transliteration operations. You can get a
|
Chris@0
|
445 // list of available operations with:
|
Chris@0
|
446 // print_r(Transliterator::listIDs()); exit();
|
Chris@0
|
447 // And a few of these are documented on
|
Chris@0
|
448 // http://userguide.icu-project.org/transforms/general and
|
Chris@0
|
449 // http://www.unicode.org/reports/tr15/ (for normalizations).
|
Chris@0
|
450 // There are also maps to the Unicode characters at:
|
Chris@0
|
451 // http://www.unicode.org/roadmaps/bmp/
|
Chris@0
|
452 // http://www.unicode.org/charts/nameslist/
|
Chris@0
|
453 $ops = '';
|
Chris@0
|
454
|
Chris@0
|
455 // The first step in any transform: separate out accents and remove them.
|
Chris@0
|
456 $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
|
Chris@0
|
457
|
Chris@0
|
458 // Then you need to do a bunch of language-specific or script-specific
|
Chris@0
|
459 // transliterations. Here is hopefully a representative set. There are
|
Chris@0
|
460 // quite a few scripts that don't appear to have rules currently, such
|
Chris@0
|
461 // as Etheopian.
|
Chris@0
|
462 $ops .= 'Greek-Latin; ';
|
Chris@0
|
463 $ops .= 'Cyrillic-Latin; ';
|
Chris@0
|
464 $ops .= 'Armenian-Latin; ';
|
Chris@0
|
465 $ops .= 'Hebrew-Latin; ';
|
Chris@0
|
466 $ops .= 'Arabic-Latin; ';
|
Chris@0
|
467 $ops .= 'Syriac-Latin; ';
|
Chris@0
|
468 $ops .= 'Thaana-Latin; ';
|
Chris@0
|
469 $ops .= 'Devanagari-Latin; ';
|
Chris@0
|
470 $ops .= 'Bengali-Latin; ';
|
Chris@0
|
471 $ops .= 'Gurmukhi-Latin; ';
|
Chris@0
|
472 $ops .= 'Gujarati-Latin; ';
|
Chris@0
|
473 $ops .= 'Oriya-Latin; ';
|
Chris@0
|
474 $ops .= 'Tamil-Latin; ';
|
Chris@0
|
475 $ops .= 'Telugu-Latin; ';
|
Chris@0
|
476 $ops .= 'Kannada-Latin; ';
|
Chris@0
|
477 $ops .= 'Malayalam-Latin; ';
|
Chris@0
|
478 $ops .= 'Thai-Latin; ';
|
Chris@0
|
479 $ops .= 'Georgian-Latin; ';
|
Chris@0
|
480 $ops .= 'Hangul-Latin; ';
|
Chris@0
|
481 $ops .= 'Mongolian-Latin/BGN; ';
|
Chris@0
|
482 $ops .= 'Jamo-Latin; ';
|
Chris@0
|
483 $ops .= 'Katakana-Latin; ';
|
Chris@0
|
484 $ops .= 'Any-Latin; ';
|
Chris@0
|
485
|
Chris@0
|
486 // Finally, after transforming to Latin, transform to ASCII.
|
Chris@0
|
487 $ops .= 'Latin-ASCII; ';
|
Chris@0
|
488
|
Chris@0
|
489 // Remove any remaining accents and recompose.
|
Chris@0
|
490 $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
|
Chris@0
|
491
|
Chris@0
|
492 $trans = Transliterator::create($ops);
|
Chris@0
|
493 $out = array();
|
Chris@0
|
494
|
Chris@0
|
495 // Transliterate all possible characters.
|
Chris@0
|
496 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
497 $data = array();
|
Chris@0
|
498 for ($chr = 0; $chr < 256; $chr++) {
|
Chris@0
|
499 // Skip the UTF-16 and "private use" ranges completely.
|
Chris@0
|
500 $OK = ($bank <= 0xd8 || $bank > 0xf8);
|
Chris@0
|
501
|
Chris@0
|
502 $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : '';
|
Chris@0
|
503
|
Chris@0
|
504 // See if we have managed to transliterate this to ASCII or not. If not,
|
Chris@0
|
505 // return NULL instead of this character.
|
Chris@0
|
506 $max = chr(127);
|
Chris@0
|
507 foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) {
|
Chris@0
|
508 if ($character > $max) {
|
Chris@0
|
509 $OK = $OK && FALSE;
|
Chris@0
|
510 break;
|
Chris@0
|
511 }
|
Chris@0
|
512 }
|
Chris@0
|
513 $data[$chr] = ($OK) ? $result : NULL;
|
Chris@0
|
514 }
|
Chris@0
|
515 $out[$bank] = $data;
|
Chris@0
|
516 }
|
Chris@0
|
517
|
Chris@0
|
518 return $out;
|
Chris@0
|
519 }
|
Chris@0
|
520
|
Chris@0
|
521 /**
|
Chris@0
|
522 * Reads in the JUnidecode data set.
|
Chris@0
|
523 *
|
Chris@0
|
524 * The data is expected to be in files XNN.java in directory 'junidecode' under
|
Chris@0
|
525 * this file's directory. It can be downloaded from
|
Chris@0
|
526 * http://www.ippatsuman.com/projects/junidecode/index.html
|
Chris@0
|
527 *
|
Chris@0
|
528 * @return array
|
Chris@0
|
529 * Nested array of transliteration data. Outer keys are the first two
|
Chris@0
|
530 * bytes of Unicode characters (or 0 for base ASCII characters). The next
|
Chris@0
|
531 * level is the other two bytes, and the values are the transliterations.
|
Chris@0
|
532 */
|
Chris@0
|
533 function read_junidecode_data() {
|
Chris@0
|
534 $dir = __DIR__ . '/junidecode';
|
Chris@0
|
535 $out = array();
|
Chris@0
|
536
|
Chris@0
|
537 // Read data files.
|
Chris@0
|
538 for ($bank = 0; $bank < 256; $bank++) {
|
Chris@0
|
539 $base = array();
|
Chris@0
|
540 $file = $dir . '/X' . sprintf('%02x', $bank) . '.java';
|
Chris@0
|
541 if (is_file($file)) {
|
Chris@0
|
542 $base = _junidecode_read_file($file);
|
Chris@0
|
543 }
|
Chris@0
|
544 $out[$bank] = $base;
|
Chris@0
|
545 }
|
Chris@0
|
546
|
Chris@0
|
547 return $out;
|
Chris@0
|
548 }
|
Chris@0
|
549
|
Chris@0
|
550 /**
|
Chris@0
|
551 * Reads in the data in a single file from the JUnidecode project.
|
Chris@0
|
552 *
|
Chris@0
|
553 * @param string $file
|
Chris@0
|
554 * File to read from.
|
Chris@0
|
555 *
|
Chris@0
|
556 * @return array
|
Chris@0
|
557 * Data read from the file.
|
Chris@0
|
558 *
|
Chris@0
|
559 * @see read_junidecode_data()
|
Chris@0
|
560 */
|
Chris@0
|
561 function _junidecode_read_file($file) {
|
Chris@0
|
562 $contents = file($file);
|
Chris@0
|
563 $save = '';
|
Chris@0
|
564 foreach ($contents as $line) {
|
Chris@0
|
565 // Discard lines starting with * or / or package or class or public or },
|
Chris@0
|
566 // to get rid of comments and Java code.
|
Chris@0
|
567 if (preg_match('|^\s*[\*/\}]|', $line)) {
|
Chris@0
|
568 continue;
|
Chris@0
|
569 }
|
Chris@0
|
570 if (preg_match('/^\s*package|public|class/', $line)) {
|
Chris@0
|
571 continue;
|
Chris@0
|
572 }
|
Chris@0
|
573
|
Chris@0
|
574 // Some of the lines look like this:
|
Chris@0
|
575 // new String("" + (char) 0x00), // 0x00
|
Chris@0
|
576 // Transform to be '0x00,'
|
Chris@0
|
577 $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line);
|
Chris@0
|
578
|
Chris@0
|
579 // Strings are in double quotes, yet many have \' in them.
|
Chris@0
|
580 $line = str_replace("\'", "'", $line);
|
Chris@0
|
581
|
Chris@0
|
582 // Everything else should probably be OK -- the lines are like:
|
Chris@0
|
583 // "Ie", // 0x00
|
Chris@0
|
584 $save .= $line;
|
Chris@0
|
585 }
|
Chris@0
|
586
|
Chris@0
|
587 // Evaluate as an array.
|
Chris@0
|
588 $save = 'return array(' . $save . ');';
|
Chris@0
|
589
|
Chris@0
|
590 $data = @eval($save);
|
Chris@0
|
591 if (isset($data) && is_array($data)) {
|
Chris@0
|
592 $data = array_map('_replace_hex_with_character', $data);
|
Chris@0
|
593 $data = array_map('_replace_question_with_null', $data);
|
Chris@0
|
594 }
|
Chris@0
|
595 else {
|
Chris@0
|
596 // There was a problem, so throw an error and exit.
|
Chris@0
|
597 print "Problem in evaluating $file\n";
|
Chris@0
|
598 print $save;
|
Chris@0
|
599 eval($save);
|
Chris@0
|
600 exit();
|
Chris@0
|
601 }
|
Chris@0
|
602
|
Chris@0
|
603 return $data;
|
Chris@0
|
604 }
|
Chris@0
|
605
|
Chris@0
|
606 /**
|
Chris@0
|
607 * Callback for array_map(): Returns $data, with '[?]' replaced with NULL.
|
Chris@0
|
608 */
|
Chris@0
|
609 function _replace_question_with_null($data) {
|
Chris@0
|
610 return ($data == '[?]' || $data == '[?] ') ? NULL : $data;
|
Chris@0
|
611 }
|
Chris@0
|
612
|
Chris@0
|
613 /**
|
Chris@0
|
614 * Callback for array_map(): Replaces '\xNN' with the actual character.
|
Chris@0
|
615 */
|
Chris@0
|
616 function _replace_hex_with_character($item) {
|
Chris@0
|
617 if (strpos($item, '\x') === 0) {
|
Chris@0
|
618 $item = eval($item);
|
Chris@0
|
619 }
|
Chris@0
|
620 return $item;
|
Chris@0
|
621 }
|
Chris@0
|
622
|
Chris@0
|
623 /**
|
Chris@0
|
624 * Writes a data file out in the standard Drupal Core data format.
|
Chris@0
|
625 *
|
Chris@0
|
626 * @param array $data
|
Chris@0
|
627 * Array of data to write out.
|
Chris@0
|
628 * @param string $bank
|
Chris@0
|
629 * Bank of characters it belongs to.
|
Chris@0
|
630 * @param string $dir
|
Chris@0
|
631 * Output directory.
|
Chris@0
|
632 */
|
Chris@0
|
633 function write_data_file($data, $bank, $outdir) {
|
Chris@0
|
634 $dir = __DIR__ . '/' . $outdir;
|
Chris@0
|
635 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
|
Chris@0
|
636
|
Chris@0
|
637 $out = '';
|
Chris@0
|
638 $out .= "<?php\n\n/**\n * @file\n * Generic transliteration data for the PhpTransliteration class.\n */\n\n\$base = array(\n";
|
Chris@0
|
639
|
Chris@0
|
640 // The 00 file skips the ASCII range
|
Chris@0
|
641 $start = 0;
|
Chris@0
|
642 if ($bank == 0) {
|
Chris@0
|
643 $start = 0x80;
|
Chris@0
|
644 $out .= " // Note: to save memory plain ASCII mappings have been left out.\n";
|
Chris@0
|
645 }
|
Chris@0
|
646
|
Chris@0
|
647 for ($line = $start; $line <= 0xf0; $line += 0x10) {
|
Chris@0
|
648 $out .= ' 0x' . sprintf('%02X', $line) . ' =>';
|
Chris@0
|
649 $elems = array_values(array_slice($data, $line, 16));
|
Chris@0
|
650 for ($i = 0; $i < 16; $i++ ) {
|
Chris@0
|
651 if (isset($elems[$i])) {
|
Chris@0
|
652 $out .= " '" . addcslashes($elems[$i], "'\\") . "',";
|
Chris@0
|
653 }
|
Chris@0
|
654 else {
|
Chris@0
|
655 $out .= ' NULL,';
|
Chris@0
|
656 }
|
Chris@0
|
657 }
|
Chris@0
|
658 $out .= "\n";
|
Chris@0
|
659 }
|
Chris@0
|
660
|
Chris@0
|
661 $out .= ");\n";
|
Chris@0
|
662
|
Chris@0
|
663 file_put_contents($file, $out);
|
Chris@0
|
664 }
|