Mercurial > hg > cmmr2012-drupal-site
comparison core/scripts/transliteration_data.php.txt @ 0:c75dbcec494b
Initial commit from drush-created site
author | Chris Cannam |
---|---|
date | Thu, 05 Jul 2018 14:24:15 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c75dbcec494b |
---|---|
1 <?php | |
2 | |
3 /** | |
4 * @file | |
5 * Unifies formats of transliteration data from various sources. | |
6 * | |
7 * A few notes about this script: | |
8 * - The functions in this file are NOT SECURE, because they use PHP functions | |
9 * like eval(). Absolutely do not run this script unless you trust the data | |
10 * files used for input. | |
11 * - You will need to change the name of this file to remove the .txt extension | |
12 * before running it (it has been given this name so that you cannot run it | |
13 * by mistake). When you do that, move it out of your web root as well so | |
14 * that it cannot be run via a URL, and run the script via the PHP command | |
15 * at a command prompt. | |
16 * - This script, depending on which portions of it you run, depends on having | |
17 * input data from various sources in sub-directories below where this file | |
18 * is located. The data inputs are as follows: | |
19 * - Existing Drupal Core transliteration data: Sub-directory 'data'; comes | |
20 * from core/lib/Drupal/Component/Transliteration/data | |
21 * - Midgardmvc data: Sub-directory 'utf8_to_ascii_db'; download from | |
22 * https://github.com/bergie/midgardmvc_helper_urlize/downloads | |
23 * - CPAN Text-Unidecode data: Sub-directory 'Unidecode'; download from | |
24 * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm | |
25 * - Node.js project: Sub-directory 'unidecoder_data'; download from | |
26 * https://github.com/bitwalker/stringex/downloads | |
27 * - JUnidecode project: Sub-directory 'junidecode'; download source from | |
28 * http://www.ippatsuman.com/projects/junidecode/index.html | |
29 * - You will also need to make directory 'outdata' to hold output. | |
30 * - If you plan to use the 'intl' data, you will also need to have the PECL | |
31 * packages 'yaml' and 'intl' installed. See | |
32 * http://php.net/manual/install.pecl.downloads.php for generic PECL | |
33 * package installation instructions. The following commands on Ubuntu Linux | |
34 * will install yaml and intl packages: | |
35 * @code | |
36 * sudo apt-get install libyaml-dev | |
37 * sudo pecl install yaml | |
38 * sudo apt-get install php5-intl | |
39 * sudo apt-get install libicu-dev | |
40 * sudo pecl install intl | |
41 * @endcode | |
42 * After running these commands, you will need to make sure | |
43 * 'extension=intl.so' and 'extension=yaml.so' are added to the php.ini file | |
44 * that is in use for the PHP command-line command. | |
45 * - When you have collected all of the data and installed the required | |
46 * packages, you will need to find the specific commands below that you want | |
47 * to use and un-comment them. The preferred data source for Drupal Core is | |
48 * the PECL 'intl' package, and the line that needs to be un-commented in | |
49 * order to make a Drupal Core patch is: | |
50 * @code | |
51 * patch_drupal('outdata'); | |
52 * @endcode | |
53 * - The functions are documented in more detail in their headers where they | |
54 * are defined. Many have parameters that you can use to change the output. | |
55 */ | |
56 | |
57 // Commands to read various data sources: | |
58 // $data = read_drupal_data(); | |
59 // $data = read_midgard_data(); | |
60 // $data = read_cpan_data(); | |
61 // $data = read_nodejs_data(); | |
62 // $data = read_intl_data(); | |
63 // $data = read_junidecode_data(); | |
64 | |
65 // After running a read_*_data() function, you can print out the data | |
66 // (it will make a LOT of output): | |
67 // print_r($data); | |
68 | |
69 // Command to read in all of data sources and output in CSV format, explaining | |
70 // the differences: | |
71 // read_all_to_csv(); | |
72 | |
73 // Command to patch Drupal Core data, using the intl data set, and put the | |
74 // resulting changed data files in the 'outdata' directory: | |
75 patch_drupal('outdata'); | |
76 | |
77 /** | |
78 * Reads in all transliteration data and outputs differences in CSV format. | |
79 * | |
80 * Each data set is compared to the Drupal Core reference data set, and the | |
81 * differences are noted. The data must be in the locations noted in the | |
82 * file header above. The CSV output has several columns. The first one is the | |
83 * Unicode character code. The next columns contain the transliteration of | |
84 * that character in each of the data sets. The last column, tells what the | |
85 * differences are between the Drupal Core reference set and the other data | |
86 * sets: | |
87 * - missing: The target set is missing data that the Drupal set has. | |
88 * - provided: The target set has provided data that Drupal does not have. | |
89 * - case: The target and Drupal set output differ only in upper/lower case. | |
90 * - different: The target and Drupal set output differ in more than just case. | |
91 * | |
92 * @param bool $print_all | |
93 * TRUE to print all data; FALSE (default) to print just data where there | |
94 * are differences between the Drupal set and other data sources. | |
95 * @param bool $print_missing | |
96 * TRUE to print cases where one of the non-Drupal sets is missing information | |
97 * and that is the only difference; FALSE (default) to include these rows. | |
98 */ | |
99 function read_all_to_csv($print_all = FALSE, $print_missing = FALSE) { | |
100 $data = array(); | |
101 $types = array('drupal', 'midgard', 'cpan', 'nodejs', 'junidecode', 'intl'); | |
102 | |
103 // Alternatively, if you just want to compare a couple of data sets, you can | |
104 // uncomment and edit the following line: | |
105 // $types = array('drupal', 'intl'); | |
106 | |
107 // Read in all the data. | |
108 foreach ($types as $type) { | |
109 $data[$type] = call_user_func('read_' . $type . '_data'); | |
110 } | |
111 | |
112 // Print CSV header row. | |
113 print "character,"; | |
114 print implode(',', $types); | |
115 print ",why\n"; | |
116 | |
117 // Go through all the banks of character data. | |
118 for ($bank = 0; $bank < 256; $bank++) { | |
119 | |
120 // Go through characters in bank; skip pure ASCII characters. | |
121 $start = ($bank == 0) ? 0x80 : 0; | |
122 for ($chr = $start; $chr < 256; $chr++) { | |
123 | |
124 // Gather the data together for this character. | |
125 $row = array(); | |
126 foreach ($types as $type) { | |
127 $row[$type] = (isset($data[$type][$bank][$chr]) && is_string($data[$type][$bank][$chr])) ? $data[$type][$bank][$chr] : ''; | |
128 } | |
129 | |
130 // Only print if there are differences or we are printing all data. | |
131 $print = $print_all; | |
132 $ref = $row['drupal']; | |
133 $why = array(); | |
134 foreach ($types as $type) { | |
135 // Try to characterize what the differences are. | |
136 if ($row[$type] != $ref) { | |
137 if ($row[$type] == '') { | |
138 $why['missing'] = 'missing'; | |
139 if ($print_missing) { | |
140 $print = TRUE; | |
141 } | |
142 } | |
143 elseif ($ref == '') { | |
144 $why['provided'] = 'provided'; | |
145 $print = TRUE; | |
146 } | |
147 elseif ($row[$type] == strtolower($ref) || $row[$type] == strtoupper($ref)) { | |
148 $why['case'] = 'case'; | |
149 $print = TRUE; | |
150 } | |
151 else { | |
152 $why['different'] = 'different'; | |
153 $print = TRUE; | |
154 } | |
155 } | |
156 } | |
157 | |
158 // Print the data line. | |
159 if ($print) { | |
160 print '0x' . sprintf('%04x', 256 * $bank + $chr) . ','; | |
161 foreach ($row as $out) { | |
162 print '"' . addcslashes($out, '"') . '", '; | |
163 } | |
164 print implode(':', $why); | |
165 print "\n"; | |
166 } | |
167 } | |
168 } | |
169 } | |
170 | |
171 /** | |
172 * Reads in 'intl' transliteration data and writes out changed Drupal files. | |
173 * | |
174 * Writes out the Drupal data files that would have to change to make our data | |
175 * match the intl data set. | |
176 * | |
177 * @param string $outdir | |
178 * Directory to put the patched data files in (under where the script is | |
179 * being run). | |
180 */ | |
181 function patch_drupal($outdir) { | |
182 $data = array(); | |
183 | |
184 // Note that this is hard-wired below. Changing this line will have no | |
185 // effect except to break this function. | |
186 $types = array('drupal', 'intl'); | |
187 | |
188 // Read in all the data. | |
189 foreach ($types as $type) { | |
190 $data[$type] = call_user_func('read_' . $type . '_data'); | |
191 } | |
192 | |
193 // Go through all the banks of character data. | |
194 for ($bank = 0; $bank < 256; $bank++) { | |
195 $print_bank = FALSE; | |
196 | |
197 // Go through characters in bank; skip pure ASCII characters. | |
198 $start = ($bank == 0) ? 0x80 : 0; | |
199 $newdata = array(); | |
200 for ($chr = 0; $chr < 256; $chr++) { | |
201 // Fill up the start of the ASCII range. | |
202 if ($chr < $start) { | |
203 $newdata[$chr] = chr($chr); | |
204 continue; | |
205 } | |
206 | |
207 // Figure out what characters we actually have. | |
208 $drupal = isset($data['drupal'][$bank][$chr]) ? $data['drupal'][$bank][$chr] : NULL; | |
209 // Note that for intl, we only want to keep the transliteration if it | |
210 // has something other than '' in it. | |
211 $intl = isset($data['intl'][$bank][$chr]) && $data['intl'][$bank][$chr] != '' ? $data['intl'][$bank][$chr] : NULL; | |
212 // Make sure we have something in the Drupal data set, in case we need | |
213 // to print. | |
214 $newdata[$chr] = $drupal; | |
215 | |
216 if (!isset($intl)) { | |
217 continue; | |
218 } | |
219 if (!isset($drupal) || $drupal != $intl) { | |
220 $print_bank = TRUE; | |
221 $newdata[$chr] = $intl; | |
222 } | |
223 } | |
224 | |
225 // If we found a difference, output a data file. | |
226 if ($print_bank) { | |
227 write_data_file($newdata, $bank, $outdir); | |
228 } | |
229 } | |
230 } | |
231 | |
232 /** | |
233 * Reads in the Drupal Core generic transliteration data set. | |
234 * | |
235 * The data is expected to be in files xNN.php in directory 'data' under | |
236 * this file's directory. | |
237 * | |
238 * @return array | |
239 * Nested array of transliteration data. Outer keys are the first two | |
240 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
241 * level is the other two bytes, and the values are the transliterations. | |
242 * | |
243 * @see PhpTransliteration::readGenericData() | |
244 */ | |
245 function read_drupal_data() { | |
246 $dir = __DIR__ . '/data'; | |
247 $out = array(); | |
248 | |
249 // Read data files. | |
250 for ($bank = 0; $bank < 256; $bank++) { | |
251 $base = array(); | |
252 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; | |
253 if (is_file($file)) { | |
254 include($file); | |
255 } | |
256 $out[$bank] = $base; | |
257 } | |
258 | |
259 return $out; | |
260 } | |
261 | |
262 /** | |
263 * Reads in the MidgardMVC transliteration data. | |
264 * | |
265 * The data is expected to be in files xNN.php in directory utf8_to_ascii_db | |
266 * under the directory where this file resides. It can be downloaded from | |
267 * https://github.com/bergie/midgardmvc_helper_urlize/downloads. | |
268 * | |
269 * @return array | |
270 * Nested array of transliteration data. Outer keys are the first two | |
271 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
272 * level is the other two bytes, and the values are the transliterations. | |
273 */ | |
274 function read_midgard_data() { | |
275 $dir = __DIR__ . '/utf8_to_ascii_db'; | |
276 $out = array(); | |
277 | |
278 // Read data files. | |
279 for ($bank = 0; $bank < 256; $bank++) { | |
280 $UTF8_TO_ASCII = array($bank => array()); | |
281 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; | |
282 if (is_file($file)) { | |
283 include($file); | |
284 } | |
285 $base = $UTF8_TO_ASCII[$bank]; | |
286 | |
287 // For unknown characters, these files have '[?]' in them. Replace with | |
288 // NULL for compatibility with our data. | |
289 $base = array_map('_replace_question_with_null', $base); | |
290 $out[$bank] = $base; | |
291 } | |
292 | |
293 return $out; | |
294 } | |
295 | |
296 /** | |
297 * Reads in the CPAN Text::Unidecode data set. | |
298 * | |
299 * The data is expected to be in files xNN.pm in directory 'Unidecode' under | |
300 * this file's directory. It can be downloaded from | |
301 * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm. | |
302 * | |
303 * @return array | |
304 * Nested array of transliteration data. Outer keys are the first two | |
305 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
306 * level is the other two bytes, and the values are the transliterations. | |
307 */ | |
308 function read_cpan_data() { | |
309 $dir = __DIR__ . '/Unidecode'; | |
310 $out = array(); | |
311 | |
312 // Read data files. | |
313 for ($bank = 0; $bank < 256; $bank++) { | |
314 $base = array(); | |
315 $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm'; | |
316 if (is_file($file)) { | |
317 $base = _cpan_read_file($file); | |
318 } | |
319 $out[$bank] = $base; | |
320 } | |
321 | |
322 return $out; | |
323 } | |
324 | |
325 /** | |
326 * Reads in the data in a single file from the Text::Unidecode CPAN project. | |
327 * | |
328 * @param string $file | |
329 * File to read from. | |
330 * | |
331 * @return array | |
332 * Data read from the file. | |
333 * | |
334 * @see read_cpan_data() | |
335 */ | |
336 function _cpan_read_file($file) { | |
337 | |
338 $contents = file($file); | |
339 $save = ''; | |
340 foreach ($contents as $line) { | |
341 // Discard lines starting with # or $. The first line seems to have a | |
342 // comment starting with #, the second has a Perl line like | |
343 // $Text::Unidecode::Char[0x04] = [, -- and we do not want either. | |
344 if (preg_match('|^\s*[#\$]|', $line)) { | |
345 continue; | |
346 } | |
347 | |
348 // Discard lines ending with semi-colons, which we also don't want | |
349 // (there seem to be two of these lines at the end of the files). | |
350 if (preg_match('|;\s*$|', $line)) { | |
351 continue; | |
352 } | |
353 | |
354 // Replace '[?]' with nothing (that means "don't know how to | |
355 // transliterate"). In some files, this is encoded as qq{[?]} or | |
356 // qq{[?] } instead. | |
357 $line = str_replace('qq{[?]}', 'NULL', $line); | |
358 $line = str_replace('qq{[?] }', 'NULL', $line); | |
359 $line = str_replace("'[?]'", 'NULL', $line); | |
360 | |
361 // Replace qq{} with either "" or '' or nothing, depending on what is | |
362 // inside it. | |
363 $line = str_replace('qq{\{}', "'{'", $line); | |
364 $line = str_replace('qq{\}}', "'}'", $line); | |
365 $line = str_replace('qq{\} }', "'} '", $line); | |
366 $line = str_replace("qq{\\\\}", '"\\\\"', $line); | |
367 $line = str_replace("qq{\\", "qq{'", $line); | |
368 $line = str_replace("qq{\"'}", "\"\\\"'\"", $line); | |
369 $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line); | |
370 $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line); | |
371 | |
372 $save .= $line; | |
373 } | |
374 | |
375 // Now we should have a string that looks like: | |
376 // 'a', 'b', ... | |
377 // Evaluate as an array. | |
378 $save = 'return array(' . $save . ');'; | |
379 | |
380 $data = @eval($save); | |
381 if (isset($data) && is_array($data)) { | |
382 $data = array_map('_replace_hex_with_character', $data); | |
383 } | |
384 else { | |
385 // There was a problem, so throw an error and exit. | |
386 print "Problem in evaluating $file\n"; | |
387 print $save; | |
388 eval($save); | |
389 exit(); | |
390 } | |
391 | |
392 // For unknown characters, these files may still have '[?]' in them. Replace | |
393 // with NULL for compatibility with our data. | |
394 $data = array_map('_replace_question_with_null', $data); | |
395 | |
396 return $data; | |
397 } | |
398 | |
399 /** | |
400 * Reads in the Node.js transliteration data. | |
401 * | |
402 * The data is expected to be in files xNN.yml in directory unidecoder_data | |
403 * under the directory where this file resides. It can be downloaded from | |
404 * https://github.com/bitwalker/stringex/downloads. You also need the PECL | |
405 * 'yaml' extension installed for this function to work. | |
406 * | |
407 * @return array | |
408 * Nested array of transliteration data. Outer keys are the first two | |
409 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
410 * level is the other two bytes, and the values are the transliterations. | |
411 */ | |
412 function read_nodejs_data() { | |
413 $dir = __DIR__ . '/unidecoder_data'; | |
414 $out = array(); | |
415 | |
416 // Read data files. | |
417 for ($bank = 0; $bank < 256; $bank++) { | |
418 $base = array(); | |
419 $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml'; | |
420 if (is_file($file)) { | |
421 $base = yaml_parse_file($file); | |
422 // For unknown characters, these files have '[?]' in them. Replace with | |
423 // NULL for compatibility with our data. | |
424 $base = array_map('_replace_question_with_null', $base); | |
425 } | |
426 $out[$bank] = $base; | |
427 } | |
428 | |
429 return $out; | |
430 } | |
431 | |
432 /** | |
433 * Loads the PECL 'intl' Transliterator class's transliteration data. | |
434 * | |
435 * You need to have the PECL 'intl' package installed for this to work. | |
436 * | |
437 * @return array | |
438 * Nested array of transliteration data. Outer keys are the first two | |
439 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
440 * level is the other two bytes, and the values are the transliterations. | |
441 */ | |
442 function read_intl_data() { | |
443 // In order to transliterate, you first have to create a transliterator | |
444 // object. This needs a list of transliteration operations. You can get a | |
445 // list of available operations with: | |
446 // print_r(Transliterator::listIDs()); exit(); | |
447 // And a few of these are documented on | |
448 // http://userguide.icu-project.org/transforms/general and | |
449 // http://www.unicode.org/reports/tr15/ (for normalizations). | |
450 // There are also maps to the Unicode characters at: | |
451 // http://www.unicode.org/roadmaps/bmp/ | |
452 // http://www.unicode.org/charts/nameslist/ | |
453 $ops = ''; | |
454 | |
455 // The first step in any transform: separate out accents and remove them. | |
456 $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;'; | |
457 | |
458 // Then you need to do a bunch of language-specific or script-specific | |
459 // transliterations. Here is hopefully a representative set. There are | |
460 // quite a few scripts that don't appear to have rules currently, such | |
461 // as Etheopian. | |
462 $ops .= 'Greek-Latin; '; | |
463 $ops .= 'Cyrillic-Latin; '; | |
464 $ops .= 'Armenian-Latin; '; | |
465 $ops .= 'Hebrew-Latin; '; | |
466 $ops .= 'Arabic-Latin; '; | |
467 $ops .= 'Syriac-Latin; '; | |
468 $ops .= 'Thaana-Latin; '; | |
469 $ops .= 'Devanagari-Latin; '; | |
470 $ops .= 'Bengali-Latin; '; | |
471 $ops .= 'Gurmukhi-Latin; '; | |
472 $ops .= 'Gujarati-Latin; '; | |
473 $ops .= 'Oriya-Latin; '; | |
474 $ops .= 'Tamil-Latin; '; | |
475 $ops .= 'Telugu-Latin; '; | |
476 $ops .= 'Kannada-Latin; '; | |
477 $ops .= 'Malayalam-Latin; '; | |
478 $ops .= 'Thai-Latin; '; | |
479 $ops .= 'Georgian-Latin; '; | |
480 $ops .= 'Hangul-Latin; '; | |
481 $ops .= 'Mongolian-Latin/BGN; '; | |
482 $ops .= 'Jamo-Latin; '; | |
483 $ops .= 'Katakana-Latin; '; | |
484 $ops .= 'Any-Latin; '; | |
485 | |
486 // Finally, after transforming to Latin, transform to ASCII. | |
487 $ops .= 'Latin-ASCII; '; | |
488 | |
489 // Remove any remaining accents and recompose. | |
490 $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;'; | |
491 | |
492 $trans = Transliterator::create($ops); | |
493 $out = array(); | |
494 | |
495 // Transliterate all possible characters. | |
496 for ($bank = 0; $bank < 256; $bank++) { | |
497 $data = array(); | |
498 for ($chr = 0; $chr < 256; $chr++) { | |
499 // Skip the UTF-16 and "private use" ranges completely. | |
500 $OK = ($bank <= 0xd8 || $bank > 0xf8); | |
501 | |
502 $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : ''; | |
503 | |
504 // See if we have managed to transliterate this to ASCII or not. If not, | |
505 // return NULL instead of this character. | |
506 $max = chr(127); | |
507 foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) { | |
508 if ($character > $max) { | |
509 $OK = $OK && FALSE; | |
510 break; | |
511 } | |
512 } | |
513 $data[$chr] = ($OK) ? $result : NULL; | |
514 } | |
515 $out[$bank] = $data; | |
516 } | |
517 | |
518 return $out; | |
519 } | |
520 | |
521 /** | |
522 * Reads in the JUnidecode data set. | |
523 * | |
524 * The data is expected to be in files XNN.java in directory 'junidecode' under | |
525 * this file's directory. It can be downloaded from | |
526 * http://www.ippatsuman.com/projects/junidecode/index.html | |
527 * | |
528 * @return array | |
529 * Nested array of transliteration data. Outer keys are the first two | |
530 * bytes of Unicode characters (or 0 for base ASCII characters). The next | |
531 * level is the other two bytes, and the values are the transliterations. | |
532 */ | |
533 function read_junidecode_data() { | |
534 $dir = __DIR__ . '/junidecode'; | |
535 $out = array(); | |
536 | |
537 // Read data files. | |
538 for ($bank = 0; $bank < 256; $bank++) { | |
539 $base = array(); | |
540 $file = $dir . '/X' . sprintf('%02x', $bank) . '.java'; | |
541 if (is_file($file)) { | |
542 $base = _junidecode_read_file($file); | |
543 } | |
544 $out[$bank] = $base; | |
545 } | |
546 | |
547 return $out; | |
548 } | |
549 | |
550 /** | |
551 * Reads in the data in a single file from the JUnidecode project. | |
552 * | |
553 * @param string $file | |
554 * File to read from. | |
555 * | |
556 * @return array | |
557 * Data read from the file. | |
558 * | |
559 * @see read_junidecode_data() | |
560 */ | |
561 function _junidecode_read_file($file) { | |
562 $contents = file($file); | |
563 $save = ''; | |
564 foreach ($contents as $line) { | |
565 // Discard lines starting with * or / or package or class or public or }, | |
566 // to get rid of comments and Java code. | |
567 if (preg_match('|^\s*[\*/\}]|', $line)) { | |
568 continue; | |
569 } | |
570 if (preg_match('/^\s*package|public|class/', $line)) { | |
571 continue; | |
572 } | |
573 | |
574 // Some of the lines look like this: | |
575 // new String("" + (char) 0x00), // 0x00 | |
576 // Transform to be '0x00,' | |
577 $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line); | |
578 | |
579 // Strings are in double quotes, yet many have \' in them. | |
580 $line = str_replace("\'", "'", $line); | |
581 | |
582 // Everything else should probably be OK -- the lines are like: | |
583 // "Ie", // 0x00 | |
584 $save .= $line; | |
585 } | |
586 | |
587 // Evaluate as an array. | |
588 $save = 'return array(' . $save . ');'; | |
589 | |
590 $data = @eval($save); | |
591 if (isset($data) && is_array($data)) { | |
592 $data = array_map('_replace_hex_with_character', $data); | |
593 $data = array_map('_replace_question_with_null', $data); | |
594 } | |
595 else { | |
596 // There was a problem, so throw an error and exit. | |
597 print "Problem in evaluating $file\n"; | |
598 print $save; | |
599 eval($save); | |
600 exit(); | |
601 } | |
602 | |
603 return $data; | |
604 } | |
605 | |
606 /** | |
607 * Callback for array_map(): Returns $data, with '[?]' replaced with NULL. | |
608 */ | |
609 function _replace_question_with_null($data) { | |
610 return ($data == '[?]' || $data == '[?] ') ? NULL : $data; | |
611 } | |
612 | |
613 /** | |
614 * Callback for array_map(): Replaces '\xNN' with the actual character. | |
615 */ | |
616 function _replace_hex_with_character($item) { | |
617 if (strpos($item, '\x') === 0) { | |
618 $item = eval($item); | |
619 } | |
620 return $item; | |
621 } | |
622 | |
623 /** | |
624 * Writes a data file out in the standard Drupal Core data format. | |
625 * | |
626 * @param array $data | |
627 * Array of data to write out. | |
628 * @param string $bank | |
629 * Bank of characters it belongs to. | |
630 * @param string $dir | |
631 * Output directory. | |
632 */ | |
633 function write_data_file($data, $bank, $outdir) { | |
634 $dir = __DIR__ . '/' . $outdir; | |
635 $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; | |
636 | |
637 $out = ''; | |
638 $out .= "<?php\n\n/**\n * @file\n * Generic transliteration data for the PhpTransliteration class.\n */\n\n\$base = array(\n"; | |
639 | |
640 // The 00 file skips the ASCII range | |
641 $start = 0; | |
642 if ($bank == 0) { | |
643 $start = 0x80; | |
644 $out .= " // Note: to save memory plain ASCII mappings have been left out.\n"; | |
645 } | |
646 | |
647 for ($line = $start; $line <= 0xf0; $line += 0x10) { | |
648 $out .= ' 0x' . sprintf('%02X', $line) . ' =>'; | |
649 $elems = array_values(array_slice($data, $line, 16)); | |
650 for ($i = 0; $i < 16; $i++ ) { | |
651 if (isset($elems[$i])) { | |
652 $out .= " '" . addcslashes($elems[$i], "'\\") . "',"; | |
653 } | |
654 else { | |
655 $out .= ' NULL,'; | |
656 } | |
657 } | |
658 $out .= "\n"; | |
659 } | |
660 | |
661 $out .= ");\n"; | |
662 | |
663 file_put_contents($file, $out); | |
664 } |