Chris@0: array()); Chris@0: $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; Chris@0: if (is_file($file)) { Chris@0: include($file); Chris@0: } Chris@0: $base = $UTF8_TO_ASCII[$bank]; Chris@0: Chris@0: // For unknown characters, these files have '[?]' in them. Replace with Chris@0: // NULL for compatibility with our data. Chris@0: $base = array_map('_replace_question_with_null', $base); Chris@0: $out[$bank] = $base; Chris@0: } Chris@0: Chris@0: return $out; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in the CPAN Text::Unidecode data set. Chris@0: * Chris@0: * The data is expected to be in files xNN.pm in directory 'Unidecode' under Chris@0: * this file's directory. It can be downloaded from Chris@0: * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm. Chris@0: * Chris@0: * @return array Chris@0: * Nested array of transliteration data. Outer keys are the first two Chris@0: * bytes of Unicode characters (or 0 for base ASCII characters). The next Chris@0: * level is the other two bytes, and the values are the transliterations. Chris@0: */ Chris@0: function read_cpan_data() { Chris@0: $dir = __DIR__ . '/Unidecode'; Chris@0: $out = array(); Chris@0: Chris@0: // Read data files. Chris@0: for ($bank = 0; $bank < 256; $bank++) { Chris@0: $base = array(); Chris@0: $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm'; Chris@0: if (is_file($file)) { Chris@0: $base = _cpan_read_file($file); Chris@0: } Chris@0: $out[$bank] = $base; Chris@0: } Chris@0: Chris@0: return $out; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in the data in a single file from the Text::Unidecode CPAN project. Chris@0: * Chris@0: * @param string $file Chris@0: * File to read from. Chris@0: * Chris@0: * @return array Chris@0: * Data read from the file. Chris@0: * Chris@0: * @see read_cpan_data() Chris@0: */ Chris@0: function _cpan_read_file($file) { Chris@0: Chris@0: $contents = file($file); Chris@0: $save = ''; Chris@0: foreach ($contents as $line) { Chris@0: // Discard lines starting with # or $. The first line seems to have a Chris@0: // comment starting with #, the second has a Perl line like Chris@0: // $Text::Unidecode::Char[0x04] = [, -- and we do not want either. Chris@0: if (preg_match('|^\s*[#\$]|', $line)) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: // Discard lines ending with semi-colons, which we also don't want Chris@0: // (there seem to be two of these lines at the end of the files). Chris@0: if (preg_match('|;\s*$|', $line)) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: // Replace '[?]' with nothing (that means "don't know how to Chris@0: // transliterate"). In some files, this is encoded as qq{[?]} or Chris@0: // qq{[?] } instead. Chris@0: $line = str_replace('qq{[?]}', 'NULL', $line); Chris@0: $line = str_replace('qq{[?] }', 'NULL', $line); Chris@0: $line = str_replace("'[?]'", 'NULL', $line); Chris@0: Chris@0: // Replace qq{} with either "" or '' or nothing, depending on what is Chris@0: // inside it. Chris@0: $line = str_replace('qq{\{}', "'{'", $line); Chris@0: $line = str_replace('qq{\}}', "'}'", $line); Chris@0: $line = str_replace('qq{\} }', "'} '", $line); Chris@0: $line = str_replace("qq{\\\\}", '"\\\\"', $line); Chris@0: $line = str_replace("qq{\\", "qq{'", $line); Chris@0: $line = str_replace("qq{\"'}", "\"\\\"'\"", $line); Chris@0: $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line); Chris@0: $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line); Chris@0: Chris@0: $save .= $line; Chris@0: } Chris@0: Chris@0: // Now we should have a string that looks like: Chris@0: // 'a', 'b', ... Chris@0: // Evaluate as an array. Chris@0: $save = 'return array(' . $save . ');'; Chris@0: Chris@0: $data = @eval($save); Chris@0: if (isset($data) && is_array($data)) { Chris@0: $data = array_map('_replace_hex_with_character', $data); Chris@0: } Chris@0: else { Chris@0: // There was a problem, so throw an error and exit. Chris@0: print "Problem in evaluating $file\n"; Chris@0: print $save; Chris@0: eval($save); Chris@0: exit(); Chris@0: } Chris@0: Chris@0: // For unknown characters, these files may still have '[?]' in them. Replace Chris@0: // with NULL for compatibility with our data. Chris@0: $data = array_map('_replace_question_with_null', $data); Chris@0: Chris@0: return $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in the Node.js transliteration data. Chris@0: * Chris@0: * The data is expected to be in files xNN.yml in directory unidecoder_data Chris@0: * under the directory where this file resides. It can be downloaded from Chris@0: * https://github.com/bitwalker/stringex/downloads. You also need the PECL Chris@0: * 'yaml' extension installed for this function to work. Chris@0: * Chris@0: * @return array Chris@0: * Nested array of transliteration data. Outer keys are the first two Chris@0: * bytes of Unicode characters (or 0 for base ASCII characters). The next Chris@0: * level is the other two bytes, and the values are the transliterations. Chris@0: */ Chris@0: function read_nodejs_data() { Chris@0: $dir = __DIR__ . '/unidecoder_data'; Chris@0: $out = array(); Chris@0: Chris@0: // Read data files. Chris@0: for ($bank = 0; $bank < 256; $bank++) { Chris@0: $base = array(); Chris@0: $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml'; Chris@0: if (is_file($file)) { Chris@0: $base = yaml_parse_file($file); Chris@0: // For unknown characters, these files have '[?]' in them. Replace with Chris@0: // NULL for compatibility with our data. Chris@0: $base = array_map('_replace_question_with_null', $base); Chris@0: } Chris@0: $out[$bank] = $base; Chris@0: } Chris@0: Chris@0: return $out; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Loads the PECL 'intl' Transliterator class's transliteration data. Chris@0: * Chris@0: * You need to have the PECL 'intl' package installed for this to work. Chris@0: * Chris@0: * @return array Chris@0: * Nested array of transliteration data. Outer keys are the first two Chris@0: * bytes of Unicode characters (or 0 for base ASCII characters). The next Chris@0: * level is the other two bytes, and the values are the transliterations. Chris@0: */ Chris@0: function read_intl_data() { Chris@0: // In order to transliterate, you first have to create a transliterator Chris@0: // object. This needs a list of transliteration operations. You can get a Chris@0: // list of available operations with: Chris@0: // print_r(Transliterator::listIDs()); exit(); Chris@0: // And a few of these are documented on Chris@0: // http://userguide.icu-project.org/transforms/general and Chris@0: // http://www.unicode.org/reports/tr15/ (for normalizations). Chris@0: // There are also maps to the Unicode characters at: Chris@0: // http://www.unicode.org/roadmaps/bmp/ Chris@0: // http://www.unicode.org/charts/nameslist/ Chris@0: $ops = ''; Chris@0: Chris@0: // The first step in any transform: separate out accents and remove them. Chris@0: $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;'; Chris@0: Chris@0: // Then you need to do a bunch of language-specific or script-specific Chris@0: // transliterations. Here is hopefully a representative set. There are Chris@0: // quite a few scripts that don't appear to have rules currently, such Chris@0: // as Etheopian. Chris@0: $ops .= 'Greek-Latin; '; Chris@0: $ops .= 'Cyrillic-Latin; '; Chris@0: $ops .= 'Armenian-Latin; '; Chris@0: $ops .= 'Hebrew-Latin; '; Chris@0: $ops .= 'Arabic-Latin; '; Chris@0: $ops .= 'Syriac-Latin; '; Chris@0: $ops .= 'Thaana-Latin; '; Chris@0: $ops .= 'Devanagari-Latin; '; Chris@0: $ops .= 'Bengali-Latin; '; Chris@0: $ops .= 'Gurmukhi-Latin; '; Chris@0: $ops .= 'Gujarati-Latin; '; Chris@0: $ops .= 'Oriya-Latin; '; Chris@0: $ops .= 'Tamil-Latin; '; Chris@0: $ops .= 'Telugu-Latin; '; Chris@0: $ops .= 'Kannada-Latin; '; Chris@0: $ops .= 'Malayalam-Latin; '; Chris@0: $ops .= 'Thai-Latin; '; Chris@0: $ops .= 'Georgian-Latin; '; Chris@0: $ops .= 'Hangul-Latin; '; Chris@0: $ops .= 'Mongolian-Latin/BGN; '; Chris@0: $ops .= 'Jamo-Latin; '; Chris@0: $ops .= 'Katakana-Latin; '; Chris@0: $ops .= 'Any-Latin; '; Chris@0: Chris@0: // Finally, after transforming to Latin, transform to ASCII. Chris@0: $ops .= 'Latin-ASCII; '; Chris@0: Chris@0: // Remove any remaining accents and recompose. Chris@0: $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;'; Chris@0: Chris@0: $trans = Transliterator::create($ops); Chris@0: $out = array(); Chris@0: Chris@0: // Transliterate all possible characters. Chris@0: for ($bank = 0; $bank < 256; $bank++) { Chris@0: $data = array(); Chris@0: for ($chr = 0; $chr < 256; $chr++) { Chris@0: // Skip the UTF-16 and "private use" ranges completely. Chris@0: $OK = ($bank <= 0xd8 || $bank > 0xf8); Chris@0: Chris@0: $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : ''; Chris@0: Chris@0: // See if we have managed to transliterate this to ASCII or not. If not, Chris@0: // return NULL instead of this character. Chris@0: $max = chr(127); Chris@0: foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) { Chris@0: if ($character > $max) { Chris@0: $OK = $OK && FALSE; Chris@0: break; Chris@0: } Chris@0: } Chris@0: $data[$chr] = ($OK) ? $result : NULL; Chris@0: } Chris@0: $out[$bank] = $data; Chris@0: } Chris@0: Chris@0: return $out; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in the JUnidecode data set. Chris@0: * Chris@0: * The data is expected to be in files XNN.java in directory 'junidecode' under Chris@0: * this file's directory. It can be downloaded from Chris@0: * http://www.ippatsuman.com/projects/junidecode/index.html Chris@0: * Chris@0: * @return array Chris@0: * Nested array of transliteration data. Outer keys are the first two Chris@0: * bytes of Unicode characters (or 0 for base ASCII characters). The next Chris@0: * level is the other two bytes, and the values are the transliterations. Chris@0: */ Chris@0: function read_junidecode_data() { Chris@0: $dir = __DIR__ . '/junidecode'; Chris@0: $out = array(); Chris@0: Chris@0: // Read data files. Chris@0: for ($bank = 0; $bank < 256; $bank++) { Chris@0: $base = array(); Chris@0: $file = $dir . '/X' . sprintf('%02x', $bank) . '.java'; Chris@0: if (is_file($file)) { Chris@0: $base = _junidecode_read_file($file); Chris@0: } Chris@0: $out[$bank] = $base; Chris@0: } Chris@0: Chris@0: return $out; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in the data in a single file from the JUnidecode project. Chris@0: * Chris@0: * @param string $file Chris@0: * File to read from. Chris@0: * Chris@0: * @return array Chris@0: * Data read from the file. Chris@0: * Chris@0: * @see read_junidecode_data() Chris@0: */ Chris@0: function _junidecode_read_file($file) { Chris@0: $contents = file($file); Chris@0: $save = ''; Chris@0: foreach ($contents as $line) { Chris@0: // Discard lines starting with * or / or package or class or public or }, Chris@0: // to get rid of comments and Java code. Chris@0: if (preg_match('|^\s*[\*/\}]|', $line)) { Chris@0: continue; Chris@0: } Chris@0: if (preg_match('/^\s*package|public|class/', $line)) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: // Some of the lines look like this: Chris@0: // new String("" + (char) 0x00), // 0x00 Chris@0: // Transform to be '0x00,' Chris@0: $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line); Chris@0: Chris@0: // Strings are in double quotes, yet many have \' in them. Chris@0: $line = str_replace("\'", "'", $line); Chris@0: Chris@0: // Everything else should probably be OK -- the lines are like: Chris@0: // "Ie", // 0x00 Chris@0: $save .= $line; Chris@0: } Chris@0: Chris@0: // Evaluate as an array. Chris@0: $save = 'return array(' . $save . ');'; Chris@0: Chris@0: $data = @eval($save); Chris@0: if (isset($data) && is_array($data)) { Chris@0: $data = array_map('_replace_hex_with_character', $data); Chris@0: $data = array_map('_replace_question_with_null', $data); Chris@0: } Chris@0: else { Chris@0: // There was a problem, so throw an error and exit. Chris@0: print "Problem in evaluating $file\n"; Chris@0: print $save; Chris@0: eval($save); Chris@0: exit(); Chris@0: } Chris@0: Chris@0: return $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Callback for array_map(): Returns $data, with '[?]' replaced with NULL. Chris@0: */ Chris@0: function _replace_question_with_null($data) { Chris@0: return ($data == '[?]' || $data == '[?] ') ? NULL : $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Callback for array_map(): Replaces '\xNN' with the actual character. Chris@0: */ Chris@0: function _replace_hex_with_character($item) { Chris@0: if (strpos($item, '\x') === 0) { Chris@0: $item = eval($item); Chris@0: } Chris@0: return $item; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Writes a data file out in the standard Drupal Core data format. Chris@0: * Chris@0: * @param array $data Chris@0: * Array of data to write out. Chris@0: * @param string $bank Chris@0: * Bank of characters it belongs to. Chris@0: * @param string $dir Chris@0: * Output directory. Chris@0: */ Chris@0: function write_data_file($data, $bank, $outdir) { Chris@0: $dir = __DIR__ . '/' . $outdir; Chris@0: $file = $dir . '/x' . sprintf('%02x', $bank) . '.php'; Chris@0: Chris@0: $out = ''; Chris@0: $out .= "'; Chris@0: $elems = array_values(array_slice($data, $line, 16)); Chris@0: for ($i = 0; $i < 16; $i++ ) { Chris@0: if (isset($elems[$i])) { Chris@0: $out .= " '" . addcslashes($elems[$i], "'\\") . "',"; Chris@0: } Chris@0: else { Chris@0: $out .= ' NULL,'; Chris@0: } Chris@0: } Chris@0: $out .= "\n"; Chris@0: } Chris@0: Chris@0: $out .= ");\n"; Chris@0: Chris@0: file_put_contents($file, $out); Chris@0: }