Chris@0: , Chris@0: * http://www.mediawiki.org/ Chris@0: */ Chris@0: class PhpTransliteration implements TransliterationInterface { Chris@0: Chris@0: /** Chris@0: * Directory where data for transliteration resides. Chris@0: * Chris@0: * The constructor sets this (by default) to subdirectory 'data' underneath Chris@0: * the directory where the class's PHP file resides. Chris@0: * Chris@0: * @var string Chris@0: */ Chris@0: protected $dataDirectory; Chris@0: Chris@0: /** Chris@0: * Associative array of language-specific character transliteration tables. Chris@0: * Chris@0: * The outermost array keys are language codes. For each language code key, Chris@0: * the value is an array whose keys are Unicode character codes, and whose Chris@0: * values are the transliterations of those characters to US-ASCII. This is Chris@0: * set up as needed in PhpTransliteration::replace() by calling Chris@0: * PhpTransliteration::readLanguageOverrides(). Chris@0: * Chris@0: * @var array Chris@0: */ Chris@0: protected $languageOverrides = []; Chris@0: Chris@0: /** Chris@0: * Non-language-specific transliteration tables. Chris@0: * Chris@0: * Array whose keys are the upper two bytes of the Unicode character, and Chris@0: * whose values are an array of transliterations for each lower-two bytes Chris@0: * character code. This is set up as needed in PhpTransliteration::replace() Chris@0: * by calling PhpTransliteration::readGenericData(). Chris@0: * Chris@0: * @var array Chris@0: */ Chris@0: protected $genericMap = []; Chris@0: Chris@0: /** Chris@0: * Constructs a transliteration object. Chris@0: * Chris@0: * @param string $data_directory Chris@0: * (optional) The directory where data files reside. If omitted, defaults Chris@0: * to subdirectory 'data' underneath the directory where the class's PHP Chris@0: * file resides. Chris@0: */ Chris@0: public function __construct($data_directory = NULL) { Chris@0: $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data'; Chris@0: } Chris@0: Chris@0: /** Chris@0: * {@inheritdoc} Chris@0: */ Chris@0: public function removeDiacritics($string) { Chris@0: $result = ''; Chris@0: Chris@0: foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { Chris@0: $code = self::ordUTF8($character); Chris@0: Chris@0: // These two Unicode ranges include the accented US-ASCII letters, with a Chris@0: // few characters that aren't accented letters mixed in. So define the Chris@0: // ranges and the excluded characters. Chris@0: $range1 = $code > 0x00bf && $code < 0x017f; Chris@0: $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b]; Chris@0: $range2 = $code > 0x01cc && $code < 0x0250; Chris@0: $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245]; Chris@0: Chris@0: $replacement = $character; Chris@0: if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) { Chris@0: $to_add = $this->lookupReplacement($code, 'xyz'); Chris@0: if (strlen($to_add) === 1) { Chris@0: $replacement = $to_add; Chris@0: } Chris@0: } Chris@0: Chris@0: $result .= $replacement; Chris@0: } Chris@0: Chris@0: return $result; Chris@0: } Chris@0: Chris@0: /** Chris@0: * {@inheritdoc} Chris@0: */ Chris@0: public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) { Chris@0: $result = ''; Chris@0: $length = 0; Chris@17: $hash = FALSE; Chris@17: Chris@17: // Replace question marks with a unique hash if necessary. This because Chris@17: // mb_convert_encoding() replaces all invalid characters with a question Chris@17: // mark. Chris@17: if ($unknown_character != '?' && strpos($string, '?') !== FALSE) { Chris@17: $hash = hash('sha256', $string); Chris@17: $string = str_replace('?', $hash, $string); Chris@17: } Chris@17: Chris@17: // Ensure the string is valid UTF8 for preg_split(). Unknown characters will Chris@17: // be replaced by a question mark. Chris@17: $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8'); Chris@17: Chris@17: // Use the provided unknown character instead of a question mark. Chris@17: if ($unknown_character != '?') { Chris@17: $string = str_replace('?', $unknown_character, $string); Chris@17: // Restore original question marks if necessary. Chris@17: if ($hash !== FALSE) { Chris@17: $string = str_replace($hash, '?', $string); Chris@17: } Chris@17: } Chris@17: Chris@0: // Split into Unicode characters and transliterate each one. Chris@0: foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { Chris@0: $code = self::ordUTF8($character); Chris@0: if ($code == -1) { Chris@0: $to_add = $unknown_character; Chris@0: } Chris@0: else { Chris@0: $to_add = $this->replace($code, $langcode, $unknown_character); Chris@0: } Chris@0: Chris@0: // Check if this exceeds the maximum allowed length. Chris@0: if (isset($max_length)) { Chris@0: $length += strlen($to_add); Chris@0: if ($length > $max_length) { Chris@0: // There is no more space. Chris@0: return $result; Chris@0: } Chris@0: } Chris@0: Chris@0: $result .= $to_add; Chris@0: } Chris@0: Chris@0: return $result; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Finds the character code for a UTF-8 character: like ord() but for UTF-8. Chris@0: * Chris@0: * @param string $character Chris@0: * A single UTF-8 character. Chris@0: * Chris@0: * @return int Chris@0: * The character code, or -1 if an illegal character is found. Chris@0: */ Chris@0: protected static function ordUTF8($character) { Chris@0: $first_byte = ord($character[0]); Chris@0: Chris@0: if (($first_byte & 0x80) == 0) { Chris@0: // Single-byte form: 0xxxxxxxx. Chris@0: return $first_byte; Chris@0: } Chris@0: if (($first_byte & 0xe0) == 0xc0) { Chris@0: // Two-byte form: 110xxxxx 10xxxxxx. Chris@0: return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f); Chris@0: } Chris@0: if (($first_byte & 0xf0) == 0xe0) { Chris@0: // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx. Chris@0: return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f); Chris@0: } Chris@0: if (($first_byte & 0xf8) == 0xf0) { Chris@0: // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. Chris@0: return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f); Chris@0: } Chris@0: Chris@0: // Other forms are not legal. Chris@0: return -1; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Replaces a single Unicode character using the transliteration database. Chris@0: * Chris@0: * @param int $code Chris@0: * The character code of a Unicode character. Chris@0: * @param string $langcode Chris@0: * The language code of the language the character is in. Chris@0: * @param string $unknown_character Chris@0: * The character to substitute for characters without transliterated Chris@0: * equivalents. Chris@0: * Chris@0: * @return string Chris@0: * US-ASCII replacement character. If it has a mapping, it is returned; Chris@0: * otherwise, $unknown_character is returned. The replacement can contain Chris@0: * multiple characters. Chris@0: */ Chris@0: protected function replace($code, $langcode, $unknown_character) { Chris@0: if ($code < 0x80) { Chris@0: // Already lower ASCII. Chris@0: return chr($code); Chris@0: } Chris@0: Chris@0: // See if there is a language-specific override for this character. Chris@0: if (!isset($this->languageOverrides[$langcode])) { Chris@0: $this->readLanguageOverrides($langcode); Chris@0: } Chris@0: if (isset($this->languageOverrides[$langcode][$code])) { Chris@0: return $this->languageOverrides[$langcode][$code]; Chris@0: } Chris@0: Chris@0: return $this->lookupReplacement($code, $unknown_character); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Look up the generic replacement for a UTF-8 character code. Chris@0: * Chris@0: * @param $code Chris@0: * The UTF-8 character code. Chris@0: * @param string $unknown_character Chris@0: * (optional) The character to substitute for characters without entries in Chris@0: * the replacement tables. Chris@0: * Chris@0: * @return string Chris@0: * US-ASCII replacement characters. If it has a mapping, it is returned; Chris@0: * otherwise, $unknown_character is returned. The replacement can contain Chris@0: * multiple characters. Chris@0: */ Chris@0: protected function lookupReplacement($code, $unknown_character = '?') { Chris@0: // See if there is a generic mapping for this character. Chris@0: $bank = $code >> 8; Chris@0: if (!isset($this->genericMap[$bank])) { Chris@0: $this->readGenericData($bank); Chris@0: } Chris@0: $code = $code & 0xff; Chris@0: return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in language overrides for a language code. Chris@0: * Chris@0: * The data is read from files named "$langcode.php" in Chris@0: * PhpTransliteration::$dataDirectory. These files should set up an array Chris@0: * variable $overrides with an element whose key is $langcode and whose value Chris@0: * is an array whose keys are character codes, and whose values are their Chris@0: * transliterations in this language. The character codes can be for any valid Chris@0: * Unicode character, independent of the number of bytes. Chris@0: * Chris@0: * @param $langcode Chris@0: * Code for the language to read. Chris@0: */ Chris@0: protected function readLanguageOverrides($langcode) { Chris@0: // Figure out the file name to use by sanitizing the language code, Chris@0: // just in case. Chris@0: $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php'; Chris@0: Chris@0: // Read in this file, which should set up a variable called $overrides, Chris@0: // which will be local to this function. Chris@0: if (is_file($file)) { Chris@0: include $file; Chris@0: } Chris@0: if (!isset($overrides) || !is_array($overrides)) { Chris@0: $overrides = [$langcode => []]; Chris@0: } Chris@0: $this->languageOverrides[$langcode] = $overrides[$langcode]; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reads in generic transliteration data for a bank of characters. Chris@0: * Chris@0: * The data is read in from a file named "x$bank.php" (with $bank in Chris@0: * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files Chris@0: * should set up a variable $bank containing an array whose numerical indices Chris@0: * are the remaining two bytes of the character code, and whose values are the Chris@0: * transliterations of these characters into US-ASCII. Note that the maximum Chris@0: * Unicode character that can be encoded in this way is 4 bytes. Chris@0: * Chris@0: * @param $bank Chris@0: * First two bytes of the Unicode character, or 0 for the ASCII range. Chris@0: */ Chris@0: protected function readGenericData($bank) { Chris@0: // Figure out the file name. Chris@0: $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php'; Chris@0: Chris@0: // Read in this file, which should set up a variable called $base, which Chris@0: // will be local to this function. Chris@0: if (is_file($file)) { Chris@0: include $file; Chris@0: } Chris@0: if (!isset($base) || !is_array($base)) { Chris@0: $base = []; Chris@0: } Chris@0: Chris@0: // Save this data. Chris@0: $this->genericMap[$bank] = $base; Chris@0: } Chris@0: Chris@0: }