Mercurial > hg > isophonics-drupal-site
diff core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 129ea1e6d783 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php Wed Nov 29 16:09:58 2017 +0000 @@ -0,0 +1,284 @@ +<?php + +namespace Drupal\Component\Transliteration; + +/** + * Implements transliteration without using the PECL extensions. + * + * Transliterations are done character-by-character, by looking up non-US-ASCII + * characters in a transliteration database. + * + * The database comes from two types of files, both of which are searched for in + * the PhpTransliteration::$dataDirectory directory. First, language-specific + * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If + * there is no language-specific override for a character, the generic + * transliteration character tables are searched (see + * PhpTransliteration::readGenericData()). If looking up the character in the + * generic table results in a NULL value, or an illegal character is + * encountered, then a substitute character is returned. + * + * Some parts of this code were derived from the MediaWiki project's UtfNormal + * class, Copyright © 2004 Brion Vibber <brion@pobox.com>, + * http://www.mediawiki.org/ + */ +class PhpTransliteration implements TransliterationInterface { + + /** + * Directory where data for transliteration resides. + * + * The constructor sets this (by default) to subdirectory 'data' underneath + * the directory where the class's PHP file resides. + * + * @var string + */ + protected $dataDirectory; + + /** + * Associative array of language-specific character transliteration tables. + * + * The outermost array keys are language codes. For each language code key, + * the value is an array whose keys are Unicode character codes, and whose + * values are the transliterations of those characters to US-ASCII. This is + * set up as needed in PhpTransliteration::replace() by calling + * PhpTransliteration::readLanguageOverrides(). + * + * @var array + */ + protected $languageOverrides = []; + + /** + * Non-language-specific transliteration tables. + * + * Array whose keys are the upper two bytes of the Unicode character, and + * whose values are an array of transliterations for each lower-two bytes + * character code. This is set up as needed in PhpTransliteration::replace() + * by calling PhpTransliteration::readGenericData(). + * + * @var array + */ + protected $genericMap = []; + + /** + * Constructs a transliteration object. + * + * @param string $data_directory + * (optional) The directory where data files reside. If omitted, defaults + * to subdirectory 'data' underneath the directory where the class's PHP + * file resides. + */ + public function __construct($data_directory = NULL) { + $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data'; + } + + /** + * {@inheritdoc} + */ + public function removeDiacritics($string) { + $result = ''; + + foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { + $code = self::ordUTF8($character); + + // These two Unicode ranges include the accented US-ASCII letters, with a + // few characters that aren't accented letters mixed in. So define the + // ranges and the excluded characters. + $range1 = $code > 0x00bf && $code < 0x017f; + $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b]; + $range2 = $code > 0x01cc && $code < 0x0250; + $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245]; + + $replacement = $character; + if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) { + $to_add = $this->lookupReplacement($code, 'xyz'); + if (strlen($to_add) === 1) { + $replacement = $to_add; + } + } + + $result .= $replacement; + } + + return $result; + } + + /** + * {@inheritdoc} + */ + public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) { + $result = ''; + $length = 0; + // Split into Unicode characters and transliterate each one. + foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) { + $code = self::ordUTF8($character); + if ($code == -1) { + $to_add = $unknown_character; + } + else { + $to_add = $this->replace($code, $langcode, $unknown_character); + } + + // Check if this exceeds the maximum allowed length. + if (isset($max_length)) { + $length += strlen($to_add); + if ($length > $max_length) { + // There is no more space. + return $result; + } + } + + $result .= $to_add; + } + + return $result; + } + + /** + * Finds the character code for a UTF-8 character: like ord() but for UTF-8. + * + * @param string $character + * A single UTF-8 character. + * + * @return int + * The character code, or -1 if an illegal character is found. + */ + protected static function ordUTF8($character) { + $first_byte = ord($character[0]); + + if (($first_byte & 0x80) == 0) { + // Single-byte form: 0xxxxxxxx. + return $first_byte; + } + if (($first_byte & 0xe0) == 0xc0) { + // Two-byte form: 110xxxxx 10xxxxxx. + return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f); + } + if (($first_byte & 0xf0) == 0xe0) { + // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx. + return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f); + } + if (($first_byte & 0xf8) == 0xf0) { + // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. + return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f); + } + + // Other forms are not legal. + return -1; + } + + /** + * Replaces a single Unicode character using the transliteration database. + * + * @param int $code + * The character code of a Unicode character. + * @param string $langcode + * The language code of the language the character is in. + * @param string $unknown_character + * The character to substitute for characters without transliterated + * equivalents. + * + * @return string + * US-ASCII replacement character. If it has a mapping, it is returned; + * otherwise, $unknown_character is returned. The replacement can contain + * multiple characters. + */ + protected function replace($code, $langcode, $unknown_character) { + if ($code < 0x80) { + // Already lower ASCII. + return chr($code); + } + + // See if there is a language-specific override for this character. + if (!isset($this->languageOverrides[$langcode])) { + $this->readLanguageOverrides($langcode); + } + if (isset($this->languageOverrides[$langcode][$code])) { + return $this->languageOverrides[$langcode][$code]; + } + + return $this->lookupReplacement($code, $unknown_character); + } + + /** + * Look up the generic replacement for a UTF-8 character code. + * + * @param $code + * The UTF-8 character code. + * @param string $unknown_character + * (optional) The character to substitute for characters without entries in + * the replacement tables. + * + * @return string + * US-ASCII replacement characters. If it has a mapping, it is returned; + * otherwise, $unknown_character is returned. The replacement can contain + * multiple characters. + */ + protected function lookupReplacement($code, $unknown_character = '?') { + // See if there is a generic mapping for this character. + $bank = $code >> 8; + if (!isset($this->genericMap[$bank])) { + $this->readGenericData($bank); + } + $code = $code & 0xff; + return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character; + } + + /** + * Reads in language overrides for a language code. + * + * The data is read from files named "$langcode.php" in + * PhpTransliteration::$dataDirectory. These files should set up an array + * variable $overrides with an element whose key is $langcode and whose value + * is an array whose keys are character codes, and whose values are their + * transliterations in this language. The character codes can be for any valid + * Unicode character, independent of the number of bytes. + * + * @param $langcode + * Code for the language to read. + */ + protected function readLanguageOverrides($langcode) { + // Figure out the file name to use by sanitizing the language code, + // just in case. + $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php'; + + // Read in this file, which should set up a variable called $overrides, + // which will be local to this function. + if (is_file($file)) { + include $file; + } + if (!isset($overrides) || !is_array($overrides)) { + $overrides = [$langcode => []]; + } + $this->languageOverrides[$langcode] = $overrides[$langcode]; + } + + /** + * Reads in generic transliteration data for a bank of characters. + * + * The data is read in from a file named "x$bank.php" (with $bank in + * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files + * should set up a variable $bank containing an array whose numerical indices + * are the remaining two bytes of the character code, and whose values are the + * transliterations of these characters into US-ASCII. Note that the maximum + * Unicode character that can be encoded in this way is 4 bytes. + * + * @param $bank + * First two bytes of the Unicode character, or 0 for the ASCII range. + */ + protected function readGenericData($bank) { + // Figure out the file name. + $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php'; + + // Read in this file, which should set up a variable called $base, which + // will be local to this function. + if (is_file($file)) { + include $file; + } + if (!isset($base) || !is_array($base)) { + $base = []; + } + + // Save this data. + $this->genericMap[$bank] = $base; + } + +}