isophonics-drupal-site: core/lib/Drupal/Component/Transliteration/PhpTransliteration.php annotate

annotate core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c

Initial import (non-working)

author	Chris Cannam
date	Wed, 29 Nov 2017 16:09:58 +0000
parents
children	129ea1e6d783

rev	line source
Chris@0	1 <?php
Chris@0	2
Chris@0	3 namespace Drupal\Component\Transliteration;
Chris@0	4
Chris@0	5 /**
Chris@0	6 * Implements transliteration without using the PECL extensions.
Chris@0	7 *
Chris@0	8 * Transliterations are done character-by-character, by looking up non-US-ASCII
Chris@0	9 * characters in a transliteration database.
Chris@0	10 *
Chris@0	11 * The database comes from two types of files, both of which are searched for in
Chris@0	12 * the PhpTransliteration::$dataDirectory directory. First, language-specific
Chris@0	13 * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
Chris@0	14 * there is no language-specific override for a character, the generic
Chris@0	15 * transliteration character tables are searched (see
Chris@0	16 * PhpTransliteration::readGenericData()). If looking up the character in the
Chris@0	17 * generic table results in a NULL value, or an illegal character is
Chris@0	18 * encountered, then a substitute character is returned.
Chris@0	19 *
Chris@0	20 * Some parts of this code were derived from the MediaWiki project's UtfNormal
Chris@0	21 * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
Chris@0	22 * http://www.mediawiki.org/
Chris@0	23 */
Chris@0	24 class PhpTransliteration implements TransliterationInterface {
Chris@0	25
Chris@0	26 /**
Chris@0	27 * Directory where data for transliteration resides.
Chris@0	28 *
Chris@0	29 * The constructor sets this (by default) to subdirectory 'data' underneath
Chris@0	30 * the directory where the class's PHP file resides.
Chris@0	31 *
Chris@0	32 * @var string
Chris@0	33 */
Chris@0	34 protected $dataDirectory;
Chris@0	35
Chris@0	36 /**
Chris@0	37 * Associative array of language-specific character transliteration tables.
Chris@0	38 *
Chris@0	39 * The outermost array keys are language codes. For each language code key,
Chris@0	40 * the value is an array whose keys are Unicode character codes, and whose
Chris@0	41 * values are the transliterations of those characters to US-ASCII. This is
Chris@0	42 * set up as needed in PhpTransliteration::replace() by calling
Chris@0	43 * PhpTransliteration::readLanguageOverrides().
Chris@0	44 *
Chris@0	45 * @var array
Chris@0	46 */
Chris@0	47 protected $languageOverrides = [];
Chris@0	48
Chris@0	49 /**
Chris@0	50 * Non-language-specific transliteration tables.
Chris@0	51 *
Chris@0	52 * Array whose keys are the upper two bytes of the Unicode character, and
Chris@0	53 * whose values are an array of transliterations for each lower-two bytes
Chris@0	54 * character code. This is set up as needed in PhpTransliteration::replace()
Chris@0	55 * by calling PhpTransliteration::readGenericData().
Chris@0	56 *
Chris@0	57 * @var array
Chris@0	58 */
Chris@0	59 protected $genericMap = [];
Chris@0	60
Chris@0	61 /**
Chris@0	62 * Constructs a transliteration object.
Chris@0	63 *
Chris@0	64 * @param string $data_directory
Chris@0	65 * (optional) The directory where data files reside. If omitted, defaults
Chris@0	66 * to subdirectory 'data' underneath the directory where the class's PHP
Chris@0	67 * file resides.
Chris@0	68 */
Chris@0	69 public function __construct($data_directory = NULL) {
Chris@0	70 $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
Chris@0	71 }
Chris@0	72
Chris@0	73 /**
Chris@0	74 * {@inheritdoc}
Chris@0	75 */
Chris@0	76 public function removeDiacritics($string) {
Chris@0	77 $result = '';
Chris@0	78
Chris@0	79 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
Chris@0	80 $code = self::ordUTF8($character);
Chris@0	81
Chris@0	82 // These two Unicode ranges include the accented US-ASCII letters, with a
Chris@0	83 // few characters that aren't accented letters mixed in. So define the
Chris@0	84 // ranges and the excluded characters.
Chris@0	85 $range1 = $code > 0x00bf && $code < 0x017f;
Chris@0	86 $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
Chris@0	87 $range2 = $code > 0x01cc && $code < 0x0250;
Chris@0	88 $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
Chris@0	89
Chris@0	90 $replacement = $character;
Chris@0	91 if (($range1 && !in_array($code, $exclusions_range1)) \|\| ($range2 && !in_array($code, $exclusions_range2))) {
Chris@0	92 $to_add = $this->lookupReplacement($code, 'xyz');
Chris@0	93 if (strlen($to_add) === 1) {
Chris@0	94 $replacement = $to_add;
Chris@0	95 }
Chris@0	96 }
Chris@0	97
Chris@0	98 $result .= $replacement;
Chris@0	99 }
Chris@0	100
Chris@0	101 return $result;
Chris@0	102 }
Chris@0	103
Chris@0	104 /**
Chris@0	105 * {@inheritdoc}
Chris@0	106 */
Chris@0	107 public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
Chris@0	108 $result = '';
Chris@0	109 $length = 0;
Chris@0	110 // Split into Unicode characters and transliterate each one.
Chris@0	111 foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
Chris@0	112 $code = self::ordUTF8($character);
Chris@0	113 if ($code == -1) {
Chris@0	114 $to_add = $unknown_character;
Chris@0	115 }
Chris@0	116 else {
Chris@0	117 $to_add = $this->replace($code, $langcode, $unknown_character);
Chris@0	118 }
Chris@0	119
Chris@0	120 // Check if this exceeds the maximum allowed length.
Chris@0	121 if (isset($max_length)) {
Chris@0	122 $length += strlen($to_add);
Chris@0	123 if ($length > $max_length) {
Chris@0	124 // There is no more space.
Chris@0	125 return $result;
Chris@0	126 }
Chris@0	127 }
Chris@0	128
Chris@0	129 $result .= $to_add;
Chris@0	130 }
Chris@0	131
Chris@0	132 return $result;
Chris@0	133 }
Chris@0	134
Chris@0	135 /**
Chris@0	136 * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
Chris@0	137 *
Chris@0	138 * @param string $character
Chris@0	139 * A single UTF-8 character.
Chris@0	140 *
Chris@0	141 * @return int
Chris@0	142 * The character code, or -1 if an illegal character is found.
Chris@0	143 */
Chris@0	144 protected static function ordUTF8($character) {
Chris@0	145 $first_byte = ord($character[0]);
Chris@0	146
Chris@0	147 if (($first_byte & 0x80) == 0) {
Chris@0	148 // Single-byte form: 0xxxxxxxx.
Chris@0	149 return $first_byte;
Chris@0	150 }
Chris@0	151 if (($first_byte & 0xe0) == 0xc0) {
Chris@0	152 // Two-byte form: 110xxxxx 10xxxxxx.
Chris@0	153 return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
Chris@0	154 }
Chris@0	155 if (($first_byte & 0xf0) == 0xe0) {
Chris@0	156 // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
Chris@0	157 return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
Chris@0	158 }
Chris@0	159 if (($first_byte & 0xf8) == 0xf0) {
Chris@0	160 // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
Chris@0	161 return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
Chris@0	162 }
Chris@0	163
Chris@0	164 // Other forms are not legal.
Chris@0	165 return -1;
Chris@0	166 }
Chris@0	167
Chris@0	168 /**
Chris@0	169 * Replaces a single Unicode character using the transliteration database.
Chris@0	170 *
Chris@0	171 * @param int $code
Chris@0	172 * The character code of a Unicode character.
Chris@0	173 * @param string $langcode
Chris@0	174 * The language code of the language the character is in.
Chris@0	175 * @param string $unknown_character
Chris@0	176 * The character to substitute for characters without transliterated
Chris@0	177 * equivalents.
Chris@0	178 *
Chris@0	179 * @return string
Chris@0	180 * US-ASCII replacement character. If it has a mapping, it is returned;
Chris@0	181 * otherwise, $unknown_character is returned. The replacement can contain
Chris@0	182 * multiple characters.
Chris@0	183 */
Chris@0	184 protected function replace($code, $langcode, $unknown_character) {
Chris@0	185 if ($code < 0x80) {
Chris@0	186 // Already lower ASCII.
Chris@0	187 return chr($code);
Chris@0	188 }
Chris@0	189
Chris@0	190 // See if there is a language-specific override for this character.
Chris@0	191 if (!isset($this->languageOverrides[$langcode])) {
Chris@0	192 $this->readLanguageOverrides($langcode);
Chris@0	193 }
Chris@0	194 if (isset($this->languageOverrides[$langcode][$code])) {
Chris@0	195 return $this->languageOverrides[$langcode][$code];
Chris@0	196 }
Chris@0	197
Chris@0	198 return $this->lookupReplacement($code, $unknown_character);
Chris@0	199 }
Chris@0	200
Chris@0	201 /**
Chris@0	202 * Look up the generic replacement for a UTF-8 character code.
Chris@0	203 *
Chris@0	204 * @param $code
Chris@0	205 * The UTF-8 character code.
Chris@0	206 * @param string $unknown_character
Chris@0	207 * (optional) The character to substitute for characters without entries in
Chris@0	208 * the replacement tables.
Chris@0	209 *
Chris@0	210 * @return string
Chris@0	211 * US-ASCII replacement characters. If it has a mapping, it is returned;
Chris@0	212 * otherwise, $unknown_character is returned. The replacement can contain
Chris@0	213 * multiple characters.
Chris@0	214 */
Chris@0	215 protected function lookupReplacement($code, $unknown_character = '?') {
Chris@0	216 // See if there is a generic mapping for this character.
Chris@0	217 $bank = $code >> 8;
Chris@0	218 if (!isset($this->genericMap[$bank])) {
Chris@0	219 $this->readGenericData($bank);
Chris@0	220 }
Chris@0	221 $code = $code & 0xff;
Chris@0	222 return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
Chris@0	223 }
Chris@0	224
Chris@0	225 /**
Chris@0	226 * Reads in language overrides for a language code.
Chris@0	227 *
Chris@0	228 * The data is read from files named "$langcode.php" in
Chris@0	229 * PhpTransliteration::$dataDirectory. These files should set up an array
Chris@0	230 * variable $overrides with an element whose key is $langcode and whose value
Chris@0	231 * is an array whose keys are character codes, and whose values are their
Chris@0	232 * transliterations in this language. The character codes can be for any valid
Chris@0	233 * Unicode character, independent of the number of bytes.
Chris@0	234 *
Chris@0	235 * @param $langcode
Chris@0	236 * Code for the language to read.
Chris@0	237 */
Chris@0	238 protected function readLanguageOverrides($langcode) {
Chris@0	239 // Figure out the file name to use by sanitizing the language code,
Chris@0	240 // just in case.
Chris@0	241 $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
Chris@0	242
Chris@0	243 // Read in this file, which should set up a variable called $overrides,
Chris@0	244 // which will be local to this function.
Chris@0	245 if (is_file($file)) {
Chris@0	246 include $file;
Chris@0	247 }
Chris@0	248 if (!isset($overrides) \|\| !is_array($overrides)) {
Chris@0	249 $overrides = [$langcode => []];
Chris@0	250 }
Chris@0	251 $this->languageOverrides[$langcode] = $overrides[$langcode];
Chris@0	252 }
Chris@0	253
Chris@0	254 /**
Chris@0	255 * Reads in generic transliteration data for a bank of characters.
Chris@0	256 *
Chris@0	257 * The data is read in from a file named "x$bank.php" (with $bank in
Chris@0	258 * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
Chris@0	259 * should set up a variable $bank containing an array whose numerical indices
Chris@0	260 * are the remaining two bytes of the character code, and whose values are the
Chris@0	261 * transliterations of these characters into US-ASCII. Note that the maximum
Chris@0	262 * Unicode character that can be encoded in this way is 4 bytes.
Chris@0	263 *
Chris@0	264 * @param $bank
Chris@0	265 * First two bytes of the Unicode character, or 0 for the ASCII range.
Chris@0	266 */
Chris@0	267 protected function readGenericData($bank) {
Chris@0	268 // Figure out the file name.
Chris@0	269 $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
Chris@0	270
Chris@0	271 // Read in this file, which should set up a variable called $base, which
Chris@0	272 // will be local to this function.
Chris@0	273 if (is_file($file)) {
Chris@0	274 include $file;
Chris@0	275 }
Chris@0	276 if (!isset($base) \|\| !is_array($base)) {
Chris@0	277 $base = [];
Chris@0	278 }
Chris@0	279
Chris@0	280 // Save this data.
Chris@0	281 $this->genericMap[$bank] = $base;
Chris@0	282 }
Chris@0	283
Chris@0	284 }

Mercurial > hg > isophonics-drupal-site

annotate core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c