isophonics-drupal-site: core/lib/Drupal/Component/Transliteration/PhpTransliteration.php comparison

comparison core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c

Initial import (non-working)

author	Chris Cannam
date	Wed, 29 Nov 2017 16:09:58 +0000
parents
children	129ea1e6d783

comparison

equal deleted inserted replaced

--1:000000000000
+:4c8ae668cc8c
+<?php
+namespace Drupal\Component\Transliteration;
+/**
+* Implements transliteration without using the PECL extensions.
+*
+* Transliterations are done character-by-character, by looking up non-US-ASCII
+* characters in a transliteration database.
+*
+* The database comes from two types of files, both of which are searched for in
+* the PhpTransliteration::$dataDirectory directory. First, language-specific
+* overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
+* there is no language-specific override for a character, the generic
+* transliteration character tables are searched (see
+* PhpTransliteration::readGenericData()). If looking up the character in the
+* generic table results in a NULL value, or an illegal character is
+* encountered, then a substitute character is returned.
+*
+* Some parts of this code were derived from the MediaWiki project's UtfNormal
+* class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
+* http://www.mediawiki.org/
+*/
+class PhpTransliteration implements TransliterationInterface {
+/**
+* Directory where data for transliteration resides.
+*
+* The constructor sets this (by default) to subdirectory 'data' underneath
+* the directory where the class's PHP file resides.
+*
+* @var string
+*/
+protected $dataDirectory;
+/**
+* Associative array of language-specific character transliteration tables.
+*
+* The outermost array keys are language codes. For each language code key,
+* the value is an array whose keys are Unicode character codes, and whose
+* values are the transliterations of those characters to US-ASCII. This is
+* set up as needed in PhpTransliteration::replace() by calling
+* PhpTransliteration::readLanguageOverrides().
+*
+* @var array
+*/
+protected $languageOverrides = [];
+/**
+* Non-language-specific transliteration tables.
+*
+* Array whose keys are the upper two bytes of the Unicode character, and
+* whose values are an array of transliterations for each lower-two bytes
+* character code. This is set up as needed in PhpTransliteration::replace()
+* by calling PhpTransliteration::readGenericData().
+*
+* @var array
+*/
+protected $genericMap = [];
+/**
+* Constructs a transliteration object.
+*
+* @param string $data_directory
+*   (optional) The directory where data files reside. If omitted, defaults
+*   to subdirectory 'data' underneath the directory where the class's PHP
+*   file resides.
+*/
+public function __construct($data_directory = NULL) {
+$this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
+}
+/**
+* {@inheritdoc}
+*/
+public function removeDiacritics($string) {
+$result = '';
+foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
+$code = self::ordUTF8($character);
+// These two Unicode ranges include the accented US-ASCII letters, with a
+// few characters that aren't accented letters mixed in. So define the
+// ranges and the excluded characters.
+$range1 = $code > 0x00bf && $code < 0x017f;
+$exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
+$range2 = $code > 0x01cc && $code < 0x0250;
+$exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
+$replacement = $character;
+if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
+$to_add = $this->lookupReplacement($code, 'xyz');
+if (strlen($to_add) === 1) {
+$replacement = $to_add;
+}
+}
+$result .= $replacement;
+}
+return $result;
+}
+/**
+* {@inheritdoc}
+*/
+public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
+$result = '';
+$length = 0;
+// Split into Unicode characters and transliterate each one.
+foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
+$code = self::ordUTF8($character);
+if ($code == -1) {
+$to_add = $unknown_character;
+}
+else {
+$to_add = $this->replace($code, $langcode, $unknown_character);
+}
+// Check if this exceeds the maximum allowed length.
+if (isset($max_length)) {
+$length += strlen($to_add);
+if ($length > $max_length) {
+// There is no more space.
+return $result;
+}
+}
+$result .= $to_add;
+}
+return $result;
+}
+/**
+* Finds the character code for a UTF-8 character: like ord() but for UTF-8.
+*
+* @param string $character
+*   A single UTF-8 character.
+*
+* @return int
+*   The character code, or -1 if an illegal character is found.
+*/
+protected static function ordUTF8($character) {
+$first_byte = ord($character[0]);
+if (($first_byte & 0x80) == 0) {
+// Single-byte form: 0xxxxxxxx.
+return $first_byte;
+}
+if (($first_byte & 0xe0) == 0xc0) {
+// Two-byte form: 110xxxxx 10xxxxxx.
+return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
+}
+if (($first_byte & 0xf0) == 0xe0) {
+// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
+return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
+}
+if (($first_byte & 0xf8) == 0xf0) {
+// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
+return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
+}
+// Other forms are not legal.
+return -1;
+}
+/**
+* Replaces a single Unicode character using the transliteration database.
+*
+* @param int $code
+*   The character code of a Unicode character.
+* @param string $langcode
+*   The language code of the language the character is in.
+* @param string $unknown_character
+*   The character to substitute for characters without transliterated
+*   equivalents.
+*
+* @return string
+*   US-ASCII replacement character. If it has a mapping, it is returned;
+*   otherwise, $unknown_character is returned. The replacement can contain
+*   multiple characters.
+*/
+protected function replace($code, $langcode, $unknown_character) {
+if ($code < 0x80) {
+// Already lower ASCII.
+return chr($code);
+}
+// See if there is a language-specific override for this character.
+if (!isset($this->languageOverrides[$langcode])) {
+$this->readLanguageOverrides($langcode);
+}
+if (isset($this->languageOverrides[$langcode][$code])) {
+return $this->languageOverrides[$langcode][$code];
+}
+return $this->lookupReplacement($code, $unknown_character);
+}
+/**
+* Look up the generic replacement for a UTF-8 character code.
+*
+* @param $code
+*   The UTF-8 character code.
+* @param string $unknown_character
+*   (optional) The character to substitute for characters without entries in
+*   the replacement tables.
+*
+* @return string
+*   US-ASCII replacement characters. If it has a mapping, it is returned;
+*   otherwise, $unknown_character is returned. The replacement can contain
+*   multiple characters.
+*/
+protected function lookupReplacement($code, $unknown_character = '?') {
+// See if there is a generic mapping for this character.
+$bank = $code >> 8;
+if (!isset($this->genericMap[$bank])) {
+$this->readGenericData($bank);
+}
+$code = $code & 0xff;
+return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
+}
+/**
+* Reads in language overrides for a language code.
+*
+* The data is read from files named "$langcode.php" in
+* PhpTransliteration::$dataDirectory. These files should set up an array
+* variable $overrides with an element whose key is $langcode and whose value
+* is an array whose keys are character codes, and whose values are their
+* transliterations in this language. The character codes can be for any valid
+* Unicode character, independent of the number of bytes.
+*
+* @param $langcode
+*   Code for the language to read.
+*/
+protected function readLanguageOverrides($langcode) {
+// Figure out the file name to use by sanitizing the language code,
+// just in case.
+$file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
+// Read in this file, which should set up a variable called $overrides,
+// which will be local to this function.
+if (is_file($file)) {
+include $file;
+}
+if (!isset($overrides) || !is_array($overrides)) {
+$overrides = [$langcode => []];
+}
+$this->languageOverrides[$langcode] = $overrides[$langcode];
+}
+/**
+* Reads in generic transliteration data for a bank of characters.
+*
+* The data is read in from a file named "x$bank.php" (with $bank in
+* hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
+* should set up a variable $bank containing an array whose numerical indices
+* are the remaining two bytes of the character code, and whose values are the
+* transliterations of these characters into US-ASCII. Note that the maximum
+* Unicode character that can be encoded in this way is 4 bytes.
+*
+* @param $bank
+*   First two bytes of the Unicode character, or 0 for the ASCII range.
+*/
+protected function readGenericData($bank) {
+// Figure out the file name.
+$file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
+// Read in this file, which should set up a variable called $base, which
+// will be local to this function.
+if (is_file($file)) {
+include $file;
+}
+if (!isset($base) || !is_array($base)) {
+$base = [];
+}
+// Save this data.
+$this->genericMap[$bank] = $base;
+}
+}

Mercurial > hg > isophonics-drupal-site

comparison core/lib/Drupal/Component/Transliteration/PhpTransliteration.php @ 0:4c8ae668cc8c