annotate core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents af1871eacc83
children
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Tests\Component\Transliteration;
Chris@0 4
Chris@0 5 use Drupal\Component\Transliteration\PhpTransliteration;
Chris@0 6 use Drupal\Component\Utility\Random;
Chris@0 7 use org\bovigo\vfs\vfsStream;
Chris@0 8 use PHPUnit\Framework\TestCase;
Chris@0 9
Chris@0 10 /**
Chris@0 11 * Tests Transliteration component functionality.
Chris@0 12 *
Chris@0 13 * @group Transliteration
Chris@0 14 *
Chris@0 15 * @coversDefaultClass \Drupal\Component\Transliteration\PhpTransliteration
Chris@0 16 */
Chris@0 17 class PhpTransliterationTest extends TestCase {
Chris@0 18
Chris@0 19 /**
Chris@0 20 * Tests the PhpTransliteration::removeDiacritics() function.
Chris@0 21 *
Chris@0 22 * @param string $original
Chris@0 23 * The language code to test.
Chris@0 24 * @param string $expected
Chris@0 25 * The expected return from PhpTransliteration::removeDiacritics().
Chris@0 26 *
Chris@0 27 * @dataProvider providerTestPhpTransliterationRemoveDiacritics
Chris@0 28 */
Chris@0 29 public function testRemoveDiacritics($original, $expected) {
Chris@0 30 $transliterator_class = new PhpTransliteration();
Chris@0 31 $result = $transliterator_class->removeDiacritics($original);
Chris@0 32 $this->assertEquals($expected, $result);
Chris@0 33 }
Chris@0 34
Chris@0 35 /**
Chris@0 36 * Provides data for self::testRemoveDiacritics().
Chris@0 37 *
Chris@0 38 * @return array
Chris@0 39 * An array of arrays, each containing the parameters for
Chris@0 40 * self::testRemoveDiacritics().
Chris@0 41 */
Chris@0 42 public function providerTestPhpTransliterationRemoveDiacritics() {
Chris@0 43 return [
Chris@0 44 // Test all characters in the Unicode range 0x00bf to 0x017f.
Chris@0 45 ['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'],
Chris@0 46 ['ÐÑÒÓÔÕÖרÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'],
Chris@0 47 ['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'],
Chris@0 48 ['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'],
Chris@0 49 ['ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'],
Chris@0 50 ['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'],
Chris@0 51 ['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'],
Chris@0 52 ['İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'],
Chris@0 53 ['ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'],
Chris@0 54 ['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'],
Chris@0 55 ['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'],
Chris@0 56 ['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'],
Chris@0 57
Chris@0 58 // Test all characters in the Unicode range 0x01CD to 0x024F.
Chris@0 59 ['ǍǎǏ', 'AaI'],
Chris@0 60 ['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'],
Chris@0 61 ['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'],
Chris@0 62 ['ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'],
Chris@0 63 ['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'],
Chris@0 64 ['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'],
Chris@0 65 ['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'],
Chris@0 66 ['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'],
Chris@0 67 ['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'],
Chris@0 68 ];
Chris@0 69 }
Chris@0 70
Chris@0 71 /**
Chris@0 72 * Tests the PhpTransliteration class.
Chris@0 73 *
Chris@0 74 * @param string $langcode
Chris@0 75 * The language code to test.
Chris@0 76 * @param string $original
Chris@0 77 * The original string.
Chris@0 78 * @param string $expected
Chris@0 79 * The expected return from PhpTransliteration::transliterate().
Chris@0 80 * @param string $unknown_character
Chris@0 81 * (optional) The character to substitute for characters in $string without
Chris@0 82 * transliterated equivalents. Defaults to '?'.
Chris@0 83 * @param int $max_length
Chris@0 84 * (optional) If provided, return at most this many characters, ensuring
Chris@0 85 * that the transliteration does not split in the middle of an input
Chris@0 86 * character's transliteration.
Chris@0 87 *
Chris@0 88 * @dataProvider providerTestPhpTransliteration
Chris@0 89 */
Chris@0 90 public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
Chris@0 91 $transliterator_class = new PhpTransliteration();
Chris@0 92 $actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length);
Chris@0 93 $this->assertSame($expected, $actual);
Chris@0 94 }
Chris@0 95
Chris@0 96 /**
Chris@0 97 * Provides data for self::testPhpTransliteration().
Chris@0 98 *
Chris@0 99 * @return array
Chris@0 100 * An array of arrays, each containing the parameters for
Chris@0 101 * self::testPhpTransliteration().
Chris@0 102 */
Chris@0 103 public function providerTestPhpTransliteration() {
Chris@0 104 $random_generator = new Random();
Chris@0 105 $random = $random_generator->string(10);
Chris@0 106 // Make some strings with two, three, and four-byte characters for testing.
Chris@0 107 // Note that the 3-byte character is overridden by the 'kg' language.
Chris@0 108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
Chris@17 109 // This is a Cyrillic character that looks something like a "u". See
Chris@0 110 // http://www.unicode.org/charts/PDF/U0400.pdf
Chris@0 111 $three_byte = html_entity_decode('&#x446;', ENT_NOQUOTES, 'UTF-8');
Chris@0 112 // This is a Canadian Aboriginal character like a triangle. See
Chris@0 113 // http://www.unicode.org/charts/PDF/U1400.pdf
Chris@0 114 $four_byte = html_entity_decode('&#x1411;', ENT_NOQUOTES, 'UTF-8');
Chris@0 115 // These are two Gothic alphabet letters. See
Chris@0 116 // http://wikipedia.org/wiki/Gothic_alphabet
Chris@0 117 // They are not in our tables, but should at least give us '?' (unknown).
Chris@0 118 $five_byte = html_entity_decode('&#x10330;&#x10338;', ENT_NOQUOTES, 'UTF-8');
Chris@0 119
Chris@0 120 return [
Chris@17 121 // Each test case is language code, input, output, unknown character, max
Chris@17 122 // length.
Chris@0 123 // Test ASCII in English.
Chris@0 124 ['en', $random, $random],
Chris@0 125 // Test ASCII in some other language with no overrides.
Chris@0 126 ['fr', $random, $random],
Chris@0 127 // Test 3 and 4-byte characters in a language without overrides.
Chris@0 128 // Note: if the data tables change, these will need to change too! They
Chris@0 129 // are set up to test that data table loading works, so values come
Chris@0 130 // directly from the data files.
Chris@0 131 ['fr', $three_byte, 'c'],
Chris@0 132 ['fr', $four_byte, 'wii'],
Chris@0 133 // Test 5-byte characters.
Chris@0 134 ['en', $five_byte, '??'],
Chris@0 135 // Test a language with no overrides.
Chris@0 136 ['en', $two_byte, 'A O U A O aouaohello'],
Chris@0 137 // Test language overrides provided by core.
Chris@0 138 ['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'],
Chris@0 139 ['de', $random, $random],
Chris@18 140 ['da', $two_byte, 'A O U Aa Oe aouaaoehello'],
Chris@18 141 ['da', $random, $random],
Chris@0 142 ['kg', $three_byte, 'ts'],
Chris@0 143 // Test strings in some other languages.
Chris@0 144 // Turkish, provided by drupal.org user Kartagis.
Chris@0 145 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
Chris@17 146 // Max length.
Chris@17 147 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17],
Chris@17 148 // Do not split up the transliteration of a single character.
Chris@17 149 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18],
Chris@0 150 // Illegal/unknown unicode.
Chris@17 151 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
Chris@17 152 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
Chris@17 153 ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
Chris@17 154 ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
Chris@17 155 // Non default replacement.
Chris@17 156 ['en', chr(0x80) . 'ello World', '_ello World', '_'],
Chris@17 157 // Keep the original question marks.
Chris@17 158 ['en', chr(0xF8) . '?' . chr(0x80), '???'],
Chris@17 159 ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
Chris@17 160 ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
Chris@17 161 // Non-US-ASCII replacement.
Chris@17 162 ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
Chris@17 163 ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
Chris@17 164 // Ensure question marks are replaced when max length used.
Chris@17 165 ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
Chris@17 166 // Empty replacement.
Chris@17 167 ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''],
Chris@17 168 // Not affecting spacing from the beginning and end of a string.
Chris@17 169 ['en', ' Hello Abventor! ', ' Hello Abventor! '],
Chris@17 170 ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15],
Chris@17 171 // Keep many spaces between words.
Chris@17 172 ['en', 'Too many spaces between words !', 'Too many spaces between words !'],
Chris@0 173 ];
Chris@0 174 }
Chris@0 175
Chris@0 176 /**
Chris@0 177 * Tests inclusion is safe.
Chris@0 178 *
Chris@0 179 * @covers ::readLanguageOverrides
Chris@0 180 */
Chris@0 181 public function testSafeInclude() {
Chris@0 182 // The overrides in the transliteration data directory transliterates 0x82
Chris@0 183 // into "safe" but the overrides one directory higher transliterates the
Chris@0 184 // same character into "security hole". So by using "../index" as the
Chris@0 185 // language code we can test the ../ is stripped from the langcode.
Chris@0 186 vfsStream::setup('transliteration', NULL, [
Chris@0 187 'index.php' => '<?php $overrides = ["../index" => [0x82 => "security hole"]];',
Chris@0 188 'dir' => [
Chris@0 189 'index.php' => '<?php $overrides = ["../index" => [0x82 => "safe"]];',
Chris@0 190 ],
Chris@0 191 ]);
Chris@0 192 $transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir'));
Chris@0 193 $transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index');
Chris@14 194 $this->assertSame('safe', $transliterated);
Chris@0 195 }
Chris@0 196
Chris@0 197 }