Chris@0
|
1 <?php
|
Chris@0
|
2
|
Chris@0
|
3 namespace Drupal\Tests\Component\Transliteration;
|
Chris@0
|
4
|
Chris@0
|
5 use Drupal\Component\Transliteration\PhpTransliteration;
|
Chris@0
|
6 use Drupal\Component\Utility\Random;
|
Chris@0
|
7 use org\bovigo\vfs\vfsStream;
|
Chris@0
|
8 use PHPUnit\Framework\TestCase;
|
Chris@0
|
9
|
Chris@0
|
10 /**
|
Chris@0
|
11 * Tests Transliteration component functionality.
|
Chris@0
|
12 *
|
Chris@0
|
13 * @group Transliteration
|
Chris@0
|
14 *
|
Chris@0
|
15 * @coversDefaultClass \Drupal\Component\Transliteration\PhpTransliteration
|
Chris@0
|
16 */
|
Chris@0
|
17 class PhpTransliterationTest extends TestCase {
|
Chris@0
|
18
|
Chris@0
|
19 /**
|
Chris@0
|
20 * Tests the PhpTransliteration::removeDiacritics() function.
|
Chris@0
|
21 *
|
Chris@0
|
22 * @param string $original
|
Chris@0
|
23 * The language code to test.
|
Chris@0
|
24 * @param string $expected
|
Chris@0
|
25 * The expected return from PhpTransliteration::removeDiacritics().
|
Chris@0
|
26 *
|
Chris@0
|
27 * @dataProvider providerTestPhpTransliterationRemoveDiacritics
|
Chris@0
|
28 */
|
Chris@0
|
29 public function testRemoveDiacritics($original, $expected) {
|
Chris@0
|
30 $transliterator_class = new PhpTransliteration();
|
Chris@0
|
31 $result = $transliterator_class->removeDiacritics($original);
|
Chris@0
|
32 $this->assertEquals($expected, $result);
|
Chris@0
|
33 }
|
Chris@0
|
34
|
Chris@0
|
35 /**
|
Chris@0
|
36 * Provides data for self::testRemoveDiacritics().
|
Chris@0
|
37 *
|
Chris@0
|
38 * @return array
|
Chris@0
|
39 * An array of arrays, each containing the parameters for
|
Chris@0
|
40 * self::testRemoveDiacritics().
|
Chris@0
|
41 */
|
Chris@0
|
42 public function providerTestPhpTransliterationRemoveDiacritics() {
|
Chris@0
|
43 return [
|
Chris@0
|
44 // Test all characters in the Unicode range 0x00bf to 0x017f.
|
Chris@0
|
45 ['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'],
|
Chris@0
|
46 ['ÐÑÒÓÔÕÖרÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'],
|
Chris@0
|
47 ['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'],
|
Chris@0
|
48 ['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'],
|
Chris@0
|
49 ['ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'],
|
Chris@0
|
50 ['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'],
|
Chris@0
|
51 ['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'],
|
Chris@0
|
52 ['İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'],
|
Chris@0
|
53 ['ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'],
|
Chris@0
|
54 ['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'],
|
Chris@0
|
55 ['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'],
|
Chris@0
|
56 ['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'],
|
Chris@0
|
57
|
Chris@0
|
58 // Test all characters in the Unicode range 0x01CD to 0x024F.
|
Chris@0
|
59 ['ǍǎǏ', 'AaI'],
|
Chris@0
|
60 ['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'],
|
Chris@0
|
61 ['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'],
|
Chris@0
|
62 ['ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'],
|
Chris@0
|
63 ['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'],
|
Chris@0
|
64 ['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'],
|
Chris@0
|
65 ['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'],
|
Chris@0
|
66 ['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'],
|
Chris@0
|
67 ['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'],
|
Chris@0
|
68 ];
|
Chris@0
|
69 }
|
Chris@0
|
70
|
Chris@0
|
71 /**
|
Chris@0
|
72 * Tests the PhpTransliteration class.
|
Chris@0
|
73 *
|
Chris@0
|
74 * @param string $langcode
|
Chris@0
|
75 * The language code to test.
|
Chris@0
|
76 * @param string $original
|
Chris@0
|
77 * The original string.
|
Chris@0
|
78 * @param string $expected
|
Chris@0
|
79 * The expected return from PhpTransliteration::transliterate().
|
Chris@0
|
80 * @param string $unknown_character
|
Chris@0
|
81 * (optional) The character to substitute for characters in $string without
|
Chris@0
|
82 * transliterated equivalents. Defaults to '?'.
|
Chris@0
|
83 * @param int $max_length
|
Chris@0
|
84 * (optional) If provided, return at most this many characters, ensuring
|
Chris@0
|
85 * that the transliteration does not split in the middle of an input
|
Chris@0
|
86 * character's transliteration.
|
Chris@0
|
87 *
|
Chris@0
|
88 * @dataProvider providerTestPhpTransliteration
|
Chris@0
|
89 */
|
Chris@0
|
90 public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
|
Chris@0
|
91 $transliterator_class = new PhpTransliteration();
|
Chris@0
|
92 $actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length);
|
Chris@0
|
93 $this->assertSame($expected, $actual);
|
Chris@0
|
94 }
|
Chris@0
|
95
|
Chris@0
|
96 /**
|
Chris@0
|
97 * Provides data for self::testPhpTransliteration().
|
Chris@0
|
98 *
|
Chris@0
|
99 * @return array
|
Chris@0
|
100 * An array of arrays, each containing the parameters for
|
Chris@0
|
101 * self::testPhpTransliteration().
|
Chris@0
|
102 */
|
Chris@0
|
103 public function providerTestPhpTransliteration() {
|
Chris@0
|
104 $random_generator = new Random();
|
Chris@0
|
105 $random = $random_generator->string(10);
|
Chris@0
|
106 // Make some strings with two, three, and four-byte characters for testing.
|
Chris@0
|
107 // Note that the 3-byte character is overridden by the 'kg' language.
|
Chris@0
|
108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
|
Chris@17
|
109 // This is a Cyrillic character that looks something like a "u". See
|
Chris@0
|
110 // http://www.unicode.org/charts/PDF/U0400.pdf
|
Chris@0
|
111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8');
|
Chris@0
|
112 // This is a Canadian Aboriginal character like a triangle. See
|
Chris@0
|
113 // http://www.unicode.org/charts/PDF/U1400.pdf
|
Chris@0
|
114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
|
Chris@0
|
115 // These are two Gothic alphabet letters. See
|
Chris@0
|
116 // http://wikipedia.org/wiki/Gothic_alphabet
|
Chris@0
|
117 // They are not in our tables, but should at least give us '?' (unknown).
|
Chris@0
|
118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
|
Chris@0
|
119
|
Chris@0
|
120 return [
|
Chris@17
|
121 // Each test case is language code, input, output, unknown character, max
|
Chris@17
|
122 // length.
|
Chris@0
|
123 // Test ASCII in English.
|
Chris@0
|
124 ['en', $random, $random],
|
Chris@0
|
125 // Test ASCII in some other language with no overrides.
|
Chris@0
|
126 ['fr', $random, $random],
|
Chris@0
|
127 // Test 3 and 4-byte characters in a language without overrides.
|
Chris@0
|
128 // Note: if the data tables change, these will need to change too! They
|
Chris@0
|
129 // are set up to test that data table loading works, so values come
|
Chris@0
|
130 // directly from the data files.
|
Chris@0
|
131 ['fr', $three_byte, 'c'],
|
Chris@0
|
132 ['fr', $four_byte, 'wii'],
|
Chris@0
|
133 // Test 5-byte characters.
|
Chris@0
|
134 ['en', $five_byte, '??'],
|
Chris@0
|
135 // Test a language with no overrides.
|
Chris@0
|
136 ['en', $two_byte, 'A O U A O aouaohello'],
|
Chris@0
|
137 // Test language overrides provided by core.
|
Chris@0
|
138 ['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'],
|
Chris@0
|
139 ['de', $random, $random],
|
Chris@18
|
140 ['da', $two_byte, 'A O U Aa Oe aouaaoehello'],
|
Chris@18
|
141 ['da', $random, $random],
|
Chris@0
|
142 ['kg', $three_byte, 'ts'],
|
Chris@0
|
143 // Test strings in some other languages.
|
Chris@0
|
144 // Turkish, provided by drupal.org user Kartagis.
|
Chris@0
|
145 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
|
Chris@17
|
146 // Max length.
|
Chris@17
|
147 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17],
|
Chris@17
|
148 // Do not split up the transliteration of a single character.
|
Chris@17
|
149 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18],
|
Chris@0
|
150 // Illegal/unknown unicode.
|
Chris@17
|
151 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
|
Chris@17
|
152 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
|
Chris@17
|
153 ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
|
Chris@17
|
154 ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
|
Chris@17
|
155 // Non default replacement.
|
Chris@17
|
156 ['en', chr(0x80) . 'ello World', '_ello World', '_'],
|
Chris@17
|
157 // Keep the original question marks.
|
Chris@17
|
158 ['en', chr(0xF8) . '?' . chr(0x80), '???'],
|
Chris@17
|
159 ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
|
Chris@17
|
160 ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
|
Chris@17
|
161 // Non-US-ASCII replacement.
|
Chris@17
|
162 ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
|
Chris@17
|
163 ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
|
Chris@17
|
164 // Ensure question marks are replaced when max length used.
|
Chris@17
|
165 ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
|
Chris@17
|
166 // Empty replacement.
|
Chris@17
|
167 ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''],
|
Chris@17
|
168 // Not affecting spacing from the beginning and end of a string.
|
Chris@17
|
169 ['en', ' Hello Abventor! ', ' Hello Abventor! '],
|
Chris@17
|
170 ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15],
|
Chris@17
|
171 // Keep many spaces between words.
|
Chris@17
|
172 ['en', 'Too many spaces between words !', 'Too many spaces between words !'],
|
Chris@0
|
173 ];
|
Chris@0
|
174 }
|
Chris@0
|
175
|
Chris@0
|
176 /**
|
Chris@0
|
177 * Tests inclusion is safe.
|
Chris@0
|
178 *
|
Chris@0
|
179 * @covers ::readLanguageOverrides
|
Chris@0
|
180 */
|
Chris@0
|
181 public function testSafeInclude() {
|
Chris@0
|
182 // The overrides in the transliteration data directory transliterates 0x82
|
Chris@0
|
183 // into "safe" but the overrides one directory higher transliterates the
|
Chris@0
|
184 // same character into "security hole". So by using "../index" as the
|
Chris@0
|
185 // language code we can test the ../ is stripped from the langcode.
|
Chris@0
|
186 vfsStream::setup('transliteration', NULL, [
|
Chris@0
|
187 'index.php' => '<?php $overrides = ["../index" => [0x82 => "security hole"]];',
|
Chris@0
|
188 'dir' => [
|
Chris@0
|
189 'index.php' => '<?php $overrides = ["../index" => [0x82 => "safe"]];',
|
Chris@0
|
190 ],
|
Chris@0
|
191 ]);
|
Chris@0
|
192 $transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir'));
|
Chris@0
|
193 $transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index');
|
Chris@14
|
194 $this->assertSame('safe', $transliterated);
|
Chris@0
|
195 }
|
Chris@0
|
196
|
Chris@0
|
197 }
|