Mercurial > hg > isophonics-drupal-site
comparison core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php @ 17:129ea1e6d783
Update, including to Drupal core 8.6.10
author | Chris Cannam |
---|---|
date | Thu, 28 Feb 2019 13:21:36 +0000 |
parents | 1fec387a4317 |
children | af1871eacc83 |
comparison
equal
deleted
inserted
replaced
16:c2387f117808 | 17:129ea1e6d783 |
---|---|
104 $random_generator = new Random(); | 104 $random_generator = new Random(); |
105 $random = $random_generator->string(10); | 105 $random = $random_generator->string(10); |
106 // Make some strings with two, three, and four-byte characters for testing. | 106 // Make some strings with two, three, and four-byte characters for testing. |
107 // Note that the 3-byte character is overridden by the 'kg' language. | 107 // Note that the 3-byte character is overridden by the 'kg' language. |
108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello'; | 108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello'; |
109 // This is a Cyrrillic character that looks something like a u. See | 109 // This is a Cyrillic character that looks something like a "u". See |
110 // http://www.unicode.org/charts/PDF/U0400.pdf | 110 // http://www.unicode.org/charts/PDF/U0400.pdf |
111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8'); | 111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8'); |
112 // This is a Canadian Aboriginal character like a triangle. See | 112 // This is a Canadian Aboriginal character like a triangle. See |
113 // http://www.unicode.org/charts/PDF/U1400.pdf | 113 // http://www.unicode.org/charts/PDF/U1400.pdf |
114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); | 114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); |
116 // http://wikipedia.org/wiki/Gothic_alphabet | 116 // http://wikipedia.org/wiki/Gothic_alphabet |
117 // They are not in our tables, but should at least give us '?' (unknown). | 117 // They are not in our tables, but should at least give us '?' (unknown). |
118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8'); | 118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8'); |
119 | 119 |
120 return [ | 120 return [ |
121 // Each test case is (language code, input, output). | 121 // Each test case is language code, input, output, unknown character, max |
122 // length. | |
122 // Test ASCII in English. | 123 // Test ASCII in English. |
123 ['en', $random, $random], | 124 ['en', $random, $random], |
124 // Test ASCII in some other language with no overrides. | 125 // Test ASCII in some other language with no overrides. |
125 ['fr', $random, $random], | 126 ['fr', $random, $random], |
126 // Test 3 and 4-byte characters in a language without overrides. | 127 // Test 3 and 4-byte characters in a language without overrides. |
140 ['dk', $random, $random], | 141 ['dk', $random, $random], |
141 ['kg', $three_byte, 'ts'], | 142 ['kg', $three_byte, 'ts'], |
142 // Test strings in some other languages. | 143 // Test strings in some other languages. |
143 // Turkish, provided by drupal.org user Kartagis. | 144 // Turkish, provided by drupal.org user Kartagis. |
144 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'], | 145 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'], |
146 // Max length. | |
147 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17], | |
148 // Do not split up the transliteration of a single character. | |
149 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18], | |
145 // Illegal/unknown unicode. | 150 // Illegal/unknown unicode. |
146 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'], | 151 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'], |
147 // Max length. | 152 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'], |
148 ['de', $two_byte, 'Ae Oe', '?', 5], | 153 ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'], |
154 ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'], | |
155 // Non default replacement. | |
156 ['en', chr(0x80) . 'ello World', '_ello World', '_'], | |
157 // Keep the original question marks. | |
158 ['en', chr(0xF8) . '?' . chr(0x80), '???'], | |
159 ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'], | |
160 ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'], | |
161 // Non-US-ASCII replacement. | |
162 ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'], | |
163 ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'], | |
164 // Ensure question marks are replaced when max length used. | |
165 ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7], | |
166 // Empty replacement. | |
167 ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''], | |
168 // Not affecting spacing from the beginning and end of a string. | |
169 ['en', ' Hello Abventor! ', ' Hello Abventor! '], | |
170 ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15], | |
171 // Keep many spaces between words. | |
172 ['en', 'Too many spaces between words !', 'Too many spaces between words !'], | |
149 ]; | 173 ]; |
150 } | |
151 | |
152 /** | |
153 * Tests the transliteration with max length. | |
154 */ | |
155 public function testTransliterationWithMaxLength() { | |
156 $transliteration = new PhpTransliteration(); | |
157 | |
158 // Test with max length, using German. It should never split up the | |
159 // transliteration of a single character. | |
160 $input = 'Ä Ö Ü Å Ø äöüåøhello'; | |
161 $trunc_output = 'Ae Oe Ue A O aeoe'; | |
162 | |
163 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 17), 'Truncating to 17 characters works'); | |
164 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 18), 'Truncating to 18 characters works'); | |
165 } | 174 } |
166 | 175 |
167 /** | 176 /** |
168 * Tests inclusion is safe. | 177 * Tests inclusion is safe. |
169 * | 178 * |