comparison core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php @ 17:129ea1e6d783

Update, including to Drupal core 8.6.10
author Chris Cannam
date Thu, 28 Feb 2019 13:21:36 +0000
parents 1fec387a4317
children af1871eacc83
comparison
equal deleted inserted replaced
16:c2387f117808 17:129ea1e6d783
104 $random_generator = new Random(); 104 $random_generator = new Random();
105 $random = $random_generator->string(10); 105 $random = $random_generator->string(10);
106 // Make some strings with two, three, and four-byte characters for testing. 106 // Make some strings with two, three, and four-byte characters for testing.
107 // Note that the 3-byte character is overridden by the 'kg' language. 107 // Note that the 3-byte character is overridden by the 'kg' language.
108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello'; 108 $two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
109 // This is a Cyrrillic character that looks something like a u. See 109 // This is a Cyrillic character that looks something like a "u". See
110 // http://www.unicode.org/charts/PDF/U0400.pdf 110 // http://www.unicode.org/charts/PDF/U0400.pdf
111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8'); 111 $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8');
112 // This is a Canadian Aboriginal character like a triangle. See 112 // This is a Canadian Aboriginal character like a triangle. See
113 // http://www.unicode.org/charts/PDF/U1400.pdf 113 // http://www.unicode.org/charts/PDF/U1400.pdf
114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); 114 $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
116 // http://wikipedia.org/wiki/Gothic_alphabet 116 // http://wikipedia.org/wiki/Gothic_alphabet
117 // They are not in our tables, but should at least give us '?' (unknown). 117 // They are not in our tables, but should at least give us '?' (unknown).
118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8'); 118 $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
119 119
120 return [ 120 return [
121 // Each test case is (language code, input, output). 121 // Each test case is language code, input, output, unknown character, max
122 // length.
122 // Test ASCII in English. 123 // Test ASCII in English.
123 ['en', $random, $random], 124 ['en', $random, $random],
124 // Test ASCII in some other language with no overrides. 125 // Test ASCII in some other language with no overrides.
125 ['fr', $random, $random], 126 ['fr', $random, $random],
126 // Test 3 and 4-byte characters in a language without overrides. 127 // Test 3 and 4-byte characters in a language without overrides.
140 ['dk', $random, $random], 141 ['dk', $random, $random],
141 ['kg', $three_byte, 'ts'], 142 ['kg', $three_byte, 'ts'],
142 // Test strings in some other languages. 143 // Test strings in some other languages.
143 // Turkish, provided by drupal.org user Kartagis. 144 // Turkish, provided by drupal.org user Kartagis.
144 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'], 145 ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
146 // Max length.
147 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17],
148 // Do not split up the transliteration of a single character.
149 ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18],
145 // Illegal/unknown unicode. 150 // Illegal/unknown unicode.
146 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'], 151 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
147 // Max length. 152 ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
148 ['de', $two_byte, 'Ae Oe', '?', 5], 153 ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
154 ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
155 // Non default replacement.
156 ['en', chr(0x80) . 'ello World', '_ello World', '_'],
157 // Keep the original question marks.
158 ['en', chr(0xF8) . '?' . chr(0x80), '???'],
159 ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
160 ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
161 // Non-US-ASCII replacement.
162 ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
163 ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
164 // Ensure question marks are replaced when max length used.
165 ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
166 // Empty replacement.
167 ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''],
168 // Not affecting spacing from the beginning and end of a string.
169 ['en', ' Hello Abventor! ', ' Hello Abventor! '],
170 ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15],
171 // Keep many spaces between words.
172 ['en', 'Too many spaces between words !', 'Too many spaces between words !'],
149 ]; 173 ];
150 }
151
152 /**
153 * Tests the transliteration with max length.
154 */
155 public function testTransliterationWithMaxLength() {
156 $transliteration = new PhpTransliteration();
157
158 // Test with max length, using German. It should never split up the
159 // transliteration of a single character.
160 $input = 'Ä Ö Ü Å Ø äöüåøhello';
161 $trunc_output = 'Ae Oe Ue A O aeoe';
162
163 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 17), 'Truncating to 17 characters works');
164 $this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 18), 'Truncating to 18 characters works');
165 } 174 }
166 175
167 /** 176 /**
168 * Tests inclusion is safe. 177 * Tests inclusion is safe.
169 * 178 *