Chris@0: <?php
Chris@0: 
Chris@0: namespace Drupal\Tests\Component\Transliteration;
Chris@0: 
Chris@0: use Drupal\Component\Transliteration\PhpTransliteration;
Chris@0: use Drupal\Component\Utility\Random;
Chris@0: use org\bovigo\vfs\vfsStream;
Chris@0: use PHPUnit\Framework\TestCase;
Chris@0: 
Chris@0: /**
Chris@0:  * Tests Transliteration component functionality.
Chris@0:  *
Chris@0:  * @group Transliteration
Chris@0:  *
Chris@0:  * @coversDefaultClass \Drupal\Component\Transliteration\PhpTransliteration
Chris@0:  */
Chris@0: class PhpTransliterationTest extends TestCase {
Chris@0: 
Chris@0:   /**
Chris@0:    * Tests the PhpTransliteration::removeDiacritics() function.
Chris@0:    *
Chris@0:    * @param string $original
Chris@0:    *   The language code to test.
Chris@0:    * @param string $expected
Chris@0:    *   The expected return from PhpTransliteration::removeDiacritics().
Chris@0:    *
Chris@0:    * @dataProvider providerTestPhpTransliterationRemoveDiacritics
Chris@0:    */
Chris@0:   public function testRemoveDiacritics($original, $expected) {
Chris@0:     $transliterator_class = new PhpTransliteration();
Chris@0:     $result = $transliterator_class->removeDiacritics($original);
Chris@0:     $this->assertEquals($expected, $result);
Chris@0:   }
Chris@0: 
Chris@0:   /**
Chris@0:    * Provides data for self::testRemoveDiacritics().
Chris@0:    *
Chris@0:    * @return array
Chris@0:    *   An array of arrays, each containing the parameters for
Chris@0:    *   self::testRemoveDiacritics().
Chris@0:    */
Chris@0:   public function providerTestPhpTransliterationRemoveDiacritics() {
Chris@0:     return [
Chris@0:       // Test all characters in the Unicode range 0x00bf to 0x017f.
Chris@0:       ['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'],
Chris@0:       ['ÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'],
Chris@0:       ['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'],
Chris@0:       ['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'],
Chris@0:       ['ĀāĂăĄąĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'],
Chris@0:       ['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'],
Chris@0:       ['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'],
Chris@0:       ['İıĲĳĴĵĶķĸĹĺĻļĽľĿ', 'IiĲĳJjKkĸLlLlLlL'],
Chris@0:       ['ŀŁłŃńŅņŇňŉŊŋŌōŎŏ', 'lLlNnNnNnŉŊŋOoOo'],
Chris@0:       ['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'],
Chris@0:       ['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'],
Chris@0:       ['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'],
Chris@0: 
Chris@0:       // Test all characters in the Unicode range 0x01CD to 0x024F.
Chris@0:       ['ǍǎǏ', 'AaI'],
Chris@0:       ['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'],
Chris@0:       ['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'],
Chris@0:       ['ǰǱǲǳǴǵǶǷǸǹǺǻǼǽǾǿ', 'jǱǲǳGgǶǷNnAaǼǽOo'],
Chris@0:       ['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'],
Chris@0:       ['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'],
Chris@0:       ['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'],
Chris@0:       ['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'],
Chris@0:       ['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'],
Chris@0:     ];
Chris@0:   }
Chris@0: 
Chris@0:   /**
Chris@0:    * Tests the PhpTransliteration class.
Chris@0:    *
Chris@0:    * @param string $langcode
Chris@0:    *   The language code to test.
Chris@0:    * @param string $original
Chris@0:    *   The original string.
Chris@0:    * @param string $expected
Chris@0:    *   The expected return from PhpTransliteration::transliterate().
Chris@0:    * @param string $unknown_character
Chris@0:    *   (optional) The character to substitute for characters in $string without
Chris@0:    *   transliterated equivalents. Defaults to '?'.
Chris@0:    * @param int $max_length
Chris@0:    *   (optional) If provided, return at most this many characters, ensuring
Chris@0:    *   that the transliteration does not split in the middle of an input
Chris@0:    *   character's transliteration.
Chris@0:    *
Chris@0:    * @dataProvider providerTestPhpTransliteration
Chris@0:    */
Chris@0:   public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
Chris@0:     $transliterator_class = new PhpTransliteration();
Chris@0:     $actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length);
Chris@0:     $this->assertSame($expected, $actual);
Chris@0:   }
Chris@0: 
Chris@0:   /**
Chris@0:    * Provides data for self::testPhpTransliteration().
Chris@0:    *
Chris@0:    * @return array
Chris@0:    *   An array of arrays, each containing the parameters for
Chris@0:    *   self::testPhpTransliteration().
Chris@0:    */
Chris@0:   public function providerTestPhpTransliteration() {
Chris@0:     $random_generator = new Random();
Chris@0:     $random = $random_generator->string(10);
Chris@0:     // Make some strings with two, three, and four-byte characters for testing.
Chris@0:     // Note that the 3-byte character is overridden by the 'kg' language.
Chris@0:     $two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
Chris@17:     // This is a Cyrillic character that looks something like a "u". See
Chris@0:     // http://www.unicode.org/charts/PDF/U0400.pdf
Chris@0:     $three_byte = html_entity_decode('&#x446;', ENT_NOQUOTES, 'UTF-8');
Chris@0:     // This is a Canadian Aboriginal character like a triangle. See
Chris@0:     // http://www.unicode.org/charts/PDF/U1400.pdf
Chris@0:     $four_byte = html_entity_decode('&#x1411;', ENT_NOQUOTES, 'UTF-8');
Chris@0:     // These are two Gothic alphabet letters. See
Chris@0:     // http://wikipedia.org/wiki/Gothic_alphabet
Chris@0:     // They are not in our tables, but should at least give us '?' (unknown).
Chris@0:     $five_byte = html_entity_decode('&#x10330;&#x10338;', ENT_NOQUOTES, 'UTF-8');
Chris@0: 
Chris@0:     return [
Chris@17:       // Each test case is language code, input, output, unknown character, max
Chris@17:       // length.
Chris@0:       // Test ASCII in English.
Chris@0:       ['en', $random, $random],
Chris@0:       // Test ASCII in some other language with no overrides.
Chris@0:       ['fr', $random, $random],
Chris@0:       // Test 3 and 4-byte characters in a language without overrides.
Chris@0:       // Note: if the data tables change, these will need to change too! They
Chris@0:       // are set up to test that data table loading works, so values come
Chris@0:       // directly from the data files.
Chris@0:       ['fr', $three_byte, 'c'],
Chris@0:       ['fr', $four_byte, 'wii'],
Chris@0:       // Test 5-byte characters.
Chris@0:       ['en', $five_byte, '??'],
Chris@0:       // Test a language with no overrides.
Chris@0:       ['en', $two_byte, 'A O U A O aouaohello'],
Chris@0:       // Test language overrides provided by core.
Chris@0:       ['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'],
Chris@0:       ['de', $random, $random],
Chris@18:       ['da', $two_byte, 'A O U Aa Oe aouaaoehello'],
Chris@18:       ['da', $random, $random],
Chris@0:       ['kg', $three_byte, 'ts'],
Chris@0:       // Test strings in some other languages.
Chris@0:       // Turkish, provided by drupal.org user Kartagis.
Chris@0:       ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
Chris@17:       // Max length.
Chris@17:       ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17],
Chris@17:       // Do not split up the transliteration of a single character.
Chris@17:       ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18],
Chris@0:       // Illegal/unknown unicode.
Chris@17:       ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
Chris@17:       ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
Chris@17:       ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
Chris@17:       ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
Chris@17:       // Non default replacement.
Chris@17:       ['en', chr(0x80) . 'ello World', '_ello World', '_'],
Chris@17:       // Keep the original question marks.
Chris@17:       ['en', chr(0xF8) . '?' . chr(0x80), '???'],
Chris@17:       ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
Chris@17:       ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
Chris@17:       // Non-US-ASCII replacement.
Chris@17:       ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
Chris@17:       ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
Chris@17:       // Ensure question marks are replaced when max length used.
Chris@17:       ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
Chris@17:       // Empty replacement.
Chris@17:       ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''],
Chris@17:       // Not affecting spacing from the beginning and end of a string.
Chris@17:       ['en', ' Hello Abventor! ', ' Hello Abventor! '],
Chris@17:       ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15],
Chris@17:       // Keep many spaces between words.
Chris@17:       ['en', 'Too    many    spaces between words !', 'Too    many    spaces between words !'],
Chris@0:     ];
Chris@0:   }
Chris@0: 
Chris@0:   /**
Chris@0:    * Tests inclusion is safe.
Chris@0:    *
Chris@0:    * @covers ::readLanguageOverrides
Chris@0:    */
Chris@0:   public function testSafeInclude() {
Chris@0:     // The overrides in the transliteration data directory transliterates 0x82
Chris@0:     // into "safe" but the overrides one directory higher transliterates the
Chris@0:     // same character into "security hole". So by using "../index" as the
Chris@0:     // language code we can test the ../ is stripped from the langcode.
Chris@0:     vfsStream::setup('transliteration', NULL, [
Chris@0:       'index.php' => '<?php $overrides = ["../index" => [0x82 => "security hole"]];',
Chris@0:       'dir' => [
Chris@0:         'index.php' => '<?php $overrides = ["../index" => [0x82 => "safe"]];',
Chris@0:       ],
Chris@0:     ]);
Chris@0:     $transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir'));
Chris@0:     $transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index');
Chris@14:     $this->assertSame('safe', $transliterated);
Chris@0:   }
Chris@0: 
Chris@0: }