Chris@0: removeDiacritics($original); Chris@0: $this->assertEquals($expected, $result); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Provides data for self::testRemoveDiacritics(). Chris@0: * Chris@0: * @return array Chris@0: * An array of arrays, each containing the parameters for Chris@0: * self::testRemoveDiacritics(). Chris@0: */ Chris@0: public function providerTestPhpTransliterationRemoveDiacritics() { Chris@0: return [ Chris@0: // Test all characters in the Unicode range 0x00bf to 0x017f. Chris@0: ['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'], Chris@0: ['ÐÑÒÓÔÕÖרÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'], Chris@0: ['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'], Chris@0: ['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'], Chris@0: ['ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'], Chris@0: ['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'], Chris@0: ['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'], Chris@0: ['İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'], Chris@0: ['ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'], Chris@0: ['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'], Chris@0: ['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'], Chris@0: ['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'], Chris@0: Chris@0: // Test all characters in the Unicode range 0x01CD to 0x024F. Chris@0: ['ǍǎǏ', 'AaI'], Chris@0: ['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'], Chris@0: ['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'], Chris@0: ['ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'], Chris@0: ['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'], Chris@0: ['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'], Chris@0: ['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'], Chris@0: ['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'], Chris@0: ['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'], Chris@0: ]; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Tests the PhpTransliteration class. Chris@0: * Chris@0: * @param string $langcode Chris@0: * The language code to test. Chris@0: * @param string $original Chris@0: * The original string. Chris@0: * @param string $expected Chris@0: * The expected return from PhpTransliteration::transliterate(). Chris@0: * @param string $unknown_character Chris@0: * (optional) The character to substitute for characters in $string without Chris@0: * transliterated equivalents. Defaults to '?'. Chris@0: * @param int $max_length Chris@0: * (optional) If provided, return at most this many characters, ensuring Chris@0: * that the transliteration does not split in the middle of an input Chris@0: * character's transliteration. Chris@0: * Chris@0: * @dataProvider providerTestPhpTransliteration Chris@0: */ Chris@0: public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) { Chris@0: $transliterator_class = new PhpTransliteration(); Chris@0: $actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length); Chris@0: $this->assertSame($expected, $actual); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Provides data for self::testPhpTransliteration(). Chris@0: * Chris@0: * @return array Chris@0: * An array of arrays, each containing the parameters for Chris@0: * self::testPhpTransliteration(). Chris@0: */ Chris@0: public function providerTestPhpTransliteration() { Chris@0: $random_generator = new Random(); Chris@0: $random = $random_generator->string(10); Chris@0: // Make some strings with two, three, and four-byte characters for testing. Chris@0: // Note that the 3-byte character is overridden by the 'kg' language. Chris@0: $two_byte = 'Ä Ö Ü Å Ø äöüåøhello'; Chris@17: // This is a Cyrillic character that looks something like a "u". See Chris@0: // http://www.unicode.org/charts/PDF/U0400.pdf Chris@0: $three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8'); Chris@0: // This is a Canadian Aboriginal character like a triangle. See Chris@0: // http://www.unicode.org/charts/PDF/U1400.pdf Chris@0: $four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8'); Chris@0: // These are two Gothic alphabet letters. See Chris@0: // http://wikipedia.org/wiki/Gothic_alphabet Chris@0: // They are not in our tables, but should at least give us '?' (unknown). Chris@0: $five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8'); Chris@0: Chris@0: return [ Chris@17: // Each test case is language code, input, output, unknown character, max Chris@17: // length. Chris@0: // Test ASCII in English. Chris@0: ['en', $random, $random], Chris@0: // Test ASCII in some other language with no overrides. Chris@0: ['fr', $random, $random], Chris@0: // Test 3 and 4-byte characters in a language without overrides. Chris@0: // Note: if the data tables change, these will need to change too! They Chris@0: // are set up to test that data table loading works, so values come Chris@0: // directly from the data files. Chris@0: ['fr', $three_byte, 'c'], Chris@0: ['fr', $four_byte, 'wii'], Chris@0: // Test 5-byte characters. Chris@0: ['en', $five_byte, '??'], Chris@0: // Test a language with no overrides. Chris@0: ['en', $two_byte, 'A O U A O aouaohello'], Chris@0: // Test language overrides provided by core. Chris@0: ['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'], Chris@0: ['de', $random, $random], Chris@18: ['da', $two_byte, 'A O U Aa Oe aouaaoehello'], Chris@18: ['da', $random, $random], Chris@0: ['kg', $three_byte, 'ts'], Chris@0: // Test strings in some other languages. Chris@0: // Turkish, provided by drupal.org user Kartagis. Chris@0: ['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'], Chris@17: // Max length. Chris@17: ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 17], Chris@17: // Do not split up the transliteration of a single character. Chris@17: ['de', $two_byte, 'Ae Oe Ue A O aeoe', '?', 18], Chris@0: // Illegal/unknown unicode. Chris@17: ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'], Chris@17: ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'], Chris@17: ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'], Chris@17: ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'], Chris@17: // Non default replacement. Chris@17: ['en', chr(0x80) . 'ello World', '_ello World', '_'], Chris@17: // Keep the original question marks. Chris@17: ['en', chr(0xF8) . '?' . chr(0x80), '???'], Chris@17: ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'], Chris@17: ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'], Chris@17: // Non-US-ASCII replacement. Chris@17: ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'], Chris@17: ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'], Chris@17: // Ensure question marks are replaced when max length used. Chris@17: ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7], Chris@17: // Empty replacement. Chris@17: ['en', chr(0x80) . 'ello World' . chr(0xF8), 'ello World', ''], Chris@17: // Not affecting spacing from the beginning and end of a string. Chris@17: ['en', ' Hello Abventor! ', ' Hello Abventor! '], Chris@17: ['pl', ' Drupal Kraków Community', ' Drupal Krakow ', '?', 15], Chris@17: // Keep many spaces between words. Chris@17: ['en', 'Too many spaces between words !', 'Too many spaces between words !'], Chris@0: ]; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Tests inclusion is safe. Chris@0: * Chris@0: * @covers ::readLanguageOverrides Chris@0: */ Chris@0: public function testSafeInclude() { Chris@0: // The overrides in the transliteration data directory transliterates 0x82 Chris@0: // into "safe" but the overrides one directory higher transliterates the Chris@0: // same character into "security hole". So by using "../index" as the Chris@0: // language code we can test the ../ is stripped from the langcode. Chris@0: vfsStream::setup('transliteration', NULL, [ Chris@0: 'index.php' => ' [0x82 => "security hole"]];', Chris@0: 'dir' => [ Chris@0: 'index.php' => ' [0x82 => "safe"]];', Chris@0: ], Chris@0: ]); Chris@0: $transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir')); Chris@0: $transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index'); Chris@14: $this->assertSame('safe', $transliterated); Chris@0: } Chris@0: Chris@0: }