Mercurial > hg > cmmr2012-drupal-site
diff core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b
Initial commit from drush-created site
author | Chris Cannam |
---|---|
date | Thu, 05 Jul 2018 14:24:15 +0000 |
parents | |
children | a9cd425dd02b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/modules/search/tests/src/Functional/SearchTokenizerTest.php Thu Jul 05 14:24:15 2018 +0000 @@ -0,0 +1,152 @@ +<?php + +namespace Drupal\Tests\search\Functional; + +use Drupal\Component\Utility\Unicode; + +/** + * Tests that CJK tokenizer works as intended. + * + * @group search + */ +class SearchTokenizerTest extends SearchTestBase { + + /** + * Verifies that strings of CJK characters are tokenized. + * + * The search_simplify() function does special things with numbers, symbols, + * and punctuation. So we only test that CJK characters that are not in these + * character classes are tokenized properly. See PREG_CLASS_CKJ for more + * information. + */ + public function testTokenizer() { + // Set the minimum word size to 1 (to split all CJK characters) and make + // sure CJK tokenizing is turned on. + $this->config('search.settings') + ->set('index.minimum_word_size', 1) + ->set('index.overlap_cjk', TRUE) + ->save(); + $this->refreshVariables(); + + // Create a string of CJK characters from various character ranges in + // the Unicode tables. + + // Beginnings of the character ranges. + $starts = [ + 'CJK unified' => 0x4e00, + 'CJK Ext A' => 0x3400, + 'CJK Compat' => 0xf900, + 'Hangul Jamo' => 0x1100, + 'Hangul Ext A' => 0xa960, + 'Hangul Ext B' => 0xd7b0, + 'Hangul Compat' => 0x3131, + 'Half non-punct 1' => 0xff21, + 'Half non-punct 2' => 0xff41, + 'Half non-punct 3' => 0xff66, + 'Hangul Syllables' => 0xac00, + 'Hiragana' => 0x3040, + 'Katakana' => 0x30a1, + 'Katakana Ext' => 0x31f0, + 'CJK Reserve 1' => 0x20000, + 'CJK Reserve 2' => 0x30000, + 'Bomofo' => 0x3100, + 'Bomofo Ext' => 0x31a0, + 'Lisu' => 0xa4d0, + 'Yi' => 0xa000, + ]; + + // Ends of the character ranges. + $ends = [ + 'CJK unified' => 0x9fcf, + 'CJK Ext A' => 0x4dbf, + 'CJK Compat' => 0xfaff, + 'Hangul Jamo' => 0x11ff, + 'Hangul Ext A' => 0xa97f, + 'Hangul Ext B' => 0xd7ff, + 'Hangul Compat' => 0x318e, + 'Half non-punct 1' => 0xff3a, + 'Half non-punct 2' => 0xff5a, + 'Half non-punct 3' => 0xffdc, + 'Hangul Syllables' => 0xd7af, + 'Hiragana' => 0x309f, + 'Katakana' => 0x30ff, + 'Katakana Ext' => 0x31ff, + 'CJK Reserve 1' => 0x2fffd, + 'CJK Reserve 2' => 0x3fffd, + 'Bomofo' => 0x312f, + 'Bomofo Ext' => 0x31b7, + 'Lisu' => 0xa4fd, + 'Yi' => 0xa48f, + ]; + + // Generate characters consisting of starts, midpoints, and ends. + $chars = []; + $charcodes = []; + foreach ($starts as $key => $value) { + $charcodes[] = $starts[$key]; + $chars[] = $this->code2utf($starts[$key]); + $mid = round(0.5 * ($starts[$key] + $ends[$key])); + $charcodes[] = $mid; + $chars[] = $this->code2utf($mid); + $charcodes[] = $ends[$key]; + $chars[] = $this->code2utf($ends[$key]); + } + + // Merge into a string and tokenize. + $string = implode('', $chars); + $out = trim(search_simplify($string)); + $expected = Unicode::strtolower(implode(' ', $chars)); + + // Verify that the output matches what we expect. + $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters'); + } + + /** + * Verifies that strings of non-CJK characters are not tokenized. + * + * This is just a sanity check - it verifies that strings of letters are + * not tokenized. + */ + public function testNoTokenizer() { + // Set the minimum word size to 1 (to split all CJK characters) and make + // sure CJK tokenizing is turned on. + $this->config('search.settings') + ->set('index.minimum_word_size', 1) + ->set('index.overlap_cjk', TRUE) + ->save(); + $this->refreshVariables(); + + $letters = 'abcdefghijklmnopqrstuvwxyz'; + $out = trim(search_simplify($letters)); + + $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); + } + + /** + * Like PHP chr() function, but for unicode characters. + * + * chr() only works for ASCII characters up to character 255. This function + * converts a number to the corresponding unicode character. Adapted from + * functions supplied in comments on several functions on php.net. + */ + public function code2utf($num) { + if ($num < 128) { + return chr($num); + } + + if ($num < 2048) { + return chr(($num >> 6) + 192) . chr(($num & 63) + 128); + } + + if ($num < 65536) { + return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); + } + + if ($num < 2097152) { + return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); + } + + return ''; + } + +}