annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 19:fa3358dc1485 tip

Add ndrum files
author Chris Cannam
date Wed, 28 Aug 2019 13:14:47 +0100
parents 129ea1e6d783
children
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Tests\search\Functional;
Chris@0 4
Chris@17 5 use Drupal\Tests\BrowserTestBase;
Chris@0 6
Chris@0 7 /**
Chris@0 8 * Tests that CJK tokenizer works as intended.
Chris@0 9 *
Chris@0 10 * @group search
Chris@0 11 */
Chris@17 12 class SearchTokenizerTest extends BrowserTestBase {
Chris@17 13
Chris@17 14 /**
Chris@17 15 * {@inheritdoc}
Chris@17 16 */
Chris@17 17 protected static $modules = ['search'];
Chris@0 18
Chris@0 19 /**
Chris@0 20 * Verifies that strings of CJK characters are tokenized.
Chris@0 21 *
Chris@0 22 * The search_simplify() function does special things with numbers, symbols,
Chris@0 23 * and punctuation. So we only test that CJK characters that are not in these
Chris@0 24 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
Chris@0 25 * information.
Chris@0 26 */
Chris@0 27 public function testTokenizer() {
Chris@0 28 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0 29 // sure CJK tokenizing is turned on.
Chris@0 30 $this->config('search.settings')
Chris@0 31 ->set('index.minimum_word_size', 1)
Chris@0 32 ->set('index.overlap_cjk', TRUE)
Chris@0 33 ->save();
Chris@0 34 $this->refreshVariables();
Chris@0 35
Chris@0 36 // Create a string of CJK characters from various character ranges in
Chris@0 37 // the Unicode tables.
Chris@0 38
Chris@0 39 // Beginnings of the character ranges.
Chris@0 40 $starts = [
Chris@0 41 'CJK unified' => 0x4e00,
Chris@0 42 'CJK Ext A' => 0x3400,
Chris@0 43 'CJK Compat' => 0xf900,
Chris@0 44 'Hangul Jamo' => 0x1100,
Chris@0 45 'Hangul Ext A' => 0xa960,
Chris@0 46 'Hangul Ext B' => 0xd7b0,
Chris@0 47 'Hangul Compat' => 0x3131,
Chris@0 48 'Half non-punct 1' => 0xff21,
Chris@0 49 'Half non-punct 2' => 0xff41,
Chris@0 50 'Half non-punct 3' => 0xff66,
Chris@0 51 'Hangul Syllables' => 0xac00,
Chris@0 52 'Hiragana' => 0x3040,
Chris@0 53 'Katakana' => 0x30a1,
Chris@0 54 'Katakana Ext' => 0x31f0,
Chris@0 55 'CJK Reserve 1' => 0x20000,
Chris@0 56 'CJK Reserve 2' => 0x30000,
Chris@0 57 'Bomofo' => 0x3100,
Chris@0 58 'Bomofo Ext' => 0x31a0,
Chris@0 59 'Lisu' => 0xa4d0,
Chris@0 60 'Yi' => 0xa000,
Chris@0 61 ];
Chris@0 62
Chris@0 63 // Ends of the character ranges.
Chris@0 64 $ends = [
Chris@0 65 'CJK unified' => 0x9fcf,
Chris@0 66 'CJK Ext A' => 0x4dbf,
Chris@0 67 'CJK Compat' => 0xfaff,
Chris@0 68 'Hangul Jamo' => 0x11ff,
Chris@0 69 'Hangul Ext A' => 0xa97f,
Chris@0 70 'Hangul Ext B' => 0xd7ff,
Chris@0 71 'Hangul Compat' => 0x318e,
Chris@0 72 'Half non-punct 1' => 0xff3a,
Chris@0 73 'Half non-punct 2' => 0xff5a,
Chris@0 74 'Half non-punct 3' => 0xffdc,
Chris@0 75 'Hangul Syllables' => 0xd7af,
Chris@0 76 'Hiragana' => 0x309f,
Chris@0 77 'Katakana' => 0x30ff,
Chris@0 78 'Katakana Ext' => 0x31ff,
Chris@0 79 'CJK Reserve 1' => 0x2fffd,
Chris@0 80 'CJK Reserve 2' => 0x3fffd,
Chris@0 81 'Bomofo' => 0x312f,
Chris@0 82 'Bomofo Ext' => 0x31b7,
Chris@0 83 'Lisu' => 0xa4fd,
Chris@0 84 'Yi' => 0xa48f,
Chris@0 85 ];
Chris@0 86
Chris@0 87 // Generate characters consisting of starts, midpoints, and ends.
Chris@0 88 $chars = [];
Chris@0 89 $charcodes = [];
Chris@0 90 foreach ($starts as $key => $value) {
Chris@0 91 $charcodes[] = $starts[$key];
Chris@0 92 $chars[] = $this->code2utf($starts[$key]);
Chris@0 93 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
Chris@0 94 $charcodes[] = $mid;
Chris@0 95 $chars[] = $this->code2utf($mid);
Chris@0 96 $charcodes[] = $ends[$key];
Chris@0 97 $chars[] = $this->code2utf($ends[$key]);
Chris@0 98 }
Chris@0 99
Chris@0 100 // Merge into a string and tokenize.
Chris@0 101 $string = implode('', $chars);
Chris@0 102 $out = trim(search_simplify($string));
Chris@17 103 $expected = mb_strtolower(implode(' ', $chars));
Chris@0 104
Chris@0 105 // Verify that the output matches what we expect.
Chris@0 106 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
Chris@0 107 }
Chris@0 108
Chris@0 109 /**
Chris@0 110 * Verifies that strings of non-CJK characters are not tokenized.
Chris@0 111 *
Chris@0 112 * This is just a sanity check - it verifies that strings of letters are
Chris@0 113 * not tokenized.
Chris@0 114 */
Chris@0 115 public function testNoTokenizer() {
Chris@0 116 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0 117 // sure CJK tokenizing is turned on.
Chris@0 118 $this->config('search.settings')
Chris@0 119 ->set('index.minimum_word_size', 1)
Chris@0 120 ->set('index.overlap_cjk', TRUE)
Chris@0 121 ->save();
Chris@0 122 $this->refreshVariables();
Chris@0 123
Chris@0 124 $letters = 'abcdefghijklmnopqrstuvwxyz';
Chris@0 125 $out = trim(search_simplify($letters));
Chris@0 126
Chris@0 127 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
Chris@0 128 }
Chris@0 129
Chris@0 130 /**
Chris@0 131 * Like PHP chr() function, but for unicode characters.
Chris@0 132 *
Chris@17 133 * Function chr() only works for ASCII characters up to character 255. This
Chris@17 134 * function converts a number to the corresponding unicode character. Adapted
Chris@17 135 * from functions supplied in comments on several functions on php.net.
Chris@0 136 */
Chris@0 137 public function code2utf($num) {
Chris@0 138 if ($num < 128) {
Chris@0 139 return chr($num);
Chris@0 140 }
Chris@0 141
Chris@0 142 if ($num < 2048) {
Chris@0 143 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
Chris@0 144 }
Chris@0 145
Chris@0 146 if ($num < 65536) {
Chris@0 147 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0 148 }
Chris@0 149
Chris@0 150 if ($num < 2097152) {
Chris@0 151 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0 152 }
Chris@0 153
Chris@0 154 return '';
Chris@0 155 }
Chris@0 156
Chris@0 157 }