comparison core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b

Initial commit from drush-created site
author Chris Cannam
date Thu, 05 Jul 2018 14:24:15 +0000
parents
children a9cd425dd02b
comparison
equal deleted inserted replaced
-1:000000000000 0:c75dbcec494b
1 <?php
2
3 namespace Drupal\Tests\search\Functional;
4
5 use Drupal\Component\Utility\Unicode;
6
7 /**
8 * Tests that CJK tokenizer works as intended.
9 *
10 * @group search
11 */
12 class SearchTokenizerTest extends SearchTestBase {
13
14 /**
15 * Verifies that strings of CJK characters are tokenized.
16 *
17 * The search_simplify() function does special things with numbers, symbols,
18 * and punctuation. So we only test that CJK characters that are not in these
19 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
20 * information.
21 */
22 public function testTokenizer() {
23 // Set the minimum word size to 1 (to split all CJK characters) and make
24 // sure CJK tokenizing is turned on.
25 $this->config('search.settings')
26 ->set('index.minimum_word_size', 1)
27 ->set('index.overlap_cjk', TRUE)
28 ->save();
29 $this->refreshVariables();
30
31 // Create a string of CJK characters from various character ranges in
32 // the Unicode tables.
33
34 // Beginnings of the character ranges.
35 $starts = [
36 'CJK unified' => 0x4e00,
37 'CJK Ext A' => 0x3400,
38 'CJK Compat' => 0xf900,
39 'Hangul Jamo' => 0x1100,
40 'Hangul Ext A' => 0xa960,
41 'Hangul Ext B' => 0xd7b0,
42 'Hangul Compat' => 0x3131,
43 'Half non-punct 1' => 0xff21,
44 'Half non-punct 2' => 0xff41,
45 'Half non-punct 3' => 0xff66,
46 'Hangul Syllables' => 0xac00,
47 'Hiragana' => 0x3040,
48 'Katakana' => 0x30a1,
49 'Katakana Ext' => 0x31f0,
50 'CJK Reserve 1' => 0x20000,
51 'CJK Reserve 2' => 0x30000,
52 'Bomofo' => 0x3100,
53 'Bomofo Ext' => 0x31a0,
54 'Lisu' => 0xa4d0,
55 'Yi' => 0xa000,
56 ];
57
58 // Ends of the character ranges.
59 $ends = [
60 'CJK unified' => 0x9fcf,
61 'CJK Ext A' => 0x4dbf,
62 'CJK Compat' => 0xfaff,
63 'Hangul Jamo' => 0x11ff,
64 'Hangul Ext A' => 0xa97f,
65 'Hangul Ext B' => 0xd7ff,
66 'Hangul Compat' => 0x318e,
67 'Half non-punct 1' => 0xff3a,
68 'Half non-punct 2' => 0xff5a,
69 'Half non-punct 3' => 0xffdc,
70 'Hangul Syllables' => 0xd7af,
71 'Hiragana' => 0x309f,
72 'Katakana' => 0x30ff,
73 'Katakana Ext' => 0x31ff,
74 'CJK Reserve 1' => 0x2fffd,
75 'CJK Reserve 2' => 0x3fffd,
76 'Bomofo' => 0x312f,
77 'Bomofo Ext' => 0x31b7,
78 'Lisu' => 0xa4fd,
79 'Yi' => 0xa48f,
80 ];
81
82 // Generate characters consisting of starts, midpoints, and ends.
83 $chars = [];
84 $charcodes = [];
85 foreach ($starts as $key => $value) {
86 $charcodes[] = $starts[$key];
87 $chars[] = $this->code2utf($starts[$key]);
88 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
89 $charcodes[] = $mid;
90 $chars[] = $this->code2utf($mid);
91 $charcodes[] = $ends[$key];
92 $chars[] = $this->code2utf($ends[$key]);
93 }
94
95 // Merge into a string and tokenize.
96 $string = implode('', $chars);
97 $out = trim(search_simplify($string));
98 $expected = Unicode::strtolower(implode(' ', $chars));
99
100 // Verify that the output matches what we expect.
101 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
102 }
103
104 /**
105 * Verifies that strings of non-CJK characters are not tokenized.
106 *
107 * This is just a sanity check - it verifies that strings of letters are
108 * not tokenized.
109 */
110 public function testNoTokenizer() {
111 // Set the minimum word size to 1 (to split all CJK characters) and make
112 // sure CJK tokenizing is turned on.
113 $this->config('search.settings')
114 ->set('index.minimum_word_size', 1)
115 ->set('index.overlap_cjk', TRUE)
116 ->save();
117 $this->refreshVariables();
118
119 $letters = 'abcdefghijklmnopqrstuvwxyz';
120 $out = trim(search_simplify($letters));
121
122 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
123 }
124
125 /**
126 * Like PHP chr() function, but for unicode characters.
127 *
128 * chr() only works for ASCII characters up to character 255. This function
129 * converts a number to the corresponding unicode character. Adapted from
130 * functions supplied in comments on several functions on php.net.
131 */
132 public function code2utf($num) {
133 if ($num < 128) {
134 return chr($num);
135 }
136
137 if ($num < 2048) {
138 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
139 }
140
141 if ($num < 65536) {
142 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
143 }
144
145 if ($num < 2097152) {
146 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
147 }
148
149 return '';
150 }
151
152 }