Chris@0
|
1 <?php
|
Chris@0
|
2
|
Chris@0
|
3 namespace Drupal\Tests\search\Functional;
|
Chris@0
|
4
|
Chris@0
|
5 use Drupal\Component\Utility\Unicode;
|
Chris@0
|
6
|
Chris@0
|
7 /**
|
Chris@0
|
8 * Tests that CJK tokenizer works as intended.
|
Chris@0
|
9 *
|
Chris@0
|
10 * @group search
|
Chris@0
|
11 */
|
Chris@0
|
12 class SearchTokenizerTest extends SearchTestBase {
|
Chris@0
|
13
|
Chris@0
|
14 /**
|
Chris@0
|
15 * Verifies that strings of CJK characters are tokenized.
|
Chris@0
|
16 *
|
Chris@0
|
17 * The search_simplify() function does special things with numbers, symbols,
|
Chris@0
|
18 * and punctuation. So we only test that CJK characters that are not in these
|
Chris@0
|
19 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
|
Chris@0
|
20 * information.
|
Chris@0
|
21 */
|
Chris@0
|
22 public function testTokenizer() {
|
Chris@0
|
23 // Set the minimum word size to 1 (to split all CJK characters) and make
|
Chris@0
|
24 // sure CJK tokenizing is turned on.
|
Chris@0
|
25 $this->config('search.settings')
|
Chris@0
|
26 ->set('index.minimum_word_size', 1)
|
Chris@0
|
27 ->set('index.overlap_cjk', TRUE)
|
Chris@0
|
28 ->save();
|
Chris@0
|
29 $this->refreshVariables();
|
Chris@0
|
30
|
Chris@0
|
31 // Create a string of CJK characters from various character ranges in
|
Chris@0
|
32 // the Unicode tables.
|
Chris@0
|
33
|
Chris@0
|
34 // Beginnings of the character ranges.
|
Chris@0
|
35 $starts = [
|
Chris@0
|
36 'CJK unified' => 0x4e00,
|
Chris@0
|
37 'CJK Ext A' => 0x3400,
|
Chris@0
|
38 'CJK Compat' => 0xf900,
|
Chris@0
|
39 'Hangul Jamo' => 0x1100,
|
Chris@0
|
40 'Hangul Ext A' => 0xa960,
|
Chris@0
|
41 'Hangul Ext B' => 0xd7b0,
|
Chris@0
|
42 'Hangul Compat' => 0x3131,
|
Chris@0
|
43 'Half non-punct 1' => 0xff21,
|
Chris@0
|
44 'Half non-punct 2' => 0xff41,
|
Chris@0
|
45 'Half non-punct 3' => 0xff66,
|
Chris@0
|
46 'Hangul Syllables' => 0xac00,
|
Chris@0
|
47 'Hiragana' => 0x3040,
|
Chris@0
|
48 'Katakana' => 0x30a1,
|
Chris@0
|
49 'Katakana Ext' => 0x31f0,
|
Chris@0
|
50 'CJK Reserve 1' => 0x20000,
|
Chris@0
|
51 'CJK Reserve 2' => 0x30000,
|
Chris@0
|
52 'Bomofo' => 0x3100,
|
Chris@0
|
53 'Bomofo Ext' => 0x31a0,
|
Chris@0
|
54 'Lisu' => 0xa4d0,
|
Chris@0
|
55 'Yi' => 0xa000,
|
Chris@0
|
56 ];
|
Chris@0
|
57
|
Chris@0
|
58 // Ends of the character ranges.
|
Chris@0
|
59 $ends = [
|
Chris@0
|
60 'CJK unified' => 0x9fcf,
|
Chris@0
|
61 'CJK Ext A' => 0x4dbf,
|
Chris@0
|
62 'CJK Compat' => 0xfaff,
|
Chris@0
|
63 'Hangul Jamo' => 0x11ff,
|
Chris@0
|
64 'Hangul Ext A' => 0xa97f,
|
Chris@0
|
65 'Hangul Ext B' => 0xd7ff,
|
Chris@0
|
66 'Hangul Compat' => 0x318e,
|
Chris@0
|
67 'Half non-punct 1' => 0xff3a,
|
Chris@0
|
68 'Half non-punct 2' => 0xff5a,
|
Chris@0
|
69 'Half non-punct 3' => 0xffdc,
|
Chris@0
|
70 'Hangul Syllables' => 0xd7af,
|
Chris@0
|
71 'Hiragana' => 0x309f,
|
Chris@0
|
72 'Katakana' => 0x30ff,
|
Chris@0
|
73 'Katakana Ext' => 0x31ff,
|
Chris@0
|
74 'CJK Reserve 1' => 0x2fffd,
|
Chris@0
|
75 'CJK Reserve 2' => 0x3fffd,
|
Chris@0
|
76 'Bomofo' => 0x312f,
|
Chris@0
|
77 'Bomofo Ext' => 0x31b7,
|
Chris@0
|
78 'Lisu' => 0xa4fd,
|
Chris@0
|
79 'Yi' => 0xa48f,
|
Chris@0
|
80 ];
|
Chris@0
|
81
|
Chris@0
|
82 // Generate characters consisting of starts, midpoints, and ends.
|
Chris@0
|
83 $chars = [];
|
Chris@0
|
84 $charcodes = [];
|
Chris@0
|
85 foreach ($starts as $key => $value) {
|
Chris@0
|
86 $charcodes[] = $starts[$key];
|
Chris@0
|
87 $chars[] = $this->code2utf($starts[$key]);
|
Chris@0
|
88 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
|
Chris@0
|
89 $charcodes[] = $mid;
|
Chris@0
|
90 $chars[] = $this->code2utf($mid);
|
Chris@0
|
91 $charcodes[] = $ends[$key];
|
Chris@0
|
92 $chars[] = $this->code2utf($ends[$key]);
|
Chris@0
|
93 }
|
Chris@0
|
94
|
Chris@0
|
95 // Merge into a string and tokenize.
|
Chris@0
|
96 $string = implode('', $chars);
|
Chris@0
|
97 $out = trim(search_simplify($string));
|
Chris@0
|
98 $expected = Unicode::strtolower(implode(' ', $chars));
|
Chris@0
|
99
|
Chris@0
|
100 // Verify that the output matches what we expect.
|
Chris@0
|
101 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
|
Chris@0
|
102 }
|
Chris@0
|
103
|
Chris@0
|
104 /**
|
Chris@0
|
105 * Verifies that strings of non-CJK characters are not tokenized.
|
Chris@0
|
106 *
|
Chris@0
|
107 * This is just a sanity check - it verifies that strings of letters are
|
Chris@0
|
108 * not tokenized.
|
Chris@0
|
109 */
|
Chris@0
|
110 public function testNoTokenizer() {
|
Chris@0
|
111 // Set the minimum word size to 1 (to split all CJK characters) and make
|
Chris@0
|
112 // sure CJK tokenizing is turned on.
|
Chris@0
|
113 $this->config('search.settings')
|
Chris@0
|
114 ->set('index.minimum_word_size', 1)
|
Chris@0
|
115 ->set('index.overlap_cjk', TRUE)
|
Chris@0
|
116 ->save();
|
Chris@0
|
117 $this->refreshVariables();
|
Chris@0
|
118
|
Chris@0
|
119 $letters = 'abcdefghijklmnopqrstuvwxyz';
|
Chris@0
|
120 $out = trim(search_simplify($letters));
|
Chris@0
|
121
|
Chris@0
|
122 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
|
Chris@0
|
123 }
|
Chris@0
|
124
|
Chris@0
|
125 /**
|
Chris@0
|
126 * Like PHP chr() function, but for unicode characters.
|
Chris@0
|
127 *
|
Chris@0
|
128 * chr() only works for ASCII characters up to character 255. This function
|
Chris@0
|
129 * converts a number to the corresponding unicode character. Adapted from
|
Chris@0
|
130 * functions supplied in comments on several functions on php.net.
|
Chris@0
|
131 */
|
Chris@0
|
132 public function code2utf($num) {
|
Chris@0
|
133 if ($num < 128) {
|
Chris@0
|
134 return chr($num);
|
Chris@0
|
135 }
|
Chris@0
|
136
|
Chris@0
|
137 if ($num < 2048) {
|
Chris@0
|
138 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
|
Chris@0
|
139 }
|
Chris@0
|
140
|
Chris@0
|
141 if ($num < 65536) {
|
Chris@0
|
142 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
|
Chris@0
|
143 }
|
Chris@0
|
144
|
Chris@0
|
145 if ($num < 2097152) {
|
Chris@0
|
146 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
|
Chris@0
|
147 }
|
Chris@0
|
148
|
Chris@0
|
149 return '';
|
Chris@0
|
150 }
|
Chris@0
|
151
|
Chris@0
|
152 }
|