Mercurial > hg > cmmr2012-drupal-site
comparison core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b
Initial commit from drush-created site
author | Chris Cannam |
---|---|
date | Thu, 05 Jul 2018 14:24:15 +0000 |
parents | |
children | a9cd425dd02b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c75dbcec494b |
---|---|
1 <?php | |
2 | |
3 namespace Drupal\Tests\search\Functional; | |
4 | |
5 use Drupal\Component\Utility\Unicode; | |
6 | |
7 /** | |
8 * Tests that CJK tokenizer works as intended. | |
9 * | |
10 * @group search | |
11 */ | |
12 class SearchTokenizerTest extends SearchTestBase { | |
13 | |
14 /** | |
15 * Verifies that strings of CJK characters are tokenized. | |
16 * | |
17 * The search_simplify() function does special things with numbers, symbols, | |
18 * and punctuation. So we only test that CJK characters that are not in these | |
19 * character classes are tokenized properly. See PREG_CLASS_CKJ for more | |
20 * information. | |
21 */ | |
22 public function testTokenizer() { | |
23 // Set the minimum word size to 1 (to split all CJK characters) and make | |
24 // sure CJK tokenizing is turned on. | |
25 $this->config('search.settings') | |
26 ->set('index.minimum_word_size', 1) | |
27 ->set('index.overlap_cjk', TRUE) | |
28 ->save(); | |
29 $this->refreshVariables(); | |
30 | |
31 // Create a string of CJK characters from various character ranges in | |
32 // the Unicode tables. | |
33 | |
34 // Beginnings of the character ranges. | |
35 $starts = [ | |
36 'CJK unified' => 0x4e00, | |
37 'CJK Ext A' => 0x3400, | |
38 'CJK Compat' => 0xf900, | |
39 'Hangul Jamo' => 0x1100, | |
40 'Hangul Ext A' => 0xa960, | |
41 'Hangul Ext B' => 0xd7b0, | |
42 'Hangul Compat' => 0x3131, | |
43 'Half non-punct 1' => 0xff21, | |
44 'Half non-punct 2' => 0xff41, | |
45 'Half non-punct 3' => 0xff66, | |
46 'Hangul Syllables' => 0xac00, | |
47 'Hiragana' => 0x3040, | |
48 'Katakana' => 0x30a1, | |
49 'Katakana Ext' => 0x31f0, | |
50 'CJK Reserve 1' => 0x20000, | |
51 'CJK Reserve 2' => 0x30000, | |
52 'Bomofo' => 0x3100, | |
53 'Bomofo Ext' => 0x31a0, | |
54 'Lisu' => 0xa4d0, | |
55 'Yi' => 0xa000, | |
56 ]; | |
57 | |
58 // Ends of the character ranges. | |
59 $ends = [ | |
60 'CJK unified' => 0x9fcf, | |
61 'CJK Ext A' => 0x4dbf, | |
62 'CJK Compat' => 0xfaff, | |
63 'Hangul Jamo' => 0x11ff, | |
64 'Hangul Ext A' => 0xa97f, | |
65 'Hangul Ext B' => 0xd7ff, | |
66 'Hangul Compat' => 0x318e, | |
67 'Half non-punct 1' => 0xff3a, | |
68 'Half non-punct 2' => 0xff5a, | |
69 'Half non-punct 3' => 0xffdc, | |
70 'Hangul Syllables' => 0xd7af, | |
71 'Hiragana' => 0x309f, | |
72 'Katakana' => 0x30ff, | |
73 'Katakana Ext' => 0x31ff, | |
74 'CJK Reserve 1' => 0x2fffd, | |
75 'CJK Reserve 2' => 0x3fffd, | |
76 'Bomofo' => 0x312f, | |
77 'Bomofo Ext' => 0x31b7, | |
78 'Lisu' => 0xa4fd, | |
79 'Yi' => 0xa48f, | |
80 ]; | |
81 | |
82 // Generate characters consisting of starts, midpoints, and ends. | |
83 $chars = []; | |
84 $charcodes = []; | |
85 foreach ($starts as $key => $value) { | |
86 $charcodes[] = $starts[$key]; | |
87 $chars[] = $this->code2utf($starts[$key]); | |
88 $mid = round(0.5 * ($starts[$key] + $ends[$key])); | |
89 $charcodes[] = $mid; | |
90 $chars[] = $this->code2utf($mid); | |
91 $charcodes[] = $ends[$key]; | |
92 $chars[] = $this->code2utf($ends[$key]); | |
93 } | |
94 | |
95 // Merge into a string and tokenize. | |
96 $string = implode('', $chars); | |
97 $out = trim(search_simplify($string)); | |
98 $expected = Unicode::strtolower(implode(' ', $chars)); | |
99 | |
100 // Verify that the output matches what we expect. | |
101 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters'); | |
102 } | |
103 | |
104 /** | |
105 * Verifies that strings of non-CJK characters are not tokenized. | |
106 * | |
107 * This is just a sanity check - it verifies that strings of letters are | |
108 * not tokenized. | |
109 */ | |
110 public function testNoTokenizer() { | |
111 // Set the minimum word size to 1 (to split all CJK characters) and make | |
112 // sure CJK tokenizing is turned on. | |
113 $this->config('search.settings') | |
114 ->set('index.minimum_word_size', 1) | |
115 ->set('index.overlap_cjk', TRUE) | |
116 ->save(); | |
117 $this->refreshVariables(); | |
118 | |
119 $letters = 'abcdefghijklmnopqrstuvwxyz'; | |
120 $out = trim(search_simplify($letters)); | |
121 | |
122 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); | |
123 } | |
124 | |
125 /** | |
126 * Like PHP chr() function, but for unicode characters. | |
127 * | |
128 * chr() only works for ASCII characters up to character 255. This function | |
129 * converts a number to the corresponding unicode character. Adapted from | |
130 * functions supplied in comments on several functions on php.net. | |
131 */ | |
132 public function code2utf($num) { | |
133 if ($num < 128) { | |
134 return chr($num); | |
135 } | |
136 | |
137 if ($num < 2048) { | |
138 return chr(($num >> 6) + 192) . chr(($num & 63) + 128); | |
139 } | |
140 | |
141 if ($num < 65536) { | |
142 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); | |
143 } | |
144 | |
145 if ($num < 2097152) { | |
146 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); | |
147 } | |
148 | |
149 return ''; | |
150 } | |
151 | |
152 } |