annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b

Initial commit from drush-created site
author Chris Cannam
date Thu, 05 Jul 2018 14:24:15 +0000
parents
children a9cd425dd02b
rev   line source
Chris@0 1 <?php
Chris@0 2
Chris@0 3 namespace Drupal\Tests\search\Functional;
Chris@0 4
Chris@0 5 use Drupal\Component\Utility\Unicode;
Chris@0 6
Chris@0 7 /**
Chris@0 8 * Tests that CJK tokenizer works as intended.
Chris@0 9 *
Chris@0 10 * @group search
Chris@0 11 */
Chris@0 12 class SearchTokenizerTest extends SearchTestBase {
Chris@0 13
Chris@0 14 /**
Chris@0 15 * Verifies that strings of CJK characters are tokenized.
Chris@0 16 *
Chris@0 17 * The search_simplify() function does special things with numbers, symbols,
Chris@0 18 * and punctuation. So we only test that CJK characters that are not in these
Chris@0 19 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
Chris@0 20 * information.
Chris@0 21 */
Chris@0 22 public function testTokenizer() {
Chris@0 23 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0 24 // sure CJK tokenizing is turned on.
Chris@0 25 $this->config('search.settings')
Chris@0 26 ->set('index.minimum_word_size', 1)
Chris@0 27 ->set('index.overlap_cjk', TRUE)
Chris@0 28 ->save();
Chris@0 29 $this->refreshVariables();
Chris@0 30
Chris@0 31 // Create a string of CJK characters from various character ranges in
Chris@0 32 // the Unicode tables.
Chris@0 33
Chris@0 34 // Beginnings of the character ranges.
Chris@0 35 $starts = [
Chris@0 36 'CJK unified' => 0x4e00,
Chris@0 37 'CJK Ext A' => 0x3400,
Chris@0 38 'CJK Compat' => 0xf900,
Chris@0 39 'Hangul Jamo' => 0x1100,
Chris@0 40 'Hangul Ext A' => 0xa960,
Chris@0 41 'Hangul Ext B' => 0xd7b0,
Chris@0 42 'Hangul Compat' => 0x3131,
Chris@0 43 'Half non-punct 1' => 0xff21,
Chris@0 44 'Half non-punct 2' => 0xff41,
Chris@0 45 'Half non-punct 3' => 0xff66,
Chris@0 46 'Hangul Syllables' => 0xac00,
Chris@0 47 'Hiragana' => 0x3040,
Chris@0 48 'Katakana' => 0x30a1,
Chris@0 49 'Katakana Ext' => 0x31f0,
Chris@0 50 'CJK Reserve 1' => 0x20000,
Chris@0 51 'CJK Reserve 2' => 0x30000,
Chris@0 52 'Bomofo' => 0x3100,
Chris@0 53 'Bomofo Ext' => 0x31a0,
Chris@0 54 'Lisu' => 0xa4d0,
Chris@0 55 'Yi' => 0xa000,
Chris@0 56 ];
Chris@0 57
Chris@0 58 // Ends of the character ranges.
Chris@0 59 $ends = [
Chris@0 60 'CJK unified' => 0x9fcf,
Chris@0 61 'CJK Ext A' => 0x4dbf,
Chris@0 62 'CJK Compat' => 0xfaff,
Chris@0 63 'Hangul Jamo' => 0x11ff,
Chris@0 64 'Hangul Ext A' => 0xa97f,
Chris@0 65 'Hangul Ext B' => 0xd7ff,
Chris@0 66 'Hangul Compat' => 0x318e,
Chris@0 67 'Half non-punct 1' => 0xff3a,
Chris@0 68 'Half non-punct 2' => 0xff5a,
Chris@0 69 'Half non-punct 3' => 0xffdc,
Chris@0 70 'Hangul Syllables' => 0xd7af,
Chris@0 71 'Hiragana' => 0x309f,
Chris@0 72 'Katakana' => 0x30ff,
Chris@0 73 'Katakana Ext' => 0x31ff,
Chris@0 74 'CJK Reserve 1' => 0x2fffd,
Chris@0 75 'CJK Reserve 2' => 0x3fffd,
Chris@0 76 'Bomofo' => 0x312f,
Chris@0 77 'Bomofo Ext' => 0x31b7,
Chris@0 78 'Lisu' => 0xa4fd,
Chris@0 79 'Yi' => 0xa48f,
Chris@0 80 ];
Chris@0 81
Chris@0 82 // Generate characters consisting of starts, midpoints, and ends.
Chris@0 83 $chars = [];
Chris@0 84 $charcodes = [];
Chris@0 85 foreach ($starts as $key => $value) {
Chris@0 86 $charcodes[] = $starts[$key];
Chris@0 87 $chars[] = $this->code2utf($starts[$key]);
Chris@0 88 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
Chris@0 89 $charcodes[] = $mid;
Chris@0 90 $chars[] = $this->code2utf($mid);
Chris@0 91 $charcodes[] = $ends[$key];
Chris@0 92 $chars[] = $this->code2utf($ends[$key]);
Chris@0 93 }
Chris@0 94
Chris@0 95 // Merge into a string and tokenize.
Chris@0 96 $string = implode('', $chars);
Chris@0 97 $out = trim(search_simplify($string));
Chris@0 98 $expected = Unicode::strtolower(implode(' ', $chars));
Chris@0 99
Chris@0 100 // Verify that the output matches what we expect.
Chris@0 101 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
Chris@0 102 }
Chris@0 103
Chris@0 104 /**
Chris@0 105 * Verifies that strings of non-CJK characters are not tokenized.
Chris@0 106 *
Chris@0 107 * This is just a sanity check - it verifies that strings of letters are
Chris@0 108 * not tokenized.
Chris@0 109 */
Chris@0 110 public function testNoTokenizer() {
Chris@0 111 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0 112 // sure CJK tokenizing is turned on.
Chris@0 113 $this->config('search.settings')
Chris@0 114 ->set('index.minimum_word_size', 1)
Chris@0 115 ->set('index.overlap_cjk', TRUE)
Chris@0 116 ->save();
Chris@0 117 $this->refreshVariables();
Chris@0 118
Chris@0 119 $letters = 'abcdefghijklmnopqrstuvwxyz';
Chris@0 120 $out = trim(search_simplify($letters));
Chris@0 121
Chris@0 122 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
Chris@0 123 }
Chris@0 124
Chris@0 125 /**
Chris@0 126 * Like PHP chr() function, but for unicode characters.
Chris@0 127 *
Chris@0 128 * chr() only works for ASCII characters up to character 255. This function
Chris@0 129 * converts a number to the corresponding unicode character. Adapted from
Chris@0 130 * functions supplied in comments on several functions on php.net.
Chris@0 131 */
Chris@0 132 public function code2utf($num) {
Chris@0 133 if ($num < 128) {
Chris@0 134 return chr($num);
Chris@0 135 }
Chris@0 136
Chris@0 137 if ($num < 2048) {
Chris@0 138 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
Chris@0 139 }
Chris@0 140
Chris@0 141 if ($num < 65536) {
Chris@0 142 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0 143 }
Chris@0 144
Chris@0 145 if ($num < 2097152) {
Chris@0 146 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0 147 }
Chris@0 148
Chris@0 149 return '';
Chris@0 150 }
Chris@0 151
Chris@0 152 }