isophonics-drupal-site: core/modules/search/tests/src/Functional/SearchTokenizerTest.php annotate

annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 19:fa3358dc1485 tip

Add ndrum files

author	Chris Cannam
date	Wed, 28 Aug 2019 13:14:47 +0100
parents	129ea1e6d783
children

rev	line source
Chris@0	1 <?php
Chris@0	2
Chris@0	3 namespace Drupal\Tests\search\Functional;
Chris@0	4
Chris@17	5 use Drupal\Tests\BrowserTestBase;
Chris@0	6
Chris@0	7 /**
Chris@0	8 * Tests that CJK tokenizer works as intended.
Chris@0	9 *
Chris@0	10 * @group search
Chris@0	11 */
Chris@17	12 class SearchTokenizerTest extends BrowserTestBase {
Chris@17	13
Chris@17	14 /**
Chris@17	15 * {@inheritdoc}
Chris@17	16 */
Chris@17	17 protected static $modules = ['search'];
Chris@0	18
Chris@0	19 /**
Chris@0	20 * Verifies that strings of CJK characters are tokenized.
Chris@0	21 *
Chris@0	22 * The search_simplify() function does special things with numbers, symbols,
Chris@0	23 * and punctuation. So we only test that CJK characters that are not in these
Chris@0	24 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
Chris@0	25 * information.
Chris@0	26 */
Chris@0	27 public function testTokenizer() {
Chris@0	28 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0	29 // sure CJK tokenizing is turned on.
Chris@0	30 $this->config('search.settings')
Chris@0	31 ->set('index.minimum_word_size', 1)
Chris@0	32 ->set('index.overlap_cjk', TRUE)
Chris@0	33 ->save();
Chris@0	34 $this->refreshVariables();
Chris@0	35
Chris@0	36 // Create a string of CJK characters from various character ranges in
Chris@0	37 // the Unicode tables.
Chris@0	38
Chris@0	39 // Beginnings of the character ranges.
Chris@0	40 $starts = [
Chris@0	41 'CJK unified' => 0x4e00,
Chris@0	42 'CJK Ext A' => 0x3400,
Chris@0	43 'CJK Compat' => 0xf900,
Chris@0	44 'Hangul Jamo' => 0x1100,
Chris@0	45 'Hangul Ext A' => 0xa960,
Chris@0	46 'Hangul Ext B' => 0xd7b0,
Chris@0	47 'Hangul Compat' => 0x3131,
Chris@0	48 'Half non-punct 1' => 0xff21,
Chris@0	49 'Half non-punct 2' => 0xff41,
Chris@0	50 'Half non-punct 3' => 0xff66,
Chris@0	51 'Hangul Syllables' => 0xac00,
Chris@0	52 'Hiragana' => 0x3040,
Chris@0	53 'Katakana' => 0x30a1,
Chris@0	54 'Katakana Ext' => 0x31f0,
Chris@0	55 'CJK Reserve 1' => 0x20000,
Chris@0	56 'CJK Reserve 2' => 0x30000,
Chris@0	57 'Bomofo' => 0x3100,
Chris@0	58 'Bomofo Ext' => 0x31a0,
Chris@0	59 'Lisu' => 0xa4d0,
Chris@0	60 'Yi' => 0xa000,
Chris@0	61 ];
Chris@0	62
Chris@0	63 // Ends of the character ranges.
Chris@0	64 $ends = [
Chris@0	65 'CJK unified' => 0x9fcf,
Chris@0	66 'CJK Ext A' => 0x4dbf,
Chris@0	67 'CJK Compat' => 0xfaff,
Chris@0	68 'Hangul Jamo' => 0x11ff,
Chris@0	69 'Hangul Ext A' => 0xa97f,
Chris@0	70 'Hangul Ext B' => 0xd7ff,
Chris@0	71 'Hangul Compat' => 0x318e,
Chris@0	72 'Half non-punct 1' => 0xff3a,
Chris@0	73 'Half non-punct 2' => 0xff5a,
Chris@0	74 'Half non-punct 3' => 0xffdc,
Chris@0	75 'Hangul Syllables' => 0xd7af,
Chris@0	76 'Hiragana' => 0x309f,
Chris@0	77 'Katakana' => 0x30ff,
Chris@0	78 'Katakana Ext' => 0x31ff,
Chris@0	79 'CJK Reserve 1' => 0x2fffd,
Chris@0	80 'CJK Reserve 2' => 0x3fffd,
Chris@0	81 'Bomofo' => 0x312f,
Chris@0	82 'Bomofo Ext' => 0x31b7,
Chris@0	83 'Lisu' => 0xa4fd,
Chris@0	84 'Yi' => 0xa48f,
Chris@0	85 ];
Chris@0	86
Chris@0	87 // Generate characters consisting of starts, midpoints, and ends.
Chris@0	88 $chars = [];
Chris@0	89 $charcodes = [];
Chris@0	90 foreach ($starts as $key => $value) {
Chris@0	91 $charcodes[] = $starts[$key];
Chris@0	92 $chars[] = $this->code2utf($starts[$key]);
Chris@0	93 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
Chris@0	94 $charcodes[] = $mid;
Chris@0	95 $chars[] = $this->code2utf($mid);
Chris@0	96 $charcodes[] = $ends[$key];
Chris@0	97 $chars[] = $this->code2utf($ends[$key]);
Chris@0	98 }
Chris@0	99
Chris@0	100 // Merge into a string and tokenize.
Chris@0	101 $string = implode('', $chars);
Chris@0	102 $out = trim(search_simplify($string));
Chris@17	103 $expected = mb_strtolower(implode(' ', $chars));
Chris@0	104
Chris@0	105 // Verify that the output matches what we expect.
Chris@0	106 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
Chris@0	107 }
Chris@0	108
Chris@0	109 /**
Chris@0	110 * Verifies that strings of non-CJK characters are not tokenized.
Chris@0	111 *
Chris@0	112 * This is just a sanity check - it verifies that strings of letters are
Chris@0	113 * not tokenized.
Chris@0	114 */
Chris@0	115 public function testNoTokenizer() {
Chris@0	116 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0	117 // sure CJK tokenizing is turned on.
Chris@0	118 $this->config('search.settings')
Chris@0	119 ->set('index.minimum_word_size', 1)
Chris@0	120 ->set('index.overlap_cjk', TRUE)
Chris@0	121 ->save();
Chris@0	122 $this->refreshVariables();
Chris@0	123
Chris@0	124 $letters = 'abcdefghijklmnopqrstuvwxyz';
Chris@0	125 $out = trim(search_simplify($letters));
Chris@0	126
Chris@0	127 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
Chris@0	128 }
Chris@0	129
Chris@0	130 /**
Chris@0	131 * Like PHP chr() function, but for unicode characters.
Chris@0	132 *
Chris@17	133 * Function chr() only works for ASCII characters up to character 255. This
Chris@17	134 * function converts a number to the corresponding unicode character. Adapted
Chris@17	135 * from functions supplied in comments on several functions on php.net.
Chris@0	136 */
Chris@0	137 public function code2utf($num) {
Chris@0	138 if ($num < 128) {
Chris@0	139 return chr($num);
Chris@0	140 }
Chris@0	141
Chris@0	142 if ($num < 2048) {
Chris@0	143 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
Chris@0	144 }
Chris@0	145
Chris@0	146 if ($num < 65536) {
Chris@0	147 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0	148 }
Chris@0	149
Chris@0	150 if ($num < 2097152) {
Chris@0	151 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0	152 }
Chris@0	153
Chris@0	154 return '';
Chris@0	155 }
Chris@0	156
Chris@0	157 }

Mercurial > hg > isophonics-drupal-site

annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 19:fa3358dc1485 tip