cmmr2012-drupal-site: core/modules/search/tests/src/Functional/SearchTokenizerTest.php annotate

annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b

Initial commit from drush-created site

author	Chris Cannam
date	Thu, 05 Jul 2018 14:24:15 +0000
parents
children	a9cd425dd02b

rev	line source
Chris@0	1 <?php
Chris@0	2
Chris@0	3 namespace Drupal\Tests\search\Functional;
Chris@0	4
Chris@0	5 use Drupal\Component\Utility\Unicode;
Chris@0	6
Chris@0	7 /**
Chris@0	8 * Tests that CJK tokenizer works as intended.
Chris@0	9 *
Chris@0	10 * @group search
Chris@0	11 */
Chris@0	12 class SearchTokenizerTest extends SearchTestBase {
Chris@0	13
Chris@0	14 /**
Chris@0	15 * Verifies that strings of CJK characters are tokenized.
Chris@0	16 *
Chris@0	17 * The search_simplify() function does special things with numbers, symbols,
Chris@0	18 * and punctuation. So we only test that CJK characters that are not in these
Chris@0	19 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
Chris@0	20 * information.
Chris@0	21 */
Chris@0	22 public function testTokenizer() {
Chris@0	23 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0	24 // sure CJK tokenizing is turned on.
Chris@0	25 $this->config('search.settings')
Chris@0	26 ->set('index.minimum_word_size', 1)
Chris@0	27 ->set('index.overlap_cjk', TRUE)
Chris@0	28 ->save();
Chris@0	29 $this->refreshVariables();
Chris@0	30
Chris@0	31 // Create a string of CJK characters from various character ranges in
Chris@0	32 // the Unicode tables.
Chris@0	33
Chris@0	34 // Beginnings of the character ranges.
Chris@0	35 $starts = [
Chris@0	36 'CJK unified' => 0x4e00,
Chris@0	37 'CJK Ext A' => 0x3400,
Chris@0	38 'CJK Compat' => 0xf900,
Chris@0	39 'Hangul Jamo' => 0x1100,
Chris@0	40 'Hangul Ext A' => 0xa960,
Chris@0	41 'Hangul Ext B' => 0xd7b0,
Chris@0	42 'Hangul Compat' => 0x3131,
Chris@0	43 'Half non-punct 1' => 0xff21,
Chris@0	44 'Half non-punct 2' => 0xff41,
Chris@0	45 'Half non-punct 3' => 0xff66,
Chris@0	46 'Hangul Syllables' => 0xac00,
Chris@0	47 'Hiragana' => 0x3040,
Chris@0	48 'Katakana' => 0x30a1,
Chris@0	49 'Katakana Ext' => 0x31f0,
Chris@0	50 'CJK Reserve 1' => 0x20000,
Chris@0	51 'CJK Reserve 2' => 0x30000,
Chris@0	52 'Bomofo' => 0x3100,
Chris@0	53 'Bomofo Ext' => 0x31a0,
Chris@0	54 'Lisu' => 0xa4d0,
Chris@0	55 'Yi' => 0xa000,
Chris@0	56 ];
Chris@0	57
Chris@0	58 // Ends of the character ranges.
Chris@0	59 $ends = [
Chris@0	60 'CJK unified' => 0x9fcf,
Chris@0	61 'CJK Ext A' => 0x4dbf,
Chris@0	62 'CJK Compat' => 0xfaff,
Chris@0	63 'Hangul Jamo' => 0x11ff,
Chris@0	64 'Hangul Ext A' => 0xa97f,
Chris@0	65 'Hangul Ext B' => 0xd7ff,
Chris@0	66 'Hangul Compat' => 0x318e,
Chris@0	67 'Half non-punct 1' => 0xff3a,
Chris@0	68 'Half non-punct 2' => 0xff5a,
Chris@0	69 'Half non-punct 3' => 0xffdc,
Chris@0	70 'Hangul Syllables' => 0xd7af,
Chris@0	71 'Hiragana' => 0x309f,
Chris@0	72 'Katakana' => 0x30ff,
Chris@0	73 'Katakana Ext' => 0x31ff,
Chris@0	74 'CJK Reserve 1' => 0x2fffd,
Chris@0	75 'CJK Reserve 2' => 0x3fffd,
Chris@0	76 'Bomofo' => 0x312f,
Chris@0	77 'Bomofo Ext' => 0x31b7,
Chris@0	78 'Lisu' => 0xa4fd,
Chris@0	79 'Yi' => 0xa48f,
Chris@0	80 ];
Chris@0	81
Chris@0	82 // Generate characters consisting of starts, midpoints, and ends.
Chris@0	83 $chars = [];
Chris@0	84 $charcodes = [];
Chris@0	85 foreach ($starts as $key => $value) {
Chris@0	86 $charcodes[] = $starts[$key];
Chris@0	87 $chars[] = $this->code2utf($starts[$key]);
Chris@0	88 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
Chris@0	89 $charcodes[] = $mid;
Chris@0	90 $chars[] = $this->code2utf($mid);
Chris@0	91 $charcodes[] = $ends[$key];
Chris@0	92 $chars[] = $this->code2utf($ends[$key]);
Chris@0	93 }
Chris@0	94
Chris@0	95 // Merge into a string and tokenize.
Chris@0	96 $string = implode('', $chars);
Chris@0	97 $out = trim(search_simplify($string));
Chris@0	98 $expected = Unicode::strtolower(implode(' ', $chars));
Chris@0	99
Chris@0	100 // Verify that the output matches what we expect.
Chris@0	101 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
Chris@0	102 }
Chris@0	103
Chris@0	104 /**
Chris@0	105 * Verifies that strings of non-CJK characters are not tokenized.
Chris@0	106 *
Chris@0	107 * This is just a sanity check - it verifies that strings of letters are
Chris@0	108 * not tokenized.
Chris@0	109 */
Chris@0	110 public function testNoTokenizer() {
Chris@0	111 // Set the minimum word size to 1 (to split all CJK characters) and make
Chris@0	112 // sure CJK tokenizing is turned on.
Chris@0	113 $this->config('search.settings')
Chris@0	114 ->set('index.minimum_word_size', 1)
Chris@0	115 ->set('index.overlap_cjk', TRUE)
Chris@0	116 ->save();
Chris@0	117 $this->refreshVariables();
Chris@0	118
Chris@0	119 $letters = 'abcdefghijklmnopqrstuvwxyz';
Chris@0	120 $out = trim(search_simplify($letters));
Chris@0	121
Chris@0	122 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
Chris@0	123 }
Chris@0	124
Chris@0	125 /**
Chris@0	126 * Like PHP chr() function, but for unicode characters.
Chris@0	127 *
Chris@0	128 * chr() only works for ASCII characters up to character 255. This function
Chris@0	129 * converts a number to the corresponding unicode character. Adapted from
Chris@0	130 * functions supplied in comments on several functions on php.net.
Chris@0	131 */
Chris@0	132 public function code2utf($num) {
Chris@0	133 if ($num < 128) {
Chris@0	134 return chr($num);
Chris@0	135 }
Chris@0	136
Chris@0	137 if ($num < 2048) {
Chris@0	138 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
Chris@0	139 }
Chris@0	140
Chris@0	141 if ($num < 65536) {
Chris@0	142 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0	143 }
Chris@0	144
Chris@0	145 if ($num < 2097152) {
Chris@0	146 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
Chris@0	147 }
Chris@0	148
Chris@0	149 return '';
Chris@0	150 }
Chris@0	151
Chris@0	152 }

Mercurial > hg > cmmr2012-drupal-site

annotate core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b