Chris@0: config('search.settings') Chris@0: ->set('index.minimum_word_size', 1) Chris@0: ->set('index.overlap_cjk', TRUE) Chris@0: ->save(); Chris@0: $this->refreshVariables(); Chris@0: Chris@0: // Create a string of CJK characters from various character ranges in Chris@0: // the Unicode tables. Chris@0: Chris@0: // Beginnings of the character ranges. Chris@0: $starts = [ Chris@0: 'CJK unified' => 0x4e00, Chris@0: 'CJK Ext A' => 0x3400, Chris@0: 'CJK Compat' => 0xf900, Chris@0: 'Hangul Jamo' => 0x1100, Chris@0: 'Hangul Ext A' => 0xa960, Chris@0: 'Hangul Ext B' => 0xd7b0, Chris@0: 'Hangul Compat' => 0x3131, Chris@0: 'Half non-punct 1' => 0xff21, Chris@0: 'Half non-punct 2' => 0xff41, Chris@0: 'Half non-punct 3' => 0xff66, Chris@0: 'Hangul Syllables' => 0xac00, Chris@0: 'Hiragana' => 0x3040, Chris@0: 'Katakana' => 0x30a1, Chris@0: 'Katakana Ext' => 0x31f0, Chris@0: 'CJK Reserve 1' => 0x20000, Chris@0: 'CJK Reserve 2' => 0x30000, Chris@0: 'Bomofo' => 0x3100, Chris@0: 'Bomofo Ext' => 0x31a0, Chris@0: 'Lisu' => 0xa4d0, Chris@0: 'Yi' => 0xa000, Chris@0: ]; Chris@0: Chris@0: // Ends of the character ranges. Chris@0: $ends = [ Chris@0: 'CJK unified' => 0x9fcf, Chris@0: 'CJK Ext A' => 0x4dbf, Chris@0: 'CJK Compat' => 0xfaff, Chris@0: 'Hangul Jamo' => 0x11ff, Chris@0: 'Hangul Ext A' => 0xa97f, Chris@0: 'Hangul Ext B' => 0xd7ff, Chris@0: 'Hangul Compat' => 0x318e, Chris@0: 'Half non-punct 1' => 0xff3a, Chris@0: 'Half non-punct 2' => 0xff5a, Chris@0: 'Half non-punct 3' => 0xffdc, Chris@0: 'Hangul Syllables' => 0xd7af, Chris@0: 'Hiragana' => 0x309f, Chris@0: 'Katakana' => 0x30ff, Chris@0: 'Katakana Ext' => 0x31ff, Chris@0: 'CJK Reserve 1' => 0x2fffd, Chris@0: 'CJK Reserve 2' => 0x3fffd, Chris@0: 'Bomofo' => 0x312f, Chris@0: 'Bomofo Ext' => 0x31b7, Chris@0: 'Lisu' => 0xa4fd, Chris@0: 'Yi' => 0xa48f, Chris@0: ]; Chris@0: Chris@0: // Generate characters consisting of starts, midpoints, and ends. Chris@0: $chars = []; Chris@0: $charcodes = []; Chris@0: foreach ($starts as $key => $value) { Chris@0: $charcodes[] = $starts[$key]; Chris@0: $chars[] = $this->code2utf($starts[$key]); Chris@0: $mid = round(0.5 * ($starts[$key] + $ends[$key])); Chris@0: $charcodes[] = $mid; Chris@0: $chars[] = $this->code2utf($mid); Chris@0: $charcodes[] = $ends[$key]; Chris@0: $chars[] = $this->code2utf($ends[$key]); Chris@0: } Chris@0: Chris@0: // Merge into a string and tokenize. Chris@0: $string = implode('', $chars); Chris@0: $out = trim(search_simplify($string)); Chris@0: $expected = Unicode::strtolower(implode(' ', $chars)); Chris@0: Chris@0: // Verify that the output matches what we expect. Chris@0: $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters'); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Verifies that strings of non-CJK characters are not tokenized. Chris@0: * Chris@0: * This is just a sanity check - it verifies that strings of letters are Chris@0: * not tokenized. Chris@0: */ Chris@0: public function testNoTokenizer() { Chris@0: // Set the minimum word size to 1 (to split all CJK characters) and make Chris@0: // sure CJK tokenizing is turned on. Chris@0: $this->config('search.settings') Chris@0: ->set('index.minimum_word_size', 1) Chris@0: ->set('index.overlap_cjk', TRUE) Chris@0: ->save(); Chris@0: $this->refreshVariables(); Chris@0: Chris@0: $letters = 'abcdefghijklmnopqrstuvwxyz'; Chris@0: $out = trim(search_simplify($letters)); Chris@0: Chris@0: $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Like PHP chr() function, but for unicode characters. Chris@0: * Chris@0: * chr() only works for ASCII characters up to character 255. This function Chris@0: * converts a number to the corresponding unicode character. Adapted from Chris@0: * functions supplied in comments on several functions on php.net. Chris@0: */ Chris@0: public function code2utf($num) { Chris@0: if ($num < 128) { Chris@0: return chr($num); Chris@0: } Chris@0: Chris@0: if ($num < 2048) { Chris@0: return chr(($num >> 6) + 192) . chr(($num & 63) + 128); Chris@0: } Chris@0: Chris@0: if ($num < 65536) { Chris@0: return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); Chris@0: } Chris@0: Chris@0: if ($num < 2097152) { Chris@0: return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); Chris@0: } Chris@0: Chris@0: return ''; Chris@0: } Chris@0: Chris@0: }