cmmr2012-drupal-site: core/modules/search/tests/src/Functional/SearchTokenizerTest.php comparison

comparison core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b

Initial commit from drush-created site

author	Chris Cannam
date	Thu, 05 Jul 2018 14:24:15 +0000
parents
children	a9cd425dd02b

comparison

equal deleted inserted replaced

--1:000000000000
+:c75dbcec494b
+<?php
+namespace Drupal\Tests\search\Functional;
+use Drupal\Component\Utility\Unicode;
+/**
+* Tests that CJK tokenizer works as intended.
+*
+* @group search
+*/
+class SearchTokenizerTest extends SearchTestBase {
+/**
+* Verifies that strings of CJK characters are tokenized.
+*
+* The search_simplify() function does special things with numbers, symbols,
+* and punctuation. So we only test that CJK characters that are not in these
+* character classes are tokenized properly. See PREG_CLASS_CKJ for more
+* information.
+*/
+public function testTokenizer() {
+// Set the minimum word size to 1 (to split all CJK characters) and make
+// sure CJK tokenizing is turned on.
+$this->config('search.settings')
+->set('index.minimum_word_size', 1)
+->set('index.overlap_cjk', TRUE)
+->save();
+$this->refreshVariables();
+// Create a string of CJK characters from various character ranges in
+// the Unicode tables.
+// Beginnings of the character ranges.
+$starts = [
+'CJK unified' => 0x4e00,
+'CJK Ext A' => 0x3400,
+'CJK Compat' => 0xf900,
+'Hangul Jamo' => 0x1100,
+'Hangul Ext A' => 0xa960,
+'Hangul Ext B' => 0xd7b0,
+'Hangul Compat' => 0x3131,
+'Half non-punct 1' => 0xff21,
+'Half non-punct 2' => 0xff41,
+'Half non-punct 3' => 0xff66,
+'Hangul Syllables' => 0xac00,
+'Hiragana' => 0x3040,
+'Katakana' => 0x30a1,
+'Katakana Ext' => 0x31f0,
+'CJK Reserve 1' => 0x20000,
+'CJK Reserve 2' => 0x30000,
+'Bomofo' => 0x3100,
+'Bomofo Ext' => 0x31a0,
+'Lisu' => 0xa4d0,
+'Yi' => 0xa000,
+];
+// Ends of the character ranges.
+$ends = [
+'CJK unified' => 0x9fcf,
+'CJK Ext A' => 0x4dbf,
+'CJK Compat' => 0xfaff,
+'Hangul Jamo' => 0x11ff,
+'Hangul Ext A' => 0xa97f,
+'Hangul Ext B' => 0xd7ff,
+'Hangul Compat' => 0x318e,
+'Half non-punct 1' => 0xff3a,
+'Half non-punct 2' => 0xff5a,
+'Half non-punct 3' => 0xffdc,
+'Hangul Syllables' => 0xd7af,
+'Hiragana' => 0x309f,
+'Katakana' => 0x30ff,
+'Katakana Ext' => 0x31ff,
+'CJK Reserve 1' => 0x2fffd,
+'CJK Reserve 2' => 0x3fffd,
+'Bomofo' => 0x312f,
+'Bomofo Ext' => 0x31b7,
+'Lisu' => 0xa4fd,
+'Yi' => 0xa48f,
+];
+// Generate characters consisting of starts, midpoints, and ends.
+$chars = [];
+$charcodes = [];
+foreach ($starts as $key => $value) {
+$charcodes[] = $starts[$key];
+$chars[] = $this->code2utf($starts[$key]);
+$mid = round(0.5 * ($starts[$key] + $ends[$key]));
+$charcodes[] = $mid;
+$chars[] = $this->code2utf($mid);
+$charcodes[] = $ends[$key];
+$chars[] = $this->code2utf($ends[$key]);
+}
+// Merge into a string and tokenize.
+$string = implode('', $chars);
+$out = trim(search_simplify($string));
+$expected = Unicode::strtolower(implode(' ', $chars));
+// Verify that the output matches what we expect.
+$this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
+}
+/**
+* Verifies that strings of non-CJK characters are not tokenized.
+*
+* This is just a sanity check - it verifies that strings of letters are
+* not tokenized.
+*/
+public function testNoTokenizer() {
+// Set the minimum word size to 1 (to split all CJK characters) and make
+// sure CJK tokenizing is turned on.
+$this->config('search.settings')
+->set('index.minimum_word_size', 1)
+->set('index.overlap_cjk', TRUE)
+->save();
+$this->refreshVariables();
+$letters = 'abcdefghijklmnopqrstuvwxyz';
+$out = trim(search_simplify($letters));
+$this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
+}
+/**
+* Like PHP chr() function, but for unicode characters.
+*
+* chr() only works for ASCII characters up to character 255. This function
+* converts a number to the corresponding unicode character. Adapted from
+* functions supplied in comments on several functions on php.net.
+*/
+public function code2utf($num) {
+if ($num < 128) {
+return chr($num);
+}
+if ($num < 2048) {
+return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
+}
+if ($num < 65536) {
+return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
+}
+if ($num < 2097152) {
+return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
+}
+return '';
+}
+}

Mercurial > hg > cmmr2012-drupal-site

comparison core/modules/search/tests/src/Functional/SearchTokenizerTest.php @ 0:c75dbcec494b