annotate forum/Sources/SearchAPI-Fulltext.php @ 85:6d7b61434be7 website

Add a copy of this here, just in case!
author Chris Cannam
date Mon, 20 Jan 2014 11:02:36 +0000
parents e3e11437ecea
children
rev   line source
Chris@76 1 <?php
Chris@76 2
Chris@76 3 /**
Chris@76 4 * Simple Machines Forum (SMF)
Chris@76 5 *
Chris@76 6 * @package SMF
Chris@76 7 * @author Simple Machines http://www.simplemachines.org
Chris@76 8 * @copyright 2011 Simple Machines
Chris@76 9 * @license http://www.simplemachines.org/about/smf/license.php BSD
Chris@76 10 *
Chris@76 11 * @version 2.0
Chris@76 12 */
Chris@76 13
Chris@76 14 if (!defined('SMF'))
Chris@76 15 die('Hacking attempt...');
Chris@76 16
Chris@76 17 /*
Chris@76 18 int searchSort(string $wordA, string $wordB)
Chris@76 19 - callback function for usort used to sort the fulltext results.
Chris@76 20 - the order of sorting is: large words, small words, large words that
Chris@76 21 are excluded from the search, small words that are excluded.
Chris@76 22 */
Chris@76 23
Chris@76 24 class fulltext_search
Chris@76 25 {
Chris@76 26 // This is the last version of SMF that this was tested on, to protect against API changes.
Chris@76 27 public $version_compatible = 'SMF 2.0';
Chris@76 28 // This won't work with versions of SMF less than this.
Chris@76 29 public $min_smf_version = 'SMF 2.0 Beta 2';
Chris@76 30 // Is it supported?
Chris@76 31 public $is_supported = true;
Chris@76 32
Chris@76 33 // Can we do a boolean search - tested on construct.
Chris@76 34 protected $canDoBooleanSearch = false;
Chris@76 35 // What words are banned?
Chris@76 36 protected $bannedWords = array();
Chris@76 37 // What is the minimum word length?
Chris@76 38 protected $min_word_length = 4;
Chris@76 39 // What databases support the fulltext index?
Chris@76 40 protected $supported_databases = array('mysql');
Chris@76 41
Chris@76 42 public function __construct()
Chris@76 43 {
Chris@76 44 global $smcFunc, $db_connection, $modSettings, $db_type;
Chris@76 45
Chris@76 46 // Is this database supported?
Chris@76 47 if (!in_array($db_type, $this->supported_databases))
Chris@76 48 {
Chris@76 49 $this->is_supported = false;
Chris@76 50 return;
Chris@76 51 }
Chris@76 52
Chris@76 53 // Some MySQL versions are superior to others :P.
Chris@76 54 $this->canDoBooleanSearch = version_compare($smcFunc['db_server_info']($db_connection), '4.0.1', '>=');
Chris@76 55
Chris@76 56 $this->bannedWords = empty($modSettings['search_banned_words']) ? array() : explode(',', $modSettings['search_banned_words']);
Chris@76 57 $this->min_word_length = $this->_getMinWordLength();
Chris@76 58 }
Chris@76 59
Chris@76 60 // Check whether the method can be performed by this API.
Chris@76 61 public function supportsMethod($methodName, $query_params = null)
Chris@76 62 {
Chris@76 63 switch ($methodName)
Chris@76 64 {
Chris@76 65 case 'searchSort':
Chris@76 66 case 'prepareIndexes':
Chris@76 67 case 'indexedWordQuery':
Chris@76 68 return true;
Chris@76 69 break;
Chris@76 70
Chris@76 71 default:
Chris@76 72 return false;
Chris@76 73 break;
Chris@76 74 }
Chris@76 75 }
Chris@76 76
Chris@76 77 // What is the minimum word length full text supports?
Chris@76 78 protected function _getMinWordLength()
Chris@76 79 {
Chris@76 80 global $smcFunc;
Chris@76 81
Chris@76 82 // Try to determine the minimum number of letters for a fulltext search.
Chris@76 83 $request = $smcFunc['db_search_query']('max_fulltext_length', '
Chris@76 84 SHOW VARIABLES
Chris@76 85 LIKE {string:fulltext_minimum_word_length}',
Chris@76 86 array(
Chris@76 87 'fulltext_minimum_word_length' => 'ft_min_word_len',
Chris@76 88 )
Chris@76 89 );
Chris@76 90 if ($request !== false && $smcFunc['db_num_rows']($request) == 1)
Chris@76 91 {
Chris@76 92 list (, $min_word_length) = $smcFunc['db_fetch_row']($request);
Chris@76 93 $smcFunc['db_free_result']($request);
Chris@76 94 }
Chris@76 95 // 4 is the MySQL default...
Chris@76 96 else
Chris@76 97 $min_word_length = 4;
Chris@76 98
Chris@76 99 return $min_word_length;
Chris@76 100 }
Chris@76 101
Chris@76 102 // This function compares the length of two strings plus a little.
Chris@76 103 public function searchSort($a, $b)
Chris@76 104 {
Chris@76 105 global $modSettings, $excludedWords;
Chris@76 106
Chris@76 107 $x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
Chris@76 108 $y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
Chris@76 109
Chris@76 110 return $x < $y ? 1 : ($x > $y ? -1 : 0);
Chris@76 111 }
Chris@76 112
Chris@76 113 // Do we have to do some work with the words we are searching for to prepare them?
Chris@76 114 public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
Chris@76 115 {
Chris@76 116 global $modSettings;
Chris@76 117
Chris@76 118 $subwords = text2words($word, null, false);
Chris@76 119
Chris@76 120 if (!$this->canDoBooleanSearch && count($subwords) > 1 && empty($modSettings['search_force_index']))
Chris@76 121 $wordsSearch['words'][] = $word;
Chris@76 122
Chris@76 123 if ($this->canDoBooleanSearch)
Chris@76 124 {
Chris@76 125 $fulltextWord = count($subwords) === 1 ? $word : '"' . $word . '"';
Chris@76 126 $wordsSearch['indexed_words'][] = $fulltextWord;
Chris@76 127 if ($isExcluded)
Chris@76 128 $wordsExclude[] = $fulltextWord;
Chris@76 129 }
Chris@76 130 // Excluded phrases don't benefit from being split into subwords.
Chris@76 131 elseif (count($subwords) > 1 && $isExcluded)
Chris@76 132 return;
Chris@76 133 else
Chris@76 134 {
Chris@76 135 $relyOnIndex = true;
Chris@76 136 foreach ($subwords as $subword)
Chris@76 137 {
Chris@76 138 if (($smcFunc['strlen']($subword) >= $this->min_word_length) && !in_array($subword, $this->bannedWords))
Chris@76 139 {
Chris@76 140 $wordsSearch['indexed_words'][] = $subword;
Chris@76 141 if ($isExcluded)
Chris@76 142 $wordsExclude[] = $subword;
Chris@76 143 }
Chris@76 144 elseif (!in_array($subword, $this->bannedWords))
Chris@76 145 $relyOnIndex = false;
Chris@76 146 }
Chris@76 147
Chris@76 148 if ($this->canDoBooleanSearch && !$relyOnIndex && empty($modSettings['search_force_index']))
Chris@76 149 $wordsSearch['words'][] = $word;
Chris@76 150 }
Chris@76 151 }
Chris@76 152
Chris@76 153 // Search for indexed words.
Chris@76 154 public function indexedWordQuery($words, $search_data)
Chris@76 155 {
Chris@76 156 global $modSettings, $smcFunc;
Chris@76 157
Chris@76 158 $query_select = array(
Chris@76 159 'id_msg' => 'm.id_msg',
Chris@76 160 );
Chris@76 161 $query_where = array();
Chris@76 162 $query_params = $search_data['params'];
Chris@76 163
Chris@76 164 if ($query_params['id_search'])
Chris@76 165 $query_select['id_search'] = '{int:id_search}';
Chris@76 166
Chris@76 167 $count = 0;
Chris@76 168 if (empty($modSettings['search_simple_fulltext']))
Chris@76 169 foreach ($words['words'] as $regularWord)
Chris@76 170 {
Chris@76 171 $query_where[] = 'm.body' . (in_array($regularWord, $query_params['excluded_words']) ? ' NOT' : '') . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:complex_body_' . $count . '}';
Chris@76 172 $query_params['complex_body_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($regularWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $regularWord), '\\\'') . '[[:>:]]';
Chris@76 173 }
Chris@76 174
Chris@76 175 if ($query_params['user_query'])
Chris@76 176 $query_where[] = '{raw:user_query}';
Chris@76 177 if ($query_params['board_query'])
Chris@76 178 $query_where[] = 'm.id_board {raw:board_query}';
Chris@76 179
Chris@76 180 if ($query_params['topic'])
Chris@76 181 $query_where[] = 'm.id_topic = {int:topic}';
Chris@76 182 if ($query_params['min_msg_id'])
Chris@76 183 $query_where[] = 'm.id_msg >= {int:min_msg_id}';
Chris@76 184 if ($query_params['max_msg_id'])
Chris@76 185 $query_where[] = 'm.id_msg <= {int:max_msg_id}';
Chris@76 186
Chris@76 187 $count = 0;
Chris@76 188 if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
Chris@76 189 foreach ($query_params['excluded_phrases'] as $phrase)
Chris@76 190 {
Chris@76 191 $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_phrase_' . $count . '}';
Chris@76 192 $query_params['exclude_subject_phrase_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($phrase, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $phrase), '\\\'') . '[[:>:]]';
Chris@76 193 }
Chris@76 194 $count = 0;
Chris@76 195 if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
Chris@76 196 foreach ($query_params['excluded_subject_words'] as $excludedWord)
Chris@76 197 {
Chris@76 198 $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_words_' . $count . '}';
Chris@76 199 $query_params['exclude_subject_words_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($excludedWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $excludedWord), '\\\'') . '[[:>:]]';
Chris@76 200 }
Chris@76 201
Chris@76 202 if (!empty($modSettings['search_simple_fulltext']))
Chris@76 203 {
Chris@76 204 $query_where[] = 'MATCH (body) AGAINST ({string:body_match})';
Chris@76 205 $query_params['body_match'] = implode(' ', array_diff($words['indexed_words'], $query_params['excluded_index_words']));
Chris@76 206 }
Chris@76 207 elseif ($this->canDoBooleanSearch)
Chris@76 208 {
Chris@76 209 $query_params['boolean_match'] = '';
Chris@76 210 foreach ($words['indexed_words'] as $fulltextWord)
Chris@76 211 $query_params['boolean_match'] .= (in_array($fulltextWord, $query_params['excluded_index_words']) ? '-' : '+') . $fulltextWord . ' ';
Chris@76 212 $query_params['boolean_match'] = substr($query_params['boolean_match'], 0, -1);
Chris@76 213
Chris@76 214 $query_where[] = 'MATCH (body) AGAINST ({string:boolean_match} IN BOOLEAN MODE)';
Chris@76 215 }
Chris@76 216 else
Chris@76 217 {
Chris@76 218 $count = 0;
Chris@76 219 foreach ($words['indexed_words'] as $fulltextWord)
Chris@76 220 {
Chris@76 221 $query_where[] = (in_array($fulltextWord, $query_params['excluded_index_words']) ? 'NOT ' : '') . 'MATCH (body) AGAINST ({string:fulltext_match_' . $count . '})';
Chris@76 222 $query_params['fulltext_match_' . $count++] = $fulltextWord;
Chris@76 223 }
Chris@76 224 }
Chris@76 225
Chris@76 226 $ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ( '
Chris@76 227 INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
Chris@76 228 (' . implode(', ', array_keys($query_select)) . ')') : '') . '
Chris@76 229 SELECT ' . implode(', ', $query_select) . '
Chris@76 230 FROM {db_prefix}messages AS m
Chris@76 231 WHERE ' . implode('
Chris@76 232 AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
Chris@76 233 LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
Chris@76 234 $query_params
Chris@76 235 );
Chris@76 236
Chris@76 237 return $ignoreRequest;
Chris@76 238 }
Chris@76 239 }
Chris@76 240
Chris@76 241 ?>