Chris@16
|
1 /*
|
Chris@16
|
2 Copyright (c) Marshall Clow 2010-2012.
|
Chris@16
|
3
|
Chris@16
|
4 Distributed under the Boost Software License, Version 1.0. (See accompanying
|
Chris@16
|
5 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
6
|
Chris@16
|
7 For more information, see http://www.boost.org
|
Chris@16
|
8 */
|
Chris@16
|
9
|
Chris@16
|
10 #ifndef BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
|
Chris@16
|
11 #define BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
|
Chris@16
|
12
|
Chris@16
|
13 #include <iterator> // for std::iterator_traits
|
Chris@16
|
14
|
Chris@16
|
15 #include <boost/assert.hpp>
|
Chris@16
|
16 #include <boost/static_assert.hpp>
|
Chris@16
|
17
|
Chris@16
|
18 #include <boost/range/begin.hpp>
|
Chris@16
|
19 #include <boost/range/end.hpp>
|
Chris@16
|
20
|
Chris@16
|
21 #include <boost/utility/enable_if.hpp>
|
Chris@16
|
22 #include <boost/type_traits/is_same.hpp>
|
Chris@16
|
23
|
Chris@16
|
24 #include <boost/algorithm/searching/detail/bm_traits.hpp>
|
Chris@16
|
25 #include <boost/algorithm/searching/detail/debugging.hpp>
|
Chris@16
|
26
|
Chris@16
|
27 namespace boost { namespace algorithm {
|
Chris@16
|
28
|
Chris@16
|
29 /*
|
Chris@16
|
30 A templated version of the boyer-moore searching algorithm.
|
Chris@16
|
31
|
Chris@16
|
32 References:
|
Chris@16
|
33 http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
|
Chris@16
|
34 http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
|
Chris@16
|
35
|
Chris@16
|
36 Explanations:
|
Chris@16
|
37 http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
|
Chris@16
|
38 http://www.movsd.com/bm.htm
|
Chris@16
|
39 http://www.cs.ucdavis.edu/~gusfield/cs224f09/bnotes.pdf
|
Chris@16
|
40
|
Chris@16
|
41 The Boyer-Moore search algorithm uses two tables, a "bad character" table
|
Chris@16
|
42 to tell how far to skip ahead when it hits a character that is not in the pattern,
|
Chris@16
|
43 and a "good character" table to tell how far to skip ahead when it hits a
|
Chris@16
|
44 mismatch on a character that _is_ in the pattern.
|
Chris@16
|
45
|
Chris@16
|
46 Requirements:
|
Chris@16
|
47 * Random access iterators
|
Chris@16
|
48 * The two iterator types (patIter and corpusIter) must
|
Chris@16
|
49 "point to" the same underlying type and be comparable.
|
Chris@16
|
50 * Additional requirements may be imposed but the skip table, such as:
|
Chris@16
|
51 ** Numeric type (array-based skip table)
|
Chris@16
|
52 ** Hashable type (map-based skip table)
|
Chris@16
|
53 */
|
Chris@16
|
54
|
Chris@16
|
55 template <typename patIter, typename traits = detail::BM_traits<patIter> >
|
Chris@16
|
56 class boyer_moore {
|
Chris@16
|
57 typedef typename std::iterator_traits<patIter>::difference_type difference_type;
|
Chris@16
|
58 public:
|
Chris@16
|
59 boyer_moore ( patIter first, patIter last )
|
Chris@16
|
60 : pat_first ( first ), pat_last ( last ),
|
Chris@16
|
61 k_pattern_length ( std::distance ( pat_first, pat_last )),
|
Chris@16
|
62 skip_ ( k_pattern_length, -1 ),
|
Chris@16
|
63 suffix_ ( k_pattern_length + 1 )
|
Chris@16
|
64 {
|
Chris@16
|
65 this->build_skip_table ( first, last );
|
Chris@16
|
66 this->build_suffix_table ( first, last );
|
Chris@16
|
67 }
|
Chris@16
|
68
|
Chris@16
|
69 ~boyer_moore () {}
|
Chris@16
|
70
|
Chris@16
|
71 /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last )
|
Chris@16
|
72 /// \brief Searches the corpus for the pattern that was passed into the constructor
|
Chris@16
|
73 ///
|
Chris@16
|
74 /// \param corpus_first The start of the data to search (Random Access Iterator)
|
Chris@16
|
75 /// \param corpus_last One past the end of the data to search
|
Chris@16
|
76 ///
|
Chris@16
|
77 template <typename corpusIter>
|
Chris@16
|
78 corpusIter operator () ( corpusIter corpus_first, corpusIter corpus_last ) const {
|
Chris@16
|
79 BOOST_STATIC_ASSERT (( boost::is_same<
|
Chris@16
|
80 typename std::iterator_traits<patIter>::value_type,
|
Chris@16
|
81 typename std::iterator_traits<corpusIter>::value_type>::value ));
|
Chris@16
|
82
|
Chris@16
|
83 if ( corpus_first == corpus_last ) return corpus_last; // if nothing to search, we didn't find it!
|
Chris@16
|
84 if ( pat_first == pat_last ) return corpus_first; // empty pattern matches at start
|
Chris@16
|
85
|
Chris@16
|
86 const difference_type k_corpus_length = std::distance ( corpus_first, corpus_last );
|
Chris@16
|
87 // If the pattern is larger than the corpus, we can't find it!
|
Chris@16
|
88 if ( k_corpus_length < k_pattern_length )
|
Chris@16
|
89 return corpus_last;
|
Chris@16
|
90
|
Chris@16
|
91 // Do the search
|
Chris@16
|
92 return this->do_search ( corpus_first, corpus_last );
|
Chris@16
|
93 }
|
Chris@16
|
94
|
Chris@16
|
95 template <typename Range>
|
Chris@16
|
96 typename boost::range_iterator<Range>::type operator () ( Range &r ) const {
|
Chris@16
|
97 return (*this) (boost::begin(r), boost::end(r));
|
Chris@16
|
98 }
|
Chris@16
|
99
|
Chris@16
|
100 private:
|
Chris@16
|
101 /// \cond DOXYGEN_HIDE
|
Chris@16
|
102 patIter pat_first, pat_last;
|
Chris@16
|
103 const difference_type k_pattern_length;
|
Chris@16
|
104 typename traits::skip_table_t skip_;
|
Chris@16
|
105 std::vector <difference_type> suffix_;
|
Chris@16
|
106
|
Chris@16
|
107 /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
|
Chris@16
|
108 /// \brief Searches the corpus for the pattern that was passed into the constructor
|
Chris@16
|
109 ///
|
Chris@16
|
110 /// \param corpus_first The start of the data to search (Random Access Iterator)
|
Chris@16
|
111 /// \param corpus_last One past the end of the data to search
|
Chris@16
|
112 /// \param p A predicate used for the search comparisons.
|
Chris@16
|
113 ///
|
Chris@16
|
114 template <typename corpusIter>
|
Chris@16
|
115 corpusIter do_search ( corpusIter corpus_first, corpusIter corpus_last ) const {
|
Chris@16
|
116 /* ---- Do the matching ---- */
|
Chris@16
|
117 corpusIter curPos = corpus_first;
|
Chris@16
|
118 const corpusIter lastPos = corpus_last - k_pattern_length;
|
Chris@16
|
119 difference_type j, k, m;
|
Chris@16
|
120
|
Chris@16
|
121 while ( curPos <= lastPos ) {
|
Chris@16
|
122 /* while ( std::distance ( curPos, corpus_last ) >= k_pattern_length ) { */
|
Chris@16
|
123 // Do we match right where we are?
|
Chris@16
|
124 j = k_pattern_length;
|
Chris@16
|
125 while ( pat_first [j-1] == curPos [j-1] ) {
|
Chris@16
|
126 j--;
|
Chris@16
|
127 // We matched - we're done!
|
Chris@16
|
128 if ( j == 0 )
|
Chris@16
|
129 return curPos;
|
Chris@16
|
130 }
|
Chris@16
|
131
|
Chris@16
|
132 // Since we didn't match, figure out how far to skip forward
|
Chris@16
|
133 k = skip_ [ curPos [ j - 1 ]];
|
Chris@16
|
134 m = j - k - 1;
|
Chris@16
|
135 if ( k < j && m > suffix_ [ j ] )
|
Chris@16
|
136 curPos += m;
|
Chris@16
|
137 else
|
Chris@16
|
138 curPos += suffix_ [ j ];
|
Chris@16
|
139 }
|
Chris@16
|
140
|
Chris@16
|
141 return corpus_last; // We didn't find anything
|
Chris@16
|
142 }
|
Chris@16
|
143
|
Chris@16
|
144
|
Chris@16
|
145 void build_skip_table ( patIter first, patIter last ) {
|
Chris@16
|
146 for ( std::size_t i = 0; first != last; ++first, ++i )
|
Chris@16
|
147 skip_.insert ( *first, i );
|
Chris@16
|
148 }
|
Chris@16
|
149
|
Chris@16
|
150
|
Chris@16
|
151 template<typename Iter, typename Container>
|
Chris@16
|
152 void compute_bm_prefix ( Iter pat_first, Iter pat_last, Container &prefix ) {
|
Chris@16
|
153 const std::size_t count = std::distance ( pat_first, pat_last );
|
Chris@16
|
154 BOOST_ASSERT ( count > 0 );
|
Chris@16
|
155 BOOST_ASSERT ( prefix.size () == count );
|
Chris@16
|
156
|
Chris@16
|
157 prefix[0] = 0;
|
Chris@16
|
158 std::size_t k = 0;
|
Chris@16
|
159 for ( std::size_t i = 1; i < count; ++i ) {
|
Chris@16
|
160 BOOST_ASSERT ( k < count );
|
Chris@16
|
161 while ( k > 0 && ( pat_first[k] != pat_first[i] )) {
|
Chris@16
|
162 BOOST_ASSERT ( k < count );
|
Chris@16
|
163 k = prefix [ k - 1 ];
|
Chris@16
|
164 }
|
Chris@16
|
165
|
Chris@16
|
166 if ( pat_first[k] == pat_first[i] )
|
Chris@16
|
167 k++;
|
Chris@16
|
168 prefix [ i ] = k;
|
Chris@16
|
169 }
|
Chris@16
|
170 }
|
Chris@16
|
171
|
Chris@16
|
172 void build_suffix_table ( patIter pat_first, patIter pat_last ) {
|
Chris@16
|
173 const std::size_t count = (std::size_t) std::distance ( pat_first, pat_last );
|
Chris@16
|
174
|
Chris@16
|
175 if ( count > 0 ) { // empty pattern
|
Chris@16
|
176 std::vector<typename std::iterator_traits<patIter>::value_type> reversed(count);
|
Chris@16
|
177 (void) std::reverse_copy ( pat_first, pat_last, reversed.begin ());
|
Chris@16
|
178
|
Chris@16
|
179 std::vector<difference_type> prefix (count);
|
Chris@16
|
180 compute_bm_prefix ( pat_first, pat_last, prefix );
|
Chris@16
|
181
|
Chris@16
|
182 std::vector<difference_type> prefix_reversed (count);
|
Chris@16
|
183 compute_bm_prefix ( reversed.begin (), reversed.end (), prefix_reversed );
|
Chris@16
|
184
|
Chris@16
|
185 for ( std::size_t i = 0; i <= count; i++ )
|
Chris@16
|
186 suffix_[i] = count - prefix [count-1];
|
Chris@16
|
187
|
Chris@16
|
188 for ( std::size_t i = 0; i < count; i++ ) {
|
Chris@16
|
189 const std::size_t j = count - prefix_reversed[i];
|
Chris@16
|
190 const difference_type k = i - prefix_reversed[i] + 1;
|
Chris@16
|
191
|
Chris@16
|
192 if (suffix_[j] > k)
|
Chris@16
|
193 suffix_[j] = k;
|
Chris@16
|
194 }
|
Chris@16
|
195 }
|
Chris@16
|
196 }
|
Chris@16
|
197 /// \endcond
|
Chris@16
|
198 };
|
Chris@16
|
199
|
Chris@16
|
200
|
Chris@16
|
201 /* Two ranges as inputs gives us four possibilities; with 2,3,3,4 parameters
|
Chris@16
|
202 Use a bit of TMP to disambiguate the 3-argument templates */
|
Chris@16
|
203
|
Chris@16
|
204 /// \fn boyer_moore_search ( corpusIter corpus_first, corpusIter corpus_last,
|
Chris@16
|
205 /// patIter pat_first, patIter pat_last )
|
Chris@16
|
206 /// \brief Searches the corpus for the pattern.
|
Chris@16
|
207 ///
|
Chris@16
|
208 /// \param corpus_first The start of the data to search (Random Access Iterator)
|
Chris@16
|
209 /// \param corpus_last One past the end of the data to search
|
Chris@16
|
210 /// \param pat_first The start of the pattern to search for (Random Access Iterator)
|
Chris@16
|
211 /// \param pat_last One past the end of the data to search for
|
Chris@16
|
212 ///
|
Chris@16
|
213 template <typename patIter, typename corpusIter>
|
Chris@16
|
214 corpusIter boyer_moore_search (
|
Chris@16
|
215 corpusIter corpus_first, corpusIter corpus_last,
|
Chris@16
|
216 patIter pat_first, patIter pat_last )
|
Chris@16
|
217 {
|
Chris@16
|
218 boyer_moore<patIter> bm ( pat_first, pat_last );
|
Chris@16
|
219 return bm ( corpus_first, corpus_last );
|
Chris@16
|
220 }
|
Chris@16
|
221
|
Chris@16
|
222 template <typename PatternRange, typename corpusIter>
|
Chris@16
|
223 corpusIter boyer_moore_search (
|
Chris@16
|
224 corpusIter corpus_first, corpusIter corpus_last, const PatternRange &pattern )
|
Chris@16
|
225 {
|
Chris@16
|
226 typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
|
Chris@16
|
227 boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
|
Chris@16
|
228 return bm ( corpus_first, corpus_last );
|
Chris@16
|
229 }
|
Chris@16
|
230
|
Chris@16
|
231 template <typename patIter, typename CorpusRange>
|
Chris@16
|
232 typename boost::lazy_disable_if_c<
|
Chris@16
|
233 boost::is_same<CorpusRange, patIter>::value, typename boost::range_iterator<CorpusRange> >
|
Chris@16
|
234 ::type
|
Chris@16
|
235 boyer_moore_search ( CorpusRange &corpus, patIter pat_first, patIter pat_last )
|
Chris@16
|
236 {
|
Chris@16
|
237 boyer_moore<patIter> bm ( pat_first, pat_last );
|
Chris@16
|
238 return bm (boost::begin (corpus), boost::end (corpus));
|
Chris@16
|
239 }
|
Chris@16
|
240
|
Chris@16
|
241 template <typename PatternRange, typename CorpusRange>
|
Chris@16
|
242 typename boost::range_iterator<CorpusRange>::type
|
Chris@16
|
243 boyer_moore_search ( CorpusRange &corpus, const PatternRange &pattern )
|
Chris@16
|
244 {
|
Chris@16
|
245 typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
|
Chris@16
|
246 boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
|
Chris@16
|
247 return bm (boost::begin (corpus), boost::end (corpus));
|
Chris@16
|
248 }
|
Chris@16
|
249
|
Chris@16
|
250
|
Chris@16
|
251 // Creator functions -- take a pattern range, return an object
|
Chris@16
|
252 template <typename Range>
|
Chris@16
|
253 boost::algorithm::boyer_moore<typename boost::range_iterator<const Range>::type>
|
Chris@16
|
254 make_boyer_moore ( const Range &r ) {
|
Chris@16
|
255 return boost::algorithm::boyer_moore
|
Chris@16
|
256 <typename boost::range_iterator<const Range>::type> (boost::begin(r), boost::end(r));
|
Chris@16
|
257 }
|
Chris@16
|
258
|
Chris@16
|
259 template <typename Range>
|
Chris@16
|
260 boost::algorithm::boyer_moore<typename boost::range_iterator<Range>::type>
|
Chris@16
|
261 make_boyer_moore ( Range &r ) {
|
Chris@16
|
262 return boost::algorithm::boyer_moore
|
Chris@16
|
263 <typename boost::range_iterator<Range>::type> (boost::begin(r), boost::end(r));
|
Chris@16
|
264 }
|
Chris@16
|
265
|
Chris@16
|
266 }}
|
Chris@16
|
267
|
Chris@16
|
268 #endif // BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
|