Chris@16
|
1 /*
|
Chris@16
|
2 *
|
Chris@16
|
3 * Copyright (c) 1998-2002
|
Chris@16
|
4 * John Maddock
|
Chris@16
|
5 *
|
Chris@16
|
6 * Use, modification and distribution are subject to the
|
Chris@16
|
7 * Boost Software License, Version 1.0. (See accompanying file
|
Chris@16
|
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
9 *
|
Chris@16
|
10 */
|
Chris@16
|
11
|
Chris@16
|
12 /*
|
Chris@16
|
13 * LOCATION: see http://www.boost.org for most recent version.
|
Chris@16
|
14 * FILE states.cpp
|
Chris@16
|
15 * VERSION see <boost/version.hpp>
|
Chris@16
|
16 * DESCRIPTION: Declares internal state machine structures.
|
Chris@16
|
17 */
|
Chris@16
|
18
|
Chris@16
|
19 #ifndef BOOST_REGEX_V4_STATES_HPP
|
Chris@16
|
20 #define BOOST_REGEX_V4_STATES_HPP
|
Chris@16
|
21
|
Chris@16
|
22 #ifdef BOOST_MSVC
|
Chris@16
|
23 #pragma warning(push)
|
Chris@16
|
24 #pragma warning(disable: 4103)
|
Chris@16
|
25 #endif
|
Chris@16
|
26 #ifdef BOOST_HAS_ABI_HEADERS
|
Chris@16
|
27 # include BOOST_ABI_PREFIX
|
Chris@16
|
28 #endif
|
Chris@16
|
29 #ifdef BOOST_MSVC
|
Chris@16
|
30 #pragma warning(pop)
|
Chris@16
|
31 #endif
|
Chris@16
|
32
|
Chris@16
|
33 namespace boost{
|
Chris@16
|
34 namespace re_detail{
|
Chris@16
|
35
|
Chris@16
|
36 /*** mask_type *******************************************************
|
Chris@16
|
37 Whenever we have a choice of two alternatives, we use an array of bytes
|
Chris@16
|
38 to indicate which of the two alternatives it is possible to take for any
|
Chris@16
|
39 given input character. If mask_take is set, then we can take the next
|
Chris@16
|
40 state, and if mask_skip is set then we can take the alternative.
|
Chris@16
|
41 ***********************************************************************/
|
Chris@16
|
42 enum mask_type
|
Chris@16
|
43 {
|
Chris@16
|
44 mask_take = 1,
|
Chris@16
|
45 mask_skip = 2,
|
Chris@16
|
46 mask_init = 4,
|
Chris@16
|
47 mask_any = mask_skip | mask_take,
|
Chris@16
|
48 mask_all = mask_any
|
Chris@16
|
49 };
|
Chris@16
|
50
|
Chris@16
|
51 /*** helpers **********************************************************
|
Chris@16
|
52 These helpers let us use function overload resolution to detect whether
|
Chris@16
|
53 we have narrow or wide character strings:
|
Chris@16
|
54 ***********************************************************************/
|
Chris@16
|
55 struct _narrow_type{};
|
Chris@16
|
56 struct _wide_type{};
|
Chris@16
|
57 template <class charT> struct is_byte;
|
Chris@16
|
58 template<> struct is_byte<char> { typedef _narrow_type width_type; };
|
Chris@16
|
59 template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
|
Chris@16
|
60 template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
|
Chris@16
|
61 template <class charT> struct is_byte { typedef _wide_type width_type; };
|
Chris@16
|
62
|
Chris@16
|
63 /*** enum syntax_element_type ******************************************
|
Chris@16
|
64 Every record in the state machine falls into one of the following types:
|
Chris@16
|
65 ***********************************************************************/
|
Chris@16
|
66 enum syntax_element_type
|
Chris@16
|
67 {
|
Chris@16
|
68 // start of a marked sub-expression, or perl-style (?...) extension
|
Chris@16
|
69 syntax_element_startmark = 0,
|
Chris@16
|
70 // end of a marked sub-expression, or perl-style (?...) extension
|
Chris@16
|
71 syntax_element_endmark = syntax_element_startmark + 1,
|
Chris@16
|
72 // any sequence of literal characters
|
Chris@16
|
73 syntax_element_literal = syntax_element_endmark + 1,
|
Chris@16
|
74 // start of line assertion: ^
|
Chris@16
|
75 syntax_element_start_line = syntax_element_literal + 1,
|
Chris@16
|
76 // end of line assertion $
|
Chris@16
|
77 syntax_element_end_line = syntax_element_start_line + 1,
|
Chris@16
|
78 // match any character: .
|
Chris@16
|
79 syntax_element_wild = syntax_element_end_line + 1,
|
Chris@16
|
80 // end of expression: we have a match when we get here
|
Chris@16
|
81 syntax_element_match = syntax_element_wild + 1,
|
Chris@16
|
82 // perl style word boundary: \b
|
Chris@16
|
83 syntax_element_word_boundary = syntax_element_match + 1,
|
Chris@16
|
84 // perl style within word boundary: \B
|
Chris@16
|
85 syntax_element_within_word = syntax_element_word_boundary + 1,
|
Chris@16
|
86 // start of word assertion: \<
|
Chris@16
|
87 syntax_element_word_start = syntax_element_within_word + 1,
|
Chris@16
|
88 // end of word assertion: \>
|
Chris@16
|
89 syntax_element_word_end = syntax_element_word_start + 1,
|
Chris@16
|
90 // start of buffer assertion: \`
|
Chris@16
|
91 syntax_element_buffer_start = syntax_element_word_end + 1,
|
Chris@16
|
92 // end of buffer assertion: \'
|
Chris@16
|
93 syntax_element_buffer_end = syntax_element_buffer_start + 1,
|
Chris@16
|
94 // backreference to previously matched sub-expression
|
Chris@16
|
95 syntax_element_backref = syntax_element_buffer_end + 1,
|
Chris@16
|
96 // either a wide character set [..] or one with multicharacter collating elements:
|
Chris@16
|
97 syntax_element_long_set = syntax_element_backref + 1,
|
Chris@16
|
98 // narrow character set: [...]
|
Chris@16
|
99 syntax_element_set = syntax_element_long_set + 1,
|
Chris@16
|
100 // jump to a new state in the machine:
|
Chris@16
|
101 syntax_element_jump = syntax_element_set + 1,
|
Chris@16
|
102 // choose between two production states:
|
Chris@16
|
103 syntax_element_alt = syntax_element_jump + 1,
|
Chris@16
|
104 // a repeat
|
Chris@16
|
105 syntax_element_rep = syntax_element_alt + 1,
|
Chris@16
|
106 // match a combining character sequence
|
Chris@16
|
107 syntax_element_combining = syntax_element_rep + 1,
|
Chris@16
|
108 // perl style soft buffer end: \z
|
Chris@16
|
109 syntax_element_soft_buffer_end = syntax_element_combining + 1,
|
Chris@16
|
110 // perl style continuation: \G
|
Chris@16
|
111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
|
Chris@16
|
112 // single character repeats:
|
Chris@16
|
113 syntax_element_dot_rep = syntax_element_restart_continue + 1,
|
Chris@16
|
114 syntax_element_char_rep = syntax_element_dot_rep + 1,
|
Chris@16
|
115 syntax_element_short_set_rep = syntax_element_char_rep + 1,
|
Chris@16
|
116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
|
Chris@16
|
117 // a backstep for lookbehind repeats:
|
Chris@16
|
118 syntax_element_backstep = syntax_element_long_set_rep + 1,
|
Chris@16
|
119 // an assertion that a mark was matched:
|
Chris@16
|
120 syntax_element_assert_backref = syntax_element_backstep + 1,
|
Chris@16
|
121 syntax_element_toggle_case = syntax_element_assert_backref + 1,
|
Chris@16
|
122 // a recursive expression:
|
Chris@16
|
123 syntax_element_recurse = syntax_element_toggle_case + 1
|
Chris@16
|
124 };
|
Chris@16
|
125
|
Chris@16
|
126 #ifdef BOOST_REGEX_DEBUG
|
Chris@16
|
127 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
|
Chris@16
|
128 std::ostream& operator<<(std::ostream&, syntax_element_type);
|
Chris@16
|
129 #endif
|
Chris@16
|
130
|
Chris@16
|
131 struct re_syntax_base;
|
Chris@16
|
132
|
Chris@16
|
133 /*** union offset_type ************************************************
|
Chris@16
|
134 Points to another state in the machine. During machine construction
|
Chris@16
|
135 we use integral offsets, but these are converted to pointers before
|
Chris@16
|
136 execution of the machine.
|
Chris@16
|
137 ***********************************************************************/
|
Chris@16
|
138 union offset_type
|
Chris@16
|
139 {
|
Chris@16
|
140 re_syntax_base* p;
|
Chris@16
|
141 std::ptrdiff_t i;
|
Chris@16
|
142 };
|
Chris@16
|
143
|
Chris@16
|
144 /*** struct re_syntax_base ********************************************
|
Chris@16
|
145 Base class for all states in the machine.
|
Chris@16
|
146 ***********************************************************************/
|
Chris@16
|
147 struct re_syntax_base
|
Chris@16
|
148 {
|
Chris@16
|
149 syntax_element_type type; // what kind of state this is
|
Chris@16
|
150 offset_type next; // next state in the machine
|
Chris@16
|
151 };
|
Chris@16
|
152
|
Chris@16
|
153 /*** struct re_brace **************************************************
|
Chris@16
|
154 A marked parenthesis.
|
Chris@16
|
155 ***********************************************************************/
|
Chris@16
|
156 struct re_brace : public re_syntax_base
|
Chris@16
|
157 {
|
Chris@16
|
158 // The index to match, can be zero (don't mark the sub-expression)
|
Chris@16
|
159 // or negative (for perl style (?...) extentions):
|
Chris@16
|
160 int index;
|
Chris@16
|
161 bool icase;
|
Chris@16
|
162 };
|
Chris@16
|
163
|
Chris@16
|
164 /*** struct re_dot **************************************************
|
Chris@16
|
165 Match anything.
|
Chris@16
|
166 ***********************************************************************/
|
Chris@16
|
167 enum
|
Chris@16
|
168 {
|
Chris@16
|
169 dont_care = 1,
|
Chris@16
|
170 force_not_newline = 0,
|
Chris@16
|
171 force_newline = 2,
|
Chris@16
|
172
|
Chris@16
|
173 test_not_newline = 2,
|
Chris@16
|
174 test_newline = 3
|
Chris@16
|
175 };
|
Chris@16
|
176 struct re_dot : public re_syntax_base
|
Chris@16
|
177 {
|
Chris@16
|
178 unsigned char mask;
|
Chris@16
|
179 };
|
Chris@16
|
180
|
Chris@16
|
181 /*** struct re_literal ************************************************
|
Chris@16
|
182 A string of literals, following this structure will be an
|
Chris@16
|
183 array of characters: charT[length]
|
Chris@16
|
184 ***********************************************************************/
|
Chris@16
|
185 struct re_literal : public re_syntax_base
|
Chris@16
|
186 {
|
Chris@16
|
187 unsigned int length;
|
Chris@16
|
188 };
|
Chris@16
|
189
|
Chris@16
|
190 /*** struct re_case ************************************************
|
Chris@16
|
191 Indicates whether we are moving to a case insensive block or not
|
Chris@16
|
192 ***********************************************************************/
|
Chris@16
|
193 struct re_case : public re_syntax_base
|
Chris@16
|
194 {
|
Chris@16
|
195 bool icase;
|
Chris@16
|
196 };
|
Chris@16
|
197
|
Chris@16
|
198 /*** struct re_set_long ***********************************************
|
Chris@16
|
199 A wide character set of characters, following this structure will be
|
Chris@16
|
200 an array of type charT:
|
Chris@16
|
201 First csingles null-terminated strings
|
Chris@16
|
202 Then 2 * cranges NULL terminated strings
|
Chris@16
|
203 Then cequivalents NULL terminated strings
|
Chris@16
|
204 ***********************************************************************/
|
Chris@16
|
205 template <class mask_type>
|
Chris@16
|
206 struct re_set_long : public re_syntax_base
|
Chris@16
|
207 {
|
Chris@16
|
208 unsigned int csingles, cranges, cequivalents;
|
Chris@16
|
209 mask_type cclasses;
|
Chris@16
|
210 mask_type cnclasses;
|
Chris@16
|
211 bool isnot;
|
Chris@16
|
212 bool singleton;
|
Chris@16
|
213 };
|
Chris@16
|
214
|
Chris@16
|
215 /*** struct re_set ****************************************************
|
Chris@16
|
216 A set of narrow-characters, matches any of _map which is none-zero
|
Chris@16
|
217 ***********************************************************************/
|
Chris@16
|
218 struct re_set : public re_syntax_base
|
Chris@16
|
219 {
|
Chris@16
|
220 unsigned char _map[1 << CHAR_BIT];
|
Chris@16
|
221 };
|
Chris@16
|
222
|
Chris@16
|
223 /*** struct re_jump ***************************************************
|
Chris@16
|
224 Jump to a new location in the machine (not next).
|
Chris@16
|
225 ***********************************************************************/
|
Chris@16
|
226 struct re_jump : public re_syntax_base
|
Chris@16
|
227 {
|
Chris@16
|
228 offset_type alt; // location to jump to
|
Chris@16
|
229 };
|
Chris@16
|
230
|
Chris@16
|
231 /*** struct re_alt ***************************************************
|
Chris@16
|
232 Jump to a new location in the machine (possibly next).
|
Chris@16
|
233 ***********************************************************************/
|
Chris@16
|
234 struct re_alt : public re_jump
|
Chris@16
|
235 {
|
Chris@16
|
236 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
|
Chris@16
|
237 unsigned int can_be_null; // true if we match a NULL string
|
Chris@16
|
238 };
|
Chris@16
|
239
|
Chris@16
|
240 /*** struct re_repeat *************************************************
|
Chris@16
|
241 Repeat a section of the machine
|
Chris@16
|
242 ***********************************************************************/
|
Chris@16
|
243 struct re_repeat : public re_alt
|
Chris@16
|
244 {
|
Chris@16
|
245 std::size_t min, max; // min and max allowable repeats
|
Chris@16
|
246 int state_id; // Unique identifier for this repeat
|
Chris@16
|
247 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
|
Chris@16
|
248 bool greedy; // True if this is a greedy repeat
|
Chris@16
|
249 };
|
Chris@16
|
250
|
Chris@16
|
251 /*** struct re_recurse ************************************************
|
Chris@16
|
252 Recurse to a particular subexpression.
|
Chris@16
|
253 **********************************************************************/
|
Chris@16
|
254 struct re_recurse : public re_jump
|
Chris@16
|
255 {
|
Chris@16
|
256 int state_id; // identifier of first nested repeat within the recursion.
|
Chris@16
|
257 };
|
Chris@16
|
258
|
Chris@16
|
259 /*** enum re_jump_size_type *******************************************
|
Chris@16
|
260 Provides compiled size of re_jump structure (allowing for trailing alignment).
|
Chris@16
|
261 We provide this so we know how manybytes to insert when constructing the machine
|
Chris@16
|
262 (The value of padding_mask is defined in regex_raw_buffer.hpp).
|
Chris@16
|
263 ***********************************************************************/
|
Chris@16
|
264 enum re_jump_size_type
|
Chris@16
|
265 {
|
Chris@16
|
266 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
|
Chris@16
|
267 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
|
Chris@16
|
268 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
|
Chris@16
|
269 };
|
Chris@16
|
270
|
Chris@16
|
271 /*** proc re_is_set_member *********************************************
|
Chris@16
|
272 Forward declaration: we'll need this one later...
|
Chris@16
|
273 ***********************************************************************/
|
Chris@16
|
274
|
Chris@16
|
275 template<class charT, class traits>
|
Chris@16
|
276 struct regex_data;
|
Chris@16
|
277
|
Chris@16
|
278 template <class iterator, class charT, class traits_type, class char_classT>
|
Chris@16
|
279 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
|
Chris@16
|
280 iterator last,
|
Chris@16
|
281 const re_set_long<char_classT>* set_,
|
Chris@16
|
282 const regex_data<charT, traits_type>& e, bool icase);
|
Chris@16
|
283
|
Chris@16
|
284 } // namespace re_detail
|
Chris@16
|
285
|
Chris@16
|
286 } // namespace boost
|
Chris@16
|
287
|
Chris@16
|
288 #ifdef BOOST_MSVC
|
Chris@16
|
289 #pragma warning(push)
|
Chris@16
|
290 #pragma warning(disable: 4103)
|
Chris@16
|
291 #endif
|
Chris@16
|
292 #ifdef BOOST_HAS_ABI_HEADERS
|
Chris@16
|
293 # include BOOST_ABI_SUFFIX
|
Chris@16
|
294 #endif
|
Chris@16
|
295 #ifdef BOOST_MSVC
|
Chris@16
|
296 #pragma warning(pop)
|
Chris@16
|
297 #endif
|
Chris@16
|
298
|
Chris@16
|
299 #endif
|
Chris@16
|
300
|
Chris@16
|
301
|