Chris@16
|
1 ///////////////////////////////////////////////////////////////////////////////
|
Chris@16
|
2 // parse_charset.hpp
|
Chris@16
|
3 //
|
Chris@16
|
4 // Copyright 2008 Eric Niebler. Distributed under the Boost
|
Chris@16
|
5 // Software License, Version 1.0. (See accompanying file
|
Chris@16
|
6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
7
|
Chris@16
|
8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
|
Chris@16
|
9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
|
Chris@16
|
10
|
Chris@16
|
11 // MS compatible compilers support #pragma once
|
Chris@101
|
12 #if defined(_MSC_VER)
|
Chris@16
|
13 # pragma once
|
Chris@16
|
14 #endif
|
Chris@16
|
15
|
Chris@16
|
16 #include <boost/config.hpp>
|
Chris@16
|
17 #include <boost/integer.hpp>
|
Chris@16
|
18 #include <boost/mpl/bool.hpp>
|
Chris@16
|
19 #include <boost/throw_exception.hpp>
|
Chris@16
|
20 #include <boost/numeric/conversion/converter.hpp>
|
Chris@16
|
21 #include <boost/xpressive/detail/detail_fwd.hpp>
|
Chris@16
|
22 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
|
Chris@16
|
23 #include <boost/xpressive/detail/utility/literals.hpp>
|
Chris@16
|
24 #include <boost/xpressive/detail/utility/chset/chset.hpp>
|
Chris@16
|
25 #include <boost/xpressive/regex_constants.hpp>
|
Chris@16
|
26
|
Chris@16
|
27 namespace boost { namespace xpressive { namespace detail
|
Chris@16
|
28 {
|
Chris@16
|
29
|
Chris@16
|
30 enum escape_type
|
Chris@16
|
31 {
|
Chris@16
|
32 escape_char
|
Chris@16
|
33 , escape_mark
|
Chris@16
|
34 , escape_class
|
Chris@16
|
35 };
|
Chris@16
|
36
|
Chris@16
|
37 ///////////////////////////////////////////////////////////////////////////////
|
Chris@16
|
38 // escape_value
|
Chris@16
|
39 //
|
Chris@16
|
40 template<typename Char, typename Class>
|
Chris@16
|
41 struct escape_value
|
Chris@16
|
42 {
|
Chris@16
|
43 Char ch_;
|
Chris@16
|
44 int mark_nbr_;
|
Chris@16
|
45 Class class_;
|
Chris@16
|
46 escape_type type_;
|
Chris@16
|
47 };
|
Chris@16
|
48
|
Chris@16
|
49 ///////////////////////////////////////////////////////////////////////////////
|
Chris@16
|
50 // char_overflow_handler
|
Chris@16
|
51 //
|
Chris@16
|
52 struct char_overflow_handler
|
Chris@16
|
53 {
|
Chris@16
|
54 void operator ()(numeric::range_check_result result) const // throw(regex_error)
|
Chris@16
|
55 {
|
Chris@16
|
56 if(numeric::cInRange != result)
|
Chris@16
|
57 {
|
Chris@16
|
58 BOOST_THROW_EXCEPTION(
|
Chris@16
|
59 regex_error(
|
Chris@16
|
60 regex_constants::error_escape
|
Chris@16
|
61 , "character escape too large to fit in target character type"
|
Chris@16
|
62 )
|
Chris@16
|
63 );
|
Chris@16
|
64 }
|
Chris@16
|
65 }
|
Chris@16
|
66 };
|
Chris@16
|
67
|
Chris@16
|
68 ///////////////////////////////////////////////////////////////////////////////
|
Chris@16
|
69 // parse_escape
|
Chris@16
|
70 //
|
Chris@16
|
71 template<typename FwdIter, typename CompilerTraits>
|
Chris@16
|
72 escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
|
Chris@16
|
73 parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
|
Chris@16
|
74 {
|
Chris@16
|
75 using namespace regex_constants;
|
Chris@16
|
76 typedef typename iterator_value<FwdIter>::type char_type;
|
Chris@16
|
77 typedef typename CompilerTraits::regex_traits regex_traits;
|
Chris@16
|
78 typedef typename regex_traits::char_class_type char_class_type;
|
Chris@16
|
79
|
Chris@16
|
80 // define an unsigned type the same size as char_type
|
Chris@16
|
81 typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
|
Chris@16
|
82 BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
|
Chris@16
|
83 typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
|
Chris@16
|
84
|
Chris@16
|
85 BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
|
Chris@16
|
86 numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
|
Chris@16
|
87 escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
|
Chris@16
|
88 bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
|
Chris@16
|
89 regex_traits const &rxtraits = tr.traits();
|
Chris@16
|
90 FwdIter tmp;
|
Chris@16
|
91
|
Chris@16
|
92 esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
|
Chris@16
|
93 if(0 != esc.class_)
|
Chris@16
|
94 {
|
Chris@16
|
95 esc.type_ = escape_class;
|
Chris@16
|
96 return esc;
|
Chris@16
|
97 }
|
Chris@16
|
98
|
Chris@16
|
99 if(-1 != rxtraits.value(*begin, 8))
|
Chris@16
|
100 {
|
Chris@16
|
101 esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
|
Chris@16
|
102 return esc;
|
Chris@16
|
103 }
|
Chris@16
|
104
|
Chris@16
|
105 switch(*begin)
|
Chris@16
|
106 {
|
Chris@16
|
107 // bell character
|
Chris@16
|
108 case BOOST_XPR_CHAR_(char_type, 'a'):
|
Chris@16
|
109 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
|
Chris@16
|
110 ++begin;
|
Chris@16
|
111 break;
|
Chris@16
|
112 // escape character
|
Chris@16
|
113 case BOOST_XPR_CHAR_(char_type, 'e'):
|
Chris@16
|
114 esc.ch_ = converter(27);
|
Chris@16
|
115 ++begin;
|
Chris@16
|
116 break;
|
Chris@16
|
117 // control character
|
Chris@16
|
118 case BOOST_XPR_CHAR_(char_type, 'c'):
|
Chris@16
|
119 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
|
Chris@16
|
120 BOOST_XPR_ENSURE_
|
Chris@16
|
121 (
|
Chris@16
|
122 rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
|
Chris@16
|
123 || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
|
Chris@16
|
124 , error_escape
|
Chris@16
|
125 , "invalid escape control letter; must be one of a-z or A-Z"
|
Chris@16
|
126 );
|
Chris@16
|
127 // Convert to character according to ECMA-262, section 15.10.2.10:
|
Chris@16
|
128 esc.ch_ = converter(*begin % 32);
|
Chris@16
|
129 ++begin;
|
Chris@16
|
130 break;
|
Chris@16
|
131 // formfeed character
|
Chris@16
|
132 case BOOST_XPR_CHAR_(char_type, 'f'):
|
Chris@16
|
133 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
|
Chris@16
|
134 ++begin;
|
Chris@16
|
135 break;
|
Chris@16
|
136 // newline
|
Chris@16
|
137 case BOOST_XPR_CHAR_(char_type, 'n'):
|
Chris@16
|
138 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
|
Chris@16
|
139 ++begin;
|
Chris@16
|
140 break;
|
Chris@16
|
141 // return
|
Chris@16
|
142 case BOOST_XPR_CHAR_(char_type, 'r'):
|
Chris@16
|
143 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
|
Chris@16
|
144 ++begin;
|
Chris@16
|
145 break;
|
Chris@16
|
146 // horizontal tab
|
Chris@16
|
147 case BOOST_XPR_CHAR_(char_type, 't'):
|
Chris@16
|
148 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
|
Chris@16
|
149 ++begin;
|
Chris@16
|
150 break;
|
Chris@16
|
151 // vertical tab
|
Chris@16
|
152 case BOOST_XPR_CHAR_(char_type, 'v'):
|
Chris@16
|
153 esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
|
Chris@16
|
154 ++begin;
|
Chris@16
|
155 break;
|
Chris@16
|
156 // hex escape sequence
|
Chris@16
|
157 case BOOST_XPR_CHAR_(char_type, 'x'):
|
Chris@16
|
158 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
|
Chris@16
|
159 tmp = begin;
|
Chris@16
|
160 esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
|
Chris@16
|
161 BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
|
Chris@16
|
162 "must be \\x HexDigit HexDigit");
|
Chris@16
|
163 break;
|
Chris@16
|
164 // Unicode escape sequence
|
Chris@16
|
165 case BOOST_XPR_CHAR_(char_type, 'u'):
|
Chris@16
|
166 BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
|
Chris@16
|
167 tmp = begin;
|
Chris@16
|
168 esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
|
Chris@16
|
169 BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
|
Chris@16
|
170 "must be \\u HexDigit HexDigit HexDigit HexDigit");
|
Chris@16
|
171 break;
|
Chris@16
|
172 // backslash
|
Chris@16
|
173 case BOOST_XPR_CHAR_(char_type, '\\'):
|
Chris@16
|
174 //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
|
Chris@16
|
175 //++begin;
|
Chris@16
|
176 //break;
|
Chris@16
|
177 // all other escaped characters represent themselves
|
Chris@16
|
178 default:
|
Chris@16
|
179 esc.ch_ = *begin;
|
Chris@16
|
180 ++begin;
|
Chris@16
|
181 break;
|
Chris@16
|
182 }
|
Chris@16
|
183
|
Chris@16
|
184 return esc;
|
Chris@16
|
185 }
|
Chris@16
|
186
|
Chris@16
|
187 //////////////////////////////////////////////////////////////////////////
|
Chris@16
|
188 // parse_charset
|
Chris@16
|
189 //
|
Chris@16
|
190 template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
|
Chris@16
|
191 inline void parse_charset
|
Chris@16
|
192 (
|
Chris@16
|
193 FwdIter &begin
|
Chris@16
|
194 , FwdIter end
|
Chris@16
|
195 , compound_charset<RegexTraits> &chset
|
Chris@16
|
196 , CompilerTraits &tr
|
Chris@16
|
197 )
|
Chris@16
|
198 {
|
Chris@16
|
199 using namespace regex_constants;
|
Chris@16
|
200 typedef typename RegexTraits::char_type char_type;
|
Chris@16
|
201 typedef typename RegexTraits::char_class_type char_class_type;
|
Chris@16
|
202 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
|
Chris@16
|
203 RegexTraits const &rxtraits = tr.traits();
|
Chris@16
|
204 bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
|
Chris@16
|
205 FwdIter iprev = FwdIter();
|
Chris@16
|
206 escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
|
Chris@16
|
207 bool invert = false;
|
Chris@16
|
208
|
Chris@16
|
209 // check to see if we have an inverse charset
|
Chris@16
|
210 if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
|
Chris@16
|
211 {
|
Chris@16
|
212 begin = iprev;
|
Chris@16
|
213 invert = true;
|
Chris@16
|
214 }
|
Chris@16
|
215
|
Chris@16
|
216 // skip the end token if-and-only-if it is the first token in the charset
|
Chris@16
|
217 if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
|
Chris@16
|
218 {
|
Chris@16
|
219 for(; begin != iprev; ++begin)
|
Chris@16
|
220 {
|
Chris@16
|
221 chset.set_char(*begin, rxtraits, icase);
|
Chris@16
|
222 }
|
Chris@16
|
223 }
|
Chris@16
|
224
|
Chris@16
|
225 compiler_token_type tok;
|
Chris@16
|
226 char_type ch_prev = char_type(), ch_next = char_type();
|
Chris@16
|
227 bool have_prev = false;
|
Chris@16
|
228
|
Chris@16
|
229 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
|
Chris@16
|
230
|
Chris@16
|
231 // remember the current position and grab the next token
|
Chris@16
|
232 iprev = begin;
|
Chris@16
|
233 tok = tr.get_charset_token(begin, end);
|
Chris@16
|
234 do
|
Chris@16
|
235 {
|
Chris@16
|
236 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
|
Chris@16
|
237
|
Chris@16
|
238 if(token_charset_hyphen == tok && have_prev)
|
Chris@16
|
239 {
|
Chris@16
|
240 // remember the current position
|
Chris@16
|
241 FwdIter iprev2 = begin;
|
Chris@16
|
242 have_prev = false;
|
Chris@16
|
243
|
Chris@16
|
244 // ch_prev is lower bound of a range
|
Chris@16
|
245 switch(tr.get_charset_token(begin, end))
|
Chris@16
|
246 {
|
Chris@16
|
247 case token_charset_hyphen:
|
Chris@16
|
248 case token_charset_invert:
|
Chris@16
|
249 begin = iprev2; // un-get these tokens and fall through
|
Chris@16
|
250 BOOST_FALLTHROUGH;
|
Chris@16
|
251 case token_literal:
|
Chris@16
|
252 ch_next = *begin++;
|
Chris@16
|
253 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
|
Chris@16
|
254 chset.set_range(ch_prev, ch_next, rxtraits, icase);
|
Chris@16
|
255 continue;
|
Chris@16
|
256 case token_charset_backspace:
|
Chris@16
|
257 ch_next = char_type(8); // backspace
|
Chris@16
|
258 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
|
Chris@16
|
259 chset.set_range(ch_prev, ch_next, rxtraits, icase);
|
Chris@16
|
260 continue;
|
Chris@16
|
261 case token_escape:
|
Chris@16
|
262 esc = parse_escape(begin, end, tr);
|
Chris@16
|
263 if(escape_char == esc.type_)
|
Chris@16
|
264 {
|
Chris@16
|
265 BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
|
Chris@16
|
266 chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
|
Chris@16
|
267 continue;
|
Chris@16
|
268 }
|
Chris@16
|
269 BOOST_FALLTHROUGH;
|
Chris@16
|
270 case token_charset_end:
|
Chris@16
|
271 default: // not a range.
|
Chris@16
|
272 begin = iprev; // backup to hyphen token
|
Chris@16
|
273 chset.set_char(ch_prev, rxtraits, icase);
|
Chris@16
|
274 chset.set_char(*begin++, rxtraits, icase);
|
Chris@16
|
275 continue;
|
Chris@16
|
276 }
|
Chris@16
|
277 }
|
Chris@16
|
278
|
Chris@16
|
279 if(have_prev)
|
Chris@16
|
280 {
|
Chris@16
|
281 chset.set_char(ch_prev, rxtraits, icase);
|
Chris@16
|
282 have_prev = false;
|
Chris@16
|
283 }
|
Chris@16
|
284
|
Chris@16
|
285 switch(tok)
|
Chris@16
|
286 {
|
Chris@16
|
287 case token_charset_hyphen:
|
Chris@16
|
288 case token_charset_invert:
|
Chris@16
|
289 case token_charset_end:
|
Chris@16
|
290 case token_posix_charset_end:
|
Chris@16
|
291 begin = iprev; // un-get these tokens
|
Chris@16
|
292 ch_prev = *begin++;
|
Chris@16
|
293 have_prev = true;
|
Chris@16
|
294 continue;
|
Chris@16
|
295
|
Chris@16
|
296 case token_charset_backspace:
|
Chris@16
|
297 ch_prev = char_type(8); // backspace
|
Chris@16
|
298 have_prev = true;
|
Chris@16
|
299 continue;
|
Chris@16
|
300
|
Chris@16
|
301 case token_posix_charset_begin:
|
Chris@16
|
302 {
|
Chris@16
|
303 FwdIter tmp = begin, start = begin;
|
Chris@16
|
304 bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
|
Chris@16
|
305 if(invert)
|
Chris@16
|
306 {
|
Chris@16
|
307 begin = start = tmp;
|
Chris@16
|
308 }
|
Chris@16
|
309 while(token_literal == (tok = tr.get_charset_token(begin, end)))
|
Chris@16
|
310 {
|
Chris@16
|
311 tmp = ++begin;
|
Chris@16
|
312 BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
|
Chris@16
|
313 }
|
Chris@16
|
314 if(token_posix_charset_end == tok)
|
Chris@16
|
315 {
|
Chris@16
|
316 char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
|
Chris@16
|
317 BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
|
Chris@16
|
318 chset.set_class(chclass, invert);
|
Chris@16
|
319 continue;
|
Chris@16
|
320 }
|
Chris@16
|
321 begin = iprev; // un-get this token
|
Chris@16
|
322 ch_prev = *begin++;
|
Chris@16
|
323 have_prev = true;
|
Chris@16
|
324 }
|
Chris@16
|
325 continue;
|
Chris@16
|
326
|
Chris@16
|
327 case token_escape:
|
Chris@16
|
328 esc = parse_escape(begin, end, tr);
|
Chris@16
|
329 if(escape_char == esc.type_)
|
Chris@16
|
330 {
|
Chris@16
|
331 ch_prev = esc.ch_;
|
Chris@16
|
332 have_prev = true;
|
Chris@16
|
333 }
|
Chris@16
|
334 else if(escape_class == esc.type_)
|
Chris@16
|
335 {
|
Chris@16
|
336 char_class_type upper_ = lookup_classname(rxtraits, "upper");
|
Chris@16
|
337 BOOST_ASSERT(0 != upper_);
|
Chris@16
|
338 chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
|
Chris@16
|
339 }
|
Chris@16
|
340 else
|
Chris@16
|
341 {
|
Chris@16
|
342 BOOST_ASSERT(false);
|
Chris@16
|
343 }
|
Chris@16
|
344 continue;
|
Chris@16
|
345
|
Chris@16
|
346 default:
|
Chris@16
|
347 ch_prev = *begin++;
|
Chris@16
|
348 have_prev = true;
|
Chris@16
|
349 continue;
|
Chris@16
|
350 }
|
Chris@16
|
351 }
|
Chris@16
|
352 while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
|
Chris@16
|
353 token_charset_end != (tok = tr.get_charset_token(begin, end)));
|
Chris@16
|
354
|
Chris@16
|
355 if(have_prev)
|
Chris@16
|
356 {
|
Chris@16
|
357 chset.set_char(ch_prev, rxtraits, icase);
|
Chris@16
|
358 }
|
Chris@16
|
359
|
Chris@16
|
360 if(invert)
|
Chris@16
|
361 {
|
Chris@16
|
362 chset.inverse();
|
Chris@16
|
363 }
|
Chris@16
|
364 }
|
Chris@16
|
365
|
Chris@16
|
366 }}} // namespace boost::xpressive::detail
|
Chris@16
|
367
|
Chris@16
|
368 #endif
|