Chris@16: /////////////////////////////////////////////////////////////////////////////// Chris@16: // parse_charset.hpp Chris@16: // Chris@16: // Copyright 2008 Eric Niebler. Distributed under the Boost Chris@16: // Software License, Version 1.0. (See accompanying file Chris@16: // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) Chris@16: Chris@16: #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 Chris@16: #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 Chris@16: Chris@16: // MS compatible compilers support #pragma once Chris@101: #if defined(_MSC_VER) Chris@16: # pragma once Chris@16: #endif Chris@16: Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: Chris@16: namespace boost { namespace xpressive { namespace detail Chris@16: { Chris@16: Chris@16: enum escape_type Chris@16: { Chris@16: escape_char Chris@16: , escape_mark Chris@16: , escape_class Chris@16: }; Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////////// Chris@16: // escape_value Chris@16: // Chris@16: template Chris@16: struct escape_value Chris@16: { Chris@16: Char ch_; Chris@16: int mark_nbr_; Chris@16: Class class_; Chris@16: escape_type type_; Chris@16: }; Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////////// Chris@16: // char_overflow_handler Chris@16: // Chris@16: struct char_overflow_handler Chris@16: { Chris@16: void operator ()(numeric::range_check_result result) const // throw(regex_error) Chris@16: { Chris@16: if(numeric::cInRange != result) Chris@16: { Chris@16: BOOST_THROW_EXCEPTION( Chris@16: regex_error( Chris@16: regex_constants::error_escape Chris@16: , "character escape too large to fit in target character type" Chris@16: ) Chris@16: ); Chris@16: } Chris@16: } Chris@16: }; Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////////// Chris@16: // parse_escape Chris@16: // Chris@16: template Chris@16: escape_value::type, typename CompilerTraits::regex_traits::char_class_type> Chris@16: parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr) Chris@16: { Chris@16: using namespace regex_constants; Chris@16: typedef typename iterator_value::type char_type; Chris@16: typedef typename CompilerTraits::regex_traits regex_traits; Chris@16: typedef typename regex_traits::char_class_type char_class_type; Chris@16: Chris@16: // define an unsigned type the same size as char_type Chris@16: typedef typename boost::uint_t::least uchar_t; Chris@16: BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type)); Chris@16: typedef numeric::conversion_traits converstion_traits; Chris@16: Chris@16: BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found"); Chris@16: numeric::converter converter; Chris@16: escape_value esc = { 0, 0, 0, escape_char }; Chris@16: bool const icase = (0 != (regex_constants::icase_ & tr.flags())); Chris@16: regex_traits const &rxtraits = tr.traits(); Chris@16: FwdIter tmp; Chris@16: Chris@16: esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase); Chris@16: if(0 != esc.class_) Chris@16: { Chris@16: esc.type_ = escape_class; Chris@16: return esc; Chris@16: } Chris@16: Chris@16: if(-1 != rxtraits.value(*begin, 8)) Chris@16: { Chris@16: esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777)); Chris@16: return esc; Chris@16: } Chris@16: Chris@16: switch(*begin) Chris@16: { Chris@16: // bell character Chris@16: case BOOST_XPR_CHAR_(char_type, 'a'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a'); Chris@16: ++begin; Chris@16: break; Chris@16: // escape character Chris@16: case BOOST_XPR_CHAR_(char_type, 'e'): Chris@16: esc.ch_ = converter(27); Chris@16: ++begin; Chris@16: break; Chris@16: // control character Chris@16: case BOOST_XPR_CHAR_(char_type, 'c'): Chris@16: BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); Chris@16: BOOST_XPR_ENSURE_ Chris@16: ( Chris@16: rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin) Chris@16: || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin) Chris@16: , error_escape Chris@16: , "invalid escape control letter; must be one of a-z or A-Z" Chris@16: ); Chris@16: // Convert to character according to ECMA-262, section 15.10.2.10: Chris@16: esc.ch_ = converter(*begin % 32); Chris@16: ++begin; Chris@16: break; Chris@16: // formfeed character Chris@16: case BOOST_XPR_CHAR_(char_type, 'f'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f'); Chris@16: ++begin; Chris@16: break; Chris@16: // newline Chris@16: case BOOST_XPR_CHAR_(char_type, 'n'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n'); Chris@16: ++begin; Chris@16: break; Chris@16: // return Chris@16: case BOOST_XPR_CHAR_(char_type, 'r'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r'); Chris@16: ++begin; Chris@16: break; Chris@16: // horizontal tab Chris@16: case BOOST_XPR_CHAR_(char_type, 't'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t'); Chris@16: ++begin; Chris@16: break; Chris@16: // vertical tab Chris@16: case BOOST_XPR_CHAR_(char_type, 'v'): Chris@16: esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v'); Chris@16: ++begin; Chris@16: break; Chris@16: // hex escape sequence Chris@16: case BOOST_XPR_CHAR_(char_type, 'x'): Chris@16: BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); Chris@16: tmp = begin; Chris@16: esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff)); Chris@16: BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : " Chris@16: "must be \\x HexDigit HexDigit"); Chris@16: break; Chris@16: // Unicode escape sequence Chris@16: case BOOST_XPR_CHAR_(char_type, 'u'): Chris@16: BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); Chris@16: tmp = begin; Chris@16: esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff)); Chris@16: BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : " Chris@16: "must be \\u HexDigit HexDigit HexDigit HexDigit"); Chris@16: break; Chris@16: // backslash Chris@16: case BOOST_XPR_CHAR_(char_type, '\\'): Chris@16: //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\'); Chris@16: //++begin; Chris@16: //break; Chris@16: // all other escaped characters represent themselves Chris@16: default: Chris@16: esc.ch_ = *begin; Chris@16: ++begin; Chris@16: break; Chris@16: } Chris@16: Chris@16: return esc; Chris@16: } Chris@16: Chris@16: ////////////////////////////////////////////////////////////////////////// Chris@16: // parse_charset Chris@16: // Chris@16: template Chris@16: inline void parse_charset Chris@16: ( Chris@16: FwdIter &begin Chris@16: , FwdIter end Chris@16: , compound_charset &chset Chris@16: , CompilerTraits &tr Chris@16: ) Chris@16: { Chris@16: using namespace regex_constants; Chris@16: typedef typename RegexTraits::char_type char_type; Chris@16: typedef typename RegexTraits::char_class_type char_class_type; Chris@16: BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); Chris@16: RegexTraits const &rxtraits = tr.traits(); Chris@16: bool const icase = (0 != (regex_constants::icase_ & tr.flags())); Chris@16: FwdIter iprev = FwdIter(); Chris@16: escape_value esc = {0, 0, 0, escape_char}; Chris@16: bool invert = false; Chris@16: Chris@16: // check to see if we have an inverse charset Chris@16: if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end)) Chris@16: { Chris@16: begin = iprev; Chris@16: invert = true; Chris@16: } Chris@16: Chris@16: // skip the end token if-and-only-if it is the first token in the charset Chris@16: if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end)) Chris@16: { Chris@16: for(; begin != iprev; ++begin) Chris@16: { Chris@16: chset.set_char(*begin, rxtraits, icase); Chris@16: } Chris@16: } Chris@16: Chris@16: compiler_token_type tok; Chris@16: char_type ch_prev = char_type(), ch_next = char_type(); Chris@16: bool have_prev = false; Chris@16: Chris@16: BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); Chris@16: Chris@16: // remember the current position and grab the next token Chris@16: iprev = begin; Chris@16: tok = tr.get_charset_token(begin, end); Chris@16: do Chris@16: { Chris@16: BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); Chris@16: Chris@16: if(token_charset_hyphen == tok && have_prev) Chris@16: { Chris@16: // remember the current position Chris@16: FwdIter iprev2 = begin; Chris@16: have_prev = false; Chris@16: Chris@16: // ch_prev is lower bound of a range Chris@16: switch(tr.get_charset_token(begin, end)) Chris@16: { Chris@16: case token_charset_hyphen: Chris@16: case token_charset_invert: Chris@16: begin = iprev2; // un-get these tokens and fall through Chris@16: BOOST_FALLTHROUGH; Chris@16: case token_literal: Chris@16: ch_next = *begin++; Chris@16: BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); Chris@16: chset.set_range(ch_prev, ch_next, rxtraits, icase); Chris@16: continue; Chris@16: case token_charset_backspace: Chris@16: ch_next = char_type(8); // backspace Chris@16: BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); Chris@16: chset.set_range(ch_prev, ch_next, rxtraits, icase); Chris@16: continue; Chris@16: case token_escape: Chris@16: esc = parse_escape(begin, end, tr); Chris@16: if(escape_char == esc.type_) Chris@16: { Chris@16: BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range"); Chris@16: chset.set_range(ch_prev, esc.ch_, rxtraits, icase); Chris@16: continue; Chris@16: } Chris@16: BOOST_FALLTHROUGH; Chris@16: case token_charset_end: Chris@16: default: // not a range. Chris@16: begin = iprev; // backup to hyphen token Chris@16: chset.set_char(ch_prev, rxtraits, icase); Chris@16: chset.set_char(*begin++, rxtraits, icase); Chris@16: continue; Chris@16: } Chris@16: } Chris@16: Chris@16: if(have_prev) Chris@16: { Chris@16: chset.set_char(ch_prev, rxtraits, icase); Chris@16: have_prev = false; Chris@16: } Chris@16: Chris@16: switch(tok) Chris@16: { Chris@16: case token_charset_hyphen: Chris@16: case token_charset_invert: Chris@16: case token_charset_end: Chris@16: case token_posix_charset_end: Chris@16: begin = iprev; // un-get these tokens Chris@16: ch_prev = *begin++; Chris@16: have_prev = true; Chris@16: continue; Chris@16: Chris@16: case token_charset_backspace: Chris@16: ch_prev = char_type(8); // backspace Chris@16: have_prev = true; Chris@16: continue; Chris@16: Chris@16: case token_posix_charset_begin: Chris@16: { Chris@16: FwdIter tmp = begin, start = begin; Chris@16: bool invert = (token_charset_invert == tr.get_charset_token(tmp, end)); Chris@16: if(invert) Chris@16: { Chris@16: begin = start = tmp; Chris@16: } Chris@16: while(token_literal == (tok = tr.get_charset_token(begin, end))) Chris@16: { Chris@16: tmp = ++begin; Chris@16: BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); Chris@16: } Chris@16: if(token_posix_charset_end == tok) Chris@16: { Chris@16: char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase); Chris@16: BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name"); Chris@16: chset.set_class(chclass, invert); Chris@16: continue; Chris@16: } Chris@16: begin = iprev; // un-get this token Chris@16: ch_prev = *begin++; Chris@16: have_prev = true; Chris@16: } Chris@16: continue; Chris@16: Chris@16: case token_escape: Chris@16: esc = parse_escape(begin, end, tr); Chris@16: if(escape_char == esc.type_) Chris@16: { Chris@16: ch_prev = esc.ch_; Chris@16: have_prev = true; Chris@16: } Chris@16: else if(escape_class == esc.type_) Chris@16: { Chris@16: char_class_type upper_ = lookup_classname(rxtraits, "upper"); Chris@16: BOOST_ASSERT(0 != upper_); Chris@16: chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_)); Chris@16: } Chris@16: else Chris@16: { Chris@16: BOOST_ASSERT(false); Chris@16: } Chris@16: continue; Chris@16: Chris@16: default: Chris@16: ch_prev = *begin++; Chris@16: have_prev = true; Chris@16: continue; Chris@16: } Chris@16: } Chris@16: while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"), Chris@16: token_charset_end != (tok = tr.get_charset_token(begin, end))); Chris@16: Chris@16: if(have_prev) Chris@16: { Chris@16: chset.set_char(ch_prev, rxtraits, icase); Chris@16: } Chris@16: Chris@16: if(invert) Chris@16: { Chris@16: chset.inverse(); Chris@16: } Chris@16: } Chris@16: Chris@16: }}} // namespace boost::xpressive::detail Chris@16: Chris@16: #endif