Chris@16: /* Chris@16: * Chris@16: * Copyright (c) 1998-2002 Chris@16: * John Maddock Chris@16: * Chris@16: * Use, modification and distribution are subject to the Chris@16: * Boost Software License, Version 1.0. (See accompanying file Chris@16: * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) Chris@16: * Chris@16: */ Chris@16: Chris@16: /* Chris@16: * LOCATION: see http://www.boost.org for most recent version. Chris@16: * FILE states.cpp Chris@16: * VERSION see Chris@16: * DESCRIPTION: Declares internal state machine structures. Chris@16: */ Chris@16: Chris@16: #ifndef BOOST_REGEX_V4_STATES_HPP Chris@16: #define BOOST_REGEX_V4_STATES_HPP Chris@16: Chris@16: #ifdef BOOST_MSVC Chris@16: #pragma warning(push) Chris@16: #pragma warning(disable: 4103) Chris@16: #endif Chris@16: #ifdef BOOST_HAS_ABI_HEADERS Chris@16: # include BOOST_ABI_PREFIX Chris@16: #endif Chris@16: #ifdef BOOST_MSVC Chris@16: #pragma warning(pop) Chris@16: #endif Chris@16: Chris@16: namespace boost{ Chris@16: namespace re_detail{ Chris@16: Chris@16: /*** mask_type ******************************************************* Chris@16: Whenever we have a choice of two alternatives, we use an array of bytes Chris@16: to indicate which of the two alternatives it is possible to take for any Chris@16: given input character. If mask_take is set, then we can take the next Chris@16: state, and if mask_skip is set then we can take the alternative. Chris@16: ***********************************************************************/ Chris@16: enum mask_type Chris@16: { Chris@16: mask_take = 1, Chris@16: mask_skip = 2, Chris@16: mask_init = 4, Chris@16: mask_any = mask_skip | mask_take, Chris@16: mask_all = mask_any Chris@16: }; Chris@16: Chris@16: /*** helpers ********************************************************** Chris@16: These helpers let us use function overload resolution to detect whether Chris@16: we have narrow or wide character strings: Chris@16: ***********************************************************************/ Chris@16: struct _narrow_type{}; Chris@16: struct _wide_type{}; Chris@16: template struct is_byte; Chris@16: template<> struct is_byte { typedef _narrow_type width_type; }; Chris@16: template<> struct is_byte{ typedef _narrow_type width_type; }; Chris@16: template<> struct is_byte { typedef _narrow_type width_type; }; Chris@16: template struct is_byte { typedef _wide_type width_type; }; Chris@16: Chris@16: /*** enum syntax_element_type ****************************************** Chris@16: Every record in the state machine falls into one of the following types: Chris@16: ***********************************************************************/ Chris@16: enum syntax_element_type Chris@16: { Chris@16: // start of a marked sub-expression, or perl-style (?...) extension Chris@16: syntax_element_startmark = 0, Chris@16: // end of a marked sub-expression, or perl-style (?...) extension Chris@16: syntax_element_endmark = syntax_element_startmark + 1, Chris@16: // any sequence of literal characters Chris@16: syntax_element_literal = syntax_element_endmark + 1, Chris@16: // start of line assertion: ^ Chris@16: syntax_element_start_line = syntax_element_literal + 1, Chris@16: // end of line assertion $ Chris@16: syntax_element_end_line = syntax_element_start_line + 1, Chris@16: // match any character: . Chris@16: syntax_element_wild = syntax_element_end_line + 1, Chris@16: // end of expression: we have a match when we get here Chris@16: syntax_element_match = syntax_element_wild + 1, Chris@16: // perl style word boundary: \b Chris@16: syntax_element_word_boundary = syntax_element_match + 1, Chris@16: // perl style within word boundary: \B Chris@16: syntax_element_within_word = syntax_element_word_boundary + 1, Chris@16: // start of word assertion: \< Chris@16: syntax_element_word_start = syntax_element_within_word + 1, Chris@16: // end of word assertion: \> Chris@16: syntax_element_word_end = syntax_element_word_start + 1, Chris@16: // start of buffer assertion: \` Chris@16: syntax_element_buffer_start = syntax_element_word_end + 1, Chris@16: // end of buffer assertion: \' Chris@16: syntax_element_buffer_end = syntax_element_buffer_start + 1, Chris@16: // backreference to previously matched sub-expression Chris@16: syntax_element_backref = syntax_element_buffer_end + 1, Chris@16: // either a wide character set [..] or one with multicharacter collating elements: Chris@16: syntax_element_long_set = syntax_element_backref + 1, Chris@16: // narrow character set: [...] Chris@16: syntax_element_set = syntax_element_long_set + 1, Chris@16: // jump to a new state in the machine: Chris@16: syntax_element_jump = syntax_element_set + 1, Chris@16: // choose between two production states: Chris@16: syntax_element_alt = syntax_element_jump + 1, Chris@16: // a repeat Chris@16: syntax_element_rep = syntax_element_alt + 1, Chris@16: // match a combining character sequence Chris@16: syntax_element_combining = syntax_element_rep + 1, Chris@16: // perl style soft buffer end: \z Chris@16: syntax_element_soft_buffer_end = syntax_element_combining + 1, Chris@16: // perl style continuation: \G Chris@16: syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, Chris@16: // single character repeats: Chris@16: syntax_element_dot_rep = syntax_element_restart_continue + 1, Chris@16: syntax_element_char_rep = syntax_element_dot_rep + 1, Chris@16: syntax_element_short_set_rep = syntax_element_char_rep + 1, Chris@16: syntax_element_long_set_rep = syntax_element_short_set_rep + 1, Chris@16: // a backstep for lookbehind repeats: Chris@16: syntax_element_backstep = syntax_element_long_set_rep + 1, Chris@16: // an assertion that a mark was matched: Chris@16: syntax_element_assert_backref = syntax_element_backstep + 1, Chris@16: syntax_element_toggle_case = syntax_element_assert_backref + 1, Chris@16: // a recursive expression: Chris@16: syntax_element_recurse = syntax_element_toggle_case + 1 Chris@16: }; Chris@16: Chris@16: #ifdef BOOST_REGEX_DEBUG Chris@16: // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion Chris@16: std::ostream& operator<<(std::ostream&, syntax_element_type); Chris@16: #endif Chris@16: Chris@16: struct re_syntax_base; Chris@16: Chris@16: /*** union offset_type ************************************************ Chris@16: Points to another state in the machine. During machine construction Chris@16: we use integral offsets, but these are converted to pointers before Chris@16: execution of the machine. Chris@16: ***********************************************************************/ Chris@16: union offset_type Chris@16: { Chris@16: re_syntax_base* p; Chris@16: std::ptrdiff_t i; Chris@16: }; Chris@16: Chris@16: /*** struct re_syntax_base ******************************************** Chris@16: Base class for all states in the machine. Chris@16: ***********************************************************************/ Chris@16: struct re_syntax_base Chris@16: { Chris@16: syntax_element_type type; // what kind of state this is Chris@16: offset_type next; // next state in the machine Chris@16: }; Chris@16: Chris@16: /*** struct re_brace ************************************************** Chris@16: A marked parenthesis. Chris@16: ***********************************************************************/ Chris@16: struct re_brace : public re_syntax_base Chris@16: { Chris@16: // The index to match, can be zero (don't mark the sub-expression) Chris@16: // or negative (for perl style (?...) extentions): Chris@16: int index; Chris@16: bool icase; Chris@16: }; Chris@16: Chris@16: /*** struct re_dot ************************************************** Chris@16: Match anything. Chris@16: ***********************************************************************/ Chris@16: enum Chris@16: { Chris@16: dont_care = 1, Chris@16: force_not_newline = 0, Chris@16: force_newline = 2, Chris@16: Chris@16: test_not_newline = 2, Chris@16: test_newline = 3 Chris@16: }; Chris@16: struct re_dot : public re_syntax_base Chris@16: { Chris@16: unsigned char mask; Chris@16: }; Chris@16: Chris@16: /*** struct re_literal ************************************************ Chris@16: A string of literals, following this structure will be an Chris@16: array of characters: charT[length] Chris@16: ***********************************************************************/ Chris@16: struct re_literal : public re_syntax_base Chris@16: { Chris@16: unsigned int length; Chris@16: }; Chris@16: Chris@16: /*** struct re_case ************************************************ Chris@16: Indicates whether we are moving to a case insensive block or not Chris@16: ***********************************************************************/ Chris@16: struct re_case : public re_syntax_base Chris@16: { Chris@16: bool icase; Chris@16: }; Chris@16: Chris@16: /*** struct re_set_long *********************************************** Chris@16: A wide character set of characters, following this structure will be Chris@16: an array of type charT: Chris@16: First csingles null-terminated strings Chris@16: Then 2 * cranges NULL terminated strings Chris@16: Then cequivalents NULL terminated strings Chris@16: ***********************************************************************/ Chris@16: template Chris@16: struct re_set_long : public re_syntax_base Chris@16: { Chris@16: unsigned int csingles, cranges, cequivalents; Chris@16: mask_type cclasses; Chris@16: mask_type cnclasses; Chris@16: bool isnot; Chris@16: bool singleton; Chris@16: }; Chris@16: Chris@16: /*** struct re_set **************************************************** Chris@16: A set of narrow-characters, matches any of _map which is none-zero Chris@16: ***********************************************************************/ Chris@16: struct re_set : public re_syntax_base Chris@16: { Chris@16: unsigned char _map[1 << CHAR_BIT]; Chris@16: }; Chris@16: Chris@16: /*** struct re_jump *************************************************** Chris@16: Jump to a new location in the machine (not next). Chris@16: ***********************************************************************/ Chris@16: struct re_jump : public re_syntax_base Chris@16: { Chris@16: offset_type alt; // location to jump to Chris@16: }; Chris@16: Chris@16: /*** struct re_alt *************************************************** Chris@16: Jump to a new location in the machine (possibly next). Chris@16: ***********************************************************************/ Chris@16: struct re_alt : public re_jump Chris@16: { Chris@16: unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump Chris@16: unsigned int can_be_null; // true if we match a NULL string Chris@16: }; Chris@16: Chris@16: /*** struct re_repeat ************************************************* Chris@16: Repeat a section of the machine Chris@16: ***********************************************************************/ Chris@16: struct re_repeat : public re_alt Chris@16: { Chris@16: std::size_t min, max; // min and max allowable repeats Chris@16: int state_id; // Unique identifier for this repeat Chris@16: bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) Chris@16: bool greedy; // True if this is a greedy repeat Chris@16: }; Chris@16: Chris@16: /*** struct re_recurse ************************************************ Chris@16: Recurse to a particular subexpression. Chris@16: **********************************************************************/ Chris@16: struct re_recurse : public re_jump Chris@16: { Chris@16: int state_id; // identifier of first nested repeat within the recursion. Chris@16: }; Chris@16: Chris@16: /*** enum re_jump_size_type ******************************************* Chris@16: Provides compiled size of re_jump structure (allowing for trailing alignment). Chris@16: We provide this so we know how manybytes to insert when constructing the machine Chris@16: (The value of padding_mask is defined in regex_raw_buffer.hpp). Chris@16: ***********************************************************************/ Chris@16: enum re_jump_size_type Chris@16: { Chris@16: re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), Chris@16: re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), Chris@16: re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) Chris@16: }; Chris@16: Chris@16: /*** proc re_is_set_member ********************************************* Chris@16: Forward declaration: we'll need this one later... Chris@16: ***********************************************************************/ Chris@16: Chris@16: template Chris@16: struct regex_data; Chris@16: Chris@16: template Chris@16: iterator BOOST_REGEX_CALL re_is_set_member(iterator next, Chris@16: iterator last, Chris@16: const re_set_long* set_, Chris@16: const regex_data& e, bool icase); Chris@16: Chris@16: } // namespace re_detail Chris@16: Chris@16: } // namespace boost Chris@16: Chris@16: #ifdef BOOST_MSVC Chris@16: #pragma warning(push) Chris@16: #pragma warning(disable: 4103) Chris@16: #endif Chris@16: #ifdef BOOST_HAS_ABI_HEADERS Chris@16: # include BOOST_ABI_SUFFIX Chris@16: #endif Chris@16: #ifdef BOOST_MSVC Chris@16: #pragma warning(pop) Chris@16: #endif Chris@16: Chris@16: #endif Chris@16: Chris@16: