Chris@16: // Copyright (c) 2001-2011 Hartmut Kaiser Chris@16: // Chris@16: // Distributed under the Boost Software License, Version 1.0. (See accompanying Chris@16: // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) Chris@16: Chris@16: #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM) Chris@16: #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM Chris@16: Chris@16: #if defined(_MSC_VER) Chris@16: #pragma once Chris@16: #endif Chris@16: Chris@16: #include Chris@16: Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) Chris@16: #include Chris@16: #endif Chris@16: Chris@16: #include Chris@16: Chris@16: namespace boost { namespace spirit { namespace lex { namespace lexertl Chris@16: { Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: namespace detail Chris@16: { Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: // The must_escape function checks if the given character value needs Chris@16: // to be preceded by a backslash character to disable its special Chris@16: // meaning in the context of a regular expression Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: template Chris@16: inline bool must_escape(Char c) Chris@16: { Chris@16: // FIXME: more needed? Chris@16: switch (c) { Chris@16: case '+': case '/': case '*': case '?': Chris@16: case '|': Chris@16: case '(': case ')': Chris@16: case '[': case ']': Chris@16: case '{': case '}': Chris@16: case '.': Chris@16: case '^': case '$': Chris@16: case '\\': Chris@16: case '"': Chris@16: return true; Chris@16: Chris@16: default: Chris@16: break; Chris@16: } Chris@16: return false; Chris@16: } Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: // The escape function returns the string representation of the given Chris@16: // character value, possibly escaped with a backslash character, to Chris@16: // allow it being safely used in a regular expression definition. Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: template Chris@16: inline std::basic_string escape(Char ch) Chris@16: { Chris@16: std::basic_string result(1, ch); Chris@16: if (detail::must_escape(ch)) Chris@16: { Chris@16: typedef typename std::basic_string::size_type size_type; Chris@16: result.insert((size_type)0, 1, '\\'); Chris@16: } Chris@16: return result; Chris@16: } Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: // Chris@16: /////////////////////////////////////////////////////////////////////// Chris@16: inline boost::lexer::regex_flags map_flags(unsigned int flags) Chris@16: { Chris@16: unsigned int retval = boost::lexer::none; Chris@16: if (flags & match_flags::match_not_dot_newline) Chris@16: retval |= boost::lexer::dot_not_newline; Chris@16: if (flags & match_flags::match_icase) Chris@16: retval |= boost::lexer::icase; Chris@16: Chris@16: return boost::lexer::regex_flags(retval); Chris@16: } Chris@16: } Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: template Chris@16: bool generate_static(Lexer const& Chris@16: , std::basic_ostream& Chris@16: , typename Lexer::char_type const*, F); Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: // Chris@16: // Every lexer type to be used as a lexer for Spirit has to conform to Chris@16: // the following public interface: Chris@16: // Chris@16: // typedefs: Chris@16: // iterator_type The type of the iterator exposed by this lexer. Chris@16: // token_type The type of the tokens returned from the exposed Chris@16: // iterators. Chris@16: // Chris@16: // functions: Chris@16: // default constructor Chris@16: // Since lexers are instantiated as base classes Chris@16: // only it might be a good idea to make this Chris@16: // constructor protected. Chris@16: // begin, end Return a pair of iterators, when dereferenced Chris@16: // returning the sequence of tokens recognized in Chris@16: // the input stream given as the parameters to the Chris@16: // begin() function. Chris@16: // add_token Should add the definition of a token to be Chris@16: // recognized by this lexer. Chris@16: // clear Should delete all current token definitions Chris@16: // associated with the given state of this lexer Chris@16: // object. Chris@16: // Chris@16: // template parameters: Chris@16: // Iterator The type of the iterator used to access the Chris@16: // underlying character stream. Chris@16: // Token The type of the tokens to be returned from the Chris@16: // exposed token iterator. Chris@16: // Functor The type of the InputPolicy to use to instantiate Chris@16: // the multi_pass iterator type to be used as the Chris@16: // token iterator (returned from begin()/end()). Chris@16: // Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: // Chris@16: // The lexer class is a implementation of a Spirit.Lex lexer on Chris@16: // top of Ben Hanson's lexertl library as outlined above (For more Chris@16: // information about lexertl go here: http://www.benhanson.net/lexertl.html). Chris@16: // Chris@16: // This class is supposed to be used as the first and only template Chris@16: // parameter while instantiating instances of a lex::lexer class. Chris@16: // Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: template Chris@16: , typename Iterator = typename Token::iterator_type Chris@16: , typename Functor = functor > Chris@16: class lexer Chris@16: { Chris@16: private: Chris@16: struct dummy { void true_() {} }; Chris@16: typedef void (dummy::*safe_bool)(); Chris@16: Chris@16: static std::size_t const all_states_id = static_cast(-2); Chris@16: Chris@16: public: Chris@16: operator safe_bool() const Chris@16: { return initialized_dfa_ ? &dummy::true_ : 0; } Chris@16: Chris@16: typedef typename boost::detail::iterator_traits::value_type Chris@16: char_type; Chris@16: typedef std::basic_string string_type; Chris@16: Chris@16: typedef boost::lexer::basic_rules basic_rules_type; Chris@16: Chris@16: // Every lexer type to be used as a lexer for Spirit has to conform to Chris@16: // a public interface . Chris@16: typedef Token token_type; Chris@16: typedef typename Token::id_type id_type; Chris@16: typedef iterator iterator_type; Chris@16: Chris@16: private: Chris@16: // this type is purely used for the iterator_type construction below Chris@16: struct iterator_data_type Chris@16: { Chris@16: typedef typename Functor::semantic_actions_type semantic_actions_type; Chris@16: Chris@16: iterator_data_type( Chris@16: boost::lexer::basic_state_machine const& sm Chris@16: , boost::lexer::basic_rules const& rules Chris@16: , semantic_actions_type const& actions) Chris@16: : state_machine_(sm), rules_(rules), actions_(actions) Chris@16: {} Chris@16: Chris@16: boost::lexer::basic_state_machine const& state_machine_; Chris@16: boost::lexer::basic_rules const& rules_; Chris@16: semantic_actions_type const& actions_; Chris@16: Chris@16: private: Chris@16: // silence MSVC warning C4512: assignment operator could not be generated Chris@16: iterator_data_type& operator= (iterator_data_type const&); Chris@16: }; Chris@16: Chris@16: public: Chris@16: // Return the start iterator usable for iterating over the generated Chris@16: // tokens. Chris@16: iterator_type begin(Iterator& first, Iterator const& last Chris@16: , char_type const* initial_state = 0) const Chris@16: { Chris@16: if (!init_dfa()) // never minimize DFA for dynamic lexers Chris@16: return iterator_type(); Chris@16: Chris@16: iterator_data_type iterator_data(state_machine_, rules_, actions_); Chris@16: return iterator_type(iterator_data, first, last, initial_state); Chris@16: } Chris@16: Chris@16: // Return the end iterator usable to stop iterating over the generated Chris@16: // tokens. Chris@16: iterator_type end() const Chris@16: { Chris@16: return iterator_type(); Chris@16: } Chris@16: Chris@16: protected: Chris@16: // Lexer instances can be created by means of a derived class only. Chris@16: lexer(unsigned int flags) Chris@16: : flags_(detail::map_flags(flags)) Chris@16: , rules_(flags_) Chris@16: , initialized_dfa_(false) Chris@16: {} Chris@16: Chris@16: public: Chris@16: // interface for token definition management Chris@16: std::size_t add_token(char_type const* state, char_type tokendef, Chris@16: std::size_t token_id, char_type const* targetstate) Chris@16: { Chris@16: add_state(state); Chris@16: initialized_dfa_ = false; Chris@16: if (state == all_states()) Chris@16: return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot()); Chris@16: Chris@16: if (0 == targetstate) Chris@16: targetstate = state; Chris@16: else Chris@16: add_state(targetstate); Chris@16: return rules_.add(state, detail::escape(tokendef), token_id, targetstate); Chris@16: } Chris@16: std::size_t add_token(char_type const* state, string_type const& tokendef, Chris@16: std::size_t token_id, char_type const* targetstate) Chris@16: { Chris@16: add_state(state); Chris@16: initialized_dfa_ = false; Chris@16: if (state == all_states()) Chris@16: return rules_.add(state, tokendef, token_id, rules_.dot()); Chris@16: Chris@16: if (0 == targetstate) Chris@16: targetstate = state; Chris@16: else Chris@16: add_state(targetstate); Chris@16: return rules_.add(state, tokendef, token_id, targetstate); Chris@16: } Chris@16: Chris@16: // interface for pattern definition management Chris@16: void add_pattern (char_type const* state, string_type const& name, Chris@16: string_type const& patterndef) Chris@16: { Chris@16: add_state(state); Chris@16: rules_.add_macro(name.c_str(), patterndef); Chris@16: initialized_dfa_ = false; Chris@16: } Chris@16: Chris@16: boost::lexer::rules const& get_rules() const { return rules_; } Chris@16: Chris@16: void clear(char_type const* state) Chris@16: { Chris@16: std::size_t s = rules_.state(state); Chris@16: if (boost::lexer::npos != s) Chris@16: rules_.clear(state); Chris@16: initialized_dfa_ = false; Chris@16: } Chris@16: std::size_t add_state(char_type const* state) Chris@16: { Chris@16: if (state == all_states()) Chris@16: return all_states_id; Chris@16: Chris@16: std::size_t stateid = rules_.state(state); Chris@16: if (boost::lexer::npos == stateid) { Chris@16: stateid = rules_.add_state(state); Chris@16: initialized_dfa_ = false; Chris@16: } Chris@16: return stateid; Chris@16: } Chris@16: string_type initial_state() const Chris@16: { Chris@16: return string_type(rules_.initial()); Chris@16: } Chris@16: string_type all_states() const Chris@16: { Chris@16: return string_type(rules_.all_states()); Chris@16: } Chris@16: Chris@16: // Register a semantic action with the given id Chris@16: template Chris@16: void add_action(std::size_t unique_id, std::size_t state, F act) Chris@16: { Chris@16: // If you see an error here stating add_action is not a member of Chris@16: // fusion::unused_type then you are probably having semantic actions Chris@16: // attached to at least one token in the lexer definition without Chris@16: // using the lex::lexertl::actor_lexer<> as its base class. Chris@16: typedef typename Functor::wrap_action_type wrapper_type; Chris@16: if (state == all_states_id) { Chris@16: // add the action to all known states Chris@16: typedef typename Chris@16: basic_rules_type::string_size_t_map::value_type Chris@16: state_type; Chris@16: Chris@16: std::size_t states = rules_.statemap().size(); Chris@16: BOOST_FOREACH(state_type const& s, rules_.statemap()) { Chris@16: for (std::size_t j = 0; j < states; ++j) Chris@16: actions_.add_action(unique_id + j, s.second, wrapper_type::call(act)); Chris@16: } Chris@16: } Chris@16: else { Chris@16: actions_.add_action(unique_id, state, wrapper_type::call(act)); Chris@16: } Chris@16: } Chris@16: // template Chris@16: // void add_action(std::size_t unique_id, char_type const* state, F act) Chris@16: // { Chris@16: // typedef typename Functor::wrap_action_type wrapper_type; Chris@16: // actions_.add_action(unique_id, add_state(state), wrapper_type::call(act)); Chris@16: // } Chris@16: Chris@16: // We do not minimize the state machine by default anymore because Chris@16: // Ben said: "If you can afford to generate a lexer at runtime, there Chris@16: // is little point in calling minimise." Chris@16: // Go figure. Chris@16: bool init_dfa(bool minimize = false) const Chris@16: { Chris@16: if (!initialized_dfa_) { Chris@16: state_machine_.clear(); Chris@16: typedef boost::lexer::basic_generator generator; Chris@16: generator::build (rules_, state_machine_); Chris@16: if (minimize) Chris@16: generator::minimise (state_machine_); Chris@16: Chris@16: #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) Chris@16: boost::lexer::debug::dump(state_machine_, std::cerr); Chris@16: #endif Chris@16: initialized_dfa_ = true; Chris@16: Chris@16: // // release memory held by rules description Chris@16: // basic_rules_type rules; Chris@16: // rules.init_state_info(rules_); // preserve states Chris@16: // std::swap(rules, rules_); Chris@16: } Chris@16: return true; Chris@16: } Chris@16: Chris@16: private: Chris@16: // lexertl specific data Chris@16: mutable boost::lexer::basic_state_machine state_machine_; Chris@16: boost::lexer::regex_flags flags_; Chris@16: /*mutable*/ basic_rules_type rules_; Chris@16: Chris@16: typename Functor::semantic_actions_type actions_; Chris@16: mutable bool initialized_dfa_; Chris@16: Chris@16: // generator functions must be able to access members directly Chris@16: template Chris@16: friend bool generate_static(Lexer const& Chris@16: , std::basic_ostream& Chris@16: , typename Lexer::char_type const*, F); Chris@16: }; Chris@16: Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: // Chris@16: // The actor_lexer class is another implementation of a Spirit.Lex Chris@16: // lexer on top of Ben Hanson's lexertl library as outlined above (For Chris@16: // more information about lexertl go here: Chris@16: // http://www.benhanson.net/lexertl.html). Chris@16: // Chris@16: // The only difference to the lexer class above is that token_def Chris@16: // definitions may have semantic (lexer) actions attached while being Chris@16: // defined: Chris@16: // Chris@16: // int w; Chris@16: // token_def word = "[^ \t\n]+"; Chris@16: // self = word[++ref(w)]; // see example: word_count_lexer Chris@16: // Chris@16: // This class is supposed to be used as the first and only template Chris@16: // parameter while instantiating instances of a lex::lexer class. Chris@16: // Chris@16: /////////////////////////////////////////////////////////////////////////// Chris@16: template Chris@16: , typename Iterator = typename Token::iterator_type Chris@16: , typename Functor = functor > Chris@16: class actor_lexer : public lexer Chris@16: { Chris@16: protected: Chris@16: // Lexer instances can be created by means of a derived class only. Chris@16: actor_lexer(unsigned int flags) Chris@16: : lexer(flags) {} Chris@16: }; Chris@16: Chris@16: }}}} Chris@16: Chris@16: #endif