Chris@16: // Chris@16: // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) Chris@16: // Chris@16: // Distributed under the Boost Software License, Version 1.0. (See Chris@16: // accompanying file LICENSE_1_0.txt or copy at Chris@16: // http://www.boost.org/LICENSE_1_0.txt) Chris@16: // Chris@16: #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED Chris@16: #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED Chris@16: Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #ifdef BOOST_MSVC Chris@16: # pragma warning(push) Chris@16: # pragma warning(disable : 4275 4251 4231 4660) Chris@16: #endif Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: #include Chris@16: Chris@16: #include Chris@16: Chris@16: namespace boost { Chris@16: Chris@16: namespace locale { Chris@16: Chris@16: namespace boundary { Chris@16: /// Chris@16: /// \defgroup boundary Boundary Analysis Chris@16: /// Chris@16: /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries Chris@16: /// Chris@16: /// @{ Chris@16: /// Chris@16: Chris@16: /// \cond INTERNAL Chris@16: Chris@16: namespace details { Chris@16: Chris@16: template::iterator_category> Chris@16: struct mapping_traits { Chris@16: typedef typename std::iterator_traits::value_type char_type; Chris@16: static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) Chris@16: { Chris@16: std::basic_string str(b,e); Chris@16: return std::use_facet >(l).map(t,str.c_str(),str.c_str()+str.size()); Chris@16: } Chris@16: }; Chris@16: Chris@16: template Chris@16: struct linear_iterator_traits { Chris@16: static const bool is_linear = Chris@16: is_same::value Chris@16: || is_same::value Chris@16: || is_same::iterator>::value Chris@16: || is_same::const_iterator>::value Chris@16: || is_same::iterator>::value Chris@16: || is_same::const_iterator>::value Chris@16: ; Chris@16: }; Chris@16: Chris@16: Chris@16: Chris@16: template Chris@16: struct mapping_traits { Chris@16: Chris@16: typedef typename std::iterator_traits::value_type char_type; Chris@16: Chris@16: Chris@16: Chris@16: static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) Chris@16: { Chris@16: index_type result; Chris@16: Chris@16: // Chris@16: // Optimize for most common cases Chris@16: // Chris@16: // C++0x requires that string is continious in memory and all known Chris@16: // string implementations Chris@16: // do this because of c_str() support. Chris@16: // Chris@16: Chris@16: if(linear_iterator_traits::is_linear && b!=e) Chris@16: { Chris@16: char_type const *begin = &*b; Chris@16: char_type const *end = begin + (e-b); Chris@16: index_type tmp=std::use_facet >(l).map(t,begin,end); Chris@16: result.swap(tmp); Chris@16: } Chris@16: else { Chris@16: std::basic_string str(b,e); Chris@16: index_type tmp = std::use_facet >(l).map(t,str.c_str(),str.c_str()+str.size()); Chris@16: result.swap(tmp); Chris@16: } Chris@16: return result; Chris@16: } Chris@16: }; Chris@16: Chris@16: template Chris@16: class mapping { Chris@16: public: Chris@16: typedef BaseIterator base_iterator; Chris@16: typedef typename std::iterator_traits::value_type char_type; Chris@16: Chris@16: Chris@16: mapping(boundary_type type, Chris@16: base_iterator begin, Chris@16: base_iterator end, Chris@16: std::locale const &loc) Chris@16: : Chris@16: index_(new index_type()), Chris@16: begin_(begin), Chris@16: end_(end) Chris@16: { Chris@16: index_type idx=details::mapping_traits::map(type,begin,end,loc); Chris@16: index_->swap(idx); Chris@16: } Chris@16: Chris@16: mapping() Chris@16: { Chris@16: } Chris@16: Chris@16: index_type const &index() const Chris@16: { Chris@16: return *index_; Chris@16: } Chris@16: Chris@16: base_iterator begin() const Chris@16: { Chris@16: return begin_; Chris@16: } Chris@16: Chris@16: base_iterator end() const Chris@16: { Chris@16: return end_; Chris@16: } Chris@16: Chris@16: private: Chris@16: boost::shared_ptr index_; Chris@16: base_iterator begin_,end_; Chris@16: }; Chris@16: Chris@16: template Chris@16: class segment_index_iterator : Chris@16: public boost::iterator_facade< Chris@16: segment_index_iterator, Chris@16: segment, Chris@16: boost::bidirectional_traversal_tag, Chris@16: segment const & Chris@16: > Chris@16: { Chris@16: public: Chris@16: typedef BaseIterator base_iterator; Chris@16: typedef mapping mapping_type; Chris@16: typedef segment segment_type; Chris@16: Chris@16: segment_index_iterator() : current_(0,0),map_(0) Chris@16: { Chris@16: } Chris@16: Chris@16: segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) : Chris@16: map_(map), Chris@16: mask_(mask), Chris@16: full_select_(full_select) Chris@16: { Chris@16: set(p); Chris@16: } Chris@16: segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) : Chris@16: map_(map), Chris@16: mask_(mask), Chris@16: full_select_(full_select) Chris@16: { Chris@16: if(is_begin) Chris@16: set_begin(); Chris@16: else Chris@16: set_end(); Chris@16: } Chris@16: Chris@16: segment_type const &dereference() const Chris@16: { Chris@16: return value_; Chris@16: } Chris@16: Chris@16: bool equal(segment_index_iterator const &other) const Chris@16: { Chris@16: return map_ == other.map_ && current_.second == other.current_.second; Chris@16: } Chris@16: Chris@16: void increment() Chris@16: { Chris@16: std::pair next = current_; Chris@16: if(full_select_) { Chris@16: next.first = next.second; Chris@16: while(next.second < size()) { Chris@16: next.second++; Chris@16: if(valid_offset(next.second)) Chris@16: break; Chris@16: } Chris@16: if(next.second == size()) Chris@16: next.first = next.second - 1; Chris@16: } Chris@16: else { Chris@16: while(next.second < size()) { Chris@16: next.first = next.second; Chris@16: next.second++; Chris@16: if(valid_offset(next.second)) Chris@16: break; Chris@16: } Chris@16: } Chris@16: update_current(next); Chris@16: } Chris@16: Chris@16: void decrement() Chris@16: { Chris@16: std::pair next = current_; Chris@16: if(full_select_) { Chris@16: while(next.second >1) { Chris@16: next.second--; Chris@16: if(valid_offset(next.second)) Chris@16: break; Chris@16: } Chris@16: next.first = next.second; Chris@16: while(next.first >0) { Chris@16: next.first--; Chris@16: if(valid_offset(next.first)) Chris@16: break; Chris@16: } Chris@16: } Chris@16: else { Chris@16: while(next.second >1) { Chris@16: next.second--; Chris@16: if(valid_offset(next.second)) Chris@16: break; Chris@16: } Chris@16: next.first = next.second - 1; Chris@16: } Chris@16: update_current(next); Chris@16: } Chris@16: Chris@16: private: Chris@16: Chris@16: void set_end() Chris@16: { Chris@16: current_.first = size() - 1; Chris@16: current_.second = size(); Chris@16: value_ = segment_type(map_->end(),map_->end(),0); Chris@16: } Chris@16: void set_begin() Chris@16: { Chris@16: current_.first = current_.second = 0; Chris@16: value_ = segment_type(map_->begin(),map_->begin(),0); Chris@16: increment(); Chris@16: } Chris@16: Chris@16: void set(base_iterator p) Chris@16: { Chris@16: size_t dist=std::distance(map_->begin(),p); Chris@16: index_type::const_iterator b=map_->index().begin(),e=map_->index().end(); Chris@16: index_type::const_iterator Chris@16: boundary_point=std::upper_bound(b,e,break_info(dist)); Chris@16: while(boundary_point != e && (boundary_point->rule & mask_)==0) Chris@16: boundary_point++; Chris@16: Chris@16: current_.first = current_.second = boundary_point - b; Chris@16: Chris@16: if(full_select_) { Chris@16: while(current_.first > 0) { Chris@16: current_.first --; Chris@16: if(valid_offset(current_.first)) Chris@16: break; Chris@16: } Chris@16: } Chris@16: else { Chris@16: if(current_.first > 0) Chris@16: current_.first --; Chris@16: } Chris@16: value_.first = map_->begin(); Chris@16: std::advance(value_.first,get_offset(current_.first)); Chris@16: value_.second = value_.first; Chris@16: std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first)); Chris@16: Chris@16: update_rule(); Chris@16: } Chris@16: Chris@16: void update_current(std::pair pos) Chris@16: { Chris@16: std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first); Chris@16: std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second); Chris@16: std::advance(value_.first,first_diff); Chris@16: std::advance(value_.second,second_diff); Chris@16: current_ = pos; Chris@16: update_rule(); Chris@16: } Chris@16: Chris@16: void update_rule() Chris@16: { Chris@16: if(current_.second != size()) { Chris@16: value_.rule(index()[current_.second].rule); Chris@16: } Chris@16: } Chris@16: size_t get_offset(size_t ind) const Chris@16: { Chris@16: if(ind == size()) Chris@16: return index().back().offset; Chris@16: return index()[ind].offset; Chris@16: } Chris@16: Chris@16: bool valid_offset(size_t offset) const Chris@16: { Chris@16: return offset == 0 Chris@16: || offset == size() // make sure we not acess index[size] Chris@16: || (index()[offset].rule & mask_)!=0; Chris@16: } Chris@16: Chris@16: size_t size() const Chris@16: { Chris@16: return index().size(); Chris@16: } Chris@16: Chris@16: index_type const &index() const Chris@16: { Chris@16: return map_->index(); Chris@16: } Chris@16: Chris@16: Chris@16: segment_type value_; Chris@16: std::pair current_; Chris@16: mapping_type const *map_; Chris@16: rule_type mask_; Chris@16: bool full_select_; Chris@16: }; Chris@16: Chris@16: template Chris@16: class boundary_point_index_iterator : Chris@16: public boost::iterator_facade< Chris@16: boundary_point_index_iterator, Chris@16: boundary_point, Chris@16: boost::bidirectional_traversal_tag, Chris@16: boundary_point const & Chris@16: > Chris@16: { Chris@16: public: Chris@16: typedef BaseIterator base_iterator; Chris@16: typedef mapping mapping_type; Chris@16: typedef boundary_point boundary_point_type; Chris@16: Chris@16: boundary_point_index_iterator() : current_(0),map_(0) Chris@16: { Chris@16: } Chris@16: Chris@16: boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) : Chris@16: map_(map), Chris@16: mask_(mask) Chris@16: { Chris@16: if(is_begin) Chris@16: set_begin(); Chris@16: else Chris@16: set_end(); Chris@16: } Chris@16: boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) : Chris@16: map_(map), Chris@16: mask_(mask) Chris@16: { Chris@16: set(p); Chris@16: } Chris@16: Chris@16: boundary_point_type const &dereference() const Chris@16: { Chris@16: return value_; Chris@16: } Chris@16: Chris@16: bool equal(boundary_point_index_iterator const &other) const Chris@16: { Chris@16: return map_ == other.map_ && current_ == other.current_; Chris@16: } Chris@16: Chris@16: void increment() Chris@16: { Chris@16: size_t next = current_; Chris@16: while(next < size()) { Chris@16: next++; Chris@16: if(valid_offset(next)) Chris@16: break; Chris@16: } Chris@16: update_current(next); Chris@16: } Chris@16: Chris@16: void decrement() Chris@16: { Chris@16: size_t next = current_; Chris@16: while(next>0) { Chris@16: next--; Chris@16: if(valid_offset(next)) Chris@16: break; Chris@16: } Chris@16: update_current(next); Chris@16: } Chris@16: Chris@16: private: Chris@16: void set_end() Chris@16: { Chris@16: current_ = size(); Chris@16: value_ = boundary_point_type(map_->end(),0); Chris@16: } Chris@16: void set_begin() Chris@16: { Chris@16: current_ = 0; Chris@16: value_ = boundary_point_type(map_->begin(),0); Chris@16: } Chris@16: Chris@16: void set(base_iterator p) Chris@16: { Chris@16: size_t dist = std::distance(map_->begin(),p); Chris@16: Chris@16: index_type::const_iterator b=index().begin(); Chris@16: index_type::const_iterator e=index().end(); Chris@16: index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist)); Chris@16: Chris@16: if(ptr==index().end()) Chris@16: current_=size()-1; Chris@16: else Chris@16: current_=ptr - index().begin(); Chris@16: Chris@16: while(!valid_offset(current_)) Chris@16: current_ ++; Chris@16: Chris@16: std::ptrdiff_t diff = get_offset(current_) - dist; Chris@16: std::advance(p,diff); Chris@16: value_.iterator(p); Chris@16: update_rule(); Chris@16: } Chris@16: Chris@16: void update_current(size_t pos) Chris@16: { Chris@16: std::ptrdiff_t diff = get_offset(pos) - get_offset(current_); Chris@16: base_iterator i=value_.iterator(); Chris@16: std::advance(i,diff); Chris@16: current_ = pos; Chris@16: value_.iterator(i); Chris@16: update_rule(); Chris@16: } Chris@16: Chris@16: void update_rule() Chris@16: { Chris@16: if(current_ != size()) { Chris@16: value_.rule(index()[current_].rule); Chris@16: } Chris@16: } Chris@16: size_t get_offset(size_t ind) const Chris@16: { Chris@16: if(ind == size()) Chris@16: return index().back().offset; Chris@16: return index()[ind].offset; Chris@16: } Chris@16: Chris@16: bool valid_offset(size_t offset) const Chris@16: { Chris@16: return offset == 0 Chris@16: || offset + 1 >= size() // last and first are always valid regardless of mark Chris@16: || (index()[offset].rule & mask_)!=0; Chris@16: } Chris@16: Chris@16: size_t size() const Chris@16: { Chris@16: return index().size(); Chris@16: } Chris@16: Chris@16: index_type const &index() const Chris@16: { Chris@16: return map_->index(); Chris@16: } Chris@16: Chris@16: Chris@16: boundary_point_type value_; Chris@16: size_t current_; Chris@16: mapping_type const *map_; Chris@16: rule_type mask_; Chris@16: }; Chris@16: Chris@16: Chris@16: } // details Chris@16: Chris@16: /// \endcond Chris@16: Chris@16: template Chris@16: class segment_index; Chris@16: Chris@16: template Chris@16: class boundary_point_index; Chris@16: Chris@16: Chris@16: /// Chris@16: /// \brief This class holds an index of segments in the text range and allows to iterate over them Chris@16: /// Chris@16: /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators Chris@16: /// to the \ref segment objects. Chris@16: /// Chris@16: /// It provides two options on way of selecting segments: Chris@16: /// Chris@16: /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to Chris@16: /// various masks %as \ref word_any. Chris@16: /// \n Chris@16: /// The default is to select any types of boundaries. Chris@16: /// \n Chris@16: /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators Chris@16: /// would iterate only over the words containing Kana letters and \ref word_any would select all types of Chris@16: /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text Chris@16: /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead Chris@16: /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". Chris@16: /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous Chris@16: /// %boundary point does not fit the selected rule. Chris@16: /// \n Chris@16: /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". Chris@16: /// \n Chris@16: /// This text contains three %boundary points separating it to sentences by different rules: Chris@16: /// - The exclamation mark "!" ends the sentence "Hello!" Chris@16: /// - The line feed that splits the sentence "How\nare you?" into two parts. Chris@16: /// - The question mark that ends the second sentence. Chris@16: /// \n Chris@16: /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would Chris@16: /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required Chris@16: /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include Chris@16: /// all the text up to previous valid %boundary point and would return two expected sentences: Chris@16: /// "Hello!" and "How\nare you?". Chris@16: /// Chris@16: /// This class allows to find a segment according to the given iterator in range using \ref find() member Chris@16: /// function. Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text Chris@16: /// invalidates existing iterators and they can't be used any more. Chris@16: /// - segment_index can be created from boundary_point_index or other segment_index that was created with Chris@16: /// same \ref boundary_type. This is very fast operation %as they shared same index Chris@16: /// and it does not require its regeneration. Chris@16: /// Chris@16: /// \see Chris@16: /// Chris@16: /// - \ref boundary_point_index Chris@16: /// - \ref segment Chris@16: /// - \ref boundary_point Chris@16: /// Chris@16: Chris@16: template Chris@16: class segment_index { Chris@16: public: Chris@16: Chris@16: /// Chris@16: /// The type of the iterator used to iterate over the original text Chris@16: /// Chris@16: typedef BaseIterator base_iterator; Chris@16: #ifdef BOOST_LOCALE_DOXYGEN Chris@16: /// Chris@16: /// The bidirectional iterator that iterates over \ref value_type objects. Chris@16: /// Chris@16: /// - The iterators may be invalidated by use of any non-const member function Chris@16: /// including but not limited to \ref rule(rule_type) and \ref full_select(bool). Chris@16: /// - The returned value_type object is valid %as long %as iterator points to it. Chris@16: /// So this following code is wrong %as t used after p was updated: Chris@16: /// \code Chris@16: /// segment_index::iterator p=index.begin(); Chris@16: /// segment &t = *p; Chris@16: /// ++p; Chris@16: /// cout << t.str() << endl; Chris@16: /// \endcode Chris@16: /// Chris@16: typedef unspecified_iterator_type iterator; Chris@16: /// Chris@16: /// \copydoc iterator Chris@16: /// Chris@16: typedef unspecified_iterator_type const_iterator; Chris@16: #else Chris@16: typedef details::segment_index_iterator iterator; Chris@16: typedef details::segment_index_iterator const_iterator; Chris@16: #endif Chris@16: /// Chris@16: /// The type dereferenced by the \ref iterator and \ref const_iterator. It is Chris@16: /// an object that represents selected segment. Chris@16: /// Chris@16: typedef segment value_type; Chris@16: Chris@16: /// Chris@16: /// Default constructor. Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// When this object is constructed by default it does not include a valid index, thus Chris@16: /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined Chris@16: /// behavior Chris@16: /// Chris@16: segment_index() : mask_(0xFFFFFFFFu),full_select_(false) Chris@16: { Chris@16: } Chris@16: /// Chris@16: /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) using a rule \a mask for locale \a loc. Chris@16: /// Chris@16: segment_index(boundary_type type, Chris@16: base_iterator begin, Chris@16: base_iterator end, Chris@16: rule_type mask, Chris@16: std::locale const &loc=std::locale()) Chris@16: : Chris@16: map_(type,begin,end,loc), Chris@16: mask_(mask), Chris@16: full_select_(false) Chris@16: { Chris@16: } Chris@16: /// Chris@16: /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. Chris@16: /// Chris@16: segment_index(boundary_type type, Chris@16: base_iterator begin, Chris@16: base_iterator end, Chris@16: std::locale const &loc=std::locale()) Chris@16: : Chris@16: map_(type,begin,end,loc), Chris@16: mask_(0xFFFFFFFFu), Chris@16: full_select_(false) Chris@16: { Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information Chris@16: /// and used default rule (all possible segments) Chris@16: /// Chris@16: /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text Chris@16: /// range it is much better to create one from another rather then indexing the same Chris@16: /// range twice. Chris@16: /// Chris@16: /// \note \ref rule() flags are not copied Chris@16: /// Chris@16: segment_index(boundary_point_index const &); Chris@16: /// Chris@16: /// Copy an index from a \ref boundary_point_index. It copies all indexing information Chris@16: /// and uses the default rule (all possible segments) Chris@16: /// Chris@16: /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text Chris@16: /// range it is much better to create one from another rather then indexing the same Chris@16: /// range twice. Chris@16: /// Chris@16: /// \note \ref rule() flags are not copied Chris@16: /// Chris@16: segment_index const &operator = (boundary_point_index const &); Chris@16: Chris@16: Chris@16: /// Chris@16: /// Create a new index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) for locale \a loc. Chris@16: /// Chris@16: /// \note \ref rule() and \ref full_select() remain unchanged. Chris@16: /// Chris@16: void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) Chris@16: { Chris@16: map_ = mapping_type(type,begin,end,loc); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the \ref iterator on the beginning of the segments range. Chris@16: /// Chris@16: /// Preconditions: the segment_index should have a mapping Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator begin() const Chris@16: { Chris@16: return iterator(true,&map_,mask_,full_select_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the \ref iterator on the ending of the segments range. Chris@16: /// Chris@16: /// Preconditions: the segment_index should have a mapping Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator end() const Chris@16: { Chris@16: return iterator(false,&map_,mask_,full_select_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Find a first valid segment following a position \a p. Chris@16: /// Chris@16: /// If \a p is inside a valid segment this segment is selected: Chris@16: /// Chris@16: /// For example: For \ref word %boundary analysis with \ref word_any rule(): Chris@16: /// Chris@16: /// - "to| be or ", would point to "be", Chris@16: /// - "t|o be or ", would point to "to", Chris@16: /// - "to be or| ", would point to end. Chris@16: /// Chris@16: /// Chris@16: /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator Chris@16: /// to the text in the mapped range. Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator find(base_iterator p) const Chris@16: { Chris@16: return iterator(p,&map_,mask_,full_select_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the mask of rules that are used Chris@16: /// Chris@16: rule_type rule() const Chris@16: { Chris@16: return mask_; Chris@16: } Chris@16: /// Chris@16: /// Set the mask of rules that are used Chris@16: /// Chris@16: void rule(rule_type v) Chris@16: { Chris@16: mask_ = v; Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the full_select property value - should segment include in the range Chris@16: /// values that not belong to specific \ref rule() or not. Chris@16: /// Chris@16: /// The default value is false. Chris@16: /// Chris@16: /// For example for \ref sentence %boundary with rule \ref sentence_term the segments Chris@16: /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false Chris@16: /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() Chris@16: /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the Chris@16: /// following part "are you?" Chris@16: /// Chris@16: Chris@16: bool full_select() const Chris@16: { Chris@16: return full_select_; Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Set the full_select property value - should segment include in the range Chris@16: /// values that not belong to specific \ref rule() or not. Chris@16: /// Chris@16: /// The default value is false. Chris@16: /// Chris@16: /// For example for \ref sentence %boundary with rule \ref sentence_term the segments Chris@16: /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false Chris@16: /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() Chris@16: /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the Chris@16: /// following part "are you?" Chris@16: /// Chris@16: Chris@16: void full_select(bool v) Chris@16: { Chris@16: full_select_ = v; Chris@16: } Chris@16: Chris@16: private: Chris@16: friend class boundary_point_index; Chris@16: typedef details::mapping mapping_type; Chris@16: mapping_type map_; Chris@16: rule_type mask_; Chris@16: bool full_select_; Chris@16: }; Chris@16: Chris@16: /// Chris@16: /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating Chris@16: /// over them. Chris@16: /// Chris@16: /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators Chris@16: /// to the \ref boundary_point objects. Chris@16: /// Chris@16: /// It provides an option that affects selecting %boundary points according to different rules: Chris@16: /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific Chris@16: /// types of %boundary points like \ref sentence_term. Chris@16: /// Chris@16: /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default Chris@16: /// rule is used the %boundary points would be: Chris@16: /// Chris@16: /// - "|Hello! How\nare you?" Chris@16: /// - "Hello! |How\nare you?" Chris@16: /// - "Hello! How\n|are you?" Chris@16: /// - "Hello! How\nare you?|" Chris@16: /// Chris@16: /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: Chris@16: /// Chris@16: /// - "|Hello! How\nare you?" Chris@16: /// - "Hello! |How\nare you?" Chris@16: /// - "Hello! How\nare you?|" Chris@16: /// Chris@16: /// Such that a %boundary point defined by a line feed character would be ignored. Chris@16: /// Chris@16: /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member Chris@16: /// function. Chris@16: /// Chris@16: /// \note Chris@16: /// - Even an empty text range [x,x) considered to have a one %boundary point x. Chris@16: /// - \a a and \a b points of the range [a,b) are always considered %boundary points Chris@16: /// regardless the rules used. Chris@16: /// - Changing any of the option \ref rule() or course re-indexing the text Chris@16: /// invalidates existing iterators and they can't be used any more. Chris@16: /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with Chris@16: /// same \ref boundary_type. This is very fast operation %as they shared same index Chris@16: /// and it does not require its regeneration. Chris@16: /// Chris@16: /// \see Chris@16: /// Chris@16: /// - \ref segment_index Chris@16: /// - \ref boundary_point Chris@16: /// - \ref segment Chris@16: /// Chris@16: Chris@16: Chris@16: template Chris@16: class boundary_point_index { Chris@16: public: Chris@16: /// Chris@16: /// The type of the iterator used to iterate over the original text Chris@16: /// Chris@16: typedef BaseIterator base_iterator; Chris@16: #ifdef BOOST_LOCALE_DOXYGEN Chris@16: /// Chris@16: /// The bidirectional iterator that iterates over \ref value_type objects. Chris@16: /// Chris@16: /// - The iterators may be invalidated by use of any non-const member function Chris@16: /// including but not limited to \ref rule(rule_type) member function. Chris@16: /// - The returned value_type object is valid %as long %as iterator points to it. Chris@16: /// So this following code is wrong %as t used after p was updated: Chris@16: /// \code Chris@16: /// boundary_point_index::iterator p=index.begin(); Chris@16: /// boundary_point &t = *p; Chris@16: /// ++p; Chris@16: /// rule_type r = t->rule(); Chris@16: /// \endcode Chris@16: /// Chris@16: typedef unspecified_iterator_type iterator; Chris@16: /// Chris@16: /// \copydoc iterator Chris@16: /// Chris@16: typedef unspecified_iterator_type const_iterator; Chris@16: #else Chris@16: typedef details::boundary_point_index_iterator iterator; Chris@16: typedef details::boundary_point_index_iterator const_iterator; Chris@16: #endif Chris@16: /// Chris@16: /// The type dereferenced by the \ref iterator and \ref const_iterator. It is Chris@16: /// an object that represents the selected \ref boundary_point "boundary point". Chris@16: /// Chris@16: typedef boundary_point value_type; Chris@16: Chris@16: /// Chris@16: /// Default constructor. Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// When this object is constructed by default it does not include a valid index, thus Chris@16: /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined Chris@16: /// behavior Chris@16: /// Chris@16: boundary_point_index() : mask_(0xFFFFFFFFu) Chris@16: { Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) using a rule \a mask for locale \a loc. Chris@16: /// Chris@16: boundary_point_index(boundary_type type, Chris@16: base_iterator begin, Chris@16: base_iterator end, Chris@16: rule_type mask, Chris@16: std::locale const &loc=std::locale()) Chris@16: : Chris@16: map_(type,begin,end,loc), Chris@16: mask_(mask) Chris@16: { Chris@16: } Chris@16: /// Chris@16: /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. Chris@16: /// Chris@16: boundary_point_index(boundary_type type, Chris@16: base_iterator begin, Chris@16: base_iterator end, Chris@16: std::locale const &loc=std::locale()) Chris@16: : Chris@16: map_(type,begin,end,loc), Chris@16: mask_(0xFFFFFFFFu) Chris@16: { Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information Chris@16: /// and uses the default rule (all possible %boundary points) Chris@16: /// Chris@16: /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text Chris@16: /// range it is much better to create one from another rather then indexing the same Chris@16: /// range twice. Chris@16: /// Chris@16: /// \note \ref rule() flags are not copied Chris@16: /// Chris@16: boundary_point_index(segment_index const &other); Chris@16: /// Chris@16: /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information Chris@16: /// and keeps the current \ref rule() unchanged Chris@16: /// Chris@16: /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text Chris@16: /// range it is much better to create one from another rather then indexing the same Chris@16: /// range twice. Chris@16: /// Chris@16: /// \note \ref rule() flags are not copied Chris@16: /// Chris@16: boundary_point_index const &operator=(segment_index const &other); Chris@16: Chris@16: /// Chris@16: /// Create a new index for %boundary analysis \ref boundary_type "type" of the text Chris@16: /// in range [begin,end) for locale \a loc. Chris@16: /// Chris@16: /// \note \ref rule() remains unchanged. Chris@16: /// Chris@16: void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) Chris@16: { Chris@16: map_ = mapping_type(type,begin,end,loc); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the \ref iterator on the beginning of the %boundary points range. Chris@16: /// Chris@16: /// Preconditions: this boundary_point_index should have a mapping Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator begin() const Chris@16: { Chris@16: return iterator(true,&map_,mask_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the \ref iterator on the ending of the %boundary points range. Chris@16: /// Chris@16: /// Preconditions: this boundary_point_index should have a mapping Chris@16: /// Chris@16: /// \note Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator end() const Chris@16: { Chris@16: return iterator(false,&map_,mask_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Find a first valid %boundary point on a position \a p or following it. Chris@16: /// Chris@16: /// For example: For \ref word %boundary analysis of the text "to be or" Chris@16: /// Chris@16: /// - "|to be", would return %boundary point at "|to be", Chris@16: /// - "t|o be", would point to "to| be" Chris@16: /// Chris@16: /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator Chris@16: /// to the text in the mapped range. Chris@16: /// Chris@16: /// The returned iterator is invalidated by access to any non-const member functions of this object Chris@16: /// Chris@16: iterator find(base_iterator p) const Chris@16: { Chris@16: return iterator(p,&map_,mask_); Chris@16: } Chris@16: Chris@16: /// Chris@16: /// Get the mask of rules that are used Chris@16: /// Chris@16: rule_type rule() const Chris@16: { Chris@16: return mask_; Chris@16: } Chris@16: /// Chris@16: /// Set the mask of rules that are used Chris@16: /// Chris@16: void rule(rule_type v) Chris@16: { Chris@16: mask_ = v; Chris@16: } Chris@16: Chris@16: private: Chris@16: Chris@16: friend class segment_index; Chris@16: typedef details::mapping mapping_type; Chris@16: mapping_type map_; Chris@16: rule_type mask_; Chris@16: }; Chris@16: Chris@16: /// \cond INTERNAL Chris@16: template Chris@16: segment_index::segment_index(boundary_point_index const &other) : Chris@16: map_(other.map_), Chris@16: mask_(0xFFFFFFFFu), Chris@16: full_select_(false) Chris@16: { Chris@16: } Chris@16: Chris@16: template Chris@16: boundary_point_index::boundary_point_index(segment_index const &other) : Chris@16: map_(other.map_), Chris@16: mask_(0xFFFFFFFFu) Chris@16: { Chris@16: } Chris@16: Chris@16: template Chris@16: segment_index const &segment_index::operator=(boundary_point_index const &other) Chris@16: { Chris@16: map_ = other.map_; Chris@16: return *this; Chris@16: } Chris@16: Chris@16: template Chris@16: boundary_point_index const &boundary_point_index::operator=(segment_index const &other) Chris@16: { Chris@16: map_ = other.map_; Chris@16: return *this; Chris@16: } Chris@16: /// \endcond Chris@16: Chris@16: typedef segment_index ssegment_index; ///< convenience typedef Chris@16: typedef segment_index wssegment_index; ///< convenience typedef Chris@16: #ifdef BOOST_HAS_CHAR16_T Chris@16: typedef segment_index u16ssegment_index;///< convenience typedef Chris@16: #endif Chris@16: #ifdef BOOST_HAS_CHAR32_T Chris@16: typedef segment_index u32ssegment_index;///< convenience typedef Chris@16: #endif Chris@16: Chris@16: typedef segment_index csegment_index; ///< convenience typedef Chris@16: typedef segment_index wcsegment_index; ///< convenience typedef Chris@16: #ifdef BOOST_HAS_CHAR16_T Chris@16: typedef segment_index u16csegment_index; ///< convenience typedef Chris@16: #endif Chris@16: #ifdef BOOST_HAS_CHAR32_T Chris@16: typedef segment_index u32csegment_index; ///< convenience typedef Chris@16: #endif Chris@16: Chris@16: typedef boundary_point_index sboundary_point_index;///< convenience typedef Chris@16: typedef boundary_point_index wsboundary_point_index;///< convenience typedef Chris@16: #ifdef BOOST_HAS_CHAR16_T Chris@16: typedef boundary_point_index u16sboundary_point_index;///< convenience typedef Chris@16: #endif Chris@16: #ifdef BOOST_HAS_CHAR32_T Chris@16: typedef boundary_point_index u32sboundary_point_index;///< convenience typedef Chris@16: #endif Chris@16: Chris@16: typedef boundary_point_index cboundary_point_index; ///< convenience typedef Chris@16: typedef boundary_point_index wcboundary_point_index; ///< convenience typedef Chris@16: #ifdef BOOST_HAS_CHAR16_T Chris@16: typedef boundary_point_index u16cboundary_point_index;///< convenience typedef Chris@16: #endif Chris@16: #ifdef BOOST_HAS_CHAR32_T Chris@16: typedef boundary_point_index u32cboundary_point_index;///< convenience typedef Chris@16: #endif Chris@16: Chris@16: Chris@16: Chris@16: } // boundary Chris@16: Chris@16: } // locale Chris@16: } // boost Chris@16: Chris@16: /// Chris@16: /// \example boundary.cpp Chris@16: /// Example of using segment_index Chris@16: /// \example wboundary.cpp Chris@16: /// Example of using segment_index over wide strings Chris@16: /// Chris@16: Chris@16: #ifdef BOOST_MSVC Chris@16: #pragma warning(pop) Chris@16: #endif Chris@16: Chris@16: #endif Chris@16: // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4