Chris@16: // Chris@16: // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) Chris@16: // Chris@16: // Distributed under the Boost Software License, Version 1.0. (See Chris@16: // accompanying file LICENSE_1_0.txt or copy at Chris@16: // http://www.boost.org/LICENSE_1_0.txt) Chris@16: // Chris@16: #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED Chris@16: #define BOOST_LOCALE_UTF_HPP_INCLUDED Chris@16: Chris@16: #include Chris@16: Chris@16: namespace boost { Chris@16: namespace locale { Chris@16: /// Chris@16: /// \brief Namespace that holds basic operations on UTF encoded sequences Chris@16: /// Chris@16: /// All functions defined in this namespace do not require linking with Boost.Locale library Chris@16: /// Chris@16: namespace utf { Chris@16: /// \cond INTERNAL Chris@16: #ifdef __GNUC__ Chris@16: # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1) Chris@16: # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0) Chris@16: #else Chris@16: # define BOOST_LOCALE_LIKELY(x) (x) Chris@16: # define BOOST_LOCALE_UNLIKELY(x) (x) Chris@16: #endif Chris@16: /// \endcond Chris@16: Chris@16: /// Chris@16: /// \brief The integral type that can hold a Unicode code point Chris@16: /// Chris@16: typedef uint32_t code_point; Chris@16: Chris@16: /// Chris@16: /// \brief Special constant that defines illegal code point Chris@16: /// Chris@16: static const code_point illegal = 0xFFFFFFFFu; Chris@16: Chris@16: /// Chris@16: /// \brief Special constant that defines incomplete code point Chris@16: /// Chris@16: static const code_point incomplete = 0xFFFFFFFEu; Chris@16: Chris@16: /// Chris@16: /// \brief the function checks if \a v is a valid code point Chris@16: /// Chris@16: inline bool is_valid_codepoint(code_point v) Chris@16: { Chris@16: if(v>0x10FFFF) Chris@16: return false; Chris@16: if(0xD800 <=v && v<= 0xDFFF) // surragates Chris@16: return false; Chris@16: return true; Chris@16: } Chris@16: Chris@16: #ifdef BOOST_LOCALE_DOXYGEN Chris@16: /// Chris@16: /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points Chris@16: /// Chris@16: template Chris@16: struct utf_traits { Chris@16: /// Chris@16: /// The type of the character Chris@16: /// Chris@16: typedef CharType char_type; Chris@16: /// Chris@16: /// Read one code point from the range [p,e) and return it. Chris@16: /// Chris@16: /// - If the sequence that was read is incomplete sequence returns \ref incomplete, Chris@16: /// - If illegal sequence detected returns \ref illegal Chris@16: /// Chris@16: /// Requirements Chris@16: /// Chris@16: /// - Iterator is valid input iterator Chris@16: /// Chris@16: /// Postconditions Chris@16: /// Chris@16: /// - p points to the last consumed character Chris@16: /// Chris@16: template Chris@16: static code_point decode(Iterator &p,Iterator e); Chris@16: Chris@16: /// Chris@16: /// Maximal width of valid sequence in the code units: Chris@16: /// Chris@16: /// - UTF-8 - 4 Chris@16: /// - UTF-16 - 2 Chris@16: /// - UTF-32 - 1 Chris@16: /// Chris@16: static const int max_width; Chris@16: /// Chris@16: /// The width of specific code point in the code units. Chris@16: /// Chris@16: /// Requirement: value is a valid Unicode code point Chris@16: /// Returns value in range [1..max_width] Chris@16: /// Chris@16: static int width(code_point value); Chris@16: Chris@16: /// Chris@16: /// Get the size of the trail part of variable length encoded sequence. Chris@16: /// Chris@16: /// Returns -1 if C is not valid lead character Chris@16: /// Chris@16: static int trail_length(char_type c); Chris@16: /// Chris@16: /// Returns true if c is trail code unit, always false for UTF-32 Chris@16: /// Chris@16: static bool is_trail(char_type c); Chris@16: /// Chris@16: /// Returns true if c is lead code unit, always true of UTF-32 Chris@16: /// Chris@16: static bool is_lead(char_type c); Chris@16: Chris@16: /// Chris@16: /// Convert valid Unicode code point \a value to the UTF sequence. Chris@16: /// Chris@16: /// Requirements: Chris@16: /// Chris@16: /// - \a value is valid code point Chris@16: /// - \a out is an output iterator should be able to accept at least width(value) units Chris@16: /// Chris@16: /// Returns the iterator past the last written code unit. Chris@16: /// Chris@16: template Chris@16: static Iterator encode(code_point value,Iterator out); Chris@16: /// Chris@16: /// Decodes valid UTF sequence that is pointed by p into code point. Chris@16: /// Chris@16: /// If the sequence is invalid or points to end the behavior is undefined Chris@16: /// Chris@16: template Chris@16: static code_point decode_valid(Iterator &p); Chris@16: }; Chris@16: Chris@16: #else Chris@16: Chris@16: template Chris@16: struct utf_traits; Chris@16: Chris@16: template Chris@16: struct utf_traits { Chris@16: Chris@16: typedef CharType char_type; Chris@16: Chris@16: static int trail_length(char_type ci) Chris@16: { Chris@16: unsigned char c = ci; Chris@16: if(c < 128) Chris@16: return 0; Chris@16: if(BOOST_LOCALE_UNLIKELY(c < 194)) Chris@16: return -1; Chris@16: if(c < 224) Chris@16: return 1; Chris@16: if(c < 240) Chris@16: return 2; Chris@16: if(BOOST_LOCALE_LIKELY(c <=244)) Chris@16: return 3; Chris@16: return -1; Chris@16: } Chris@16: Chris@16: static const int max_width = 4; Chris@16: Chris@16: static int width(code_point value) Chris@16: { Chris@16: if(value <=0x7F) { Chris@16: return 1; Chris@16: } Chris@16: else if(value <=0x7FF) { Chris@16: return 2; Chris@16: } Chris@16: else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) { Chris@16: return 3; Chris@16: } Chris@16: else { Chris@16: return 4; Chris@16: } Chris@16: } Chris@16: Chris@16: static bool is_trail(char_type ci) Chris@16: { Chris@16: unsigned char c=ci; Chris@16: return (c & 0xC0)==0x80; Chris@16: } Chris@16: Chris@16: static bool is_lead(char_type ci) Chris@16: { Chris@16: return !is_trail(ci); Chris@16: } Chris@16: Chris@16: template Chris@16: static code_point decode(Iterator &p,Iterator e) Chris@16: { Chris@16: if(BOOST_LOCALE_UNLIKELY(p==e)) Chris@16: return incomplete; Chris@16: Chris@16: unsigned char lead = *p++; Chris@16: Chris@16: // First byte is fully validated here Chris@16: int trail_size = trail_length(lead); Chris@16: Chris@16: if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) Chris@16: return illegal; Chris@16: Chris@16: // Chris@16: // Ok as only ASCII may be of size = 0 Chris@16: // also optimize for ASCII text Chris@16: // Chris@16: if(trail_size == 0) Chris@16: return lead; Chris@16: Chris@16: code_point c = lead & ((1<<(6-trail_size))-1); Chris@16: Chris@16: // Read the rest Chris@16: unsigned char tmp; Chris@16: switch(trail_size) { Chris@16: case 3: Chris@16: if(BOOST_LOCALE_UNLIKELY(p==e)) Chris@16: return incomplete; Chris@16: tmp = *p++; Chris@16: if (!is_trail(tmp)) Chris@16: return illegal; Chris@16: c = (c << 6) | ( tmp & 0x3F); Chris@16: case 2: Chris@16: if(BOOST_LOCALE_UNLIKELY(p==e)) Chris@16: return incomplete; Chris@16: tmp = *p++; Chris@16: if (!is_trail(tmp)) Chris@16: return illegal; Chris@16: c = (c << 6) | ( tmp & 0x3F); Chris@16: case 1: Chris@16: if(BOOST_LOCALE_UNLIKELY(p==e)) Chris@16: return incomplete; Chris@16: tmp = *p++; Chris@16: if (!is_trail(tmp)) Chris@16: return illegal; Chris@16: c = (c << 6) | ( tmp & 0x3F); Chris@16: } Chris@16: Chris@16: // Check code point validity: no surrogates and Chris@16: // valid range Chris@16: if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) Chris@16: return illegal; Chris@16: Chris@16: // make sure it is the most compact representation Chris@16: if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1)) Chris@16: return illegal; Chris@16: Chris@16: return c; Chris@16: Chris@16: } Chris@16: Chris@16: template Chris@16: static code_point decode_valid(Iterator &p) Chris@16: { Chris@16: unsigned char lead = *p++; Chris@16: if(lead < 192) Chris@16: return lead; Chris@16: Chris@16: int trail_size; Chris@16: Chris@16: if(lead < 224) Chris@16: trail_size = 1; Chris@16: else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare Chris@16: trail_size = 2; Chris@16: else Chris@16: trail_size = 3; Chris@16: Chris@16: code_point c = lead & ((1<<(6-trail_size))-1); Chris@16: Chris@16: switch(trail_size) { Chris@16: case 3: Chris@16: c = (c << 6) | ( static_cast(*p++) & 0x3F); Chris@16: case 2: Chris@16: c = (c << 6) | ( static_cast(*p++) & 0x3F); Chris@16: case 1: Chris@16: c = (c << 6) | ( static_cast(*p++) & 0x3F); Chris@16: } Chris@16: Chris@16: return c; Chris@16: } Chris@16: Chris@16: Chris@16: Chris@16: template Chris@16: static Iterator encode(code_point value,Iterator out) Chris@16: { Chris@16: if(value <= 0x7F) { Chris@16: *out++ = static_cast(value); Chris@16: } Chris@16: else if(value <= 0x7FF) { Chris@16: *out++ = static_cast((value >> 6) | 0xC0); Chris@16: *out++ = static_cast((value & 0x3F) | 0x80); Chris@16: } Chris@16: else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { Chris@16: *out++ = static_cast((value >> 12) | 0xE0); Chris@16: *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); Chris@16: *out++ = static_cast((value & 0x3F) | 0x80); Chris@16: } Chris@16: else { Chris@16: *out++ = static_cast((value >> 18) | 0xF0); Chris@16: *out++ = static_cast(((value >> 12) & 0x3F) | 0x80); Chris@16: *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); Chris@16: *out++ = static_cast((value & 0x3F) | 0x80); Chris@16: } Chris@16: return out; Chris@16: } Chris@16: }; // utf8 Chris@16: Chris@16: template Chris@16: struct utf_traits { Chris@16: typedef CharType char_type; Chris@16: Chris@16: // See RFC 2781 Chris@16: static bool is_first_surrogate(uint16_t x) Chris@16: { Chris@16: return 0xD800 <=x && x<= 0xDBFF; Chris@16: } Chris@16: static bool is_second_surrogate(uint16_t x) Chris@16: { Chris@16: return 0xDC00 <=x && x<= 0xDFFF; Chris@16: } Chris@16: static code_point combine_surrogate(uint16_t w1,uint16_t w2) Chris@16: { Chris@16: return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; Chris@16: } Chris@16: static int trail_length(char_type c) Chris@16: { Chris@16: if(is_first_surrogate(c)) Chris@16: return 1; Chris@16: if(is_second_surrogate(c)) Chris@16: return -1; Chris@16: return 0; Chris@16: } Chris@16: /// Chris@16: /// Returns true if c is trail code unit, always false for UTF-32 Chris@16: /// Chris@16: static bool is_trail(char_type c) Chris@16: { Chris@16: return is_second_surrogate(c); Chris@16: } Chris@16: /// Chris@16: /// Returns true if c is lead code unit, always true of UTF-32 Chris@16: /// Chris@16: static bool is_lead(char_type c) Chris@16: { Chris@16: return !is_second_surrogate(c); Chris@16: } Chris@16: Chris@16: template Chris@16: static code_point decode(It ¤t,It last) Chris@16: { Chris@16: if(BOOST_LOCALE_UNLIKELY(current == last)) Chris@16: return incomplete; Chris@16: uint16_t w1=*current++; Chris@16: if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { Chris@16: return w1; Chris@16: } Chris@16: if(w1 > 0xDBFF) Chris@16: return illegal; Chris@16: if(current==last) Chris@16: return incomplete; Chris@16: uint16_t w2=*current++; Chris@16: if(w2 < 0xDC00 || 0xDFFF < w2) Chris@16: return illegal; Chris@16: return combine_surrogate(w1,w2); Chris@16: } Chris@16: template Chris@16: static code_point decode_valid(It ¤t) Chris@16: { Chris@16: uint16_t w1=*current++; Chris@16: if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { Chris@16: return w1; Chris@16: } Chris@16: uint16_t w2=*current++; Chris@16: return combine_surrogate(w1,w2); Chris@16: } Chris@16: Chris@16: static const int max_width = 2; Chris@16: static int width(code_point u) Chris@16: { Chris@16: return u>=0x10000 ? 2 : 1; Chris@16: } Chris@16: template Chris@16: static It encode(code_point u,It out) Chris@16: { Chris@16: if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) { Chris@16: *out++ = static_cast(u); Chris@16: } Chris@16: else { Chris@16: u -= 0x10000; Chris@16: *out++ = static_cast(0xD800 | (u>>10)); Chris@16: *out++ = static_cast(0xDC00 | (u & 0x3FF)); Chris@16: } Chris@16: return out; Chris@16: } Chris@16: }; // utf16; Chris@16: Chris@16: Chris@16: template Chris@16: struct utf_traits { Chris@16: typedef CharType char_type; Chris@16: static int trail_length(char_type c) Chris@16: { Chris@16: if(is_valid_codepoint(c)) Chris@16: return 0; Chris@16: return -1; Chris@16: } Chris@16: static bool is_trail(char_type /*c*/) Chris@16: { Chris@16: return false; Chris@16: } Chris@16: static bool is_lead(char_type /*c*/) Chris@16: { Chris@16: return true; Chris@16: } Chris@16: Chris@16: template Chris@16: static code_point decode_valid(It ¤t) Chris@16: { Chris@16: return *current++; Chris@16: } Chris@16: Chris@16: template Chris@16: static code_point decode(It ¤t,It last) Chris@16: { Chris@16: if(BOOST_LOCALE_UNLIKELY(current == last)) Chris@16: return boost::locale::utf::incomplete; Chris@16: code_point c=*current++; Chris@16: if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) Chris@16: return boost::locale::utf::illegal; Chris@16: return c; Chris@16: } Chris@16: static const int max_width = 1; Chris@16: static int width(code_point /*u*/) Chris@16: { Chris@16: return 1; Chris@16: } Chris@16: template Chris@16: static It encode(code_point u,It out) Chris@16: { Chris@16: *out++ = static_cast(u); Chris@16: return out; Chris@16: } Chris@16: Chris@16: }; // utf32 Chris@16: Chris@16: #endif Chris@16: Chris@16: Chris@16: } // utf Chris@16: } // locale Chris@16: } // boost Chris@16: Chris@16: Chris@16: #endif Chris@16: Chris@16: // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 Chris@16: