Chris@16: /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 Chris@16: // utf8_codecvt_facet.ipp Chris@16: Chris@16: // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) Chris@16: // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). Chris@16: // Use, modification and distribution is subject to the Boost Software Chris@16: // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at Chris@16: // http://www.boost.org/LICENSE_1_0.txt) Chris@16: Chris@16: // Please see the comments in to Chris@16: // learn how this file should be used. Chris@16: Chris@16: #include Chris@16: Chris@16: #include // for multi-byte converson routines Chris@16: #include Chris@16: Chris@16: #include Chris@16: #include Chris@16: Chris@16: // If we don't have wstring, then Unicode support Chris@16: // is not available anyway, so we don't need to even Chris@16: // compiler this file. This also fixes the problem Chris@16: // with mingw, which can compile this file, but will Chris@16: // generate link error when building DLL. Chris@16: #ifndef BOOST_NO_STD_WSTRING Chris@16: Chris@16: BOOST_UTF8_BEGIN_NAMESPACE Chris@16: Chris@16: /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 Chris@16: // implementation for wchar_t Chris@16: Chris@16: // Translate incoming UTF-8 into UCS-4 Chris@16: std::codecvt_base::result utf8_codecvt_facet::do_in( Chris@16: std::mbstate_t& /*state*/, Chris@16: const char * from, Chris@16: const char * from_end, Chris@16: const char * & from_next, Chris@16: wchar_t * to, Chris@16: wchar_t * to_end, Chris@16: wchar_t * & to_next Chris@16: ) const { Chris@16: // Basic algorithm: The first octet determines how many Chris@16: // octets total make up the UCS-4 character. The remaining Chris@16: // "continuing octets" all begin with "10". To convert, subtract Chris@16: // the amount that specifies the number of octets from the first Chris@16: // octet. Subtract 0x80 (1000 0000) from each continuing octet, Chris@16: // then mash the whole lot together. Note that each continuing Chris@16: // octet only uses 6 bits as unique values, so only shift by Chris@16: // multiples of 6 to combine. Chris@16: while (from != from_end && to != to_end) { Chris@16: Chris@16: // Error checking on the first octet Chris@16: if (invalid_leading_octet(*from)){ Chris@16: from_next = from; Chris@16: to_next = to; Chris@16: return std::codecvt_base::error; Chris@16: } Chris@16: Chris@16: // The first octet is adjusted by a value dependent upon Chris@16: // the number of "continuing octets" encoding the character Chris@16: const int cont_octet_count = get_cont_octet_count(*from); Chris@16: const wchar_t octet1_modifier_table[] = { Chris@16: 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc Chris@16: }; Chris@16: Chris@16: // The unsigned char conversion is necessary in case char is Chris@16: // signed (I learned this the hard way) Chris@16: wchar_t ucs_result = Chris@16: (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; Chris@16: Chris@16: // Invariants : Chris@16: // 1) At the start of the loop, 'i' continuing characters have been Chris@16: // processed Chris@16: // 2) *from points to the next continuing character to be processed. Chris@16: int i = 0; Chris@16: while(i != cont_octet_count && from != from_end) { Chris@16: Chris@16: // Error checking on continuing characters Chris@16: if (invalid_continuing_octet(*from)) { Chris@16: from_next = from; Chris@16: to_next = to; Chris@16: return std::codecvt_base::error; Chris@16: } Chris@16: Chris@16: ucs_result *= (1 << 6); Chris@16: Chris@16: // each continuing character has an extra (10xxxxxx)b attached to Chris@16: // it that must be removed. Chris@16: ucs_result += (unsigned char)(*from++) - 0x80; Chris@16: ++i; Chris@16: } Chris@16: Chris@16: // If the buffer ends with an incomplete unicode character... Chris@16: if (from == from_end && i != cont_octet_count) { Chris@16: // rewind "from" to before the current character translation Chris@16: from_next = from - (i+1); Chris@16: to_next = to; Chris@16: return std::codecvt_base::partial; Chris@16: } Chris@16: *to++ = ucs_result; Chris@16: } Chris@16: from_next = from; Chris@16: to_next = to; Chris@16: Chris@16: // Were we done converting or did we run out of destination space? Chris@16: if(from == from_end) return std::codecvt_base::ok; Chris@16: else return std::codecvt_base::partial; Chris@16: } Chris@16: Chris@16: std::codecvt_base::result utf8_codecvt_facet::do_out( Chris@16: std::mbstate_t& /*state*/, Chris@16: const wchar_t * from, Chris@16: const wchar_t * from_end, Chris@16: const wchar_t * & from_next, Chris@16: char * to, Chris@16: char * to_end, Chris@16: char * & to_next Chris@16: ) const Chris@16: { Chris@16: // RG - consider merging this table with the other one Chris@16: const wchar_t octet1_modifier_table[] = { Chris@16: 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc Chris@16: }; Chris@16: Chris@16: wchar_t max_wchar = (std::numeric_limits::max)(); Chris@16: while (from != from_end && to != to_end) { Chris@16: Chris@16: // Check for invalid UCS-4 character Chris@16: if (*from > max_wchar) { Chris@16: from_next = from; Chris@16: to_next = to; Chris@16: return std::codecvt_base::error; Chris@16: } Chris@16: Chris@16: int cont_octet_count = get_cont_octet_out_count(*from); Chris@16: Chris@16: // RG - comment this formula better Chris@16: int shift_exponent = (cont_octet_count) * 6; Chris@16: Chris@16: // Process the first character Chris@16: *to++ = static_cast(octet1_modifier_table[cont_octet_count] + Chris@16: (unsigned char)(*from / (1 << shift_exponent))); Chris@16: Chris@16: // Process the continuation characters Chris@16: // Invariants: At the start of the loop: Chris@16: // 1) 'i' continuing octets have been generated Chris@16: // 2) '*to' points to the next location to place an octet Chris@16: // 3) shift_exponent is 6 more than needed for the next octet Chris@16: int i = 0; Chris@16: while (i != cont_octet_count && to != to_end) { Chris@16: shift_exponent -= 6; Chris@16: *to++ = static_cast(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); Chris@16: ++i; Chris@16: } Chris@16: // If we filled up the out buffer before encoding the character Chris@16: if(to == to_end && i != cont_octet_count) { Chris@16: from_next = from; Chris@16: to_next = to - (i+1); Chris@16: return std::codecvt_base::partial; Chris@16: } Chris@16: ++from; Chris@16: } Chris@16: from_next = from; Chris@16: to_next = to; Chris@16: // Were we done or did we run out of destination space Chris@16: if(from == from_end) return std::codecvt_base::ok; Chris@16: else return std::codecvt_base::partial; Chris@16: } Chris@16: Chris@16: // How many char objects can I process to get <= max_limit Chris@16: // wchar_t objects? Chris@16: int utf8_codecvt_facet::do_length( Chris@101: const std::mbstate_t &, Chris@16: const char * from, Chris@16: const char * from_end, Chris@16: std::size_t max_limit Chris@101: ) const Chris@16: #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) Chris@101: throw() Chris@16: #endif Chris@16: { Chris@16: // RG - this code is confusing! I need a better way to express it. Chris@16: // and test cases. Chris@16: Chris@16: // Invariants: Chris@16: // 1) last_octet_count has the size of the last measured character Chris@16: // 2) char_count holds the number of characters shown to fit Chris@16: // within the bounds so far (no greater than max_limit) Chris@16: // 3) from_next points to the octet 'last_octet_count' before the Chris@16: // last measured character. Chris@16: int last_octet_count=0; Chris@16: std::size_t char_count = 0; Chris@16: const char* from_next = from; Chris@16: // Use "<" because the buffer may represent incomplete characters Chris@16: while (from_next+last_octet_count <= from_end && char_count <= max_limit) { Chris@16: from_next += last_octet_count; Chris@16: last_octet_count = (get_octet_count(*from_next)); Chris@16: ++char_count; Chris@16: } Chris@16: return static_cast(from_next-from_end); Chris@16: } Chris@16: Chris@16: unsigned int utf8_codecvt_facet::get_octet_count( Chris@16: unsigned char lead_octet Chris@16: ){ Chris@16: // if the 0-bit (MSB) is 0, then 1 character Chris@16: if (lead_octet <= 0x7f) return 1; Chris@16: Chris@16: // Otherwise the count number of consecutive 1 bits starting at MSB Chris@16: // assert(0xc0 <= lead_octet && lead_octet <= 0xfd); Chris@16: Chris@16: if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; Chris@16: else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; Chris@16: else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; Chris@16: else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; Chris@16: else return 6; Chris@16: } Chris@16: Chris@101: namespace detail { Chris@101: Chris@16: template Chris@16: int get_cont_octet_out_count_impl(wchar_t word){ Chris@16: if (word < 0x80) { Chris@16: return 0; Chris@16: } Chris@16: if (word < 0x800) { Chris@16: return 1; Chris@16: } Chris@16: return 2; Chris@16: } Chris@16: Chris@16: template<> Chris@16: int get_cont_octet_out_count_impl<4>(wchar_t word){ Chris@16: if (word < 0x80) { Chris@16: return 0; Chris@16: } Chris@16: if (word < 0x800) { Chris@16: return 1; Chris@16: } Chris@16: Chris@16: // Note that the following code will generate warnings on some platforms Chris@16: // where wchar_t is defined as UCS2. The warnings are superfluous as the Chris@16: // specialization is never instantitiated with such compilers, but this Chris@16: // can cause problems if warnings are being treated as errors, so we guard Chris@16: // against that. Including as we do Chris@16: // should be enough to get WCHAR_MAX defined. Chris@16: #if !defined(WCHAR_MAX) Chris@16: # error WCHAR_MAX not defined! Chris@16: #endif Chris@16: // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX Chris@16: #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier Chris@16: return 2; Chris@16: #elif WCHAR_MAX > 0x10000 Chris@16: Chris@16: if (word < 0x10000) { Chris@16: return 2; Chris@16: } Chris@16: if (word < 0x200000) { Chris@16: return 3; Chris@16: } Chris@16: if (word < 0x4000000) { Chris@16: return 4; Chris@16: } Chris@16: return 5; Chris@16: Chris@16: #else Chris@16: return 2; Chris@16: #endif Chris@16: } Chris@16: Chris@101: } // namespace detail Chris@16: Chris@16: // How many "continuing octets" will be needed for this word Chris@16: // == total octets - 1. Chris@16: int utf8_codecvt_facet::get_cont_octet_out_count( Chris@16: wchar_t word Chris@16: ) const { Chris@101: return detail::get_cont_octet_out_count_impl(word); Chris@16: } Chris@16: BOOST_UTF8_END_NAMESPACE Chris@16: Chris@16: #endif