Chris@16: // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) Chris@16: // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). Chris@16: // Distributed under the Boost Software License, Version 1.0. (See accompany- Chris@16: // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) Chris@16: Chris@16: #ifndef BOOST_UTF8_CODECVT_FACET_HPP Chris@16: #define BOOST_UTF8_CODECVT_FACET_HPP Chris@16: Chris@16: // MS compatible compilers support #pragma once Chris@16: #if defined(_MSC_VER) && (_MSC_VER >= 1020) Chris@16: # pragma once Chris@16: #endif Chris@16: Chris@16: /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 Chris@16: // utf8_codecvt_facet.hpp Chris@16: Chris@101: // This header defines class utf8_codecvt_facet, derived from Chris@16: // std::codecvt, which can be used to convert utf8 data in Chris@16: // files into wchar_t strings in the application. Chris@16: // Chris@16: // The header is NOT STANDALONE, and is not to be included by the USER. Chris@16: // There are at least two libraries which want to use this functionality, and Chris@16: // we want to avoid code duplication. It would be possible to create utf8 Chris@16: // library, but: Chris@16: // - this requires review process first Chris@16: // - in the case, when linking the a library which uses utf8 Chris@16: // (say 'program_options'), user should also link to the utf8 library. Chris@16: // This seems inconvenient, and asking a user to link to an unrevieved Chris@16: // library is strange. Chris@16: // Until the above points are fixed, a library which wants to use utf8 must: Chris@101: // - include this header in one of it's headers or sources Chris@101: // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one Chris@101: // of its sources Chris@16: // - before including either file, the library must define Chris@16: // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used Chris@16: // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace Chris@101: // declaration. Chris@16: // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' Chris@16: // symbols. Chris@16: // Chris@16: // For example, program_options library might contain: Chris@16: // #define BOOST_UTF8_BEGIN_NAMESPACE Chris@16: // namespace boost { namespace program_options { Chris@16: // #define BOOST_UTF8_END_NAMESPACE }} Chris@16: // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL Chris@101: // #include Chris@16: // Chris@16: // Essentially, each library will have its own copy of utf8 code, in Chris@16: // different namespaces. Chris@16: Chris@16: // Note:(Robert Ramey). I have made the following alterations in the original Chris@16: // code. Chris@16: // a) Rendered utf8_codecvt with using templates Chris@16: // b) Move longer functions outside class definition to prevent inlining Chris@16: // and make code smaller Chris@16: // c) added on a derived class to permit translation to/from current Chris@16: // locale to utf8 Chris@16: Chris@16: // See http://www.boost.org for updates, documentation, and revision history. Chris@16: Chris@16: // archives stored as text - note these ar templated on the basic Chris@16: // stream templates to accommodate wide (and other?) kind of characters Chris@16: // Chris@16: // note the fact that on libraries without wide characters, ostream is Chris@16: // is not a specialization of basic_ostream which in fact is not defined Chris@16: // in such cases. So we can't use basic_ostream but rather Chris@16: // use two template parameters Chris@16: // Chris@16: // utf8_codecvt_facet Chris@16: // This is an implementation of a std::codecvt facet for translating Chris@16: // from UTF-8 externally to UCS-4. Note that this is not tied to Chris@16: // any specific types in order to allow customization on platforms Chris@16: // where wchar_t is not big enough. Chris@16: // Chris@16: // NOTES: The current implementation jumps through some unpleasant hoops in Chris@16: // order to deal with signed character types. As a std::codecvt_base::result, Chris@16: // it is necessary for the ExternType to be convertible to unsigned char. Chris@16: // I chose not to tie the extern_type explicitly to char. But if any combination Chris@16: // of types other than is used, then std::codecvt must be Chris@16: // specialized on those types for this to work. Chris@16: Chris@16: #include Chris@16: #include // for mbstate_t Chris@16: #include // for std::size_t Chris@16: Chris@16: #include Chris@16: #include Chris@16: Chris@16: #if defined(BOOST_NO_STDC_NAMESPACE) Chris@16: namespace std { Chris@16: using ::mbstate_t; Chris@16: using ::size_t; Chris@16: } Chris@16: #endif Chris@16: Chris@16: // maximum lenght of a multibyte string Chris@16: #define MB_LENGTH_MAX 8 Chris@16: Chris@16: BOOST_UTF8_BEGIN_NAMESPACE Chris@16: Chris@101: //----------------------------------------------------------------------------// Chris@101: // // Chris@101: // utf8_codecvt_facet // Chris@101: // // Chris@101: // See utf8_codecvt_facet.ipp for the implementation. // Chris@101: //----------------------------------------------------------------------------// Chris@101: Chris@101: Chris@16: struct BOOST_UTF8_DECL utf8_codecvt_facet : Chris@16: public std::codecvt Chris@16: { Chris@16: public: Chris@16: explicit utf8_codecvt_facet(std::size_t no_locale_manage=0) Chris@16: : std::codecvt(no_locale_manage) Chris@16: {} Chris@16: protected: Chris@16: virtual std::codecvt_base::result do_in( Chris@16: std::mbstate_t& state, Chris@16: const char * from, Chris@16: const char * from_end, Chris@16: const char * & from_next, Chris@16: wchar_t * to, Chris@16: wchar_t * to_end, Chris@16: wchar_t*& to_next Chris@16: ) const; Chris@16: Chris@16: virtual std::codecvt_base::result do_out( Chris@101: std::mbstate_t & state, Chris@101: const wchar_t * from, Chris@101: const wchar_t * from_end, Chris@101: const wchar_t* & from_next, Chris@101: char * to, Chris@101: char * to_end, Chris@101: char * & to_next Chris@16: ) const; Chris@16: Chris@16: bool invalid_continuing_octet(unsigned char octet_1) const { Chris@16: return (octet_1 < 0x80|| 0xbf< octet_1); Chris@16: } Chris@16: Chris@16: bool invalid_leading_octet(unsigned char octet_1) const { Chris@16: return (0x7f < octet_1 && octet_1 < 0xc0) || Chris@16: (octet_1 > 0xfd); Chris@16: } Chris@16: Chris@16: // continuing octets = octets except for the leading octet Chris@101: static unsigned int get_cont_octet_count(unsigned char lead_octet) { Chris@16: return get_octet_count(lead_octet) - 1; Chris@16: } Chris@16: Chris@101: static unsigned int get_octet_count(unsigned char lead_octet); Chris@16: Chris@16: // How many "continuing octets" will be needed for this word Chris@16: // == total octets - 1. Chris@16: int get_cont_octet_out_count(wchar_t word) const ; Chris@16: Chris@101: virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW { Chris@101: return false; Chris@101: } Chris@16: Chris@16: // UTF-8 isn't really stateful since we rewind on partial conversions Chris@16: virtual std::codecvt_base::result do_unshift( Chris@16: std::mbstate_t&, Chris@16: char * from, Chris@16: char * /*to*/, Chris@16: char * & next Chris@101: ) const { Chris@16: next = from; Chris@16: return ok; Chris@16: } Chris@16: Chris@101: virtual int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW { Chris@16: const int variable_byte_external_encoding=0; Chris@16: return variable_byte_external_encoding; Chris@16: } Chris@16: Chris@16: // How many char objects can I process to get <= max_limit Chris@16: // wchar_t objects? Chris@16: virtual int do_length( Chris@101: const std::mbstate_t &, Chris@16: const char * from, Chris@16: const char * from_end, Chris@16: std::size_t max_limit Chris@101: ) const Chris@16: #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) Chris@101: throw() Chris@16: #endif Chris@101: ; Chris@101: virtual int do_length( Chris@101: std::mbstate_t & s, Chris@101: const char * from, Chris@101: const char * from_end, Chris@101: std::size_t max_limit Chris@101: ) const Chris@101: #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) Chris@101: throw() Chris@101: #endif Chris@101: { Chris@101: return do_length( Chris@101: const_cast(s), Chris@101: from, Chris@101: from_end, Chris@101: max_limit Chris@101: ); Chris@101: } Chris@16: // Largest possible value do_length(state,from,from_end,1) could return. Chris@101: virtual int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW { Chris@16: return 6; // largest UTF-8 encoding of a UCS-4 character Chris@16: } Chris@16: }; Chris@16: Chris@16: BOOST_UTF8_END_NAMESPACE Chris@16: Chris@16: #endif // BOOST_UTF8_CODECVT_FACET_HPP