Mercurial > hg > vamp-build-and-test
comparison DEPENDENCIES/generic/include/boost/detail/utf8_codecvt_facet.ipp @ 16:2665513ce2d3
Add boost headers
author | Chris Cannam |
---|---|
date | Tue, 05 Aug 2014 11:11:38 +0100 |
parents | |
children | c530137014c0 |
comparison
equal
deleted
inserted
replaced
15:663ca0da4350 | 16:2665513ce2d3 |
---|---|
1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
2 // utf8_codecvt_facet.ipp | |
3 | |
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) | |
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). | |
6 // Use, modification and distribution is subject to the Boost Software | |
7 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at | |
8 // http://www.boost.org/LICENSE_1_0.txt) | |
9 | |
10 // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to | |
11 // learn how this file should be used. | |
12 | |
13 #include <boost/detail/utf8_codecvt_facet.hpp> | |
14 | |
15 #include <cstdlib> // for multi-byte converson routines | |
16 #include <cassert> | |
17 | |
18 #include <boost/limits.hpp> | |
19 #include <boost/config.hpp> | |
20 | |
21 // If we don't have wstring, then Unicode support | |
22 // is not available anyway, so we don't need to even | |
23 // compiler this file. This also fixes the problem | |
24 // with mingw, which can compile this file, but will | |
25 // generate link error when building DLL. | |
26 #ifndef BOOST_NO_STD_WSTRING | |
27 | |
28 BOOST_UTF8_BEGIN_NAMESPACE | |
29 | |
30 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
31 // implementation for wchar_t | |
32 | |
33 // Translate incoming UTF-8 into UCS-4 | |
34 std::codecvt_base::result utf8_codecvt_facet::do_in( | |
35 std::mbstate_t& /*state*/, | |
36 const char * from, | |
37 const char * from_end, | |
38 const char * & from_next, | |
39 wchar_t * to, | |
40 wchar_t * to_end, | |
41 wchar_t * & to_next | |
42 ) const { | |
43 // Basic algorithm: The first octet determines how many | |
44 // octets total make up the UCS-4 character. The remaining | |
45 // "continuing octets" all begin with "10". To convert, subtract | |
46 // the amount that specifies the number of octets from the first | |
47 // octet. Subtract 0x80 (1000 0000) from each continuing octet, | |
48 // then mash the whole lot together. Note that each continuing | |
49 // octet only uses 6 bits as unique values, so only shift by | |
50 // multiples of 6 to combine. | |
51 while (from != from_end && to != to_end) { | |
52 | |
53 // Error checking on the first octet | |
54 if (invalid_leading_octet(*from)){ | |
55 from_next = from; | |
56 to_next = to; | |
57 return std::codecvt_base::error; | |
58 } | |
59 | |
60 // The first octet is adjusted by a value dependent upon | |
61 // the number of "continuing octets" encoding the character | |
62 const int cont_octet_count = get_cont_octet_count(*from); | |
63 const wchar_t octet1_modifier_table[] = { | |
64 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc | |
65 }; | |
66 | |
67 // The unsigned char conversion is necessary in case char is | |
68 // signed (I learned this the hard way) | |
69 wchar_t ucs_result = | |
70 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; | |
71 | |
72 // Invariants : | |
73 // 1) At the start of the loop, 'i' continuing characters have been | |
74 // processed | |
75 // 2) *from points to the next continuing character to be processed. | |
76 int i = 0; | |
77 while(i != cont_octet_count && from != from_end) { | |
78 | |
79 // Error checking on continuing characters | |
80 if (invalid_continuing_octet(*from)) { | |
81 from_next = from; | |
82 to_next = to; | |
83 return std::codecvt_base::error; | |
84 } | |
85 | |
86 ucs_result *= (1 << 6); | |
87 | |
88 // each continuing character has an extra (10xxxxxx)b attached to | |
89 // it that must be removed. | |
90 ucs_result += (unsigned char)(*from++) - 0x80; | |
91 ++i; | |
92 } | |
93 | |
94 // If the buffer ends with an incomplete unicode character... | |
95 if (from == from_end && i != cont_octet_count) { | |
96 // rewind "from" to before the current character translation | |
97 from_next = from - (i+1); | |
98 to_next = to; | |
99 return std::codecvt_base::partial; | |
100 } | |
101 *to++ = ucs_result; | |
102 } | |
103 from_next = from; | |
104 to_next = to; | |
105 | |
106 // Were we done converting or did we run out of destination space? | |
107 if(from == from_end) return std::codecvt_base::ok; | |
108 else return std::codecvt_base::partial; | |
109 } | |
110 | |
111 std::codecvt_base::result utf8_codecvt_facet::do_out( | |
112 std::mbstate_t& /*state*/, | |
113 const wchar_t * from, | |
114 const wchar_t * from_end, | |
115 const wchar_t * & from_next, | |
116 char * to, | |
117 char * to_end, | |
118 char * & to_next | |
119 ) const | |
120 { | |
121 // RG - consider merging this table with the other one | |
122 const wchar_t octet1_modifier_table[] = { | |
123 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc | |
124 }; | |
125 | |
126 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); | |
127 while (from != from_end && to != to_end) { | |
128 | |
129 // Check for invalid UCS-4 character | |
130 if (*from > max_wchar) { | |
131 from_next = from; | |
132 to_next = to; | |
133 return std::codecvt_base::error; | |
134 } | |
135 | |
136 int cont_octet_count = get_cont_octet_out_count(*from); | |
137 | |
138 // RG - comment this formula better | |
139 int shift_exponent = (cont_octet_count) * 6; | |
140 | |
141 // Process the first character | |
142 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + | |
143 (unsigned char)(*from / (1 << shift_exponent))); | |
144 | |
145 // Process the continuation characters | |
146 // Invariants: At the start of the loop: | |
147 // 1) 'i' continuing octets have been generated | |
148 // 2) '*to' points to the next location to place an octet | |
149 // 3) shift_exponent is 6 more than needed for the next octet | |
150 int i = 0; | |
151 while (i != cont_octet_count && to != to_end) { | |
152 shift_exponent -= 6; | |
153 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); | |
154 ++i; | |
155 } | |
156 // If we filled up the out buffer before encoding the character | |
157 if(to == to_end && i != cont_octet_count) { | |
158 from_next = from; | |
159 to_next = to - (i+1); | |
160 return std::codecvt_base::partial; | |
161 } | |
162 ++from; | |
163 } | |
164 from_next = from; | |
165 to_next = to; | |
166 // Were we done or did we run out of destination space | |
167 if(from == from_end) return std::codecvt_base::ok; | |
168 else return std::codecvt_base::partial; | |
169 } | |
170 | |
171 // How many char objects can I process to get <= max_limit | |
172 // wchar_t objects? | |
173 int utf8_codecvt_facet::do_length( | |
174 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, | |
175 const char * from, | |
176 const char * from_end, | |
177 std::size_t max_limit | |
178 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) | |
179 ) const throw() | |
180 #else | |
181 ) const | |
182 #endif | |
183 { | |
184 // RG - this code is confusing! I need a better way to express it. | |
185 // and test cases. | |
186 | |
187 // Invariants: | |
188 // 1) last_octet_count has the size of the last measured character | |
189 // 2) char_count holds the number of characters shown to fit | |
190 // within the bounds so far (no greater than max_limit) | |
191 // 3) from_next points to the octet 'last_octet_count' before the | |
192 // last measured character. | |
193 int last_octet_count=0; | |
194 std::size_t char_count = 0; | |
195 const char* from_next = from; | |
196 // Use "<" because the buffer may represent incomplete characters | |
197 while (from_next+last_octet_count <= from_end && char_count <= max_limit) { | |
198 from_next += last_octet_count; | |
199 last_octet_count = (get_octet_count(*from_next)); | |
200 ++char_count; | |
201 } | |
202 return static_cast<int>(from_next-from_end); | |
203 } | |
204 | |
205 unsigned int utf8_codecvt_facet::get_octet_count( | |
206 unsigned char lead_octet | |
207 ){ | |
208 // if the 0-bit (MSB) is 0, then 1 character | |
209 if (lead_octet <= 0x7f) return 1; | |
210 | |
211 // Otherwise the count number of consecutive 1 bits starting at MSB | |
212 // assert(0xc0 <= lead_octet && lead_octet <= 0xfd); | |
213 | |
214 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; | |
215 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; | |
216 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; | |
217 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; | |
218 else return 6; | |
219 } | |
220 BOOST_UTF8_END_NAMESPACE | |
221 | |
222 namespace { | |
223 template<std::size_t s> | |
224 int get_cont_octet_out_count_impl(wchar_t word){ | |
225 if (word < 0x80) { | |
226 return 0; | |
227 } | |
228 if (word < 0x800) { | |
229 return 1; | |
230 } | |
231 return 2; | |
232 } | |
233 | |
234 template<> | |
235 int get_cont_octet_out_count_impl<4>(wchar_t word){ | |
236 if (word < 0x80) { | |
237 return 0; | |
238 } | |
239 if (word < 0x800) { | |
240 return 1; | |
241 } | |
242 | |
243 // Note that the following code will generate warnings on some platforms | |
244 // where wchar_t is defined as UCS2. The warnings are superfluous as the | |
245 // specialization is never instantitiated with such compilers, but this | |
246 // can cause problems if warnings are being treated as errors, so we guard | |
247 // against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do | |
248 // should be enough to get WCHAR_MAX defined. | |
249 #if !defined(WCHAR_MAX) | |
250 # error WCHAR_MAX not defined! | |
251 #endif | |
252 // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX | |
253 #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier | |
254 return 2; | |
255 #elif WCHAR_MAX > 0x10000 | |
256 | |
257 if (word < 0x10000) { | |
258 return 2; | |
259 } | |
260 if (word < 0x200000) { | |
261 return 3; | |
262 } | |
263 if (word < 0x4000000) { | |
264 return 4; | |
265 } | |
266 return 5; | |
267 | |
268 #else | |
269 return 2; | |
270 #endif | |
271 } | |
272 | |
273 } // namespace anonymous | |
274 | |
275 BOOST_UTF8_BEGIN_NAMESPACE | |
276 // How many "continuing octets" will be needed for this word | |
277 // == total octets - 1. | |
278 int utf8_codecvt_facet::get_cont_octet_out_count( | |
279 wchar_t word | |
280 ) const { | |
281 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); | |
282 } | |
283 BOOST_UTF8_END_NAMESPACE | |
284 | |
285 #endif |