Chris@16
|
1 //
|
Chris@16
|
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
|
Chris@16
|
3 //
|
Chris@16
|
4 // Distributed under the Boost Software License, Version 1.0. (See
|
Chris@16
|
5 // accompanying file LICENSE_1_0.txt or copy at
|
Chris@16
|
6 // http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
7 //
|
Chris@16
|
8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
|
Chris@16
|
9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
|
Chris@16
|
10
|
Chris@16
|
11 #include <boost/cstdint.hpp>
|
Chris@16
|
12
|
Chris@16
|
13 namespace boost {
|
Chris@16
|
14 namespace locale {
|
Chris@16
|
15 ///
|
Chris@16
|
16 /// \brief Namespace that holds basic operations on UTF encoded sequences
|
Chris@16
|
17 ///
|
Chris@16
|
18 /// All functions defined in this namespace do not require linking with Boost.Locale library
|
Chris@16
|
19 ///
|
Chris@16
|
20 namespace utf {
|
Chris@16
|
21 /// \cond INTERNAL
|
Chris@16
|
22 #ifdef __GNUC__
|
Chris@16
|
23 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
|
Chris@16
|
24 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
|
Chris@16
|
25 #else
|
Chris@16
|
26 # define BOOST_LOCALE_LIKELY(x) (x)
|
Chris@16
|
27 # define BOOST_LOCALE_UNLIKELY(x) (x)
|
Chris@16
|
28 #endif
|
Chris@16
|
29 /// \endcond
|
Chris@16
|
30
|
Chris@16
|
31 ///
|
Chris@16
|
32 /// \brief The integral type that can hold a Unicode code point
|
Chris@16
|
33 ///
|
Chris@16
|
34 typedef uint32_t code_point;
|
Chris@16
|
35
|
Chris@16
|
36 ///
|
Chris@16
|
37 /// \brief Special constant that defines illegal code point
|
Chris@16
|
38 ///
|
Chris@16
|
39 static const code_point illegal = 0xFFFFFFFFu;
|
Chris@16
|
40
|
Chris@16
|
41 ///
|
Chris@16
|
42 /// \brief Special constant that defines incomplete code point
|
Chris@16
|
43 ///
|
Chris@16
|
44 static const code_point incomplete = 0xFFFFFFFEu;
|
Chris@16
|
45
|
Chris@16
|
46 ///
|
Chris@16
|
47 /// \brief the function checks if \a v is a valid code point
|
Chris@16
|
48 ///
|
Chris@16
|
49 inline bool is_valid_codepoint(code_point v)
|
Chris@16
|
50 {
|
Chris@16
|
51 if(v>0x10FFFF)
|
Chris@16
|
52 return false;
|
Chris@16
|
53 if(0xD800 <=v && v<= 0xDFFF) // surragates
|
Chris@16
|
54 return false;
|
Chris@16
|
55 return true;
|
Chris@16
|
56 }
|
Chris@16
|
57
|
Chris@16
|
58 #ifdef BOOST_LOCALE_DOXYGEN
|
Chris@16
|
59 ///
|
Chris@16
|
60 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
|
Chris@16
|
61 ///
|
Chris@16
|
62 template<typename CharType,int size=sizeof(CharType)>
|
Chris@16
|
63 struct utf_traits {
|
Chris@16
|
64 ///
|
Chris@16
|
65 /// The type of the character
|
Chris@16
|
66 ///
|
Chris@16
|
67 typedef CharType char_type;
|
Chris@16
|
68 ///
|
Chris@16
|
69 /// Read one code point from the range [p,e) and return it.
|
Chris@16
|
70 ///
|
Chris@16
|
71 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
|
Chris@16
|
72 /// - If illegal sequence detected returns \ref illegal
|
Chris@16
|
73 ///
|
Chris@16
|
74 /// Requirements
|
Chris@16
|
75 ///
|
Chris@16
|
76 /// - Iterator is valid input iterator
|
Chris@16
|
77 ///
|
Chris@16
|
78 /// Postconditions
|
Chris@16
|
79 ///
|
Chris@16
|
80 /// - p points to the last consumed character
|
Chris@16
|
81 ///
|
Chris@16
|
82 template<typename Iterator>
|
Chris@16
|
83 static code_point decode(Iterator &p,Iterator e);
|
Chris@16
|
84
|
Chris@16
|
85 ///
|
Chris@16
|
86 /// Maximal width of valid sequence in the code units:
|
Chris@16
|
87 ///
|
Chris@16
|
88 /// - UTF-8 - 4
|
Chris@16
|
89 /// - UTF-16 - 2
|
Chris@16
|
90 /// - UTF-32 - 1
|
Chris@16
|
91 ///
|
Chris@16
|
92 static const int max_width;
|
Chris@16
|
93 ///
|
Chris@16
|
94 /// The width of specific code point in the code units.
|
Chris@16
|
95 ///
|
Chris@16
|
96 /// Requirement: value is a valid Unicode code point
|
Chris@16
|
97 /// Returns value in range [1..max_width]
|
Chris@16
|
98 ///
|
Chris@16
|
99 static int width(code_point value);
|
Chris@16
|
100
|
Chris@16
|
101 ///
|
Chris@16
|
102 /// Get the size of the trail part of variable length encoded sequence.
|
Chris@16
|
103 ///
|
Chris@16
|
104 /// Returns -1 if C is not valid lead character
|
Chris@16
|
105 ///
|
Chris@16
|
106 static int trail_length(char_type c);
|
Chris@16
|
107 ///
|
Chris@16
|
108 /// Returns true if c is trail code unit, always false for UTF-32
|
Chris@16
|
109 ///
|
Chris@16
|
110 static bool is_trail(char_type c);
|
Chris@16
|
111 ///
|
Chris@16
|
112 /// Returns true if c is lead code unit, always true of UTF-32
|
Chris@16
|
113 ///
|
Chris@16
|
114 static bool is_lead(char_type c);
|
Chris@16
|
115
|
Chris@16
|
116 ///
|
Chris@16
|
117 /// Convert valid Unicode code point \a value to the UTF sequence.
|
Chris@16
|
118 ///
|
Chris@16
|
119 /// Requirements:
|
Chris@16
|
120 ///
|
Chris@16
|
121 /// - \a value is valid code point
|
Chris@16
|
122 /// - \a out is an output iterator should be able to accept at least width(value) units
|
Chris@16
|
123 ///
|
Chris@16
|
124 /// Returns the iterator past the last written code unit.
|
Chris@16
|
125 ///
|
Chris@16
|
126 template<typename Iterator>
|
Chris@16
|
127 static Iterator encode(code_point value,Iterator out);
|
Chris@16
|
128 ///
|
Chris@16
|
129 /// Decodes valid UTF sequence that is pointed by p into code point.
|
Chris@16
|
130 ///
|
Chris@16
|
131 /// If the sequence is invalid or points to end the behavior is undefined
|
Chris@16
|
132 ///
|
Chris@16
|
133 template<typename Iterator>
|
Chris@16
|
134 static code_point decode_valid(Iterator &p);
|
Chris@16
|
135 };
|
Chris@16
|
136
|
Chris@16
|
137 #else
|
Chris@16
|
138
|
Chris@16
|
139 template<typename CharType,int size=sizeof(CharType)>
|
Chris@16
|
140 struct utf_traits;
|
Chris@16
|
141
|
Chris@16
|
142 template<typename CharType>
|
Chris@16
|
143 struct utf_traits<CharType,1> {
|
Chris@16
|
144
|
Chris@16
|
145 typedef CharType char_type;
|
Chris@16
|
146
|
Chris@16
|
147 static int trail_length(char_type ci)
|
Chris@16
|
148 {
|
Chris@16
|
149 unsigned char c = ci;
|
Chris@16
|
150 if(c < 128)
|
Chris@16
|
151 return 0;
|
Chris@16
|
152 if(BOOST_LOCALE_UNLIKELY(c < 194))
|
Chris@16
|
153 return -1;
|
Chris@16
|
154 if(c < 224)
|
Chris@16
|
155 return 1;
|
Chris@16
|
156 if(c < 240)
|
Chris@16
|
157 return 2;
|
Chris@16
|
158 if(BOOST_LOCALE_LIKELY(c <=244))
|
Chris@16
|
159 return 3;
|
Chris@16
|
160 return -1;
|
Chris@16
|
161 }
|
Chris@16
|
162
|
Chris@16
|
163 static const int max_width = 4;
|
Chris@16
|
164
|
Chris@16
|
165 static int width(code_point value)
|
Chris@16
|
166 {
|
Chris@16
|
167 if(value <=0x7F) {
|
Chris@16
|
168 return 1;
|
Chris@16
|
169 }
|
Chris@16
|
170 else if(value <=0x7FF) {
|
Chris@16
|
171 return 2;
|
Chris@16
|
172 }
|
Chris@16
|
173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
|
Chris@16
|
174 return 3;
|
Chris@16
|
175 }
|
Chris@16
|
176 else {
|
Chris@16
|
177 return 4;
|
Chris@16
|
178 }
|
Chris@16
|
179 }
|
Chris@16
|
180
|
Chris@16
|
181 static bool is_trail(char_type ci)
|
Chris@16
|
182 {
|
Chris@16
|
183 unsigned char c=ci;
|
Chris@16
|
184 return (c & 0xC0)==0x80;
|
Chris@16
|
185 }
|
Chris@16
|
186
|
Chris@16
|
187 static bool is_lead(char_type ci)
|
Chris@16
|
188 {
|
Chris@16
|
189 return !is_trail(ci);
|
Chris@16
|
190 }
|
Chris@16
|
191
|
Chris@16
|
192 template<typename Iterator>
|
Chris@16
|
193 static code_point decode(Iterator &p,Iterator e)
|
Chris@16
|
194 {
|
Chris@16
|
195 if(BOOST_LOCALE_UNLIKELY(p==e))
|
Chris@16
|
196 return incomplete;
|
Chris@16
|
197
|
Chris@16
|
198 unsigned char lead = *p++;
|
Chris@16
|
199
|
Chris@16
|
200 // First byte is fully validated here
|
Chris@16
|
201 int trail_size = trail_length(lead);
|
Chris@16
|
202
|
Chris@16
|
203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
|
Chris@16
|
204 return illegal;
|
Chris@16
|
205
|
Chris@16
|
206 //
|
Chris@16
|
207 // Ok as only ASCII may be of size = 0
|
Chris@16
|
208 // also optimize for ASCII text
|
Chris@16
|
209 //
|
Chris@16
|
210 if(trail_size == 0)
|
Chris@16
|
211 return lead;
|
Chris@16
|
212
|
Chris@16
|
213 code_point c = lead & ((1<<(6-trail_size))-1);
|
Chris@16
|
214
|
Chris@16
|
215 // Read the rest
|
Chris@16
|
216 unsigned char tmp;
|
Chris@16
|
217 switch(trail_size) {
|
Chris@16
|
218 case 3:
|
Chris@16
|
219 if(BOOST_LOCALE_UNLIKELY(p==e))
|
Chris@16
|
220 return incomplete;
|
Chris@16
|
221 tmp = *p++;
|
Chris@16
|
222 if (!is_trail(tmp))
|
Chris@16
|
223 return illegal;
|
Chris@16
|
224 c = (c << 6) | ( tmp & 0x3F);
|
Chris@16
|
225 case 2:
|
Chris@16
|
226 if(BOOST_LOCALE_UNLIKELY(p==e))
|
Chris@16
|
227 return incomplete;
|
Chris@16
|
228 tmp = *p++;
|
Chris@16
|
229 if (!is_trail(tmp))
|
Chris@16
|
230 return illegal;
|
Chris@16
|
231 c = (c << 6) | ( tmp & 0x3F);
|
Chris@16
|
232 case 1:
|
Chris@16
|
233 if(BOOST_LOCALE_UNLIKELY(p==e))
|
Chris@16
|
234 return incomplete;
|
Chris@16
|
235 tmp = *p++;
|
Chris@16
|
236 if (!is_trail(tmp))
|
Chris@16
|
237 return illegal;
|
Chris@16
|
238 c = (c << 6) | ( tmp & 0x3F);
|
Chris@16
|
239 }
|
Chris@16
|
240
|
Chris@16
|
241 // Check code point validity: no surrogates and
|
Chris@16
|
242 // valid range
|
Chris@16
|
243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
|
Chris@16
|
244 return illegal;
|
Chris@16
|
245
|
Chris@16
|
246 // make sure it is the most compact representation
|
Chris@16
|
247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
|
Chris@16
|
248 return illegal;
|
Chris@16
|
249
|
Chris@16
|
250 return c;
|
Chris@16
|
251
|
Chris@16
|
252 }
|
Chris@16
|
253
|
Chris@16
|
254 template<typename Iterator>
|
Chris@16
|
255 static code_point decode_valid(Iterator &p)
|
Chris@16
|
256 {
|
Chris@16
|
257 unsigned char lead = *p++;
|
Chris@16
|
258 if(lead < 192)
|
Chris@16
|
259 return lead;
|
Chris@16
|
260
|
Chris@16
|
261 int trail_size;
|
Chris@16
|
262
|
Chris@16
|
263 if(lead < 224)
|
Chris@16
|
264 trail_size = 1;
|
Chris@16
|
265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
|
Chris@16
|
266 trail_size = 2;
|
Chris@16
|
267 else
|
Chris@16
|
268 trail_size = 3;
|
Chris@16
|
269
|
Chris@16
|
270 code_point c = lead & ((1<<(6-trail_size))-1);
|
Chris@16
|
271
|
Chris@16
|
272 switch(trail_size) {
|
Chris@16
|
273 case 3:
|
Chris@16
|
274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
Chris@16
|
275 case 2:
|
Chris@16
|
276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
Chris@16
|
277 case 1:
|
Chris@16
|
278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
|
Chris@16
|
279 }
|
Chris@16
|
280
|
Chris@16
|
281 return c;
|
Chris@16
|
282 }
|
Chris@16
|
283
|
Chris@16
|
284
|
Chris@16
|
285
|
Chris@16
|
286 template<typename Iterator>
|
Chris@16
|
287 static Iterator encode(code_point value,Iterator out)
|
Chris@16
|
288 {
|
Chris@16
|
289 if(value <= 0x7F) {
|
Chris@16
|
290 *out++ = static_cast<char_type>(value);
|
Chris@16
|
291 }
|
Chris@16
|
292 else if(value <= 0x7FF) {
|
Chris@16
|
293 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
|
Chris@16
|
294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
Chris@16
|
295 }
|
Chris@16
|
296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
|
Chris@16
|
297 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
|
Chris@16
|
298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
|
Chris@16
|
299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
Chris@16
|
300 }
|
Chris@16
|
301 else {
|
Chris@16
|
302 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
|
Chris@16
|
303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
|
Chris@16
|
304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
|
Chris@16
|
305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
|
Chris@16
|
306 }
|
Chris@16
|
307 return out;
|
Chris@16
|
308 }
|
Chris@16
|
309 }; // utf8
|
Chris@16
|
310
|
Chris@16
|
311 template<typename CharType>
|
Chris@16
|
312 struct utf_traits<CharType,2> {
|
Chris@16
|
313 typedef CharType char_type;
|
Chris@16
|
314
|
Chris@16
|
315 // See RFC 2781
|
Chris@16
|
316 static bool is_first_surrogate(uint16_t x)
|
Chris@16
|
317 {
|
Chris@16
|
318 return 0xD800 <=x && x<= 0xDBFF;
|
Chris@16
|
319 }
|
Chris@16
|
320 static bool is_second_surrogate(uint16_t x)
|
Chris@16
|
321 {
|
Chris@16
|
322 return 0xDC00 <=x && x<= 0xDFFF;
|
Chris@16
|
323 }
|
Chris@16
|
324 static code_point combine_surrogate(uint16_t w1,uint16_t w2)
|
Chris@16
|
325 {
|
Chris@16
|
326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
|
Chris@16
|
327 }
|
Chris@16
|
328 static int trail_length(char_type c)
|
Chris@16
|
329 {
|
Chris@16
|
330 if(is_first_surrogate(c))
|
Chris@16
|
331 return 1;
|
Chris@16
|
332 if(is_second_surrogate(c))
|
Chris@16
|
333 return -1;
|
Chris@16
|
334 return 0;
|
Chris@16
|
335 }
|
Chris@16
|
336 ///
|
Chris@16
|
337 /// Returns true if c is trail code unit, always false for UTF-32
|
Chris@16
|
338 ///
|
Chris@16
|
339 static bool is_trail(char_type c)
|
Chris@16
|
340 {
|
Chris@16
|
341 return is_second_surrogate(c);
|
Chris@16
|
342 }
|
Chris@16
|
343 ///
|
Chris@16
|
344 /// Returns true if c is lead code unit, always true of UTF-32
|
Chris@16
|
345 ///
|
Chris@16
|
346 static bool is_lead(char_type c)
|
Chris@16
|
347 {
|
Chris@16
|
348 return !is_second_surrogate(c);
|
Chris@16
|
349 }
|
Chris@16
|
350
|
Chris@16
|
351 template<typename It>
|
Chris@16
|
352 static code_point decode(It ¤t,It last)
|
Chris@16
|
353 {
|
Chris@16
|
354 if(BOOST_LOCALE_UNLIKELY(current == last))
|
Chris@16
|
355 return incomplete;
|
Chris@16
|
356 uint16_t w1=*current++;
|
Chris@16
|
357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
|
Chris@16
|
358 return w1;
|
Chris@16
|
359 }
|
Chris@16
|
360 if(w1 > 0xDBFF)
|
Chris@16
|
361 return illegal;
|
Chris@16
|
362 if(current==last)
|
Chris@16
|
363 return incomplete;
|
Chris@16
|
364 uint16_t w2=*current++;
|
Chris@16
|
365 if(w2 < 0xDC00 || 0xDFFF < w2)
|
Chris@16
|
366 return illegal;
|
Chris@16
|
367 return combine_surrogate(w1,w2);
|
Chris@16
|
368 }
|
Chris@16
|
369 template<typename It>
|
Chris@16
|
370 static code_point decode_valid(It ¤t)
|
Chris@16
|
371 {
|
Chris@16
|
372 uint16_t w1=*current++;
|
Chris@16
|
373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
|
Chris@16
|
374 return w1;
|
Chris@16
|
375 }
|
Chris@16
|
376 uint16_t w2=*current++;
|
Chris@16
|
377 return combine_surrogate(w1,w2);
|
Chris@16
|
378 }
|
Chris@16
|
379
|
Chris@16
|
380 static const int max_width = 2;
|
Chris@16
|
381 static int width(code_point u)
|
Chris@16
|
382 {
|
Chris@16
|
383 return u>=0x10000 ? 2 : 1;
|
Chris@16
|
384 }
|
Chris@16
|
385 template<typename It>
|
Chris@16
|
386 static It encode(code_point u,It out)
|
Chris@16
|
387 {
|
Chris@16
|
388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
|
Chris@16
|
389 *out++ = static_cast<char_type>(u);
|
Chris@16
|
390 }
|
Chris@16
|
391 else {
|
Chris@16
|
392 u -= 0x10000;
|
Chris@16
|
393 *out++ = static_cast<char_type>(0xD800 | (u>>10));
|
Chris@16
|
394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
|
Chris@16
|
395 }
|
Chris@16
|
396 return out;
|
Chris@16
|
397 }
|
Chris@16
|
398 }; // utf16;
|
Chris@16
|
399
|
Chris@16
|
400
|
Chris@16
|
401 template<typename CharType>
|
Chris@16
|
402 struct utf_traits<CharType,4> {
|
Chris@16
|
403 typedef CharType char_type;
|
Chris@16
|
404 static int trail_length(char_type c)
|
Chris@16
|
405 {
|
Chris@16
|
406 if(is_valid_codepoint(c))
|
Chris@16
|
407 return 0;
|
Chris@16
|
408 return -1;
|
Chris@16
|
409 }
|
Chris@16
|
410 static bool is_trail(char_type /*c*/)
|
Chris@16
|
411 {
|
Chris@16
|
412 return false;
|
Chris@16
|
413 }
|
Chris@16
|
414 static bool is_lead(char_type /*c*/)
|
Chris@16
|
415 {
|
Chris@16
|
416 return true;
|
Chris@16
|
417 }
|
Chris@16
|
418
|
Chris@16
|
419 template<typename It>
|
Chris@16
|
420 static code_point decode_valid(It ¤t)
|
Chris@16
|
421 {
|
Chris@16
|
422 return *current++;
|
Chris@16
|
423 }
|
Chris@16
|
424
|
Chris@16
|
425 template<typename It>
|
Chris@16
|
426 static code_point decode(It ¤t,It last)
|
Chris@16
|
427 {
|
Chris@16
|
428 if(BOOST_LOCALE_UNLIKELY(current == last))
|
Chris@16
|
429 return boost::locale::utf::incomplete;
|
Chris@16
|
430 code_point c=*current++;
|
Chris@16
|
431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
|
Chris@16
|
432 return boost::locale::utf::illegal;
|
Chris@16
|
433 return c;
|
Chris@16
|
434 }
|
Chris@16
|
435 static const int max_width = 1;
|
Chris@16
|
436 static int width(code_point /*u*/)
|
Chris@16
|
437 {
|
Chris@16
|
438 return 1;
|
Chris@16
|
439 }
|
Chris@16
|
440 template<typename It>
|
Chris@16
|
441 static It encode(code_point u,It out)
|
Chris@16
|
442 {
|
Chris@16
|
443 *out++ = static_cast<char_type>(u);
|
Chris@16
|
444 return out;
|
Chris@16
|
445 }
|
Chris@16
|
446
|
Chris@16
|
447 }; // utf32
|
Chris@16
|
448
|
Chris@16
|
449 #endif
|
Chris@16
|
450
|
Chris@16
|
451
|
Chris@16
|
452 } // utf
|
Chris@16
|
453 } // locale
|
Chris@16
|
454 } // boost
|
Chris@16
|
455
|
Chris@16
|
456
|
Chris@16
|
457 #endif
|
Chris@16
|
458
|
Chris@16
|
459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
|
Chris@16
|
460
|