annotate DEPENDENCIES/generic/include/boost/locale/utf.hpp @ 133:4acb5d8d80b6 tip

Don't fail environmental check if README.md exists (but .txt and no-suffix don't)
author Chris Cannam
date Tue, 30 Jul 2019 12:25:44 +0100
parents 2665513ce2d3
children
rev   line source
Chris@16 1 //
Chris@16 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
Chris@16 3 //
Chris@16 4 // Distributed under the Boost Software License, Version 1.0. (See
Chris@16 5 // accompanying file LICENSE_1_0.txt or copy at
Chris@16 6 // http://www.boost.org/LICENSE_1_0.txt)
Chris@16 7 //
Chris@16 8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
Chris@16 9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
Chris@16 10
Chris@16 11 #include <boost/cstdint.hpp>
Chris@16 12
Chris@16 13 namespace boost {
Chris@16 14 namespace locale {
Chris@16 15 ///
Chris@16 16 /// \brief Namespace that holds basic operations on UTF encoded sequences
Chris@16 17 ///
Chris@16 18 /// All functions defined in this namespace do not require linking with Boost.Locale library
Chris@16 19 ///
Chris@16 20 namespace utf {
Chris@16 21 /// \cond INTERNAL
Chris@16 22 #ifdef __GNUC__
Chris@16 23 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
Chris@16 24 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
Chris@16 25 #else
Chris@16 26 # define BOOST_LOCALE_LIKELY(x) (x)
Chris@16 27 # define BOOST_LOCALE_UNLIKELY(x) (x)
Chris@16 28 #endif
Chris@16 29 /// \endcond
Chris@16 30
Chris@16 31 ///
Chris@16 32 /// \brief The integral type that can hold a Unicode code point
Chris@16 33 ///
Chris@16 34 typedef uint32_t code_point;
Chris@16 35
Chris@16 36 ///
Chris@16 37 /// \brief Special constant that defines illegal code point
Chris@16 38 ///
Chris@16 39 static const code_point illegal = 0xFFFFFFFFu;
Chris@16 40
Chris@16 41 ///
Chris@16 42 /// \brief Special constant that defines incomplete code point
Chris@16 43 ///
Chris@16 44 static const code_point incomplete = 0xFFFFFFFEu;
Chris@16 45
Chris@16 46 ///
Chris@16 47 /// \brief the function checks if \a v is a valid code point
Chris@16 48 ///
Chris@16 49 inline bool is_valid_codepoint(code_point v)
Chris@16 50 {
Chris@16 51 if(v>0x10FFFF)
Chris@16 52 return false;
Chris@16 53 if(0xD800 <=v && v<= 0xDFFF) // surragates
Chris@16 54 return false;
Chris@16 55 return true;
Chris@16 56 }
Chris@16 57
Chris@16 58 #ifdef BOOST_LOCALE_DOXYGEN
Chris@16 59 ///
Chris@16 60 /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
Chris@16 61 ///
Chris@16 62 template<typename CharType,int size=sizeof(CharType)>
Chris@16 63 struct utf_traits {
Chris@16 64 ///
Chris@16 65 /// The type of the character
Chris@16 66 ///
Chris@16 67 typedef CharType char_type;
Chris@16 68 ///
Chris@16 69 /// Read one code point from the range [p,e) and return it.
Chris@16 70 ///
Chris@16 71 /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
Chris@16 72 /// - If illegal sequence detected returns \ref illegal
Chris@16 73 ///
Chris@16 74 /// Requirements
Chris@16 75 ///
Chris@16 76 /// - Iterator is valid input iterator
Chris@16 77 ///
Chris@16 78 /// Postconditions
Chris@16 79 ///
Chris@16 80 /// - p points to the last consumed character
Chris@16 81 ///
Chris@16 82 template<typename Iterator>
Chris@16 83 static code_point decode(Iterator &p,Iterator e);
Chris@16 84
Chris@16 85 ///
Chris@16 86 /// Maximal width of valid sequence in the code units:
Chris@16 87 ///
Chris@16 88 /// - UTF-8 - 4
Chris@16 89 /// - UTF-16 - 2
Chris@16 90 /// - UTF-32 - 1
Chris@16 91 ///
Chris@16 92 static const int max_width;
Chris@16 93 ///
Chris@16 94 /// The width of specific code point in the code units.
Chris@16 95 ///
Chris@16 96 /// Requirement: value is a valid Unicode code point
Chris@16 97 /// Returns value in range [1..max_width]
Chris@16 98 ///
Chris@16 99 static int width(code_point value);
Chris@16 100
Chris@16 101 ///
Chris@16 102 /// Get the size of the trail part of variable length encoded sequence.
Chris@16 103 ///
Chris@16 104 /// Returns -1 if C is not valid lead character
Chris@16 105 ///
Chris@16 106 static int trail_length(char_type c);
Chris@16 107 ///
Chris@16 108 /// Returns true if c is trail code unit, always false for UTF-32
Chris@16 109 ///
Chris@16 110 static bool is_trail(char_type c);
Chris@16 111 ///
Chris@16 112 /// Returns true if c is lead code unit, always true of UTF-32
Chris@16 113 ///
Chris@16 114 static bool is_lead(char_type c);
Chris@16 115
Chris@16 116 ///
Chris@16 117 /// Convert valid Unicode code point \a value to the UTF sequence.
Chris@16 118 ///
Chris@16 119 /// Requirements:
Chris@16 120 ///
Chris@16 121 /// - \a value is valid code point
Chris@16 122 /// - \a out is an output iterator should be able to accept at least width(value) units
Chris@16 123 ///
Chris@16 124 /// Returns the iterator past the last written code unit.
Chris@16 125 ///
Chris@16 126 template<typename Iterator>
Chris@16 127 static Iterator encode(code_point value,Iterator out);
Chris@16 128 ///
Chris@16 129 /// Decodes valid UTF sequence that is pointed by p into code point.
Chris@16 130 ///
Chris@16 131 /// If the sequence is invalid or points to end the behavior is undefined
Chris@16 132 ///
Chris@16 133 template<typename Iterator>
Chris@16 134 static code_point decode_valid(Iterator &p);
Chris@16 135 };
Chris@16 136
Chris@16 137 #else
Chris@16 138
Chris@16 139 template<typename CharType,int size=sizeof(CharType)>
Chris@16 140 struct utf_traits;
Chris@16 141
Chris@16 142 template<typename CharType>
Chris@16 143 struct utf_traits<CharType,1> {
Chris@16 144
Chris@16 145 typedef CharType char_type;
Chris@16 146
Chris@16 147 static int trail_length(char_type ci)
Chris@16 148 {
Chris@16 149 unsigned char c = ci;
Chris@16 150 if(c < 128)
Chris@16 151 return 0;
Chris@16 152 if(BOOST_LOCALE_UNLIKELY(c < 194))
Chris@16 153 return -1;
Chris@16 154 if(c < 224)
Chris@16 155 return 1;
Chris@16 156 if(c < 240)
Chris@16 157 return 2;
Chris@16 158 if(BOOST_LOCALE_LIKELY(c <=244))
Chris@16 159 return 3;
Chris@16 160 return -1;
Chris@16 161 }
Chris@16 162
Chris@16 163 static const int max_width = 4;
Chris@16 164
Chris@16 165 static int width(code_point value)
Chris@16 166 {
Chris@16 167 if(value <=0x7F) {
Chris@16 168 return 1;
Chris@16 169 }
Chris@16 170 else if(value <=0x7FF) {
Chris@16 171 return 2;
Chris@16 172 }
Chris@16 173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
Chris@16 174 return 3;
Chris@16 175 }
Chris@16 176 else {
Chris@16 177 return 4;
Chris@16 178 }
Chris@16 179 }
Chris@16 180
Chris@16 181 static bool is_trail(char_type ci)
Chris@16 182 {
Chris@16 183 unsigned char c=ci;
Chris@16 184 return (c & 0xC0)==0x80;
Chris@16 185 }
Chris@16 186
Chris@16 187 static bool is_lead(char_type ci)
Chris@16 188 {
Chris@16 189 return !is_trail(ci);
Chris@16 190 }
Chris@16 191
Chris@16 192 template<typename Iterator>
Chris@16 193 static code_point decode(Iterator &p,Iterator e)
Chris@16 194 {
Chris@16 195 if(BOOST_LOCALE_UNLIKELY(p==e))
Chris@16 196 return incomplete;
Chris@16 197
Chris@16 198 unsigned char lead = *p++;
Chris@16 199
Chris@16 200 // First byte is fully validated here
Chris@16 201 int trail_size = trail_length(lead);
Chris@16 202
Chris@16 203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
Chris@16 204 return illegal;
Chris@16 205
Chris@16 206 //
Chris@16 207 // Ok as only ASCII may be of size = 0
Chris@16 208 // also optimize for ASCII text
Chris@16 209 //
Chris@16 210 if(trail_size == 0)
Chris@16 211 return lead;
Chris@16 212
Chris@16 213 code_point c = lead & ((1<<(6-trail_size))-1);
Chris@16 214
Chris@16 215 // Read the rest
Chris@16 216 unsigned char tmp;
Chris@16 217 switch(trail_size) {
Chris@16 218 case 3:
Chris@16 219 if(BOOST_LOCALE_UNLIKELY(p==e))
Chris@16 220 return incomplete;
Chris@16 221 tmp = *p++;
Chris@16 222 if (!is_trail(tmp))
Chris@16 223 return illegal;
Chris@16 224 c = (c << 6) | ( tmp & 0x3F);
Chris@16 225 case 2:
Chris@16 226 if(BOOST_LOCALE_UNLIKELY(p==e))
Chris@16 227 return incomplete;
Chris@16 228 tmp = *p++;
Chris@16 229 if (!is_trail(tmp))
Chris@16 230 return illegal;
Chris@16 231 c = (c << 6) | ( tmp & 0x3F);
Chris@16 232 case 1:
Chris@16 233 if(BOOST_LOCALE_UNLIKELY(p==e))
Chris@16 234 return incomplete;
Chris@16 235 tmp = *p++;
Chris@16 236 if (!is_trail(tmp))
Chris@16 237 return illegal;
Chris@16 238 c = (c << 6) | ( tmp & 0x3F);
Chris@16 239 }
Chris@16 240
Chris@16 241 // Check code point validity: no surrogates and
Chris@16 242 // valid range
Chris@16 243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
Chris@16 244 return illegal;
Chris@16 245
Chris@16 246 // make sure it is the most compact representation
Chris@16 247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
Chris@16 248 return illegal;
Chris@16 249
Chris@16 250 return c;
Chris@16 251
Chris@16 252 }
Chris@16 253
Chris@16 254 template<typename Iterator>
Chris@16 255 static code_point decode_valid(Iterator &p)
Chris@16 256 {
Chris@16 257 unsigned char lead = *p++;
Chris@16 258 if(lead < 192)
Chris@16 259 return lead;
Chris@16 260
Chris@16 261 int trail_size;
Chris@16 262
Chris@16 263 if(lead < 224)
Chris@16 264 trail_size = 1;
Chris@16 265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
Chris@16 266 trail_size = 2;
Chris@16 267 else
Chris@16 268 trail_size = 3;
Chris@16 269
Chris@16 270 code_point c = lead & ((1<<(6-trail_size))-1);
Chris@16 271
Chris@16 272 switch(trail_size) {
Chris@16 273 case 3:
Chris@16 274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
Chris@16 275 case 2:
Chris@16 276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
Chris@16 277 case 1:
Chris@16 278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
Chris@16 279 }
Chris@16 280
Chris@16 281 return c;
Chris@16 282 }
Chris@16 283
Chris@16 284
Chris@16 285
Chris@16 286 template<typename Iterator>
Chris@16 287 static Iterator encode(code_point value,Iterator out)
Chris@16 288 {
Chris@16 289 if(value <= 0x7F) {
Chris@16 290 *out++ = static_cast<char_type>(value);
Chris@16 291 }
Chris@16 292 else if(value <= 0x7FF) {
Chris@16 293 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
Chris@16 294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
Chris@16 295 }
Chris@16 296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
Chris@16 297 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
Chris@16 298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
Chris@16 299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
Chris@16 300 }
Chris@16 301 else {
Chris@16 302 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
Chris@16 303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
Chris@16 304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
Chris@16 305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
Chris@16 306 }
Chris@16 307 return out;
Chris@16 308 }
Chris@16 309 }; // utf8
Chris@16 310
Chris@16 311 template<typename CharType>
Chris@16 312 struct utf_traits<CharType,2> {
Chris@16 313 typedef CharType char_type;
Chris@16 314
Chris@16 315 // See RFC 2781
Chris@16 316 static bool is_first_surrogate(uint16_t x)
Chris@16 317 {
Chris@16 318 return 0xD800 <=x && x<= 0xDBFF;
Chris@16 319 }
Chris@16 320 static bool is_second_surrogate(uint16_t x)
Chris@16 321 {
Chris@16 322 return 0xDC00 <=x && x<= 0xDFFF;
Chris@16 323 }
Chris@16 324 static code_point combine_surrogate(uint16_t w1,uint16_t w2)
Chris@16 325 {
Chris@16 326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
Chris@16 327 }
Chris@16 328 static int trail_length(char_type c)
Chris@16 329 {
Chris@16 330 if(is_first_surrogate(c))
Chris@16 331 return 1;
Chris@16 332 if(is_second_surrogate(c))
Chris@16 333 return -1;
Chris@16 334 return 0;
Chris@16 335 }
Chris@16 336 ///
Chris@16 337 /// Returns true if c is trail code unit, always false for UTF-32
Chris@16 338 ///
Chris@16 339 static bool is_trail(char_type c)
Chris@16 340 {
Chris@16 341 return is_second_surrogate(c);
Chris@16 342 }
Chris@16 343 ///
Chris@16 344 /// Returns true if c is lead code unit, always true of UTF-32
Chris@16 345 ///
Chris@16 346 static bool is_lead(char_type c)
Chris@16 347 {
Chris@16 348 return !is_second_surrogate(c);
Chris@16 349 }
Chris@16 350
Chris@16 351 template<typename It>
Chris@16 352 static code_point decode(It &current,It last)
Chris@16 353 {
Chris@16 354 if(BOOST_LOCALE_UNLIKELY(current == last))
Chris@16 355 return incomplete;
Chris@16 356 uint16_t w1=*current++;
Chris@16 357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
Chris@16 358 return w1;
Chris@16 359 }
Chris@16 360 if(w1 > 0xDBFF)
Chris@16 361 return illegal;
Chris@16 362 if(current==last)
Chris@16 363 return incomplete;
Chris@16 364 uint16_t w2=*current++;
Chris@16 365 if(w2 < 0xDC00 || 0xDFFF < w2)
Chris@16 366 return illegal;
Chris@16 367 return combine_surrogate(w1,w2);
Chris@16 368 }
Chris@16 369 template<typename It>
Chris@16 370 static code_point decode_valid(It &current)
Chris@16 371 {
Chris@16 372 uint16_t w1=*current++;
Chris@16 373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
Chris@16 374 return w1;
Chris@16 375 }
Chris@16 376 uint16_t w2=*current++;
Chris@16 377 return combine_surrogate(w1,w2);
Chris@16 378 }
Chris@16 379
Chris@16 380 static const int max_width = 2;
Chris@16 381 static int width(code_point u)
Chris@16 382 {
Chris@16 383 return u>=0x10000 ? 2 : 1;
Chris@16 384 }
Chris@16 385 template<typename It>
Chris@16 386 static It encode(code_point u,It out)
Chris@16 387 {
Chris@16 388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
Chris@16 389 *out++ = static_cast<char_type>(u);
Chris@16 390 }
Chris@16 391 else {
Chris@16 392 u -= 0x10000;
Chris@16 393 *out++ = static_cast<char_type>(0xD800 | (u>>10));
Chris@16 394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
Chris@16 395 }
Chris@16 396 return out;
Chris@16 397 }
Chris@16 398 }; // utf16;
Chris@16 399
Chris@16 400
Chris@16 401 template<typename CharType>
Chris@16 402 struct utf_traits<CharType,4> {
Chris@16 403 typedef CharType char_type;
Chris@16 404 static int trail_length(char_type c)
Chris@16 405 {
Chris@16 406 if(is_valid_codepoint(c))
Chris@16 407 return 0;
Chris@16 408 return -1;
Chris@16 409 }
Chris@16 410 static bool is_trail(char_type /*c*/)
Chris@16 411 {
Chris@16 412 return false;
Chris@16 413 }
Chris@16 414 static bool is_lead(char_type /*c*/)
Chris@16 415 {
Chris@16 416 return true;
Chris@16 417 }
Chris@16 418
Chris@16 419 template<typename It>
Chris@16 420 static code_point decode_valid(It &current)
Chris@16 421 {
Chris@16 422 return *current++;
Chris@16 423 }
Chris@16 424
Chris@16 425 template<typename It>
Chris@16 426 static code_point decode(It &current,It last)
Chris@16 427 {
Chris@16 428 if(BOOST_LOCALE_UNLIKELY(current == last))
Chris@16 429 return boost::locale::utf::incomplete;
Chris@16 430 code_point c=*current++;
Chris@16 431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
Chris@16 432 return boost::locale::utf::illegal;
Chris@16 433 return c;
Chris@16 434 }
Chris@16 435 static const int max_width = 1;
Chris@16 436 static int width(code_point /*u*/)
Chris@16 437 {
Chris@16 438 return 1;
Chris@16 439 }
Chris@16 440 template<typename It>
Chris@16 441 static It encode(code_point u,It out)
Chris@16 442 {
Chris@16 443 *out++ = static_cast<char_type>(u);
Chris@16 444 return out;
Chris@16 445 }
Chris@16 446
Chris@16 447 }; // utf32
Chris@16 448
Chris@16 449 #endif
Chris@16 450
Chris@16 451
Chris@16 452 } // utf
Chris@16 453 } // locale
Chris@16 454 } // boost
Chris@16 455
Chris@16 456
Chris@16 457 #endif
Chris@16 458
Chris@16 459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
Chris@16 460