annotate DEPENDENCIES/generic/include/boost/regex/pending/unicode_iterator.hpp @ 125:34e428693f5d vext

Vext -> Repoint
author Chris Cannam
date Thu, 14 Jun 2018 11:15:39 +0100
parents c530137014c0
children
rev   line source
Chris@16 1 /*
Chris@16 2 *
Chris@16 3 * Copyright (c) 2004
Chris@16 4 * John Maddock
Chris@16 5 *
Chris@16 6 * Use, modification and distribution are subject to the
Chris@16 7 * Boost Software License, Version 1.0. (See accompanying file
Chris@16 8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
Chris@16 9 *
Chris@16 10 */
Chris@16 11
Chris@16 12 /*
Chris@16 13 * LOCATION: see http://www.boost.org for most recent version.
Chris@16 14 * FILE unicode_iterator.hpp
Chris@16 15 * VERSION see <boost/version.hpp>
Chris@16 16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
Chris@16 17 */
Chris@16 18
Chris@16 19 /****************************************************************************
Chris@16 20
Chris@16 21 Contents:
Chris@16 22 ~~~~~~~~~
Chris@16 23
Chris@16 24 1) Read Only, Input Adapters:
Chris@16 25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Chris@16 26
Chris@16 27 template <class BaseIterator, class U8Type = ::boost::uint8_t>
Chris@16 28 class u32_to_u8_iterator;
Chris@16 29
Chris@16 30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
Chris@16 31
Chris@16 32 template <class BaseIterator, class U32Type = ::boost::uint32_t>
Chris@16 33 class u8_to_u32_iterator;
Chris@16 34
Chris@16 35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
Chris@16 36
Chris@16 37 template <class BaseIterator, class U16Type = ::boost::uint16_t>
Chris@16 38 class u32_to_u16_iterator;
Chris@16 39
Chris@16 40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
Chris@16 41
Chris@16 42 template <class BaseIterator, class U32Type = ::boost::uint32_t>
Chris@16 43 class u16_to_u32_iterator;
Chris@16 44
Chris@16 45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
Chris@16 46
Chris@16 47 2) Single pass output iterator adapters:
Chris@16 48
Chris@16 49 template <class BaseIterator>
Chris@16 50 class utf8_output_iterator;
Chris@16 51
Chris@16 52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
Chris@16 53
Chris@16 54 template <class BaseIterator>
Chris@16 55 class utf16_output_iterator;
Chris@16 56
Chris@16 57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
Chris@16 58
Chris@16 59 ****************************************************************************/
Chris@16 60
Chris@16 61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
Chris@16 62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
Chris@16 63 #include <boost/cstdint.hpp>
Chris@16 64 #include <boost/assert.hpp>
Chris@16 65 #include <boost/iterator/iterator_facade.hpp>
Chris@16 66 #include <boost/static_assert.hpp>
Chris@16 67 #include <boost/throw_exception.hpp>
Chris@16 68 #include <stdexcept>
Chris@16 69 #ifndef BOOST_NO_STD_LOCALE
Chris@16 70 #include <sstream>
Chris@16 71 #include <ios>
Chris@16 72 #endif
Chris@16 73 #include <limits.h> // CHAR_BIT
Chris@16 74
Chris@16 75 namespace boost{
Chris@16 76
Chris@16 77 namespace detail{
Chris@16 78
Chris@16 79 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
Chris@16 80 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
Chris@16 81 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
Chris@16 82
Chris@16 83 inline bool is_high_surrogate(::boost::uint16_t v)
Chris@16 84 {
Chris@16 85 return (v & 0xFFFFFC00u) == 0xd800u;
Chris@16 86 }
Chris@16 87 inline bool is_low_surrogate(::boost::uint16_t v)
Chris@16 88 {
Chris@16 89 return (v & 0xFFFFFC00u) == 0xdc00u;
Chris@16 90 }
Chris@16 91 template <class T>
Chris@16 92 inline bool is_surrogate(T v)
Chris@16 93 {
Chris@16 94 return (v & 0xFFFFF800u) == 0xd800;
Chris@16 95 }
Chris@16 96
Chris@16 97 inline unsigned utf8_byte_count(boost::uint8_t c)
Chris@16 98 {
Chris@16 99 // if the most significant bit with a zero in it is in position
Chris@16 100 // 8-N then there are N bytes in this UTF-8 sequence:
Chris@16 101 boost::uint8_t mask = 0x80u;
Chris@16 102 unsigned result = 0;
Chris@16 103 while(c & mask)
Chris@16 104 {
Chris@16 105 ++result;
Chris@16 106 mask >>= 1;
Chris@16 107 }
Chris@16 108 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
Chris@16 109 }
Chris@16 110
Chris@16 111 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
Chris@16 112 {
Chris@16 113 return utf8_byte_count(c) - 1;
Chris@16 114 }
Chris@16 115
Chris@16 116 #ifdef BOOST_MSVC
Chris@16 117 #pragma warning(push)
Chris@16 118 #pragma warning(disable:4100)
Chris@16 119 #endif
Chris@16 120 inline void invalid_utf32_code_point(::boost::uint32_t val)
Chris@16 121 {
Chris@16 122 #ifndef BOOST_NO_STD_LOCALE
Chris@16 123 std::stringstream ss;
Chris@16 124 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
Chris@16 125 std::out_of_range e(ss.str());
Chris@16 126 #else
Chris@16 127 std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
Chris@16 128 #endif
Chris@16 129 boost::throw_exception(e);
Chris@16 130 }
Chris@16 131 #ifdef BOOST_MSVC
Chris@16 132 #pragma warning(pop)
Chris@16 133 #endif
Chris@16 134
Chris@16 135
Chris@16 136 } // namespace detail
Chris@16 137
Chris@16 138 template <class BaseIterator, class U16Type = ::boost::uint16_t>
Chris@16 139 class u32_to_u16_iterator
Chris@16 140 : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
Chris@16 141 {
Chris@16 142 typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
Chris@16 143
Chris@101 144 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
Chris@16 145 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
Chris@16 146
Chris@16 147 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
Chris@16 148 BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
Chris@16 149 #endif
Chris@16 150
Chris@16 151 public:
Chris@16 152 typename base_type::reference
Chris@16 153 dereference()const
Chris@16 154 {
Chris@16 155 if(m_current == 2)
Chris@16 156 extract_current();
Chris@16 157 return m_values[m_current];
Chris@16 158 }
Chris@16 159 bool equal(const u32_to_u16_iterator& that)const
Chris@16 160 {
Chris@16 161 if(m_position == that.m_position)
Chris@16 162 {
Chris@16 163 // Both m_currents must be equal, or both even
Chris@16 164 // this is the same as saying their sum must be even:
Chris@16 165 return (m_current + that.m_current) & 1u ? false : true;
Chris@16 166 }
Chris@16 167 return false;
Chris@16 168 }
Chris@16 169 void increment()
Chris@16 170 {
Chris@16 171 // if we have a pending read then read now, so that we know whether
Chris@16 172 // to skip a position, or move to a low-surrogate:
Chris@16 173 if(m_current == 2)
Chris@16 174 {
Chris@16 175 // pending read:
Chris@16 176 extract_current();
Chris@16 177 }
Chris@16 178 // move to the next surrogate position:
Chris@16 179 ++m_current;
Chris@16 180 // if we've reached the end skip a position:
Chris@16 181 if(m_values[m_current] == 0)
Chris@16 182 {
Chris@16 183 m_current = 2;
Chris@16 184 ++m_position;
Chris@16 185 }
Chris@16 186 }
Chris@16 187 void decrement()
Chris@16 188 {
Chris@16 189 if(m_current != 1)
Chris@16 190 {
Chris@16 191 // decrementing an iterator always leads to a valid position:
Chris@16 192 --m_position;
Chris@16 193 extract_current();
Chris@16 194 m_current = m_values[1] ? 1 : 0;
Chris@16 195 }
Chris@16 196 else
Chris@16 197 {
Chris@16 198 m_current = 0;
Chris@16 199 }
Chris@16 200 }
Chris@16 201 BaseIterator base()const
Chris@16 202 {
Chris@16 203 return m_position;
Chris@16 204 }
Chris@16 205 // construct:
Chris@16 206 u32_to_u16_iterator() : m_position(), m_current(0)
Chris@16 207 {
Chris@16 208 m_values[0] = 0;
Chris@16 209 m_values[1] = 0;
Chris@16 210 m_values[2] = 0;
Chris@16 211 }
Chris@16 212 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
Chris@16 213 {
Chris@16 214 m_values[0] = 0;
Chris@16 215 m_values[1] = 0;
Chris@16 216 m_values[2] = 0;
Chris@16 217 }
Chris@16 218 private:
Chris@16 219
Chris@16 220 void extract_current()const
Chris@16 221 {
Chris@16 222 // begin by checking for a code point out of range:
Chris@16 223 ::boost::uint32_t v = *m_position;
Chris@16 224 if(v >= 0x10000u)
Chris@16 225 {
Chris@16 226 if(v > 0x10FFFFu)
Chris@16 227 detail::invalid_utf32_code_point(*m_position);
Chris@16 228 // split into two surrogates:
Chris@16 229 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
Chris@16 230 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
Chris@16 231 m_current = 0;
Chris@16 232 BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
Chris@16 233 BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
Chris@16 234 }
Chris@16 235 else
Chris@16 236 {
Chris@16 237 // 16-bit code point:
Chris@16 238 m_values[0] = static_cast<U16Type>(*m_position);
Chris@16 239 m_values[1] = 0;
Chris@16 240 m_current = 0;
Chris@16 241 // value must not be a surrogate:
Chris@16 242 if(detail::is_surrogate(m_values[0]))
Chris@16 243 detail::invalid_utf32_code_point(*m_position);
Chris@16 244 }
Chris@16 245 }
Chris@16 246 BaseIterator m_position;
Chris@16 247 mutable U16Type m_values[3];
Chris@16 248 mutable unsigned m_current;
Chris@16 249 };
Chris@16 250
Chris@16 251 template <class BaseIterator, class U32Type = ::boost::uint32_t>
Chris@16 252 class u16_to_u32_iterator
Chris@16 253 : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
Chris@16 254 {
Chris@16 255 typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
Chris@16 256 // special values for pending iterator reads:
Chris@16 257 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
Chris@16 258
Chris@101 259 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
Chris@16 260 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
Chris@16 261
Chris@16 262 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
Chris@16 263 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
Chris@16 264 #endif
Chris@16 265
Chris@16 266 public:
Chris@16 267 typename base_type::reference
Chris@16 268 dereference()const
Chris@16 269 {
Chris@16 270 if(m_value == pending_read)
Chris@16 271 extract_current();
Chris@16 272 return m_value;
Chris@16 273 }
Chris@16 274 bool equal(const u16_to_u32_iterator& that)const
Chris@16 275 {
Chris@16 276 return m_position == that.m_position;
Chris@16 277 }
Chris@16 278 void increment()
Chris@16 279 {
Chris@16 280 // skip high surrogate first if there is one:
Chris@16 281 if(detail::is_high_surrogate(*m_position)) ++m_position;
Chris@16 282 ++m_position;
Chris@16 283 m_value = pending_read;
Chris@16 284 }
Chris@16 285 void decrement()
Chris@16 286 {
Chris@16 287 --m_position;
Chris@16 288 // if we have a low surrogate then go back one more:
Chris@16 289 if(detail::is_low_surrogate(*m_position))
Chris@16 290 --m_position;
Chris@16 291 m_value = pending_read;
Chris@16 292 }
Chris@16 293 BaseIterator base()const
Chris@16 294 {
Chris@16 295 return m_position;
Chris@16 296 }
Chris@16 297 // construct:
Chris@16 298 u16_to_u32_iterator() : m_position()
Chris@16 299 {
Chris@16 300 m_value = pending_read;
Chris@16 301 }
Chris@16 302 u16_to_u32_iterator(BaseIterator b) : m_position(b)
Chris@16 303 {
Chris@16 304 m_value = pending_read;
Chris@16 305 }
Chris@16 306 //
Chris@16 307 // Range checked version:
Chris@16 308 //
Chris@16 309 u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
Chris@16 310 {
Chris@16 311 m_value = pending_read;
Chris@16 312 //
Chris@16 313 // The range must not start with a low surrogate, or end in a high surrogate,
Chris@16 314 // otherwise we run the risk of running outside the underlying input range.
Chris@16 315 // Likewise b must not be located at a low surrogate.
Chris@16 316 //
Chris@16 317 boost::uint16_t val;
Chris@16 318 if(start != end)
Chris@16 319 {
Chris@16 320 if((b != start) && (b != end))
Chris@16 321 {
Chris@16 322 val = *b;
Chris@16 323 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
Chris@16 324 invalid_code_point(val);
Chris@16 325 }
Chris@16 326 val = *start;
Chris@16 327 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
Chris@16 328 invalid_code_point(val);
Chris@16 329 val = *--end;
Chris@16 330 if(detail::is_high_surrogate(val))
Chris@16 331 invalid_code_point(val);
Chris@16 332 }
Chris@16 333 }
Chris@16 334 private:
Chris@16 335 static void invalid_code_point(::boost::uint16_t val)
Chris@16 336 {
Chris@16 337 #ifndef BOOST_NO_STD_LOCALE
Chris@16 338 std::stringstream ss;
Chris@16 339 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
Chris@16 340 std::out_of_range e(ss.str());
Chris@16 341 #else
Chris@16 342 std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
Chris@16 343 #endif
Chris@16 344 boost::throw_exception(e);
Chris@16 345 }
Chris@16 346 void extract_current()const
Chris@16 347 {
Chris@16 348 m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
Chris@16 349 // if the last value is a high surrogate then adjust m_position and m_value as needed:
Chris@16 350 if(detail::is_high_surrogate(*m_position))
Chris@16 351 {
Chris@16 352 // precondition; next value must have be a low-surrogate:
Chris@16 353 BaseIterator next(m_position);
Chris@16 354 ::boost::uint16_t t = *++next;
Chris@16 355 if((t & 0xFC00u) != 0xDC00u)
Chris@16 356 invalid_code_point(t);
Chris@16 357 m_value = (m_value - detail::high_surrogate_base) << 10;
Chris@16 358 m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
Chris@16 359 }
Chris@16 360 // postcondition; result must not be a surrogate:
Chris@16 361 if(detail::is_surrogate(m_value))
Chris@16 362 invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
Chris@16 363 }
Chris@16 364 BaseIterator m_position;
Chris@16 365 mutable U32Type m_value;
Chris@16 366 };
Chris@16 367
Chris@16 368 template <class BaseIterator, class U8Type = ::boost::uint8_t>
Chris@16 369 class u32_to_u8_iterator
Chris@16 370 : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
Chris@16 371 {
Chris@16 372 typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
Chris@16 373
Chris@101 374 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
Chris@16 375 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
Chris@16 376
Chris@16 377 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
Chris@16 378 BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
Chris@16 379 #endif
Chris@16 380
Chris@16 381 public:
Chris@16 382 typename base_type::reference
Chris@16 383 dereference()const
Chris@16 384 {
Chris@16 385 if(m_current == 4)
Chris@16 386 extract_current();
Chris@16 387 return m_values[m_current];
Chris@16 388 }
Chris@16 389 bool equal(const u32_to_u8_iterator& that)const
Chris@16 390 {
Chris@16 391 if(m_position == that.m_position)
Chris@16 392 {
Chris@16 393 // either the m_current's must be equal, or one must be 0 and
Chris@16 394 // the other 4: which means neither must have bits 1 or 2 set:
Chris@16 395 return (m_current == that.m_current)
Chris@16 396 || (((m_current | that.m_current) & 3) == 0);
Chris@16 397 }
Chris@16 398 return false;
Chris@16 399 }
Chris@16 400 void increment()
Chris@16 401 {
Chris@16 402 // if we have a pending read then read now, so that we know whether
Chris@16 403 // to skip a position, or move to a low-surrogate:
Chris@16 404 if(m_current == 4)
Chris@16 405 {
Chris@16 406 // pending read:
Chris@16 407 extract_current();
Chris@16 408 }
Chris@16 409 // move to the next surrogate position:
Chris@16 410 ++m_current;
Chris@16 411 // if we've reached the end skip a position:
Chris@16 412 if(m_values[m_current] == 0)
Chris@16 413 {
Chris@16 414 m_current = 4;
Chris@16 415 ++m_position;
Chris@16 416 }
Chris@16 417 }
Chris@16 418 void decrement()
Chris@16 419 {
Chris@16 420 if((m_current & 3) == 0)
Chris@16 421 {
Chris@16 422 --m_position;
Chris@16 423 extract_current();
Chris@16 424 m_current = 3;
Chris@16 425 while(m_current && (m_values[m_current] == 0))
Chris@16 426 --m_current;
Chris@16 427 }
Chris@16 428 else
Chris@16 429 --m_current;
Chris@16 430 }
Chris@16 431 BaseIterator base()const
Chris@16 432 {
Chris@16 433 return m_position;
Chris@16 434 }
Chris@16 435 // construct:
Chris@16 436 u32_to_u8_iterator() : m_position(), m_current(0)
Chris@16 437 {
Chris@16 438 m_values[0] = 0;
Chris@16 439 m_values[1] = 0;
Chris@16 440 m_values[2] = 0;
Chris@16 441 m_values[3] = 0;
Chris@16 442 m_values[4] = 0;
Chris@16 443 }
Chris@16 444 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
Chris@16 445 {
Chris@16 446 m_values[0] = 0;
Chris@16 447 m_values[1] = 0;
Chris@16 448 m_values[2] = 0;
Chris@16 449 m_values[3] = 0;
Chris@16 450 m_values[4] = 0;
Chris@16 451 }
Chris@16 452 private:
Chris@16 453
Chris@16 454 void extract_current()const
Chris@16 455 {
Chris@16 456 boost::uint32_t c = *m_position;
Chris@16 457 if(c > 0x10FFFFu)
Chris@16 458 detail::invalid_utf32_code_point(c);
Chris@16 459 if(c < 0x80u)
Chris@16 460 {
Chris@16 461 m_values[0] = static_cast<unsigned char>(c);
Chris@16 462 m_values[1] = static_cast<unsigned char>(0u);
Chris@16 463 m_values[2] = static_cast<unsigned char>(0u);
Chris@16 464 m_values[3] = static_cast<unsigned char>(0u);
Chris@16 465 }
Chris@16 466 else if(c < 0x800u)
Chris@16 467 {
Chris@16 468 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
Chris@16 469 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 470 m_values[2] = static_cast<unsigned char>(0u);
Chris@16 471 m_values[3] = static_cast<unsigned char>(0u);
Chris@16 472 }
Chris@16 473 else if(c < 0x10000u)
Chris@16 474 {
Chris@16 475 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
Chris@16 476 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
Chris@16 477 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 478 m_values[3] = static_cast<unsigned char>(0u);
Chris@16 479 }
Chris@16 480 else
Chris@16 481 {
Chris@16 482 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
Chris@16 483 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
Chris@16 484 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
Chris@16 485 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 486 }
Chris@16 487 m_current= 0;
Chris@16 488 }
Chris@16 489 BaseIterator m_position;
Chris@16 490 mutable U8Type m_values[5];
Chris@16 491 mutable unsigned m_current;
Chris@16 492 };
Chris@16 493
Chris@16 494 template <class BaseIterator, class U32Type = ::boost::uint32_t>
Chris@16 495 class u8_to_u32_iterator
Chris@16 496 : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
Chris@16 497 {
Chris@16 498 typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
Chris@16 499 // special values for pending iterator reads:
Chris@16 500 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
Chris@16 501
Chris@101 502 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
Chris@16 503 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
Chris@16 504
Chris@16 505 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
Chris@16 506 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
Chris@16 507 #endif
Chris@16 508
Chris@16 509 public:
Chris@16 510 typename base_type::reference
Chris@16 511 dereference()const
Chris@16 512 {
Chris@16 513 if(m_value == pending_read)
Chris@16 514 extract_current();
Chris@16 515 return m_value;
Chris@16 516 }
Chris@16 517 bool equal(const u8_to_u32_iterator& that)const
Chris@16 518 {
Chris@16 519 return m_position == that.m_position;
Chris@16 520 }
Chris@16 521 void increment()
Chris@16 522 {
Chris@16 523 // We must not start with a continuation character:
Chris@16 524 if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
Chris@16 525 invalid_sequence();
Chris@16 526 // skip high surrogate first if there is one:
Chris@16 527 unsigned c = detail::utf8_byte_count(*m_position);
Chris@16 528 if(m_value == pending_read)
Chris@16 529 {
Chris@16 530 // Since we haven't read in a value, we need to validate the code points:
Chris@16 531 for(unsigned i = 0; i < c; ++i)
Chris@16 532 {
Chris@16 533 ++m_position;
Chris@16 534 // We must have a continuation byte:
Chris@16 535 if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
Chris@16 536 invalid_sequence();
Chris@16 537 }
Chris@16 538 }
Chris@16 539 else
Chris@16 540 {
Chris@16 541 std::advance(m_position, c);
Chris@16 542 }
Chris@16 543 m_value = pending_read;
Chris@16 544 }
Chris@16 545 void decrement()
Chris@16 546 {
Chris@16 547 // Keep backtracking until we don't have a trailing character:
Chris@16 548 unsigned count = 0;
Chris@16 549 while((*--m_position & 0xC0u) == 0x80u) ++count;
Chris@16 550 // now check that the sequence was valid:
Chris@16 551 if(count != detail::utf8_trailing_byte_count(*m_position))
Chris@16 552 invalid_sequence();
Chris@16 553 m_value = pending_read;
Chris@16 554 }
Chris@16 555 BaseIterator base()const
Chris@16 556 {
Chris@16 557 return m_position;
Chris@16 558 }
Chris@16 559 // construct:
Chris@16 560 u8_to_u32_iterator() : m_position()
Chris@16 561 {
Chris@16 562 m_value = pending_read;
Chris@16 563 }
Chris@16 564 u8_to_u32_iterator(BaseIterator b) : m_position(b)
Chris@16 565 {
Chris@16 566 m_value = pending_read;
Chris@16 567 }
Chris@16 568 //
Chris@16 569 // Checked constructor:
Chris@16 570 //
Chris@16 571 u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
Chris@16 572 {
Chris@16 573 m_value = pending_read;
Chris@16 574 //
Chris@16 575 // We must not start with a continuation character, or end with a
Chris@16 576 // truncated UTF-8 sequence otherwise we run the risk of going past
Chris@16 577 // the start/end of the underlying sequence:
Chris@16 578 //
Chris@16 579 if(start != end)
Chris@16 580 {
Chris@16 581 unsigned char v = *start;
Chris@16 582 if((v & 0xC0u) == 0x80u)
Chris@16 583 invalid_sequence();
Chris@16 584 if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
Chris@16 585 invalid_sequence();
Chris@16 586 BaseIterator pos = end;
Chris@16 587 do
Chris@16 588 {
Chris@16 589 v = *--pos;
Chris@16 590 }
Chris@16 591 while((start != pos) && ((v & 0xC0u) == 0x80u));
Chris@16 592 std::ptrdiff_t extra = detail::utf8_byte_count(v);
Chris@16 593 if(std::distance(pos, end) < extra)
Chris@16 594 invalid_sequence();
Chris@16 595 }
Chris@16 596 }
Chris@16 597 private:
Chris@16 598 static void invalid_sequence()
Chris@16 599 {
Chris@16 600 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
Chris@16 601 boost::throw_exception(e);
Chris@16 602 }
Chris@16 603 void extract_current()const
Chris@16 604 {
Chris@16 605 m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
Chris@16 606 // we must not have a continuation character:
Chris@16 607 if((m_value & 0xC0u) == 0x80u)
Chris@16 608 invalid_sequence();
Chris@16 609 // see how many extra bytes we have:
Chris@16 610 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
Chris@16 611 // extract the extra bits, 6 from each extra byte:
Chris@16 612 BaseIterator next(m_position);
Chris@16 613 for(unsigned c = 0; c < extra; ++c)
Chris@16 614 {
Chris@16 615 ++next;
Chris@16 616 m_value <<= 6;
Chris@16 617 // We must have a continuation byte:
Chris@16 618 if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
Chris@16 619 invalid_sequence();
Chris@16 620 m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
Chris@16 621 }
Chris@16 622 // we now need to remove a few of the leftmost bits, but how many depends
Chris@16 623 // upon how many extra bytes we've extracted:
Chris@16 624 static const boost::uint32_t masks[4] =
Chris@16 625 {
Chris@16 626 0x7Fu,
Chris@16 627 0x7FFu,
Chris@16 628 0xFFFFu,
Chris@16 629 0x1FFFFFu,
Chris@16 630 };
Chris@16 631 m_value &= masks[extra];
Chris@101 632 // check the result is in range:
Chris@16 633 if(m_value > static_cast<U32Type>(0x10FFFFu))
Chris@16 634 invalid_sequence();
Chris@101 635 // The result must not be a surrogate:
Chris@101 636 if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
Chris@101 637 invalid_sequence();
Chris@101 638 // We should not have had an invalidly encoded UTF8 sequence:
Chris@101 639 if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
Chris@101 640 invalid_sequence();
Chris@16 641 }
Chris@16 642 BaseIterator m_position;
Chris@16 643 mutable U32Type m_value;
Chris@16 644 };
Chris@16 645
Chris@16 646 template <class BaseIterator>
Chris@16 647 class utf16_output_iterator
Chris@16 648 {
Chris@16 649 public:
Chris@16 650 typedef void difference_type;
Chris@16 651 typedef void value_type;
Chris@16 652 typedef boost::uint32_t* pointer;
Chris@16 653 typedef boost::uint32_t& reference;
Chris@16 654 typedef std::output_iterator_tag iterator_category;
Chris@16 655
Chris@16 656 utf16_output_iterator(const BaseIterator& b)
Chris@16 657 : m_position(b){}
Chris@16 658 utf16_output_iterator(const utf16_output_iterator& that)
Chris@16 659 : m_position(that.m_position){}
Chris@16 660 utf16_output_iterator& operator=(const utf16_output_iterator& that)
Chris@16 661 {
Chris@16 662 m_position = that.m_position;
Chris@16 663 return *this;
Chris@16 664 }
Chris@16 665 const utf16_output_iterator& operator*()const
Chris@16 666 {
Chris@16 667 return *this;
Chris@16 668 }
Chris@16 669 void operator=(boost::uint32_t val)const
Chris@16 670 {
Chris@16 671 push(val);
Chris@16 672 }
Chris@16 673 utf16_output_iterator& operator++()
Chris@16 674 {
Chris@16 675 return *this;
Chris@16 676 }
Chris@16 677 utf16_output_iterator& operator++(int)
Chris@16 678 {
Chris@16 679 return *this;
Chris@16 680 }
Chris@16 681 BaseIterator base()const
Chris@16 682 {
Chris@16 683 return m_position;
Chris@16 684 }
Chris@16 685 private:
Chris@16 686 void push(boost::uint32_t v)const
Chris@16 687 {
Chris@16 688 if(v >= 0x10000u)
Chris@16 689 {
Chris@16 690 // begin by checking for a code point out of range:
Chris@16 691 if(v > 0x10FFFFu)
Chris@16 692 detail::invalid_utf32_code_point(v);
Chris@16 693 // split into two surrogates:
Chris@16 694 *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
Chris@16 695 *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
Chris@16 696 }
Chris@16 697 else
Chris@16 698 {
Chris@16 699 // 16-bit code point:
Chris@16 700 // value must not be a surrogate:
Chris@16 701 if(detail::is_surrogate(v))
Chris@16 702 detail::invalid_utf32_code_point(v);
Chris@16 703 *m_position++ = static_cast<boost::uint16_t>(v);
Chris@16 704 }
Chris@16 705 }
Chris@16 706 mutable BaseIterator m_position;
Chris@16 707 };
Chris@16 708
Chris@16 709 template <class BaseIterator>
Chris@16 710 class utf8_output_iterator
Chris@16 711 {
Chris@16 712 public:
Chris@16 713 typedef void difference_type;
Chris@16 714 typedef void value_type;
Chris@16 715 typedef boost::uint32_t* pointer;
Chris@16 716 typedef boost::uint32_t& reference;
Chris@16 717 typedef std::output_iterator_tag iterator_category;
Chris@16 718
Chris@16 719 utf8_output_iterator(const BaseIterator& b)
Chris@16 720 : m_position(b){}
Chris@16 721 utf8_output_iterator(const utf8_output_iterator& that)
Chris@16 722 : m_position(that.m_position){}
Chris@16 723 utf8_output_iterator& operator=(const utf8_output_iterator& that)
Chris@16 724 {
Chris@16 725 m_position = that.m_position;
Chris@16 726 return *this;
Chris@16 727 }
Chris@16 728 const utf8_output_iterator& operator*()const
Chris@16 729 {
Chris@16 730 return *this;
Chris@16 731 }
Chris@16 732 void operator=(boost::uint32_t val)const
Chris@16 733 {
Chris@16 734 push(val);
Chris@16 735 }
Chris@16 736 utf8_output_iterator& operator++()
Chris@16 737 {
Chris@16 738 return *this;
Chris@16 739 }
Chris@16 740 utf8_output_iterator& operator++(int)
Chris@16 741 {
Chris@16 742 return *this;
Chris@16 743 }
Chris@16 744 BaseIterator base()const
Chris@16 745 {
Chris@16 746 return m_position;
Chris@16 747 }
Chris@16 748 private:
Chris@16 749 void push(boost::uint32_t c)const
Chris@16 750 {
Chris@16 751 if(c > 0x10FFFFu)
Chris@16 752 detail::invalid_utf32_code_point(c);
Chris@16 753 if(c < 0x80u)
Chris@16 754 {
Chris@16 755 *m_position++ = static_cast<unsigned char>(c);
Chris@16 756 }
Chris@16 757 else if(c < 0x800u)
Chris@16 758 {
Chris@16 759 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
Chris@16 760 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 761 }
Chris@16 762 else if(c < 0x10000u)
Chris@16 763 {
Chris@16 764 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
Chris@16 765 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
Chris@16 766 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 767 }
Chris@16 768 else
Chris@16 769 {
Chris@16 770 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
Chris@16 771 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
Chris@16 772 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
Chris@16 773 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
Chris@16 774 }
Chris@16 775 }
Chris@16 776 mutable BaseIterator m_position;
Chris@16 777 };
Chris@16 778
Chris@16 779 } // namespace boost
Chris@16 780
Chris@16 781 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
Chris@16 782