Chris@16
|
1 /*
|
Chris@16
|
2 *
|
Chris@16
|
3 * Copyright (c) 2004
|
Chris@16
|
4 * John Maddock
|
Chris@16
|
5 *
|
Chris@16
|
6 * Use, modification and distribution are subject to the
|
Chris@16
|
7 * Boost Software License, Version 1.0. (See accompanying file
|
Chris@16
|
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
9 *
|
Chris@16
|
10 */
|
Chris@16
|
11
|
Chris@16
|
12 /*
|
Chris@16
|
13 * LOCATION: see http://www.boost.org for most recent version.
|
Chris@16
|
14 * FILE unicode_iterator.hpp
|
Chris@16
|
15 * VERSION see <boost/version.hpp>
|
Chris@16
|
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
|
Chris@16
|
17 */
|
Chris@16
|
18
|
Chris@16
|
19 /****************************************************************************
|
Chris@16
|
20
|
Chris@16
|
21 Contents:
|
Chris@16
|
22 ~~~~~~~~~
|
Chris@16
|
23
|
Chris@16
|
24 1) Read Only, Input Adapters:
|
Chris@16
|
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
Chris@16
|
26
|
Chris@16
|
27 template <class BaseIterator, class U8Type = ::boost::uint8_t>
|
Chris@16
|
28 class u32_to_u8_iterator;
|
Chris@16
|
29
|
Chris@16
|
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
|
Chris@16
|
31
|
Chris@16
|
32 template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
Chris@16
|
33 class u8_to_u32_iterator;
|
Chris@16
|
34
|
Chris@16
|
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
|
Chris@16
|
36
|
Chris@16
|
37 template <class BaseIterator, class U16Type = ::boost::uint16_t>
|
Chris@16
|
38 class u32_to_u16_iterator;
|
Chris@16
|
39
|
Chris@16
|
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
|
Chris@16
|
41
|
Chris@16
|
42 template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
Chris@16
|
43 class u16_to_u32_iterator;
|
Chris@16
|
44
|
Chris@16
|
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
|
Chris@16
|
46
|
Chris@16
|
47 2) Single pass output iterator adapters:
|
Chris@16
|
48
|
Chris@16
|
49 template <class BaseIterator>
|
Chris@16
|
50 class utf8_output_iterator;
|
Chris@16
|
51
|
Chris@16
|
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
|
Chris@16
|
53
|
Chris@16
|
54 template <class BaseIterator>
|
Chris@16
|
55 class utf16_output_iterator;
|
Chris@16
|
56
|
Chris@16
|
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
|
Chris@16
|
58
|
Chris@16
|
59 ****************************************************************************/
|
Chris@16
|
60
|
Chris@16
|
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
|
Chris@16
|
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
|
Chris@16
|
63 #include <boost/cstdint.hpp>
|
Chris@16
|
64 #include <boost/assert.hpp>
|
Chris@16
|
65 #include <boost/iterator/iterator_facade.hpp>
|
Chris@16
|
66 #include <boost/static_assert.hpp>
|
Chris@16
|
67 #include <boost/throw_exception.hpp>
|
Chris@16
|
68 #include <stdexcept>
|
Chris@16
|
69 #ifndef BOOST_NO_STD_LOCALE
|
Chris@16
|
70 #include <sstream>
|
Chris@16
|
71 #include <ios>
|
Chris@16
|
72 #endif
|
Chris@16
|
73 #include <limits.h> // CHAR_BIT
|
Chris@16
|
74
|
Chris@16
|
75 namespace boost{
|
Chris@16
|
76
|
Chris@16
|
77 namespace detail{
|
Chris@16
|
78
|
Chris@16
|
79 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
|
Chris@16
|
80 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
|
Chris@16
|
81 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
|
Chris@16
|
82
|
Chris@16
|
83 inline bool is_high_surrogate(::boost::uint16_t v)
|
Chris@16
|
84 {
|
Chris@16
|
85 return (v & 0xFFFFFC00u) == 0xd800u;
|
Chris@16
|
86 }
|
Chris@16
|
87 inline bool is_low_surrogate(::boost::uint16_t v)
|
Chris@16
|
88 {
|
Chris@16
|
89 return (v & 0xFFFFFC00u) == 0xdc00u;
|
Chris@16
|
90 }
|
Chris@16
|
91 template <class T>
|
Chris@16
|
92 inline bool is_surrogate(T v)
|
Chris@16
|
93 {
|
Chris@16
|
94 return (v & 0xFFFFF800u) == 0xd800;
|
Chris@16
|
95 }
|
Chris@16
|
96
|
Chris@16
|
97 inline unsigned utf8_byte_count(boost::uint8_t c)
|
Chris@16
|
98 {
|
Chris@16
|
99 // if the most significant bit with a zero in it is in position
|
Chris@16
|
100 // 8-N then there are N bytes in this UTF-8 sequence:
|
Chris@16
|
101 boost::uint8_t mask = 0x80u;
|
Chris@16
|
102 unsigned result = 0;
|
Chris@16
|
103 while(c & mask)
|
Chris@16
|
104 {
|
Chris@16
|
105 ++result;
|
Chris@16
|
106 mask >>= 1;
|
Chris@16
|
107 }
|
Chris@16
|
108 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
|
Chris@16
|
109 }
|
Chris@16
|
110
|
Chris@16
|
111 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
|
Chris@16
|
112 {
|
Chris@16
|
113 return utf8_byte_count(c) - 1;
|
Chris@16
|
114 }
|
Chris@16
|
115
|
Chris@16
|
116 #ifdef BOOST_MSVC
|
Chris@16
|
117 #pragma warning(push)
|
Chris@16
|
118 #pragma warning(disable:4100)
|
Chris@16
|
119 #endif
|
Chris@16
|
120 inline void invalid_utf32_code_point(::boost::uint32_t val)
|
Chris@16
|
121 {
|
Chris@16
|
122 #ifndef BOOST_NO_STD_LOCALE
|
Chris@16
|
123 std::stringstream ss;
|
Chris@16
|
124 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
|
Chris@16
|
125 std::out_of_range e(ss.str());
|
Chris@16
|
126 #else
|
Chris@16
|
127 std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
|
Chris@16
|
128 #endif
|
Chris@16
|
129 boost::throw_exception(e);
|
Chris@16
|
130 }
|
Chris@16
|
131 #ifdef BOOST_MSVC
|
Chris@16
|
132 #pragma warning(pop)
|
Chris@16
|
133 #endif
|
Chris@16
|
134
|
Chris@16
|
135
|
Chris@16
|
136 } // namespace detail
|
Chris@16
|
137
|
Chris@16
|
138 template <class BaseIterator, class U16Type = ::boost::uint16_t>
|
Chris@16
|
139 class u32_to_u16_iterator
|
Chris@16
|
140 : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
|
Chris@16
|
141 {
|
Chris@16
|
142 typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
|
Chris@16
|
143
|
Chris@101
|
144 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
|
Chris@16
|
145 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
Chris@16
|
146
|
Chris@16
|
147 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
Chris@16
|
148 BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
|
Chris@16
|
149 #endif
|
Chris@16
|
150
|
Chris@16
|
151 public:
|
Chris@16
|
152 typename base_type::reference
|
Chris@16
|
153 dereference()const
|
Chris@16
|
154 {
|
Chris@16
|
155 if(m_current == 2)
|
Chris@16
|
156 extract_current();
|
Chris@16
|
157 return m_values[m_current];
|
Chris@16
|
158 }
|
Chris@16
|
159 bool equal(const u32_to_u16_iterator& that)const
|
Chris@16
|
160 {
|
Chris@16
|
161 if(m_position == that.m_position)
|
Chris@16
|
162 {
|
Chris@16
|
163 // Both m_currents must be equal, or both even
|
Chris@16
|
164 // this is the same as saying their sum must be even:
|
Chris@16
|
165 return (m_current + that.m_current) & 1u ? false : true;
|
Chris@16
|
166 }
|
Chris@16
|
167 return false;
|
Chris@16
|
168 }
|
Chris@16
|
169 void increment()
|
Chris@16
|
170 {
|
Chris@16
|
171 // if we have a pending read then read now, so that we know whether
|
Chris@16
|
172 // to skip a position, or move to a low-surrogate:
|
Chris@16
|
173 if(m_current == 2)
|
Chris@16
|
174 {
|
Chris@16
|
175 // pending read:
|
Chris@16
|
176 extract_current();
|
Chris@16
|
177 }
|
Chris@16
|
178 // move to the next surrogate position:
|
Chris@16
|
179 ++m_current;
|
Chris@16
|
180 // if we've reached the end skip a position:
|
Chris@16
|
181 if(m_values[m_current] == 0)
|
Chris@16
|
182 {
|
Chris@16
|
183 m_current = 2;
|
Chris@16
|
184 ++m_position;
|
Chris@16
|
185 }
|
Chris@16
|
186 }
|
Chris@16
|
187 void decrement()
|
Chris@16
|
188 {
|
Chris@16
|
189 if(m_current != 1)
|
Chris@16
|
190 {
|
Chris@16
|
191 // decrementing an iterator always leads to a valid position:
|
Chris@16
|
192 --m_position;
|
Chris@16
|
193 extract_current();
|
Chris@16
|
194 m_current = m_values[1] ? 1 : 0;
|
Chris@16
|
195 }
|
Chris@16
|
196 else
|
Chris@16
|
197 {
|
Chris@16
|
198 m_current = 0;
|
Chris@16
|
199 }
|
Chris@16
|
200 }
|
Chris@16
|
201 BaseIterator base()const
|
Chris@16
|
202 {
|
Chris@16
|
203 return m_position;
|
Chris@16
|
204 }
|
Chris@16
|
205 // construct:
|
Chris@16
|
206 u32_to_u16_iterator() : m_position(), m_current(0)
|
Chris@16
|
207 {
|
Chris@16
|
208 m_values[0] = 0;
|
Chris@16
|
209 m_values[1] = 0;
|
Chris@16
|
210 m_values[2] = 0;
|
Chris@16
|
211 }
|
Chris@16
|
212 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
|
Chris@16
|
213 {
|
Chris@16
|
214 m_values[0] = 0;
|
Chris@16
|
215 m_values[1] = 0;
|
Chris@16
|
216 m_values[2] = 0;
|
Chris@16
|
217 }
|
Chris@16
|
218 private:
|
Chris@16
|
219
|
Chris@16
|
220 void extract_current()const
|
Chris@16
|
221 {
|
Chris@16
|
222 // begin by checking for a code point out of range:
|
Chris@16
|
223 ::boost::uint32_t v = *m_position;
|
Chris@16
|
224 if(v >= 0x10000u)
|
Chris@16
|
225 {
|
Chris@16
|
226 if(v > 0x10FFFFu)
|
Chris@16
|
227 detail::invalid_utf32_code_point(*m_position);
|
Chris@16
|
228 // split into two surrogates:
|
Chris@16
|
229 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
|
Chris@16
|
230 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
|
Chris@16
|
231 m_current = 0;
|
Chris@16
|
232 BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
|
Chris@16
|
233 BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
|
Chris@16
|
234 }
|
Chris@16
|
235 else
|
Chris@16
|
236 {
|
Chris@16
|
237 // 16-bit code point:
|
Chris@16
|
238 m_values[0] = static_cast<U16Type>(*m_position);
|
Chris@16
|
239 m_values[1] = 0;
|
Chris@16
|
240 m_current = 0;
|
Chris@16
|
241 // value must not be a surrogate:
|
Chris@16
|
242 if(detail::is_surrogate(m_values[0]))
|
Chris@16
|
243 detail::invalid_utf32_code_point(*m_position);
|
Chris@16
|
244 }
|
Chris@16
|
245 }
|
Chris@16
|
246 BaseIterator m_position;
|
Chris@16
|
247 mutable U16Type m_values[3];
|
Chris@16
|
248 mutable unsigned m_current;
|
Chris@16
|
249 };
|
Chris@16
|
250
|
Chris@16
|
251 template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
Chris@16
|
252 class u16_to_u32_iterator
|
Chris@16
|
253 : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
Chris@16
|
254 {
|
Chris@16
|
255 typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
Chris@16
|
256 // special values for pending iterator reads:
|
Chris@16
|
257 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
|
Chris@16
|
258
|
Chris@101
|
259 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
|
Chris@16
|
260 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
Chris@16
|
261
|
Chris@16
|
262 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
|
Chris@16
|
263 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
Chris@16
|
264 #endif
|
Chris@16
|
265
|
Chris@16
|
266 public:
|
Chris@16
|
267 typename base_type::reference
|
Chris@16
|
268 dereference()const
|
Chris@16
|
269 {
|
Chris@16
|
270 if(m_value == pending_read)
|
Chris@16
|
271 extract_current();
|
Chris@16
|
272 return m_value;
|
Chris@16
|
273 }
|
Chris@16
|
274 bool equal(const u16_to_u32_iterator& that)const
|
Chris@16
|
275 {
|
Chris@16
|
276 return m_position == that.m_position;
|
Chris@16
|
277 }
|
Chris@16
|
278 void increment()
|
Chris@16
|
279 {
|
Chris@16
|
280 // skip high surrogate first if there is one:
|
Chris@16
|
281 if(detail::is_high_surrogate(*m_position)) ++m_position;
|
Chris@16
|
282 ++m_position;
|
Chris@16
|
283 m_value = pending_read;
|
Chris@16
|
284 }
|
Chris@16
|
285 void decrement()
|
Chris@16
|
286 {
|
Chris@16
|
287 --m_position;
|
Chris@16
|
288 // if we have a low surrogate then go back one more:
|
Chris@16
|
289 if(detail::is_low_surrogate(*m_position))
|
Chris@16
|
290 --m_position;
|
Chris@16
|
291 m_value = pending_read;
|
Chris@16
|
292 }
|
Chris@16
|
293 BaseIterator base()const
|
Chris@16
|
294 {
|
Chris@16
|
295 return m_position;
|
Chris@16
|
296 }
|
Chris@16
|
297 // construct:
|
Chris@16
|
298 u16_to_u32_iterator() : m_position()
|
Chris@16
|
299 {
|
Chris@16
|
300 m_value = pending_read;
|
Chris@16
|
301 }
|
Chris@16
|
302 u16_to_u32_iterator(BaseIterator b) : m_position(b)
|
Chris@16
|
303 {
|
Chris@16
|
304 m_value = pending_read;
|
Chris@16
|
305 }
|
Chris@16
|
306 //
|
Chris@16
|
307 // Range checked version:
|
Chris@16
|
308 //
|
Chris@16
|
309 u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
|
Chris@16
|
310 {
|
Chris@16
|
311 m_value = pending_read;
|
Chris@16
|
312 //
|
Chris@16
|
313 // The range must not start with a low surrogate, or end in a high surrogate,
|
Chris@16
|
314 // otherwise we run the risk of running outside the underlying input range.
|
Chris@16
|
315 // Likewise b must not be located at a low surrogate.
|
Chris@16
|
316 //
|
Chris@16
|
317 boost::uint16_t val;
|
Chris@16
|
318 if(start != end)
|
Chris@16
|
319 {
|
Chris@16
|
320 if((b != start) && (b != end))
|
Chris@16
|
321 {
|
Chris@16
|
322 val = *b;
|
Chris@16
|
323 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
|
Chris@16
|
324 invalid_code_point(val);
|
Chris@16
|
325 }
|
Chris@16
|
326 val = *start;
|
Chris@16
|
327 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
|
Chris@16
|
328 invalid_code_point(val);
|
Chris@16
|
329 val = *--end;
|
Chris@16
|
330 if(detail::is_high_surrogate(val))
|
Chris@16
|
331 invalid_code_point(val);
|
Chris@16
|
332 }
|
Chris@16
|
333 }
|
Chris@16
|
334 private:
|
Chris@16
|
335 static void invalid_code_point(::boost::uint16_t val)
|
Chris@16
|
336 {
|
Chris@16
|
337 #ifndef BOOST_NO_STD_LOCALE
|
Chris@16
|
338 std::stringstream ss;
|
Chris@16
|
339 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
|
Chris@16
|
340 std::out_of_range e(ss.str());
|
Chris@16
|
341 #else
|
Chris@16
|
342 std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
|
Chris@16
|
343 #endif
|
Chris@16
|
344 boost::throw_exception(e);
|
Chris@16
|
345 }
|
Chris@16
|
346 void extract_current()const
|
Chris@16
|
347 {
|
Chris@16
|
348 m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
|
Chris@16
|
349 // if the last value is a high surrogate then adjust m_position and m_value as needed:
|
Chris@16
|
350 if(detail::is_high_surrogate(*m_position))
|
Chris@16
|
351 {
|
Chris@16
|
352 // precondition; next value must have be a low-surrogate:
|
Chris@16
|
353 BaseIterator next(m_position);
|
Chris@16
|
354 ::boost::uint16_t t = *++next;
|
Chris@16
|
355 if((t & 0xFC00u) != 0xDC00u)
|
Chris@16
|
356 invalid_code_point(t);
|
Chris@16
|
357 m_value = (m_value - detail::high_surrogate_base) << 10;
|
Chris@16
|
358 m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
|
Chris@16
|
359 }
|
Chris@16
|
360 // postcondition; result must not be a surrogate:
|
Chris@16
|
361 if(detail::is_surrogate(m_value))
|
Chris@16
|
362 invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
|
Chris@16
|
363 }
|
Chris@16
|
364 BaseIterator m_position;
|
Chris@16
|
365 mutable U32Type m_value;
|
Chris@16
|
366 };
|
Chris@16
|
367
|
Chris@16
|
368 template <class BaseIterator, class U8Type = ::boost::uint8_t>
|
Chris@16
|
369 class u32_to_u8_iterator
|
Chris@16
|
370 : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
|
Chris@16
|
371 {
|
Chris@16
|
372 typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
|
Chris@16
|
373
|
Chris@101
|
374 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
|
Chris@16
|
375 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
Chris@16
|
376
|
Chris@16
|
377 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
Chris@16
|
378 BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
|
Chris@16
|
379 #endif
|
Chris@16
|
380
|
Chris@16
|
381 public:
|
Chris@16
|
382 typename base_type::reference
|
Chris@16
|
383 dereference()const
|
Chris@16
|
384 {
|
Chris@16
|
385 if(m_current == 4)
|
Chris@16
|
386 extract_current();
|
Chris@16
|
387 return m_values[m_current];
|
Chris@16
|
388 }
|
Chris@16
|
389 bool equal(const u32_to_u8_iterator& that)const
|
Chris@16
|
390 {
|
Chris@16
|
391 if(m_position == that.m_position)
|
Chris@16
|
392 {
|
Chris@16
|
393 // either the m_current's must be equal, or one must be 0 and
|
Chris@16
|
394 // the other 4: which means neither must have bits 1 or 2 set:
|
Chris@16
|
395 return (m_current == that.m_current)
|
Chris@16
|
396 || (((m_current | that.m_current) & 3) == 0);
|
Chris@16
|
397 }
|
Chris@16
|
398 return false;
|
Chris@16
|
399 }
|
Chris@16
|
400 void increment()
|
Chris@16
|
401 {
|
Chris@16
|
402 // if we have a pending read then read now, so that we know whether
|
Chris@16
|
403 // to skip a position, or move to a low-surrogate:
|
Chris@16
|
404 if(m_current == 4)
|
Chris@16
|
405 {
|
Chris@16
|
406 // pending read:
|
Chris@16
|
407 extract_current();
|
Chris@16
|
408 }
|
Chris@16
|
409 // move to the next surrogate position:
|
Chris@16
|
410 ++m_current;
|
Chris@16
|
411 // if we've reached the end skip a position:
|
Chris@16
|
412 if(m_values[m_current] == 0)
|
Chris@16
|
413 {
|
Chris@16
|
414 m_current = 4;
|
Chris@16
|
415 ++m_position;
|
Chris@16
|
416 }
|
Chris@16
|
417 }
|
Chris@16
|
418 void decrement()
|
Chris@16
|
419 {
|
Chris@16
|
420 if((m_current & 3) == 0)
|
Chris@16
|
421 {
|
Chris@16
|
422 --m_position;
|
Chris@16
|
423 extract_current();
|
Chris@16
|
424 m_current = 3;
|
Chris@16
|
425 while(m_current && (m_values[m_current] == 0))
|
Chris@16
|
426 --m_current;
|
Chris@16
|
427 }
|
Chris@16
|
428 else
|
Chris@16
|
429 --m_current;
|
Chris@16
|
430 }
|
Chris@16
|
431 BaseIterator base()const
|
Chris@16
|
432 {
|
Chris@16
|
433 return m_position;
|
Chris@16
|
434 }
|
Chris@16
|
435 // construct:
|
Chris@16
|
436 u32_to_u8_iterator() : m_position(), m_current(0)
|
Chris@16
|
437 {
|
Chris@16
|
438 m_values[0] = 0;
|
Chris@16
|
439 m_values[1] = 0;
|
Chris@16
|
440 m_values[2] = 0;
|
Chris@16
|
441 m_values[3] = 0;
|
Chris@16
|
442 m_values[4] = 0;
|
Chris@16
|
443 }
|
Chris@16
|
444 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
|
Chris@16
|
445 {
|
Chris@16
|
446 m_values[0] = 0;
|
Chris@16
|
447 m_values[1] = 0;
|
Chris@16
|
448 m_values[2] = 0;
|
Chris@16
|
449 m_values[3] = 0;
|
Chris@16
|
450 m_values[4] = 0;
|
Chris@16
|
451 }
|
Chris@16
|
452 private:
|
Chris@16
|
453
|
Chris@16
|
454 void extract_current()const
|
Chris@16
|
455 {
|
Chris@16
|
456 boost::uint32_t c = *m_position;
|
Chris@16
|
457 if(c > 0x10FFFFu)
|
Chris@16
|
458 detail::invalid_utf32_code_point(c);
|
Chris@16
|
459 if(c < 0x80u)
|
Chris@16
|
460 {
|
Chris@16
|
461 m_values[0] = static_cast<unsigned char>(c);
|
Chris@16
|
462 m_values[1] = static_cast<unsigned char>(0u);
|
Chris@16
|
463 m_values[2] = static_cast<unsigned char>(0u);
|
Chris@16
|
464 m_values[3] = static_cast<unsigned char>(0u);
|
Chris@16
|
465 }
|
Chris@16
|
466 else if(c < 0x800u)
|
Chris@16
|
467 {
|
Chris@16
|
468 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
|
Chris@16
|
469 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
470 m_values[2] = static_cast<unsigned char>(0u);
|
Chris@16
|
471 m_values[3] = static_cast<unsigned char>(0u);
|
Chris@16
|
472 }
|
Chris@16
|
473 else if(c < 0x10000u)
|
Chris@16
|
474 {
|
Chris@16
|
475 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
|
Chris@16
|
476 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
Chris@16
|
477 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
478 m_values[3] = static_cast<unsigned char>(0u);
|
Chris@16
|
479 }
|
Chris@16
|
480 else
|
Chris@16
|
481 {
|
Chris@16
|
482 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
|
Chris@16
|
483 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
|
Chris@16
|
484 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
Chris@16
|
485 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
486 }
|
Chris@16
|
487 m_current= 0;
|
Chris@16
|
488 }
|
Chris@16
|
489 BaseIterator m_position;
|
Chris@16
|
490 mutable U8Type m_values[5];
|
Chris@16
|
491 mutable unsigned m_current;
|
Chris@16
|
492 };
|
Chris@16
|
493
|
Chris@16
|
494 template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
Chris@16
|
495 class u8_to_u32_iterator
|
Chris@16
|
496 : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
Chris@16
|
497 {
|
Chris@16
|
498 typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
Chris@16
|
499 // special values for pending iterator reads:
|
Chris@16
|
500 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
|
Chris@16
|
501
|
Chris@101
|
502 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
|
Chris@16
|
503 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
Chris@16
|
504
|
Chris@16
|
505 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
|
Chris@16
|
506 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
Chris@16
|
507 #endif
|
Chris@16
|
508
|
Chris@16
|
509 public:
|
Chris@16
|
510 typename base_type::reference
|
Chris@16
|
511 dereference()const
|
Chris@16
|
512 {
|
Chris@16
|
513 if(m_value == pending_read)
|
Chris@16
|
514 extract_current();
|
Chris@16
|
515 return m_value;
|
Chris@16
|
516 }
|
Chris@16
|
517 bool equal(const u8_to_u32_iterator& that)const
|
Chris@16
|
518 {
|
Chris@16
|
519 return m_position == that.m_position;
|
Chris@16
|
520 }
|
Chris@16
|
521 void increment()
|
Chris@16
|
522 {
|
Chris@16
|
523 // We must not start with a continuation character:
|
Chris@16
|
524 if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
|
Chris@16
|
525 invalid_sequence();
|
Chris@16
|
526 // skip high surrogate first if there is one:
|
Chris@16
|
527 unsigned c = detail::utf8_byte_count(*m_position);
|
Chris@16
|
528 if(m_value == pending_read)
|
Chris@16
|
529 {
|
Chris@16
|
530 // Since we haven't read in a value, we need to validate the code points:
|
Chris@16
|
531 for(unsigned i = 0; i < c; ++i)
|
Chris@16
|
532 {
|
Chris@16
|
533 ++m_position;
|
Chris@16
|
534 // We must have a continuation byte:
|
Chris@16
|
535 if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
|
Chris@16
|
536 invalid_sequence();
|
Chris@16
|
537 }
|
Chris@16
|
538 }
|
Chris@16
|
539 else
|
Chris@16
|
540 {
|
Chris@16
|
541 std::advance(m_position, c);
|
Chris@16
|
542 }
|
Chris@16
|
543 m_value = pending_read;
|
Chris@16
|
544 }
|
Chris@16
|
545 void decrement()
|
Chris@16
|
546 {
|
Chris@16
|
547 // Keep backtracking until we don't have a trailing character:
|
Chris@16
|
548 unsigned count = 0;
|
Chris@16
|
549 while((*--m_position & 0xC0u) == 0x80u) ++count;
|
Chris@16
|
550 // now check that the sequence was valid:
|
Chris@16
|
551 if(count != detail::utf8_trailing_byte_count(*m_position))
|
Chris@16
|
552 invalid_sequence();
|
Chris@16
|
553 m_value = pending_read;
|
Chris@16
|
554 }
|
Chris@16
|
555 BaseIterator base()const
|
Chris@16
|
556 {
|
Chris@16
|
557 return m_position;
|
Chris@16
|
558 }
|
Chris@16
|
559 // construct:
|
Chris@16
|
560 u8_to_u32_iterator() : m_position()
|
Chris@16
|
561 {
|
Chris@16
|
562 m_value = pending_read;
|
Chris@16
|
563 }
|
Chris@16
|
564 u8_to_u32_iterator(BaseIterator b) : m_position(b)
|
Chris@16
|
565 {
|
Chris@16
|
566 m_value = pending_read;
|
Chris@16
|
567 }
|
Chris@16
|
568 //
|
Chris@16
|
569 // Checked constructor:
|
Chris@16
|
570 //
|
Chris@16
|
571 u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
|
Chris@16
|
572 {
|
Chris@16
|
573 m_value = pending_read;
|
Chris@16
|
574 //
|
Chris@16
|
575 // We must not start with a continuation character, or end with a
|
Chris@16
|
576 // truncated UTF-8 sequence otherwise we run the risk of going past
|
Chris@16
|
577 // the start/end of the underlying sequence:
|
Chris@16
|
578 //
|
Chris@16
|
579 if(start != end)
|
Chris@16
|
580 {
|
Chris@16
|
581 unsigned char v = *start;
|
Chris@16
|
582 if((v & 0xC0u) == 0x80u)
|
Chris@16
|
583 invalid_sequence();
|
Chris@16
|
584 if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
|
Chris@16
|
585 invalid_sequence();
|
Chris@16
|
586 BaseIterator pos = end;
|
Chris@16
|
587 do
|
Chris@16
|
588 {
|
Chris@16
|
589 v = *--pos;
|
Chris@16
|
590 }
|
Chris@16
|
591 while((start != pos) && ((v & 0xC0u) == 0x80u));
|
Chris@16
|
592 std::ptrdiff_t extra = detail::utf8_byte_count(v);
|
Chris@16
|
593 if(std::distance(pos, end) < extra)
|
Chris@16
|
594 invalid_sequence();
|
Chris@16
|
595 }
|
Chris@16
|
596 }
|
Chris@16
|
597 private:
|
Chris@16
|
598 static void invalid_sequence()
|
Chris@16
|
599 {
|
Chris@16
|
600 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
|
Chris@16
|
601 boost::throw_exception(e);
|
Chris@16
|
602 }
|
Chris@16
|
603 void extract_current()const
|
Chris@16
|
604 {
|
Chris@16
|
605 m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
|
Chris@16
|
606 // we must not have a continuation character:
|
Chris@16
|
607 if((m_value & 0xC0u) == 0x80u)
|
Chris@16
|
608 invalid_sequence();
|
Chris@16
|
609 // see how many extra bytes we have:
|
Chris@16
|
610 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
|
Chris@16
|
611 // extract the extra bits, 6 from each extra byte:
|
Chris@16
|
612 BaseIterator next(m_position);
|
Chris@16
|
613 for(unsigned c = 0; c < extra; ++c)
|
Chris@16
|
614 {
|
Chris@16
|
615 ++next;
|
Chris@16
|
616 m_value <<= 6;
|
Chris@16
|
617 // We must have a continuation byte:
|
Chris@16
|
618 if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
|
Chris@16
|
619 invalid_sequence();
|
Chris@16
|
620 m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
|
Chris@16
|
621 }
|
Chris@16
|
622 // we now need to remove a few of the leftmost bits, but how many depends
|
Chris@16
|
623 // upon how many extra bytes we've extracted:
|
Chris@16
|
624 static const boost::uint32_t masks[4] =
|
Chris@16
|
625 {
|
Chris@16
|
626 0x7Fu,
|
Chris@16
|
627 0x7FFu,
|
Chris@16
|
628 0xFFFFu,
|
Chris@16
|
629 0x1FFFFFu,
|
Chris@16
|
630 };
|
Chris@16
|
631 m_value &= masks[extra];
|
Chris@101
|
632 // check the result is in range:
|
Chris@16
|
633 if(m_value > static_cast<U32Type>(0x10FFFFu))
|
Chris@16
|
634 invalid_sequence();
|
Chris@101
|
635 // The result must not be a surrogate:
|
Chris@101
|
636 if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
|
Chris@101
|
637 invalid_sequence();
|
Chris@101
|
638 // We should not have had an invalidly encoded UTF8 sequence:
|
Chris@101
|
639 if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
|
Chris@101
|
640 invalid_sequence();
|
Chris@16
|
641 }
|
Chris@16
|
642 BaseIterator m_position;
|
Chris@16
|
643 mutable U32Type m_value;
|
Chris@16
|
644 };
|
Chris@16
|
645
|
Chris@16
|
646 template <class BaseIterator>
|
Chris@16
|
647 class utf16_output_iterator
|
Chris@16
|
648 {
|
Chris@16
|
649 public:
|
Chris@16
|
650 typedef void difference_type;
|
Chris@16
|
651 typedef void value_type;
|
Chris@16
|
652 typedef boost::uint32_t* pointer;
|
Chris@16
|
653 typedef boost::uint32_t& reference;
|
Chris@16
|
654 typedef std::output_iterator_tag iterator_category;
|
Chris@16
|
655
|
Chris@16
|
656 utf16_output_iterator(const BaseIterator& b)
|
Chris@16
|
657 : m_position(b){}
|
Chris@16
|
658 utf16_output_iterator(const utf16_output_iterator& that)
|
Chris@16
|
659 : m_position(that.m_position){}
|
Chris@16
|
660 utf16_output_iterator& operator=(const utf16_output_iterator& that)
|
Chris@16
|
661 {
|
Chris@16
|
662 m_position = that.m_position;
|
Chris@16
|
663 return *this;
|
Chris@16
|
664 }
|
Chris@16
|
665 const utf16_output_iterator& operator*()const
|
Chris@16
|
666 {
|
Chris@16
|
667 return *this;
|
Chris@16
|
668 }
|
Chris@16
|
669 void operator=(boost::uint32_t val)const
|
Chris@16
|
670 {
|
Chris@16
|
671 push(val);
|
Chris@16
|
672 }
|
Chris@16
|
673 utf16_output_iterator& operator++()
|
Chris@16
|
674 {
|
Chris@16
|
675 return *this;
|
Chris@16
|
676 }
|
Chris@16
|
677 utf16_output_iterator& operator++(int)
|
Chris@16
|
678 {
|
Chris@16
|
679 return *this;
|
Chris@16
|
680 }
|
Chris@16
|
681 BaseIterator base()const
|
Chris@16
|
682 {
|
Chris@16
|
683 return m_position;
|
Chris@16
|
684 }
|
Chris@16
|
685 private:
|
Chris@16
|
686 void push(boost::uint32_t v)const
|
Chris@16
|
687 {
|
Chris@16
|
688 if(v >= 0x10000u)
|
Chris@16
|
689 {
|
Chris@16
|
690 // begin by checking for a code point out of range:
|
Chris@16
|
691 if(v > 0x10FFFFu)
|
Chris@16
|
692 detail::invalid_utf32_code_point(v);
|
Chris@16
|
693 // split into two surrogates:
|
Chris@16
|
694 *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
|
Chris@16
|
695 *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
|
Chris@16
|
696 }
|
Chris@16
|
697 else
|
Chris@16
|
698 {
|
Chris@16
|
699 // 16-bit code point:
|
Chris@16
|
700 // value must not be a surrogate:
|
Chris@16
|
701 if(detail::is_surrogate(v))
|
Chris@16
|
702 detail::invalid_utf32_code_point(v);
|
Chris@16
|
703 *m_position++ = static_cast<boost::uint16_t>(v);
|
Chris@16
|
704 }
|
Chris@16
|
705 }
|
Chris@16
|
706 mutable BaseIterator m_position;
|
Chris@16
|
707 };
|
Chris@16
|
708
|
Chris@16
|
709 template <class BaseIterator>
|
Chris@16
|
710 class utf8_output_iterator
|
Chris@16
|
711 {
|
Chris@16
|
712 public:
|
Chris@16
|
713 typedef void difference_type;
|
Chris@16
|
714 typedef void value_type;
|
Chris@16
|
715 typedef boost::uint32_t* pointer;
|
Chris@16
|
716 typedef boost::uint32_t& reference;
|
Chris@16
|
717 typedef std::output_iterator_tag iterator_category;
|
Chris@16
|
718
|
Chris@16
|
719 utf8_output_iterator(const BaseIterator& b)
|
Chris@16
|
720 : m_position(b){}
|
Chris@16
|
721 utf8_output_iterator(const utf8_output_iterator& that)
|
Chris@16
|
722 : m_position(that.m_position){}
|
Chris@16
|
723 utf8_output_iterator& operator=(const utf8_output_iterator& that)
|
Chris@16
|
724 {
|
Chris@16
|
725 m_position = that.m_position;
|
Chris@16
|
726 return *this;
|
Chris@16
|
727 }
|
Chris@16
|
728 const utf8_output_iterator& operator*()const
|
Chris@16
|
729 {
|
Chris@16
|
730 return *this;
|
Chris@16
|
731 }
|
Chris@16
|
732 void operator=(boost::uint32_t val)const
|
Chris@16
|
733 {
|
Chris@16
|
734 push(val);
|
Chris@16
|
735 }
|
Chris@16
|
736 utf8_output_iterator& operator++()
|
Chris@16
|
737 {
|
Chris@16
|
738 return *this;
|
Chris@16
|
739 }
|
Chris@16
|
740 utf8_output_iterator& operator++(int)
|
Chris@16
|
741 {
|
Chris@16
|
742 return *this;
|
Chris@16
|
743 }
|
Chris@16
|
744 BaseIterator base()const
|
Chris@16
|
745 {
|
Chris@16
|
746 return m_position;
|
Chris@16
|
747 }
|
Chris@16
|
748 private:
|
Chris@16
|
749 void push(boost::uint32_t c)const
|
Chris@16
|
750 {
|
Chris@16
|
751 if(c > 0x10FFFFu)
|
Chris@16
|
752 detail::invalid_utf32_code_point(c);
|
Chris@16
|
753 if(c < 0x80u)
|
Chris@16
|
754 {
|
Chris@16
|
755 *m_position++ = static_cast<unsigned char>(c);
|
Chris@16
|
756 }
|
Chris@16
|
757 else if(c < 0x800u)
|
Chris@16
|
758 {
|
Chris@16
|
759 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
|
Chris@16
|
760 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
761 }
|
Chris@16
|
762 else if(c < 0x10000u)
|
Chris@16
|
763 {
|
Chris@16
|
764 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
|
Chris@16
|
765 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
Chris@16
|
766 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
767 }
|
Chris@16
|
768 else
|
Chris@16
|
769 {
|
Chris@16
|
770 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
|
Chris@16
|
771 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
|
Chris@16
|
772 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
Chris@16
|
773 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
Chris@16
|
774 }
|
Chris@16
|
775 }
|
Chris@16
|
776 mutable BaseIterator m_position;
|
Chris@16
|
777 };
|
Chris@16
|
778
|
Chris@16
|
779 } // namespace boost
|
Chris@16
|
780
|
Chris@16
|
781 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
|
Chris@16
|
782
|