Chris@102
|
1 /*
|
Chris@102
|
2 * Copyright Andrey Semashev 2013.
|
Chris@102
|
3 * Distributed under the Boost Software License, Version 1.0.
|
Chris@102
|
4 * (See accompanying file LICENSE_1_0.txt or copy at
|
Chris@102
|
5 * http://www.boost.org/LICENSE_1_0.txt)
|
Chris@102
|
6 */
|
Chris@102
|
7 /*!
|
Chris@102
|
8 * \file uuid/detail/uuid_x86.hpp
|
Chris@102
|
9 *
|
Chris@102
|
10 * \brief This header contains optimized SSE implementation of \c boost::uuid operations.
|
Chris@102
|
11 */
|
Chris@102
|
12
|
Chris@102
|
13 #ifndef BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_
|
Chris@102
|
14 #define BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_
|
Chris@102
|
15
|
Chris@102
|
16 // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set
|
Chris@102
|
17 #if defined(BOOST_UUID_USE_SSE41)
|
Chris@102
|
18 #include <smmintrin.h>
|
Chris@102
|
19 #elif defined(BOOST_UUID_USE_SSE3)
|
Chris@102
|
20 #include <pmmintrin.h>
|
Chris@102
|
21 #else
|
Chris@102
|
22 #include <emmintrin.h>
|
Chris@102
|
23 #endif
|
Chris@102
|
24
|
Chris@102
|
25 namespace boost {
|
Chris@102
|
26 namespace uuids {
|
Chris@102
|
27 namespace detail {
|
Chris@102
|
28
|
Chris@102
|
29 BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT
|
Chris@102
|
30 {
|
Chris@102
|
31 #if defined(BOOST_UUID_USE_SSE3)
|
Chris@102
|
32 return _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p));
|
Chris@102
|
33 #else
|
Chris@102
|
34 return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
|
Chris@102
|
35 #endif
|
Chris@102
|
36 }
|
Chris@102
|
37
|
Chris@102
|
38 } // namespace detail
|
Chris@102
|
39
|
Chris@102
|
40 inline bool uuid::is_nil() const BOOST_NOEXCEPT
|
Chris@102
|
41 {
|
Chris@102
|
42 register __m128i mm = uuids::detail::load_unaligned_si128(data);
|
Chris@102
|
43 #if defined(BOOST_UUID_USE_SSE41)
|
Chris@102
|
44 return _mm_test_all_zeros(mm, mm) != 0;
|
Chris@102
|
45 #else
|
Chris@102
|
46 mm = _mm_cmpeq_epi8(mm, _mm_setzero_si128());
|
Chris@102
|
47 return _mm_movemask_epi8(mm) == 0xFFFF;
|
Chris@102
|
48 #endif
|
Chris@102
|
49 }
|
Chris@102
|
50
|
Chris@102
|
51 inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT
|
Chris@102
|
52 {
|
Chris@102
|
53 register __m128i mm_this = uuids::detail::load_unaligned_si128(data);
|
Chris@102
|
54 register __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data);
|
Chris@102
|
55 _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this);
|
Chris@102
|
56 _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs);
|
Chris@102
|
57 }
|
Chris@102
|
58
|
Chris@102
|
59 inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
|
Chris@102
|
60 {
|
Chris@102
|
61 register __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
|
Chris@102
|
62 register __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
|
Chris@102
|
63
|
Chris@102
|
64 register __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right);
|
Chris@102
|
65 #if defined(BOOST_UUID_USE_SSE41)
|
Chris@102
|
66 return _mm_test_all_ones(mm_cmp);
|
Chris@102
|
67 #else
|
Chris@102
|
68 return _mm_movemask_epi8(mm_cmp) == 0xFFFF;
|
Chris@102
|
69 #endif
|
Chris@102
|
70 }
|
Chris@102
|
71
|
Chris@102
|
72 inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
|
Chris@102
|
73 {
|
Chris@102
|
74 register __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
|
Chris@102
|
75 register __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
|
Chris@102
|
76
|
Chris@102
|
77 // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one.
|
Chris@102
|
78 // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results
|
Chris@102
|
79 // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways),
|
Chris@102
|
80 // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider:
|
Chris@102
|
81 //
|
Chris@102
|
82 // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers,
|
Chris@102
|
83 // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result.
|
Chris@102
|
84 // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with
|
Chris@102
|
85 // the "(x - 1) ^ x" trick.
|
Chris@102
|
86 // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding
|
Chris@102
|
87 // bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right,
|
Chris@102
|
88 // making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results.
|
Chris@102
|
89 // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need.
|
Chris@102
|
90
|
Chris@102
|
91 const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right);
|
Chris@102
|
92
|
Chris@102
|
93 __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right);
|
Chris@102
|
94
|
Chris@102
|
95 mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp);
|
Chris@102
|
96 mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp);
|
Chris@102
|
97
|
Chris@102
|
98 uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp));
|
Chris@102
|
99
|
Chris@102
|
100 cmp = (cmp - 1u) ^ cmp;
|
Chris@102
|
101 rcmp = (rcmp - 1u) ^ rcmp;
|
Chris@102
|
102
|
Chris@102
|
103 return static_cast< uint16_t >(cmp) < static_cast< uint16_t >(rcmp);
|
Chris@102
|
104 }
|
Chris@102
|
105
|
Chris@102
|
106 } // namespace uuids
|
Chris@102
|
107 } // namespace boost
|
Chris@102
|
108
|
Chris@102
|
109 #endif // BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_
|