Chris@102: /* Chris@102: * Copyright Andrey Semashev 2013. Chris@102: * Distributed under the Boost Software License, Version 1.0. Chris@102: * (See accompanying file LICENSE_1_0.txt or copy at Chris@102: * http://www.boost.org/LICENSE_1_0.txt) Chris@102: */ Chris@102: /*! Chris@102: * \file uuid/detail/uuid_x86.hpp Chris@102: * Chris@102: * \brief This header contains optimized SSE implementation of \c boost::uuid operations. Chris@102: */ Chris@102: Chris@102: #ifndef BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ Chris@102: #define BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ Chris@102: Chris@102: // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set Chris@102: #if defined(BOOST_UUID_USE_SSE41) Chris@102: #include Chris@102: #elif defined(BOOST_UUID_USE_SSE3) Chris@102: #include Chris@102: #else Chris@102: #include Chris@102: #endif Chris@102: Chris@102: namespace boost { Chris@102: namespace uuids { Chris@102: namespace detail { Chris@102: Chris@102: BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT Chris@102: { Chris@102: #if defined(BOOST_UUID_USE_SSE3) Chris@102: return _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); Chris@102: #else Chris@102: return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); Chris@102: #endif Chris@102: } Chris@102: Chris@102: } // namespace detail Chris@102: Chris@102: inline bool uuid::is_nil() const BOOST_NOEXCEPT Chris@102: { Chris@102: register __m128i mm = uuids::detail::load_unaligned_si128(data); Chris@102: #if defined(BOOST_UUID_USE_SSE41) Chris@102: return _mm_test_all_zeros(mm, mm) != 0; Chris@102: #else Chris@102: mm = _mm_cmpeq_epi8(mm, _mm_setzero_si128()); Chris@102: return _mm_movemask_epi8(mm) == 0xFFFF; Chris@102: #endif Chris@102: } Chris@102: Chris@102: inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT Chris@102: { Chris@102: register __m128i mm_this = uuids::detail::load_unaligned_si128(data); Chris@102: register __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data); Chris@102: _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this); Chris@102: _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs); Chris@102: } Chris@102: Chris@102: inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT Chris@102: { Chris@102: register __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); Chris@102: register __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); Chris@102: Chris@102: register __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right); Chris@102: #if defined(BOOST_UUID_USE_SSE41) Chris@102: return _mm_test_all_ones(mm_cmp); Chris@102: #else Chris@102: return _mm_movemask_epi8(mm_cmp) == 0xFFFF; Chris@102: #endif Chris@102: } Chris@102: Chris@102: inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT Chris@102: { Chris@102: register __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); Chris@102: register __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); Chris@102: Chris@102: // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one. Chris@102: // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results Chris@102: // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways), Chris@102: // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider: Chris@102: // Chris@102: // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers, Chris@102: // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result. Chris@102: // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with Chris@102: // the "(x - 1) ^ x" trick. Chris@102: // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding Chris@102: // bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right, Chris@102: // making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results. Chris@102: // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need. Chris@102: Chris@102: const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right); Chris@102: Chris@102: __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right); Chris@102: Chris@102: mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp); Chris@102: mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp); Chris@102: Chris@102: uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp)); Chris@102: Chris@102: cmp = (cmp - 1u) ^ cmp; Chris@102: rcmp = (rcmp - 1u) ^ rcmp; Chris@102: Chris@102: return static_cast< uint16_t >(cmp) < static_cast< uint16_t >(rcmp); Chris@102: } Chris@102: Chris@102: } // namespace uuids Chris@102: } // namespace boost Chris@102: Chris@102: #endif // BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_