Chris@16: // (C) Copyright John Maddock 2006. Chris@16: // Use, modification and distribution are subject to the Chris@16: // Boost Software License, Version 1.0. (See accompanying file Chris@16: // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) Chris@16: Chris@16: #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 Chris@16: #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2 Chris@16: Chris@16: #ifdef _MSC_VER Chris@16: #pragma once Chris@16: #endif Chris@16: Chris@16: #include Chris@16: Chris@16: #if defined(__GNUC__) || defined(__PGI) Chris@16: #define ALIGN16 __attribute__((__aligned__(16))) Chris@16: #else Chris@16: #define ALIGN16 __declspec(align(16)) Chris@16: #endif Chris@16: Chris@16: namespace boost{ namespace math{ namespace lanczos{ Chris@16: Chris@16: template <> Chris@16: inline double lanczos13m53::lanczos_sum(const double& x) Chris@16: { Chris@16: static const ALIGN16 double coeff[26] = { Chris@16: static_cast(2.506628274631000270164908177133837338626L), Chris@16: static_cast(1u), Chris@16: static_cast(210.8242777515793458725097339207133627117L), Chris@16: static_cast(66u), Chris@16: static_cast(8071.672002365816210638002902272250613822L), Chris@16: static_cast(1925u), Chris@16: static_cast(186056.2653952234950402949897160456992822L), Chris@16: static_cast(32670u), Chris@16: static_cast(2876370.628935372441225409051620849613599L), Chris@16: static_cast(357423u), Chris@16: static_cast(31426415.58540019438061423162831820536287L), Chris@16: static_cast(2637558u), Chris@16: static_cast(248874557.8620541565114603864132294232163L), Chris@16: static_cast(13339535u), Chris@16: static_cast(1439720407.311721673663223072794912393972L), Chris@16: static_cast(45995730u), Chris@16: static_cast(6039542586.35202800506429164430729792107L), Chris@16: static_cast(105258076u), Chris@16: static_cast(17921034426.03720969991975575445893111267L), Chris@16: static_cast(150917976u), Chris@16: static_cast(35711959237.35566804944018545154716670596L), Chris@16: static_cast(120543840u), Chris@16: static_cast(42919803642.64909876895789904700198885093L), Chris@16: static_cast(39916800u), Chris@16: static_cast(23531376880.41075968857200767445163675473L), Chris@16: static_cast(0u) Chris@16: }; Chris@101: __m128d vx = _mm_load1_pd(&x); Chris@101: __m128d sum_even = _mm_load_pd(coeff); Chris@101: __m128d sum_odd = _mm_load_pd(coeff+2); Chris@101: __m128d nc_odd, nc_even; Chris@101: __m128d vx2 = _mm_mul_pd(vx, vx); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 4); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 6); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 8); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 10); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 12); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 14); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 16); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 18); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 20); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 22); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 24); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_even = _mm_add_pd(sum_even, sum_odd); Chris@16: Chris@16: Chris@16: double ALIGN16 t[2]; Chris@16: _mm_store_pd(t, sum_even); Chris@16: Chris@16: return t[0] / t[1]; Chris@16: } Chris@16: Chris@16: template <> Chris@16: inline double lanczos13m53::lanczos_sum_expG_scaled(const double& x) Chris@16: { Chris@16: static const ALIGN16 double coeff[26] = { Chris@16: static_cast(0.006061842346248906525783753964555936883222L), Chris@16: static_cast(1u), Chris@16: static_cast(0.5098416655656676188125178644804694509993L), Chris@16: static_cast(66u), Chris@16: static_cast(19.51992788247617482847860966235652136208L), Chris@16: static_cast(1925u), Chris@16: static_cast(449.9445569063168119446858607650988409623L), Chris@16: static_cast(32670u), Chris@16: static_cast(6955.999602515376140356310115515198987526L), Chris@16: static_cast(357423u), Chris@16: static_cast(75999.29304014542649875303443598909137092L), Chris@16: static_cast(2637558u), Chris@16: static_cast(601859.6171681098786670226533699352302507L), Chris@16: static_cast(13339535u), Chris@16: static_cast(3481712.15498064590882071018964774556468L), Chris@16: static_cast(45995730u), Chris@16: static_cast(14605578.08768506808414169982791359218571L), Chris@16: static_cast(105258076u), Chris@16: static_cast(43338889.32467613834773723740590533316085L), Chris@16: static_cast(150917976u), Chris@16: static_cast(86363131.28813859145546927288977868422342L), Chris@16: static_cast(120543840u), Chris@16: static_cast(103794043.1163445451906271053616070238554L), Chris@16: static_cast(39916800u), Chris@16: static_cast(56906521.91347156388090791033559122686859L), Chris@16: static_cast(0u) Chris@16: }; Chris@101: __m128d vx = _mm_load1_pd(&x); Chris@101: __m128d sum_even = _mm_load_pd(coeff); Chris@101: __m128d sum_odd = _mm_load_pd(coeff+2); Chris@101: __m128d nc_odd, nc_even; Chris@101: __m128d vx2 = _mm_mul_pd(vx, vx); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 4); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 6); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 8); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 10); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 12); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 14); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 16); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 18); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 20); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx2); Chris@16: nc_odd = _mm_load_pd(coeff + 22); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_odd = _mm_add_pd(sum_odd, nc_odd); Chris@16: Chris@16: sum_even = _mm_mul_pd(sum_even, vx2); Chris@16: nc_even = _mm_load_pd(coeff + 24); Chris@16: sum_odd = _mm_mul_pd(sum_odd, vx); Chris@16: sum_even = _mm_add_pd(sum_even, nc_even); Chris@16: sum_even = _mm_add_pd(sum_even, sum_odd); Chris@16: Chris@16: Chris@16: double ALIGN16 t[2]; Chris@16: _mm_store_pd(t, sum_even); Chris@16: Chris@16: return t[0] / t[1]; Chris@16: } Chris@16: Chris@16: } // namespace lanczos Chris@16: } // namespace math Chris@16: } // namespace boost Chris@16: Chris@16: #undef ALIGN16 Chris@16: Chris@16: #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS Chris@16: Chris@16: Chris@16: Chris@16: Chris@16: