Chris@16
|
1 // (C) Copyright John Maddock 2006.
|
Chris@16
|
2 // Use, modification and distribution are subject to the
|
Chris@16
|
3 // Boost Software License, Version 1.0. (See accompanying file
|
Chris@16
|
4 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
Chris@16
|
5
|
Chris@16
|
6 #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
|
Chris@16
|
7 #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
|
Chris@16
|
8
|
Chris@16
|
9 #ifdef _MSC_VER
|
Chris@16
|
10 #pragma once
|
Chris@16
|
11 #endif
|
Chris@16
|
12
|
Chris@16
|
13 #include <emmintrin.h>
|
Chris@16
|
14
|
Chris@16
|
15 #if defined(__GNUC__) || defined(__PGI)
|
Chris@16
|
16 #define ALIGN16 __attribute__((__aligned__(16)))
|
Chris@16
|
17 #else
|
Chris@16
|
18 #define ALIGN16 __declspec(align(16))
|
Chris@16
|
19 #endif
|
Chris@16
|
20
|
Chris@16
|
21 namespace boost{ namespace math{ namespace lanczos{
|
Chris@16
|
22
|
Chris@16
|
23 template <>
|
Chris@16
|
24 inline double lanczos13m53::lanczos_sum<double>(const double& x)
|
Chris@16
|
25 {
|
Chris@16
|
26 static const ALIGN16 double coeff[26] = {
|
Chris@16
|
27 static_cast<double>(2.506628274631000270164908177133837338626L),
|
Chris@16
|
28 static_cast<double>(1u),
|
Chris@16
|
29 static_cast<double>(210.8242777515793458725097339207133627117L),
|
Chris@16
|
30 static_cast<double>(66u),
|
Chris@16
|
31 static_cast<double>(8071.672002365816210638002902272250613822L),
|
Chris@16
|
32 static_cast<double>(1925u),
|
Chris@16
|
33 static_cast<double>(186056.2653952234950402949897160456992822L),
|
Chris@16
|
34 static_cast<double>(32670u),
|
Chris@16
|
35 static_cast<double>(2876370.628935372441225409051620849613599L),
|
Chris@16
|
36 static_cast<double>(357423u),
|
Chris@16
|
37 static_cast<double>(31426415.58540019438061423162831820536287L),
|
Chris@16
|
38 static_cast<double>(2637558u),
|
Chris@16
|
39 static_cast<double>(248874557.8620541565114603864132294232163L),
|
Chris@16
|
40 static_cast<double>(13339535u),
|
Chris@16
|
41 static_cast<double>(1439720407.311721673663223072794912393972L),
|
Chris@16
|
42 static_cast<double>(45995730u),
|
Chris@16
|
43 static_cast<double>(6039542586.35202800506429164430729792107L),
|
Chris@16
|
44 static_cast<double>(105258076u),
|
Chris@16
|
45 static_cast<double>(17921034426.03720969991975575445893111267L),
|
Chris@16
|
46 static_cast<double>(150917976u),
|
Chris@16
|
47 static_cast<double>(35711959237.35566804944018545154716670596L),
|
Chris@16
|
48 static_cast<double>(120543840u),
|
Chris@16
|
49 static_cast<double>(42919803642.64909876895789904700198885093L),
|
Chris@16
|
50 static_cast<double>(39916800u),
|
Chris@16
|
51 static_cast<double>(23531376880.41075968857200767445163675473L),
|
Chris@16
|
52 static_cast<double>(0u)
|
Chris@16
|
53 };
|
Chris@101
|
54 __m128d vx = _mm_load1_pd(&x);
|
Chris@101
|
55 __m128d sum_even = _mm_load_pd(coeff);
|
Chris@101
|
56 __m128d sum_odd = _mm_load_pd(coeff+2);
|
Chris@101
|
57 __m128d nc_odd, nc_even;
|
Chris@101
|
58 __m128d vx2 = _mm_mul_pd(vx, vx);
|
Chris@16
|
59
|
Chris@16
|
60 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
61 nc_even = _mm_load_pd(coeff + 4);
|
Chris@16
|
62 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
63 nc_odd = _mm_load_pd(coeff + 6);
|
Chris@16
|
64 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
65 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
66
|
Chris@16
|
67 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
68 nc_even = _mm_load_pd(coeff + 8);
|
Chris@16
|
69 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
70 nc_odd = _mm_load_pd(coeff + 10);
|
Chris@16
|
71 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
72 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
73
|
Chris@16
|
74 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
75 nc_even = _mm_load_pd(coeff + 12);
|
Chris@16
|
76 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
77 nc_odd = _mm_load_pd(coeff + 14);
|
Chris@16
|
78 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
79 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
80
|
Chris@16
|
81 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
82 nc_even = _mm_load_pd(coeff + 16);
|
Chris@16
|
83 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
84 nc_odd = _mm_load_pd(coeff + 18);
|
Chris@16
|
85 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
86 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
87
|
Chris@16
|
88 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
89 nc_even = _mm_load_pd(coeff + 20);
|
Chris@16
|
90 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
91 nc_odd = _mm_load_pd(coeff + 22);
|
Chris@16
|
92 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
93 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
94
|
Chris@16
|
95 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
96 nc_even = _mm_load_pd(coeff + 24);
|
Chris@16
|
97 sum_odd = _mm_mul_pd(sum_odd, vx);
|
Chris@16
|
98 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
99 sum_even = _mm_add_pd(sum_even, sum_odd);
|
Chris@16
|
100
|
Chris@16
|
101
|
Chris@16
|
102 double ALIGN16 t[2];
|
Chris@16
|
103 _mm_store_pd(t, sum_even);
|
Chris@16
|
104
|
Chris@16
|
105 return t[0] / t[1];
|
Chris@16
|
106 }
|
Chris@16
|
107
|
Chris@16
|
108 template <>
|
Chris@16
|
109 inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
|
Chris@16
|
110 {
|
Chris@16
|
111 static const ALIGN16 double coeff[26] = {
|
Chris@16
|
112 static_cast<double>(0.006061842346248906525783753964555936883222L),
|
Chris@16
|
113 static_cast<double>(1u),
|
Chris@16
|
114 static_cast<double>(0.5098416655656676188125178644804694509993L),
|
Chris@16
|
115 static_cast<double>(66u),
|
Chris@16
|
116 static_cast<double>(19.51992788247617482847860966235652136208L),
|
Chris@16
|
117 static_cast<double>(1925u),
|
Chris@16
|
118 static_cast<double>(449.9445569063168119446858607650988409623L),
|
Chris@16
|
119 static_cast<double>(32670u),
|
Chris@16
|
120 static_cast<double>(6955.999602515376140356310115515198987526L),
|
Chris@16
|
121 static_cast<double>(357423u),
|
Chris@16
|
122 static_cast<double>(75999.29304014542649875303443598909137092L),
|
Chris@16
|
123 static_cast<double>(2637558u),
|
Chris@16
|
124 static_cast<double>(601859.6171681098786670226533699352302507L),
|
Chris@16
|
125 static_cast<double>(13339535u),
|
Chris@16
|
126 static_cast<double>(3481712.15498064590882071018964774556468L),
|
Chris@16
|
127 static_cast<double>(45995730u),
|
Chris@16
|
128 static_cast<double>(14605578.08768506808414169982791359218571L),
|
Chris@16
|
129 static_cast<double>(105258076u),
|
Chris@16
|
130 static_cast<double>(43338889.32467613834773723740590533316085L),
|
Chris@16
|
131 static_cast<double>(150917976u),
|
Chris@16
|
132 static_cast<double>(86363131.28813859145546927288977868422342L),
|
Chris@16
|
133 static_cast<double>(120543840u),
|
Chris@16
|
134 static_cast<double>(103794043.1163445451906271053616070238554L),
|
Chris@16
|
135 static_cast<double>(39916800u),
|
Chris@16
|
136 static_cast<double>(56906521.91347156388090791033559122686859L),
|
Chris@16
|
137 static_cast<double>(0u)
|
Chris@16
|
138 };
|
Chris@101
|
139 __m128d vx = _mm_load1_pd(&x);
|
Chris@101
|
140 __m128d sum_even = _mm_load_pd(coeff);
|
Chris@101
|
141 __m128d sum_odd = _mm_load_pd(coeff+2);
|
Chris@101
|
142 __m128d nc_odd, nc_even;
|
Chris@101
|
143 __m128d vx2 = _mm_mul_pd(vx, vx);
|
Chris@16
|
144
|
Chris@16
|
145 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
146 nc_even = _mm_load_pd(coeff + 4);
|
Chris@16
|
147 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
148 nc_odd = _mm_load_pd(coeff + 6);
|
Chris@16
|
149 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
150 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
151
|
Chris@16
|
152 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
153 nc_even = _mm_load_pd(coeff + 8);
|
Chris@16
|
154 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
155 nc_odd = _mm_load_pd(coeff + 10);
|
Chris@16
|
156 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
157 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
158
|
Chris@16
|
159 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
160 nc_even = _mm_load_pd(coeff + 12);
|
Chris@16
|
161 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
162 nc_odd = _mm_load_pd(coeff + 14);
|
Chris@16
|
163 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
164 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
165
|
Chris@16
|
166 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
167 nc_even = _mm_load_pd(coeff + 16);
|
Chris@16
|
168 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
169 nc_odd = _mm_load_pd(coeff + 18);
|
Chris@16
|
170 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
171 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
172
|
Chris@16
|
173 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
174 nc_even = _mm_load_pd(coeff + 20);
|
Chris@16
|
175 sum_odd = _mm_mul_pd(sum_odd, vx2);
|
Chris@16
|
176 nc_odd = _mm_load_pd(coeff + 22);
|
Chris@16
|
177 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
178 sum_odd = _mm_add_pd(sum_odd, nc_odd);
|
Chris@16
|
179
|
Chris@16
|
180 sum_even = _mm_mul_pd(sum_even, vx2);
|
Chris@16
|
181 nc_even = _mm_load_pd(coeff + 24);
|
Chris@16
|
182 sum_odd = _mm_mul_pd(sum_odd, vx);
|
Chris@16
|
183 sum_even = _mm_add_pd(sum_even, nc_even);
|
Chris@16
|
184 sum_even = _mm_add_pd(sum_even, sum_odd);
|
Chris@16
|
185
|
Chris@16
|
186
|
Chris@16
|
187 double ALIGN16 t[2];
|
Chris@16
|
188 _mm_store_pd(t, sum_even);
|
Chris@16
|
189
|
Chris@16
|
190 return t[0] / t[1];
|
Chris@16
|
191 }
|
Chris@16
|
192
|
Chris@16
|
193 } // namespace lanczos
|
Chris@16
|
194 } // namespace math
|
Chris@16
|
195 } // namespace boost
|
Chris@16
|
196
|
Chris@16
|
197 #undef ALIGN16
|
Chris@16
|
198
|
Chris@16
|
199 #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS
|
Chris@16
|
200
|
Chris@16
|
201
|
Chris@16
|
202
|
Chris@16
|
203
|
Chris@16
|
204
|