Chris@29
|
1
|
Chris@29
|
2 #ifndef _POMMIER_SSE_MATHFUN_H_
|
Chris@29
|
3 #define _POMMIER_SSE_MATHFUN_H_
|
Chris@29
|
4
|
Chris@29
|
5 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
|
Chris@29
|
6
|
Chris@29
|
7 Inspired by Intel Approximate Math library, and based on the
|
Chris@29
|
8 corresponding algorithms of the cephes math library
|
Chris@29
|
9
|
Chris@29
|
10 The default is to use the SSE1 version. If you define USE_SSE2 the
|
Chris@29
|
11 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
|
Chris@29
|
12 not expect any significant performance improvement with SSE2.
|
Chris@29
|
13 */
|
Chris@29
|
14
|
Chris@29
|
15 /* Copyright (C) 2007 Julien Pommier
|
Chris@29
|
16
|
Chris@29
|
17 This software is provided 'as-is', without any express or implied
|
Chris@29
|
18 warranty. In no event will the authors be held liable for any damages
|
Chris@29
|
19 arising from the use of this software.
|
Chris@29
|
20
|
Chris@29
|
21 Permission is granted to anyone to use this software for any purpose,
|
Chris@29
|
22 including commercial applications, and to alter it and redistribute it
|
Chris@29
|
23 freely, subject to the following restrictions:
|
Chris@29
|
24
|
Chris@29
|
25 1. The origin of this software must not be misrepresented; you must not
|
Chris@29
|
26 claim that you wrote the original software. If you use this software
|
Chris@29
|
27 in a product, an acknowledgment in the product documentation would be
|
Chris@29
|
28 appreciated but is not required.
|
Chris@29
|
29 2. Altered source versions must be plainly marked as such, and must not be
|
Chris@29
|
30 misrepresented as being the original software.
|
Chris@29
|
31 3. This notice may not be removed or altered from any source distribution.
|
Chris@29
|
32
|
Chris@29
|
33 (this is the zlib license)
|
Chris@29
|
34 */
|
Chris@29
|
35
|
Chris@29
|
36 #include <xmmintrin.h>
|
Chris@29
|
37
|
Chris@29
|
38 /* yes I know, the top of this file is quite ugly */
|
Chris@29
|
39
|
Chris@29
|
40 #ifdef _MSC_VER /* visual c++ */
|
Chris@29
|
41 # define ALIGN16_BEG __declspec(align(16))
|
Chris@29
|
42 # define ALIGN16_END
|
Chris@29
|
43 #else /* gcc or icc */
|
Chris@29
|
44 # define ALIGN16_BEG
|
Chris@29
|
45 # define ALIGN16_END __attribute__((aligned(16)))
|
Chris@29
|
46 #endif
|
Chris@29
|
47
|
Chris@29
|
48 /* __m128 is ugly to write */
|
Chris@29
|
49 typedef __m128 v4sf; // vector of 4 float (sse1)
|
Chris@29
|
50
|
Chris@29
|
51 #ifdef USE_SSE2
|
Chris@29
|
52 # include <emmintrin.h>
|
Chris@29
|
53 typedef __m128i v4si; // vector of 4 int (sse2)
|
Chris@29
|
54 #else
|
Chris@29
|
55 typedef __m64 v2si; // vector of 2 int (mmx)
|
Chris@29
|
56 #endif
|
Chris@29
|
57
|
Chris@29
|
58 /* declare some SSE constants -- why can't I figure a better way to do that? */
|
Chris@29
|
59 #define _PS_CONST(Name, Val) \
|
Chris@29
|
60 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
Chris@29
|
61 #define _PI32_CONST(Name, Val) \
|
Chris@29
|
62 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
Chris@29
|
63 #define _PS_CONST_TYPE(Name, Type, Val) \
|
Chris@29
|
64 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
Chris@29
|
65
|
Chris@29
|
66 _PS_CONST(1 , 1.0f);
|
Chris@29
|
67 _PS_CONST(0p5, 0.5f);
|
Chris@29
|
68 /* the smallest non denormalized float number */
|
Chris@29
|
69 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
|
Chris@29
|
70 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
|
Chris@29
|
71 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
|
Chris@29
|
72
|
Chris@29
|
73 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
|
Chris@29
|
74 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
|
Chris@29
|
75
|
Chris@29
|
76 _PI32_CONST(1, 1);
|
Chris@29
|
77 _PI32_CONST(inv1, ~1);
|
Chris@29
|
78 _PI32_CONST(2, 2);
|
Chris@29
|
79 _PI32_CONST(4, 4);
|
Chris@29
|
80 _PI32_CONST(0x7f, 0x7f);
|
Chris@29
|
81
|
Chris@29
|
82 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
|
Chris@29
|
83 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
|
Chris@29
|
84 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
|
Chris@29
|
85 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
|
Chris@29
|
86 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
|
Chris@29
|
87 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
|
Chris@29
|
88 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
|
Chris@29
|
89 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
|
Chris@29
|
90 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
|
Chris@29
|
91 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
|
Chris@29
|
92 _PS_CONST(cephes_log_q1, -2.12194440e-4);
|
Chris@29
|
93 _PS_CONST(cephes_log_q2, 0.693359375);
|
Chris@29
|
94
|
Chris@29
|
95 #if defined (__MINGW32__)
|
Chris@29
|
96
|
Chris@29
|
97 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
|
Chris@29
|
98 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
|
Chris@29
|
99 broken on my mingw gcc 3.4.5 ...
|
Chris@29
|
100
|
Chris@29
|
101 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
|
Chris@29
|
102 */
|
Chris@29
|
103
|
Chris@29
|
104 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
|
Chris@29
|
105 asm (
|
Chris@29
|
106 "movhlps %2,%0\n\t"
|
Chris@29
|
107 : "=x" (a)
|
Chris@29
|
108 : "0" (a), "x"(b)
|
Chris@29
|
109 );
|
Chris@29
|
110 return a; }
|
Chris@29
|
111 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
|
Chris@29
|
112 #define _mm_movehl_ps my_movehl_ps
|
Chris@29
|
113
|
Chris@29
|
114 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
|
Chris@29
|
115 asm (
|
Chris@29
|
116 "cmpltps %2,%0\n\t"
|
Chris@29
|
117 : "=x" (a)
|
Chris@29
|
118 : "0" (a), "x"(b)
|
Chris@29
|
119 );
|
Chris@29
|
120 return a;
|
Chris@29
|
121 }
|
Chris@29
|
122 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
|
Chris@29
|
123 asm (
|
Chris@29
|
124 "cmpnleps %2,%0\n\t"
|
Chris@29
|
125 : "=x" (a)
|
Chris@29
|
126 : "0" (a), "x"(b)
|
Chris@29
|
127 );
|
Chris@29
|
128 return a;
|
Chris@29
|
129 }
|
Chris@29
|
130 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
|
Chris@29
|
131 asm (
|
Chris@29
|
132 "cmpeqps %2,%0\n\t"
|
Chris@29
|
133 : "=x" (a)
|
Chris@29
|
134 : "0" (a), "x"(b)
|
Chris@29
|
135 );
|
Chris@29
|
136 return a;
|
Chris@29
|
137 }
|
Chris@29
|
138 #warning "redefined _mm_cmpxx_ps functions..."
|
Chris@29
|
139 #define _mm_cmplt_ps my_cmplt_ps
|
Chris@29
|
140 #define _mm_cmpgt_ps my_cmpgt_ps
|
Chris@29
|
141 #define _mm_cmpeq_ps my_cmpeq_ps
|
Chris@29
|
142 #endif
|
Chris@29
|
143
|
Chris@29
|
144 #ifndef USE_SSE2
|
Chris@29
|
145 typedef union xmm_mm_union {
|
Chris@29
|
146 __m128 xmm;
|
Chris@29
|
147 __m64 mm[2];
|
Chris@29
|
148 } xmm_mm_union;
|
Chris@29
|
149
|
Chris@29
|
150 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
|
Chris@29
|
151 xmm_mm_union u; u.xmm = xmm_; \
|
Chris@29
|
152 mm0_ = u.mm[0]; \
|
Chris@29
|
153 mm1_ = u.mm[1]; \
|
Chris@29
|
154 }
|
Chris@29
|
155
|
Chris@29
|
156 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
|
Chris@29
|
157 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
|
Chris@29
|
158 }
|
Chris@29
|
159
|
Chris@29
|
160 #endif // USE_SSE2
|
Chris@29
|
161
|
Chris@29
|
162 /* natural logarithm computed for 4 simultaneous float
|
Chris@29
|
163 return NaN for x <= 0
|
Chris@29
|
164 */
|
Chris@29
|
165 v4sf log_ps(v4sf x) {
|
Chris@29
|
166 #ifdef USE_SSE2
|
Chris@29
|
167 v4si emm0;
|
Chris@29
|
168 #else
|
Chris@29
|
169 v2si mm0, mm1;
|
Chris@29
|
170 #endif
|
Chris@29
|
171 v4sf one = *(v4sf*)_ps_1;
|
Chris@29
|
172
|
Chris@29
|
173 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
|
Chris@29
|
174
|
Chris@29
|
175 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
|
Chris@29
|
176
|
Chris@29
|
177 #ifndef USE_SSE2
|
Chris@29
|
178 /* part 1: x = frexpf(x, &e); */
|
Chris@29
|
179 COPY_XMM_TO_MM(x, mm0, mm1);
|
Chris@29
|
180 mm0 = _mm_srli_pi32(mm0, 23);
|
Chris@29
|
181 mm1 = _mm_srli_pi32(mm1, 23);
|
Chris@29
|
182 #else
|
Chris@29
|
183 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
|
Chris@29
|
184 #endif
|
Chris@29
|
185 /* keep only the fractional part */
|
Chris@29
|
186 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
|
Chris@29
|
187 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
|
Chris@29
|
188
|
Chris@29
|
189 #ifndef USE_SSE2
|
Chris@29
|
190 /* now e=mm0:mm1 contain the really base-2 exponent */
|
Chris@29
|
191 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
|
Chris@29
|
192 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
|
Chris@29
|
193 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
|
Chris@29
|
194 _mm_empty(); /* bye bye mmx */
|
Chris@29
|
195 #else
|
Chris@29
|
196 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
|
Chris@29
|
197 v4sf e = _mm_cvtepi32_ps(emm0);
|
Chris@29
|
198 #endif
|
Chris@29
|
199
|
Chris@29
|
200 e = _mm_add_ps(e, one);
|
Chris@29
|
201
|
Chris@29
|
202 /* part2:
|
Chris@29
|
203 if( x < SQRTHF ) {
|
Chris@29
|
204 e -= 1;
|
Chris@29
|
205 x = x + x - 1.0;
|
Chris@29
|
206 } else { x = x - 1.0; }
|
Chris@29
|
207 */
|
Chris@29
|
208 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
|
Chris@29
|
209 v4sf tmp = _mm_and_ps(x, mask);
|
Chris@29
|
210 x = _mm_sub_ps(x, one);
|
Chris@29
|
211 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
|
Chris@29
|
212 x = _mm_add_ps(x, tmp);
|
Chris@29
|
213
|
Chris@29
|
214
|
Chris@29
|
215 v4sf z = _mm_mul_ps(x,x);
|
Chris@29
|
216
|
Chris@29
|
217 v4sf y = *(v4sf*)_ps_cephes_log_p0;
|
Chris@29
|
218 y = _mm_mul_ps(y, x);
|
Chris@29
|
219 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
|
Chris@29
|
220 y = _mm_mul_ps(y, x);
|
Chris@29
|
221 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
|
Chris@29
|
222 y = _mm_mul_ps(y, x);
|
Chris@29
|
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
|
Chris@29
|
224 y = _mm_mul_ps(y, x);
|
Chris@29
|
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
|
Chris@29
|
226 y = _mm_mul_ps(y, x);
|
Chris@29
|
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
|
Chris@29
|
228 y = _mm_mul_ps(y, x);
|
Chris@29
|
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
|
Chris@29
|
230 y = _mm_mul_ps(y, x);
|
Chris@29
|
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
|
Chris@29
|
232 y = _mm_mul_ps(y, x);
|
Chris@29
|
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
|
Chris@29
|
234 y = _mm_mul_ps(y, x);
|
Chris@29
|
235
|
Chris@29
|
236 y = _mm_mul_ps(y, z);
|
Chris@29
|
237
|
Chris@29
|
238
|
Chris@29
|
239 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
|
Chris@29
|
240 y = _mm_add_ps(y, tmp);
|
Chris@29
|
241
|
Chris@29
|
242
|
Chris@29
|
243 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
Chris@29
|
244 y = _mm_sub_ps(y, tmp);
|
Chris@29
|
245
|
Chris@29
|
246 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
|
Chris@29
|
247 x = _mm_add_ps(x, y);
|
Chris@29
|
248 x = _mm_add_ps(x, tmp);
|
Chris@29
|
249 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
|
Chris@29
|
250 return x;
|
Chris@29
|
251 }
|
Chris@29
|
252
|
Chris@29
|
253 _PS_CONST(exp_hi, 88.3762626647949f);
|
Chris@29
|
254 _PS_CONST(exp_lo, -88.3762626647949f);
|
Chris@29
|
255
|
Chris@29
|
256 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
|
Chris@29
|
257 _PS_CONST(cephes_exp_C1, 0.693359375);
|
Chris@29
|
258 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
|
Chris@29
|
259
|
Chris@29
|
260 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
|
Chris@29
|
261 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
|
Chris@29
|
262 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
|
Chris@29
|
263 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
|
Chris@29
|
264 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
|
Chris@29
|
265 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
|
Chris@29
|
266
|
Chris@29
|
267 v4sf exp_ps(v4sf x) {
|
Chris@29
|
268 v4sf tmp = _mm_setzero_ps(), fx;
|
Chris@29
|
269 #ifdef USE_SSE2
|
Chris@29
|
270 v4si emm0;
|
Chris@29
|
271 #else
|
Chris@29
|
272 v2si mm0, mm1;
|
Chris@29
|
273 #endif
|
Chris@29
|
274 v4sf one = *(v4sf*)_ps_1;
|
Chris@29
|
275
|
Chris@29
|
276 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
|
Chris@29
|
277 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
|
Chris@29
|
278
|
Chris@29
|
279 /* express exp(x) as exp(g + n*log(2)) */
|
Chris@29
|
280 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
|
Chris@29
|
281 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
|
Chris@29
|
282
|
Chris@29
|
283 /* how to perform a floorf with SSE: just below */
|
Chris@29
|
284 #ifndef USE_SSE2
|
Chris@29
|
285 /* step 1 : cast to int */
|
Chris@29
|
286 tmp = _mm_movehl_ps(tmp, fx);
|
Chris@29
|
287 mm0 = _mm_cvttps_pi32(fx);
|
Chris@29
|
288 mm1 = _mm_cvttps_pi32(tmp);
|
Chris@29
|
289 /* step 2 : cast back to float */
|
Chris@29
|
290 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
|
Chris@29
|
291 #else
|
Chris@29
|
292 emm0 = _mm_cvttps_epi32(fx);
|
Chris@29
|
293 tmp = _mm_cvtepi32_ps(emm0);
|
Chris@29
|
294 #endif
|
Chris@29
|
295 /* if greater, substract 1 */
|
Chris@29
|
296 v4sf mask = _mm_cmpgt_ps(tmp, fx);
|
Chris@29
|
297 mask = _mm_and_ps(mask, one);
|
Chris@29
|
298 fx = _mm_sub_ps(tmp, mask);
|
Chris@29
|
299
|
Chris@29
|
300 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
|
Chris@29
|
301 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
|
Chris@29
|
302 x = _mm_sub_ps(x, tmp);
|
Chris@29
|
303 x = _mm_sub_ps(x, z);
|
Chris@29
|
304
|
Chris@29
|
305 z = _mm_mul_ps(x,x);
|
Chris@29
|
306
|
Chris@29
|
307 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
|
Chris@29
|
308 y = _mm_mul_ps(y, x);
|
Chris@29
|
309 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
|
Chris@29
|
310 y = _mm_mul_ps(y, x);
|
Chris@29
|
311 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
|
Chris@29
|
312 y = _mm_mul_ps(y, x);
|
Chris@29
|
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
|
Chris@29
|
314 y = _mm_mul_ps(y, x);
|
Chris@29
|
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
|
Chris@29
|
316 y = _mm_mul_ps(y, x);
|
Chris@29
|
317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
|
Chris@29
|
318 y = _mm_mul_ps(y, z);
|
Chris@29
|
319 y = _mm_add_ps(y, x);
|
Chris@29
|
320 y = _mm_add_ps(y, one);
|
Chris@29
|
321
|
Chris@29
|
322 /* build 2^n */
|
Chris@29
|
323 #ifndef USE_SSE2
|
Chris@29
|
324 z = _mm_movehl_ps(z, fx);
|
Chris@29
|
325 mm0 = _mm_cvttps_pi32(fx);
|
Chris@29
|
326 mm1 = _mm_cvttps_pi32(z);
|
Chris@29
|
327 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
|
Chris@29
|
328 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
|
Chris@29
|
329 mm0 = _mm_slli_pi32(mm0, 23);
|
Chris@29
|
330 mm1 = _mm_slli_pi32(mm1, 23);
|
Chris@29
|
331
|
Chris@29
|
332 v4sf pow2n;
|
Chris@29
|
333 COPY_MM_TO_XMM(mm0, mm1, pow2n);
|
Chris@29
|
334 _mm_empty();
|
Chris@29
|
335 #else
|
Chris@29
|
336 emm0 = _mm_cvttps_epi32(fx);
|
Chris@29
|
337 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
|
Chris@29
|
338 emm0 = _mm_slli_epi32(emm0, 23);
|
Chris@29
|
339 v4sf pow2n = _mm_castsi128_ps(emm0);
|
Chris@29
|
340 #endif
|
Chris@29
|
341 y = _mm_mul_ps(y, pow2n);
|
Chris@29
|
342 return y;
|
Chris@29
|
343 }
|
Chris@29
|
344
|
Chris@29
|
345 _PS_CONST(minus_cephes_DP1, -0.78515625);
|
Chris@29
|
346 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
|
Chris@29
|
347 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
|
Chris@29
|
348 _PS_CONST(sincof_p0, -1.9515295891E-4);
|
Chris@29
|
349 _PS_CONST(sincof_p1, 8.3321608736E-3);
|
Chris@29
|
350 _PS_CONST(sincof_p2, -1.6666654611E-1);
|
Chris@29
|
351 _PS_CONST(coscof_p0, 2.443315711809948E-005);
|
Chris@29
|
352 _PS_CONST(coscof_p1, -1.388731625493765E-003);
|
Chris@29
|
353 _PS_CONST(coscof_p2, 4.166664568298827E-002);
|
Chris@29
|
354 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
Chris@29
|
355
|
Chris@29
|
356
|
Chris@29
|
357 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
|
Chris@29
|
358 it runs also on old athlons XPs and the pentium III of your grand
|
Chris@29
|
359 mother.
|
Chris@29
|
360
|
Chris@29
|
361 The code is the exact rewriting of the cephes sinf function.
|
Chris@29
|
362 Precision is excellent as long as x < 8192 (I did not bother to
|
Chris@29
|
363 take into account the special handling they have for greater values
|
Chris@29
|
364 -- it does not return garbage for arguments over 8192, though, but
|
Chris@29
|
365 the extra precision is missing).
|
Chris@29
|
366
|
Chris@29
|
367 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
|
Chris@29
|
368 surprising but correct result.
|
Chris@29
|
369
|
Chris@29
|
370 Performance is also surprisingly good, 1.33 times faster than the
|
Chris@29
|
371 macos vsinf SSE2 function, and 1.5 times faster than the
|
Chris@29
|
372 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
|
Chris@29
|
373 too bad for an SSE1 function (with no special tuning) !
|
Chris@29
|
374 However the latter libraries probably have a much better handling of NaN,
|
Chris@29
|
375 Inf, denormalized and other special arguments..
|
Chris@29
|
376
|
Chris@29
|
377 On my core 1 duo, the execution of this function takes approximately 95 cycles.
|
Chris@29
|
378
|
Chris@29
|
379 From what I have observed on the experiments with Intel AMath lib, switching to an
|
Chris@29
|
380 SSE2 version would improve the perf by only 10%.
|
Chris@29
|
381
|
Chris@29
|
382 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
|
Chris@29
|
383 deliver full speed.
|
Chris@29
|
384 */
|
Chris@29
|
385 v4sf sin_ps(v4sf x) { // any x
|
Chris@29
|
386 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
|
Chris@29
|
387
|
Chris@29
|
388 #ifdef USE_SSE2
|
Chris@29
|
389 v4si emm0, emm2;
|
Chris@29
|
390 #else
|
Chris@29
|
391 v2si mm0, mm1, mm2, mm3;
|
Chris@29
|
392 #endif
|
Chris@29
|
393 sign_bit = x;
|
Chris@29
|
394 /* take the absolute value */
|
Chris@29
|
395 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
Chris@29
|
396 /* extract the sign bit (upper one) */
|
Chris@29
|
397 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
|
Chris@29
|
398
|
Chris@29
|
399 /* scale by 4/Pi */
|
Chris@29
|
400 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
Chris@29
|
401
|
Chris@29
|
402 //printf("plop:"); print4(y);
|
Chris@29
|
403 #ifdef USE_SSE2
|
Chris@29
|
404 /* store the integer part of y in mm0 */
|
Chris@29
|
405 emm2 = _mm_cvttps_epi32(y);
|
Chris@29
|
406 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
407 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
Chris@29
|
408 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
Chris@29
|
409 y = _mm_cvtepi32_ps(emm2);
|
Chris@29
|
410 /* get the swap sign flag */
|
Chris@29
|
411 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
Chris@29
|
412 emm0 = _mm_slli_epi32(emm0, 29);
|
Chris@29
|
413 /* get the polynom selection mask
|
Chris@29
|
414 there is one polynom for 0 <= x <= Pi/4
|
Chris@29
|
415 and another one for Pi/4<x<=Pi/2
|
Chris@29
|
416
|
Chris@29
|
417 Both branches will be computed.
|
Chris@29
|
418 */
|
Chris@29
|
419 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
Chris@29
|
420 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
Chris@29
|
421
|
Chris@29
|
422 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
|
Chris@29
|
423 v4sf poly_mask = _mm_castsi128_ps(emm2);
|
Chris@29
|
424 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
Chris@29
|
425 #else
|
Chris@29
|
426 /* store the integer part of y in mm0:mm1 */
|
Chris@29
|
427 xmm2 = _mm_movehl_ps(xmm2, y);
|
Chris@29
|
428 mm2 = _mm_cvttps_pi32(y);
|
Chris@29
|
429 mm3 = _mm_cvttps_pi32(xmm2);
|
Chris@29
|
430 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
431 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
Chris@29
|
432 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
Chris@29
|
433 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
Chris@29
|
434 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
Chris@29
|
435 y = _mm_cvtpi32x2_ps(mm2, mm3);
|
Chris@29
|
436 /* get the swap sign flag */
|
Chris@29
|
437 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
Chris@29
|
438 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
Chris@29
|
439 mm0 = _mm_slli_pi32(mm0, 29);
|
Chris@29
|
440 mm1 = _mm_slli_pi32(mm1, 29);
|
Chris@29
|
441 /* get the polynom selection mask */
|
Chris@29
|
442 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
Chris@29
|
443 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
Chris@29
|
444 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
Chris@29
|
445 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
Chris@29
|
446 v4sf swap_sign_bit, poly_mask;
|
Chris@29
|
447 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
|
Chris@29
|
448 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
Chris@29
|
449 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
Chris@29
|
450 _mm_empty(); /* good-bye mmx */
|
Chris@29
|
451 #endif
|
Chris@29
|
452
|
Chris@29
|
453 /* The magic pass: "Extended precision modular arithmetic"
|
Chris@29
|
454 x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
Chris@29
|
455 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
Chris@29
|
456 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
Chris@29
|
457 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
Chris@29
|
458 xmm1 = _mm_mul_ps(y, xmm1);
|
Chris@29
|
459 xmm2 = _mm_mul_ps(y, xmm2);
|
Chris@29
|
460 xmm3 = _mm_mul_ps(y, xmm3);
|
Chris@29
|
461 x = _mm_add_ps(x, xmm1);
|
Chris@29
|
462 x = _mm_add_ps(x, xmm2);
|
Chris@29
|
463 x = _mm_add_ps(x, xmm3);
|
Chris@29
|
464
|
Chris@29
|
465 /* Evaluate the first polynom (0 <= x <= Pi/4) */
|
Chris@29
|
466 y = *(v4sf*)_ps_coscof_p0;
|
Chris@29
|
467 v4sf z = _mm_mul_ps(x,x);
|
Chris@29
|
468
|
Chris@29
|
469 y = _mm_mul_ps(y, z);
|
Chris@29
|
470 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
Chris@29
|
471 y = _mm_mul_ps(y, z);
|
Chris@29
|
472 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
Chris@29
|
473 y = _mm_mul_ps(y, z);
|
Chris@29
|
474 y = _mm_mul_ps(y, z);
|
Chris@29
|
475 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
Chris@29
|
476 y = _mm_sub_ps(y, tmp);
|
Chris@29
|
477 y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
Chris@29
|
478
|
Chris@29
|
479 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
Chris@29
|
480
|
Chris@29
|
481 v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
Chris@29
|
482 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
483 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
Chris@29
|
484 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
485 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
Chris@29
|
486 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
487 y2 = _mm_mul_ps(y2, x);
|
Chris@29
|
488 y2 = _mm_add_ps(y2, x);
|
Chris@29
|
489
|
Chris@29
|
490 /* select the correct result from the two polynoms */
|
Chris@29
|
491 xmm3 = poly_mask;
|
Chris@29
|
492 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
Chris@29
|
493 y = _mm_andnot_ps(xmm3, y);
|
Chris@29
|
494 y = _mm_add_ps(y,y2);
|
Chris@29
|
495 /* update the sign */
|
Chris@29
|
496 y = _mm_xor_ps(y, sign_bit);
|
Chris@29
|
497
|
Chris@29
|
498 return y;
|
Chris@29
|
499 }
|
Chris@29
|
500
|
Chris@29
|
501 /* almost the same as sin_ps */
|
Chris@29
|
502 v4sf cos_ps(v4sf x) { // any x
|
Chris@29
|
503 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
|
Chris@29
|
504 #ifdef USE_SSE2
|
Chris@29
|
505 v4si emm0, emm2;
|
Chris@29
|
506 #else
|
Chris@29
|
507 v2si mm0, mm1, mm2, mm3;
|
Chris@29
|
508 #endif
|
Chris@29
|
509 /* take the absolute value */
|
Chris@29
|
510 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
Chris@29
|
511
|
Chris@29
|
512 /* scale by 4/Pi */
|
Chris@29
|
513 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
Chris@29
|
514
|
Chris@29
|
515 #ifdef USE_SSE2
|
Chris@29
|
516 /* store the integer part of y in mm0 */
|
Chris@29
|
517 emm2 = _mm_cvttps_epi32(y);
|
Chris@29
|
518 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
519 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
Chris@29
|
520 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
Chris@29
|
521 y = _mm_cvtepi32_ps(emm2);
|
Chris@29
|
522
|
Chris@29
|
523 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
|
Chris@29
|
524
|
Chris@29
|
525 /* get the swap sign flag */
|
Chris@29
|
526 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
|
Chris@29
|
527 emm0 = _mm_slli_epi32(emm0, 29);
|
Chris@29
|
528 /* get the polynom selection mask */
|
Chris@29
|
529 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
Chris@29
|
530 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
Chris@29
|
531
|
Chris@29
|
532 v4sf sign_bit = _mm_castsi128_ps(emm0);
|
Chris@29
|
533 v4sf poly_mask = _mm_castsi128_ps(emm2);
|
Chris@29
|
534 #else
|
Chris@29
|
535 /* store the integer part of y in mm0:mm1 */
|
Chris@29
|
536 xmm2 = _mm_movehl_ps(xmm2, y);
|
Chris@29
|
537 mm2 = _mm_cvttps_pi32(y);
|
Chris@29
|
538 mm3 = _mm_cvttps_pi32(xmm2);
|
Chris@29
|
539
|
Chris@29
|
540 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
541 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
Chris@29
|
542 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
Chris@29
|
543 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
Chris@29
|
544 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
Chris@29
|
545
|
Chris@29
|
546 y = _mm_cvtpi32x2_ps(mm2, mm3);
|
Chris@29
|
547
|
Chris@29
|
548
|
Chris@29
|
549 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
|
Chris@29
|
550 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
|
Chris@29
|
551
|
Chris@29
|
552 /* get the swap sign flag in mm0:mm1 and the
|
Chris@29
|
553 polynom selection mask in mm2:mm3 */
|
Chris@29
|
554
|
Chris@29
|
555 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
|
Chris@29
|
556 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
|
Chris@29
|
557 mm0 = _mm_slli_pi32(mm0, 29);
|
Chris@29
|
558 mm1 = _mm_slli_pi32(mm1, 29);
|
Chris@29
|
559
|
Chris@29
|
560 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
Chris@29
|
561 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
Chris@29
|
562
|
Chris@29
|
563 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
Chris@29
|
564 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
Chris@29
|
565
|
Chris@29
|
566 v4sf sign_bit, poly_mask;
|
Chris@29
|
567 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
|
Chris@29
|
568 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
Chris@29
|
569 _mm_empty(); /* good-bye mmx */
|
Chris@29
|
570 #endif
|
Chris@29
|
571 /* The magic pass: "Extended precision modular arithmetic"
|
Chris@29
|
572 x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
Chris@29
|
573 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
Chris@29
|
574 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
Chris@29
|
575 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
Chris@29
|
576 xmm1 = _mm_mul_ps(y, xmm1);
|
Chris@29
|
577 xmm2 = _mm_mul_ps(y, xmm2);
|
Chris@29
|
578 xmm3 = _mm_mul_ps(y, xmm3);
|
Chris@29
|
579 x = _mm_add_ps(x, xmm1);
|
Chris@29
|
580 x = _mm_add_ps(x, xmm2);
|
Chris@29
|
581 x = _mm_add_ps(x, xmm3);
|
Chris@29
|
582
|
Chris@29
|
583 /* Evaluate the first polynom (0 <= x <= Pi/4) */
|
Chris@29
|
584 y = *(v4sf*)_ps_coscof_p0;
|
Chris@29
|
585 v4sf z = _mm_mul_ps(x,x);
|
Chris@29
|
586
|
Chris@29
|
587 y = _mm_mul_ps(y, z);
|
Chris@29
|
588 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
Chris@29
|
589 y = _mm_mul_ps(y, z);
|
Chris@29
|
590 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
Chris@29
|
591 y = _mm_mul_ps(y, z);
|
Chris@29
|
592 y = _mm_mul_ps(y, z);
|
Chris@29
|
593 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
Chris@29
|
594 y = _mm_sub_ps(y, tmp);
|
Chris@29
|
595 y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
Chris@29
|
596
|
Chris@29
|
597 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
Chris@29
|
598
|
Chris@29
|
599 v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
Chris@29
|
600 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
601 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
Chris@29
|
602 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
603 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
Chris@29
|
604 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
605 y2 = _mm_mul_ps(y2, x);
|
Chris@29
|
606 y2 = _mm_add_ps(y2, x);
|
Chris@29
|
607
|
Chris@29
|
608 /* select the correct result from the two polynoms */
|
Chris@29
|
609 xmm3 = poly_mask;
|
Chris@29
|
610 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
Chris@29
|
611 y = _mm_andnot_ps(xmm3, y);
|
Chris@29
|
612 y = _mm_add_ps(y,y2);
|
Chris@29
|
613 /* update the sign */
|
Chris@29
|
614 y = _mm_xor_ps(y, sign_bit);
|
Chris@29
|
615
|
Chris@29
|
616 return y;
|
Chris@29
|
617 }
|
Chris@29
|
618
|
Chris@29
|
619 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
|
Chris@29
|
620 it is almost as fast, and gives you a free cosine with your sine */
|
Chris@29
|
621 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
|
Chris@29
|
622 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
Chris@29
|
623 #ifdef USE_SSE2
|
Chris@29
|
624 v4si emm0, emm2, emm4;
|
Chris@29
|
625 #else
|
Chris@29
|
626 v2si mm0, mm1, mm2, mm3, mm4, mm5;
|
Chris@29
|
627 #endif
|
Chris@29
|
628 sign_bit_sin = x;
|
Chris@29
|
629 /* take the absolute value */
|
Chris@29
|
630 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
Chris@29
|
631 /* extract the sign bit (upper one) */
|
Chris@29
|
632 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
|
Chris@29
|
633
|
Chris@29
|
634 /* scale by 4/Pi */
|
Chris@29
|
635 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
Chris@29
|
636
|
Chris@29
|
637 #ifdef USE_SSE2
|
Chris@29
|
638 /* store the integer part of y in emm2 */
|
Chris@29
|
639 emm2 = _mm_cvttps_epi32(y);
|
Chris@29
|
640
|
Chris@29
|
641 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
642 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
Chris@29
|
643 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
Chris@29
|
644 y = _mm_cvtepi32_ps(emm2);
|
Chris@29
|
645
|
Chris@29
|
646 emm4 = emm2;
|
Chris@29
|
647
|
Chris@29
|
648 /* get the swap sign flag for the sine */
|
Chris@29
|
649 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
Chris@29
|
650 emm0 = _mm_slli_epi32(emm0, 29);
|
Chris@29
|
651 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
Chris@29
|
652
|
Chris@29
|
653 /* get the polynom selection mask for the sine*/
|
Chris@29
|
654 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
Chris@29
|
655 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
Chris@29
|
656 v4sf poly_mask = _mm_castsi128_ps(emm2);
|
Chris@29
|
657 #else
|
Chris@29
|
658 /* store the integer part of y in mm2:mm3 */
|
Chris@29
|
659 xmm3 = _mm_movehl_ps(xmm3, y);
|
Chris@29
|
660 mm2 = _mm_cvttps_pi32(y);
|
Chris@29
|
661 mm3 = _mm_cvttps_pi32(xmm3);
|
Chris@29
|
662
|
Chris@29
|
663 /* j=(j+1) & (~1) (see the cephes sources) */
|
Chris@29
|
664 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
Chris@29
|
665 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
Chris@29
|
666 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
Chris@29
|
667 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
Chris@29
|
668
|
Chris@29
|
669 y = _mm_cvtpi32x2_ps(mm2, mm3);
|
Chris@29
|
670
|
Chris@29
|
671 mm4 = mm2;
|
Chris@29
|
672 mm5 = mm3;
|
Chris@29
|
673
|
Chris@29
|
674 /* get the swap sign flag for the sine */
|
Chris@29
|
675 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
Chris@29
|
676 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
Chris@29
|
677 mm0 = _mm_slli_pi32(mm0, 29);
|
Chris@29
|
678 mm1 = _mm_slli_pi32(mm1, 29);
|
Chris@29
|
679 v4sf swap_sign_bit_sin;
|
Chris@29
|
680 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
|
Chris@29
|
681
|
Chris@29
|
682 /* get the polynom selection mask for the sine */
|
Chris@29
|
683
|
Chris@29
|
684 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
Chris@29
|
685 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
Chris@29
|
686 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
Chris@29
|
687 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
Chris@29
|
688 v4sf poly_mask;
|
Chris@29
|
689 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
Chris@29
|
690 #endif
|
Chris@29
|
691
|
Chris@29
|
692 /* The magic pass: "Extended precision modular arithmetic"
|
Chris@29
|
693 x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
Chris@29
|
694 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
Chris@29
|
695 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
Chris@29
|
696 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
Chris@29
|
697 xmm1 = _mm_mul_ps(y, xmm1);
|
Chris@29
|
698 xmm2 = _mm_mul_ps(y, xmm2);
|
Chris@29
|
699 xmm3 = _mm_mul_ps(y, xmm3);
|
Chris@29
|
700 x = _mm_add_ps(x, xmm1);
|
Chris@29
|
701 x = _mm_add_ps(x, xmm2);
|
Chris@29
|
702 x = _mm_add_ps(x, xmm3);
|
Chris@29
|
703
|
Chris@29
|
704 #ifdef USE_SSE2
|
Chris@29
|
705 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
|
Chris@29
|
706 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
|
Chris@29
|
707 emm4 = _mm_slli_epi32(emm4, 29);
|
Chris@29
|
708 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
|
Chris@29
|
709 #else
|
Chris@29
|
710 /* get the sign flag for the cosine */
|
Chris@29
|
711 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
|
Chris@29
|
712 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
|
Chris@29
|
713 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
|
Chris@29
|
714 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
|
Chris@29
|
715 mm4 = _mm_slli_pi32(mm4, 29);
|
Chris@29
|
716 mm5 = _mm_slli_pi32(mm5, 29);
|
Chris@29
|
717 v4sf sign_bit_cos;
|
Chris@29
|
718 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
|
Chris@29
|
719 _mm_empty(); /* good-bye mmx */
|
Chris@29
|
720 #endif
|
Chris@29
|
721
|
Chris@29
|
722 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
Chris@29
|
723
|
Chris@29
|
724
|
Chris@29
|
725 /* Evaluate the first polynom (0 <= x <= Pi/4) */
|
Chris@29
|
726 v4sf z = _mm_mul_ps(x,x);
|
Chris@29
|
727 y = *(v4sf*)_ps_coscof_p0;
|
Chris@29
|
728
|
Chris@29
|
729 y = _mm_mul_ps(y, z);
|
Chris@29
|
730 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
Chris@29
|
731 y = _mm_mul_ps(y, z);
|
Chris@29
|
732 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
Chris@29
|
733 y = _mm_mul_ps(y, z);
|
Chris@29
|
734 y = _mm_mul_ps(y, z);
|
Chris@29
|
735 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
Chris@29
|
736 y = _mm_sub_ps(y, tmp);
|
Chris@29
|
737 y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
Chris@29
|
738
|
Chris@29
|
739 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
Chris@29
|
740
|
Chris@29
|
741 v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
Chris@29
|
742 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
743 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
Chris@29
|
744 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
745 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
Chris@29
|
746 y2 = _mm_mul_ps(y2, z);
|
Chris@29
|
747 y2 = _mm_mul_ps(y2, x);
|
Chris@29
|
748 y2 = _mm_add_ps(y2, x);
|
Chris@29
|
749
|
Chris@29
|
750 /* select the correct result from the two polynoms */
|
Chris@29
|
751 xmm3 = poly_mask;
|
Chris@29
|
752 v4sf ysin2 = _mm_and_ps(xmm3, y2);
|
Chris@29
|
753 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
|
Chris@29
|
754 y2 = _mm_sub_ps(y2,ysin2);
|
Chris@29
|
755 y = _mm_sub_ps(y, ysin1);
|
Chris@29
|
756
|
Chris@29
|
757 xmm1 = _mm_add_ps(ysin1,ysin2);
|
Chris@29
|
758 xmm2 = _mm_add_ps(y,y2);
|
Chris@29
|
759
|
Chris@29
|
760 /* update the sign */
|
Chris@29
|
761 *s = _mm_xor_ps(xmm1, sign_bit_sin);
|
Chris@29
|
762 *c = _mm_xor_ps(xmm2, sign_bit_cos);
|
Chris@29
|
763 }
|
Chris@29
|
764
|
Chris@29
|
765 #endif
|
Chris@29
|
766
|