comparison fft/native/bqvec/pommier/sse_mathfun.h @ 29:cf59817a5983

Build stuff for native
author Chris Cannam
date Sat, 17 Oct 2015 15:00:08 +0100
parents
children
comparison
equal deleted inserted replaced
28:77fca9c01e46 29:cf59817a5983
1
2 #ifndef _POMMIER_SSE_MATHFUN_H_
3 #define _POMMIER_SSE_MATHFUN_H_
4
5 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
6
7 Inspired by Intel Approximate Math library, and based on the
8 corresponding algorithms of the cephes math library
9
10 The default is to use the SSE1 version. If you define USE_SSE2 the
11 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
12 not expect any significant performance improvement with SSE2.
13 */
14
15 /* Copyright (C) 2007 Julien Pommier
16
17 This software is provided 'as-is', without any express or implied
18 warranty. In no event will the authors be held liable for any damages
19 arising from the use of this software.
20
21 Permission is granted to anyone to use this software for any purpose,
22 including commercial applications, and to alter it and redistribute it
23 freely, subject to the following restrictions:
24
25 1. The origin of this software must not be misrepresented; you must not
26 claim that you wrote the original software. If you use this software
27 in a product, an acknowledgment in the product documentation would be
28 appreciated but is not required.
29 2. Altered source versions must be plainly marked as such, and must not be
30 misrepresented as being the original software.
31 3. This notice may not be removed or altered from any source distribution.
32
33 (this is the zlib license)
34 */
35
36 #include <xmmintrin.h>
37
38 /* yes I know, the top of this file is quite ugly */
39
40 #ifdef _MSC_VER /* visual c++ */
41 # define ALIGN16_BEG __declspec(align(16))
42 # define ALIGN16_END
43 #else /* gcc or icc */
44 # define ALIGN16_BEG
45 # define ALIGN16_END __attribute__((aligned(16)))
46 #endif
47
48 /* __m128 is ugly to write */
49 typedef __m128 v4sf; // vector of 4 float (sse1)
50
51 #ifdef USE_SSE2
52 # include <emmintrin.h>
53 typedef __m128i v4si; // vector of 4 int (sse2)
54 #else
55 typedef __m64 v2si; // vector of 2 int (mmx)
56 #endif
57
58 /* declare some SSE constants -- why can't I figure a better way to do that? */
59 #define _PS_CONST(Name, Val) \
60 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
61 #define _PI32_CONST(Name, Val) \
62 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
63 #define _PS_CONST_TYPE(Name, Type, Val) \
64 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
65
66 _PS_CONST(1 , 1.0f);
67 _PS_CONST(0p5, 0.5f);
68 /* the smallest non denormalized float number */
69 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
70 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
71 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
72
73 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
74 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
75
76 _PI32_CONST(1, 1);
77 _PI32_CONST(inv1, ~1);
78 _PI32_CONST(2, 2);
79 _PI32_CONST(4, 4);
80 _PI32_CONST(0x7f, 0x7f);
81
82 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
83 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
84 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
85 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
86 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
87 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
88 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
89 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
90 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
91 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
92 _PS_CONST(cephes_log_q1, -2.12194440e-4);
93 _PS_CONST(cephes_log_q2, 0.693359375);
94
95 #if defined (__MINGW32__)
96
97 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
98 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
99 broken on my mingw gcc 3.4.5 ...
100
101 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
102 */
103
104 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
105 asm (
106 "movhlps %2,%0\n\t"
107 : "=x" (a)
108 : "0" (a), "x"(b)
109 );
110 return a; }
111 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
112 #define _mm_movehl_ps my_movehl_ps
113
114 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
115 asm (
116 "cmpltps %2,%0\n\t"
117 : "=x" (a)
118 : "0" (a), "x"(b)
119 );
120 return a;
121 }
122 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
123 asm (
124 "cmpnleps %2,%0\n\t"
125 : "=x" (a)
126 : "0" (a), "x"(b)
127 );
128 return a;
129 }
130 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
131 asm (
132 "cmpeqps %2,%0\n\t"
133 : "=x" (a)
134 : "0" (a), "x"(b)
135 );
136 return a;
137 }
138 #warning "redefined _mm_cmpxx_ps functions..."
139 #define _mm_cmplt_ps my_cmplt_ps
140 #define _mm_cmpgt_ps my_cmpgt_ps
141 #define _mm_cmpeq_ps my_cmpeq_ps
142 #endif
143
144 #ifndef USE_SSE2
145 typedef union xmm_mm_union {
146 __m128 xmm;
147 __m64 mm[2];
148 } xmm_mm_union;
149
150 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
151 xmm_mm_union u; u.xmm = xmm_; \
152 mm0_ = u.mm[0]; \
153 mm1_ = u.mm[1]; \
154 }
155
156 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
157 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
158 }
159
160 #endif // USE_SSE2
161
162 /* natural logarithm computed for 4 simultaneous float
163 return NaN for x <= 0
164 */
165 v4sf log_ps(v4sf x) {
166 #ifdef USE_SSE2
167 v4si emm0;
168 #else
169 v2si mm0, mm1;
170 #endif
171 v4sf one = *(v4sf*)_ps_1;
172
173 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
174
175 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
176
177 #ifndef USE_SSE2
178 /* part 1: x = frexpf(x, &e); */
179 COPY_XMM_TO_MM(x, mm0, mm1);
180 mm0 = _mm_srli_pi32(mm0, 23);
181 mm1 = _mm_srli_pi32(mm1, 23);
182 #else
183 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
184 #endif
185 /* keep only the fractional part */
186 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
187 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
188
189 #ifndef USE_SSE2
190 /* now e=mm0:mm1 contain the really base-2 exponent */
191 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
192 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
193 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
194 _mm_empty(); /* bye bye mmx */
195 #else
196 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
197 v4sf e = _mm_cvtepi32_ps(emm0);
198 #endif
199
200 e = _mm_add_ps(e, one);
201
202 /* part2:
203 if( x < SQRTHF ) {
204 e -= 1;
205 x = x + x - 1.0;
206 } else { x = x - 1.0; }
207 */
208 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
209 v4sf tmp = _mm_and_ps(x, mask);
210 x = _mm_sub_ps(x, one);
211 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
212 x = _mm_add_ps(x, tmp);
213
214
215 v4sf z = _mm_mul_ps(x,x);
216
217 v4sf y = *(v4sf*)_ps_cephes_log_p0;
218 y = _mm_mul_ps(y, x);
219 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
220 y = _mm_mul_ps(y, x);
221 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
222 y = _mm_mul_ps(y, x);
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
224 y = _mm_mul_ps(y, x);
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
226 y = _mm_mul_ps(y, x);
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
234 y = _mm_mul_ps(y, x);
235
236 y = _mm_mul_ps(y, z);
237
238
239 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
240 y = _mm_add_ps(y, tmp);
241
242
243 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
244 y = _mm_sub_ps(y, tmp);
245
246 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
247 x = _mm_add_ps(x, y);
248 x = _mm_add_ps(x, tmp);
249 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
250 return x;
251 }
252
253 _PS_CONST(exp_hi, 88.3762626647949f);
254 _PS_CONST(exp_lo, -88.3762626647949f);
255
256 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
257 _PS_CONST(cephes_exp_C1, 0.693359375);
258 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
259
260 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
261 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
262 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
263 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
264 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
265 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
266
267 v4sf exp_ps(v4sf x) {
268 v4sf tmp = _mm_setzero_ps(), fx;
269 #ifdef USE_SSE2
270 v4si emm0;
271 #else
272 v2si mm0, mm1;
273 #endif
274 v4sf one = *(v4sf*)_ps_1;
275
276 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
277 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
278
279 /* express exp(x) as exp(g + n*log(2)) */
280 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
281 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
282
283 /* how to perform a floorf with SSE: just below */
284 #ifndef USE_SSE2
285 /* step 1 : cast to int */
286 tmp = _mm_movehl_ps(tmp, fx);
287 mm0 = _mm_cvttps_pi32(fx);
288 mm1 = _mm_cvttps_pi32(tmp);
289 /* step 2 : cast back to float */
290 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
291 #else
292 emm0 = _mm_cvttps_epi32(fx);
293 tmp = _mm_cvtepi32_ps(emm0);
294 #endif
295 /* if greater, substract 1 */
296 v4sf mask = _mm_cmpgt_ps(tmp, fx);
297 mask = _mm_and_ps(mask, one);
298 fx = _mm_sub_ps(tmp, mask);
299
300 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
301 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
302 x = _mm_sub_ps(x, tmp);
303 x = _mm_sub_ps(x, z);
304
305 z = _mm_mul_ps(x,x);
306
307 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
308 y = _mm_mul_ps(y, x);
309 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
310 y = _mm_mul_ps(y, x);
311 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
312 y = _mm_mul_ps(y, x);
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
314 y = _mm_mul_ps(y, x);
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
316 y = _mm_mul_ps(y, x);
317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
318 y = _mm_mul_ps(y, z);
319 y = _mm_add_ps(y, x);
320 y = _mm_add_ps(y, one);
321
322 /* build 2^n */
323 #ifndef USE_SSE2
324 z = _mm_movehl_ps(z, fx);
325 mm0 = _mm_cvttps_pi32(fx);
326 mm1 = _mm_cvttps_pi32(z);
327 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
328 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
329 mm0 = _mm_slli_pi32(mm0, 23);
330 mm1 = _mm_slli_pi32(mm1, 23);
331
332 v4sf pow2n;
333 COPY_MM_TO_XMM(mm0, mm1, pow2n);
334 _mm_empty();
335 #else
336 emm0 = _mm_cvttps_epi32(fx);
337 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
338 emm0 = _mm_slli_epi32(emm0, 23);
339 v4sf pow2n = _mm_castsi128_ps(emm0);
340 #endif
341 y = _mm_mul_ps(y, pow2n);
342 return y;
343 }
344
345 _PS_CONST(minus_cephes_DP1, -0.78515625);
346 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
347 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
348 _PS_CONST(sincof_p0, -1.9515295891E-4);
349 _PS_CONST(sincof_p1, 8.3321608736E-3);
350 _PS_CONST(sincof_p2, -1.6666654611E-1);
351 _PS_CONST(coscof_p0, 2.443315711809948E-005);
352 _PS_CONST(coscof_p1, -1.388731625493765E-003);
353 _PS_CONST(coscof_p2, 4.166664568298827E-002);
354 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
355
356
357 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
358 it runs also on old athlons XPs and the pentium III of your grand
359 mother.
360
361 The code is the exact rewriting of the cephes sinf function.
362 Precision is excellent as long as x < 8192 (I did not bother to
363 take into account the special handling they have for greater values
364 -- it does not return garbage for arguments over 8192, though, but
365 the extra precision is missing).
366
367 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
368 surprising but correct result.
369
370 Performance is also surprisingly good, 1.33 times faster than the
371 macos vsinf SSE2 function, and 1.5 times faster than the
372 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
373 too bad for an SSE1 function (with no special tuning) !
374 However the latter libraries probably have a much better handling of NaN,
375 Inf, denormalized and other special arguments..
376
377 On my core 1 duo, the execution of this function takes approximately 95 cycles.
378
379 From what I have observed on the experiments with Intel AMath lib, switching to an
380 SSE2 version would improve the perf by only 10%.
381
382 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
383 deliver full speed.
384 */
385 v4sf sin_ps(v4sf x) { // any x
386 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
387
388 #ifdef USE_SSE2
389 v4si emm0, emm2;
390 #else
391 v2si mm0, mm1, mm2, mm3;
392 #endif
393 sign_bit = x;
394 /* take the absolute value */
395 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
396 /* extract the sign bit (upper one) */
397 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
398
399 /* scale by 4/Pi */
400 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
401
402 //printf("plop:"); print4(y);
403 #ifdef USE_SSE2
404 /* store the integer part of y in mm0 */
405 emm2 = _mm_cvttps_epi32(y);
406 /* j=(j+1) & (~1) (see the cephes sources) */
407 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
408 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
409 y = _mm_cvtepi32_ps(emm2);
410 /* get the swap sign flag */
411 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
412 emm0 = _mm_slli_epi32(emm0, 29);
413 /* get the polynom selection mask
414 there is one polynom for 0 <= x <= Pi/4
415 and another one for Pi/4<x<=Pi/2
416
417 Both branches will be computed.
418 */
419 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
420 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
421
422 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
423 v4sf poly_mask = _mm_castsi128_ps(emm2);
424 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
425 #else
426 /* store the integer part of y in mm0:mm1 */
427 xmm2 = _mm_movehl_ps(xmm2, y);
428 mm2 = _mm_cvttps_pi32(y);
429 mm3 = _mm_cvttps_pi32(xmm2);
430 /* j=(j+1) & (~1) (see the cephes sources) */
431 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
432 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
433 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
434 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
435 y = _mm_cvtpi32x2_ps(mm2, mm3);
436 /* get the swap sign flag */
437 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
438 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
439 mm0 = _mm_slli_pi32(mm0, 29);
440 mm1 = _mm_slli_pi32(mm1, 29);
441 /* get the polynom selection mask */
442 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
443 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
444 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
445 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
446 v4sf swap_sign_bit, poly_mask;
447 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
448 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
449 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
450 _mm_empty(); /* good-bye mmx */
451 #endif
452
453 /* The magic pass: "Extended precision modular arithmetic"
454 x = ((x - y * DP1) - y * DP2) - y * DP3; */
455 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
456 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
457 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
458 xmm1 = _mm_mul_ps(y, xmm1);
459 xmm2 = _mm_mul_ps(y, xmm2);
460 xmm3 = _mm_mul_ps(y, xmm3);
461 x = _mm_add_ps(x, xmm1);
462 x = _mm_add_ps(x, xmm2);
463 x = _mm_add_ps(x, xmm3);
464
465 /* Evaluate the first polynom (0 <= x <= Pi/4) */
466 y = *(v4sf*)_ps_coscof_p0;
467 v4sf z = _mm_mul_ps(x,x);
468
469 y = _mm_mul_ps(y, z);
470 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
471 y = _mm_mul_ps(y, z);
472 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
473 y = _mm_mul_ps(y, z);
474 y = _mm_mul_ps(y, z);
475 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
476 y = _mm_sub_ps(y, tmp);
477 y = _mm_add_ps(y, *(v4sf*)_ps_1);
478
479 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
480
481 v4sf y2 = *(v4sf*)_ps_sincof_p0;
482 y2 = _mm_mul_ps(y2, z);
483 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
484 y2 = _mm_mul_ps(y2, z);
485 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
486 y2 = _mm_mul_ps(y2, z);
487 y2 = _mm_mul_ps(y2, x);
488 y2 = _mm_add_ps(y2, x);
489
490 /* select the correct result from the two polynoms */
491 xmm3 = poly_mask;
492 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
493 y = _mm_andnot_ps(xmm3, y);
494 y = _mm_add_ps(y,y2);
495 /* update the sign */
496 y = _mm_xor_ps(y, sign_bit);
497
498 return y;
499 }
500
501 /* almost the same as sin_ps */
502 v4sf cos_ps(v4sf x) { // any x
503 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
504 #ifdef USE_SSE2
505 v4si emm0, emm2;
506 #else
507 v2si mm0, mm1, mm2, mm3;
508 #endif
509 /* take the absolute value */
510 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
511
512 /* scale by 4/Pi */
513 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
514
515 #ifdef USE_SSE2
516 /* store the integer part of y in mm0 */
517 emm2 = _mm_cvttps_epi32(y);
518 /* j=(j+1) & (~1) (see the cephes sources) */
519 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
520 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
521 y = _mm_cvtepi32_ps(emm2);
522
523 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
524
525 /* get the swap sign flag */
526 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
527 emm0 = _mm_slli_epi32(emm0, 29);
528 /* get the polynom selection mask */
529 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
530 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
531
532 v4sf sign_bit = _mm_castsi128_ps(emm0);
533 v4sf poly_mask = _mm_castsi128_ps(emm2);
534 #else
535 /* store the integer part of y in mm0:mm1 */
536 xmm2 = _mm_movehl_ps(xmm2, y);
537 mm2 = _mm_cvttps_pi32(y);
538 mm3 = _mm_cvttps_pi32(xmm2);
539
540 /* j=(j+1) & (~1) (see the cephes sources) */
541 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
542 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
543 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
544 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
545
546 y = _mm_cvtpi32x2_ps(mm2, mm3);
547
548
549 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
550 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
551
552 /* get the swap sign flag in mm0:mm1 and the
553 polynom selection mask in mm2:mm3 */
554
555 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
556 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
557 mm0 = _mm_slli_pi32(mm0, 29);
558 mm1 = _mm_slli_pi32(mm1, 29);
559
560 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
561 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
562
563 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
564 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
565
566 v4sf sign_bit, poly_mask;
567 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
568 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
569 _mm_empty(); /* good-bye mmx */
570 #endif
571 /* The magic pass: "Extended precision modular arithmetic"
572 x = ((x - y * DP1) - y * DP2) - y * DP3; */
573 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
574 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
575 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
576 xmm1 = _mm_mul_ps(y, xmm1);
577 xmm2 = _mm_mul_ps(y, xmm2);
578 xmm3 = _mm_mul_ps(y, xmm3);
579 x = _mm_add_ps(x, xmm1);
580 x = _mm_add_ps(x, xmm2);
581 x = _mm_add_ps(x, xmm3);
582
583 /* Evaluate the first polynom (0 <= x <= Pi/4) */
584 y = *(v4sf*)_ps_coscof_p0;
585 v4sf z = _mm_mul_ps(x,x);
586
587 y = _mm_mul_ps(y, z);
588 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
589 y = _mm_mul_ps(y, z);
590 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
591 y = _mm_mul_ps(y, z);
592 y = _mm_mul_ps(y, z);
593 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
594 y = _mm_sub_ps(y, tmp);
595 y = _mm_add_ps(y, *(v4sf*)_ps_1);
596
597 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
598
599 v4sf y2 = *(v4sf*)_ps_sincof_p0;
600 y2 = _mm_mul_ps(y2, z);
601 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
602 y2 = _mm_mul_ps(y2, z);
603 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
604 y2 = _mm_mul_ps(y2, z);
605 y2 = _mm_mul_ps(y2, x);
606 y2 = _mm_add_ps(y2, x);
607
608 /* select the correct result from the two polynoms */
609 xmm3 = poly_mask;
610 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
611 y = _mm_andnot_ps(xmm3, y);
612 y = _mm_add_ps(y,y2);
613 /* update the sign */
614 y = _mm_xor_ps(y, sign_bit);
615
616 return y;
617 }
618
619 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
620 it is almost as fast, and gives you a free cosine with your sine */
621 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
622 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
623 #ifdef USE_SSE2
624 v4si emm0, emm2, emm4;
625 #else
626 v2si mm0, mm1, mm2, mm3, mm4, mm5;
627 #endif
628 sign_bit_sin = x;
629 /* take the absolute value */
630 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
631 /* extract the sign bit (upper one) */
632 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
633
634 /* scale by 4/Pi */
635 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
636
637 #ifdef USE_SSE2
638 /* store the integer part of y in emm2 */
639 emm2 = _mm_cvttps_epi32(y);
640
641 /* j=(j+1) & (~1) (see the cephes sources) */
642 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
643 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
644 y = _mm_cvtepi32_ps(emm2);
645
646 emm4 = emm2;
647
648 /* get the swap sign flag for the sine */
649 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
650 emm0 = _mm_slli_epi32(emm0, 29);
651 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
652
653 /* get the polynom selection mask for the sine*/
654 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
655 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
656 v4sf poly_mask = _mm_castsi128_ps(emm2);
657 #else
658 /* store the integer part of y in mm2:mm3 */
659 xmm3 = _mm_movehl_ps(xmm3, y);
660 mm2 = _mm_cvttps_pi32(y);
661 mm3 = _mm_cvttps_pi32(xmm3);
662
663 /* j=(j+1) & (~1) (see the cephes sources) */
664 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
665 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
666 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
667 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
668
669 y = _mm_cvtpi32x2_ps(mm2, mm3);
670
671 mm4 = mm2;
672 mm5 = mm3;
673
674 /* get the swap sign flag for the sine */
675 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
676 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
677 mm0 = _mm_slli_pi32(mm0, 29);
678 mm1 = _mm_slli_pi32(mm1, 29);
679 v4sf swap_sign_bit_sin;
680 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
681
682 /* get the polynom selection mask for the sine */
683
684 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
685 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
686 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
687 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
688 v4sf poly_mask;
689 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
690 #endif
691
692 /* The magic pass: "Extended precision modular arithmetic"
693 x = ((x - y * DP1) - y * DP2) - y * DP3; */
694 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
695 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
696 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
697 xmm1 = _mm_mul_ps(y, xmm1);
698 xmm2 = _mm_mul_ps(y, xmm2);
699 xmm3 = _mm_mul_ps(y, xmm3);
700 x = _mm_add_ps(x, xmm1);
701 x = _mm_add_ps(x, xmm2);
702 x = _mm_add_ps(x, xmm3);
703
704 #ifdef USE_SSE2
705 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
706 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
707 emm4 = _mm_slli_epi32(emm4, 29);
708 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
709 #else
710 /* get the sign flag for the cosine */
711 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
712 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
713 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
714 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
715 mm4 = _mm_slli_pi32(mm4, 29);
716 mm5 = _mm_slli_pi32(mm5, 29);
717 v4sf sign_bit_cos;
718 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
719 _mm_empty(); /* good-bye mmx */
720 #endif
721
722 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
723
724
725 /* Evaluate the first polynom (0 <= x <= Pi/4) */
726 v4sf z = _mm_mul_ps(x,x);
727 y = *(v4sf*)_ps_coscof_p0;
728
729 y = _mm_mul_ps(y, z);
730 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
731 y = _mm_mul_ps(y, z);
732 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
733 y = _mm_mul_ps(y, z);
734 y = _mm_mul_ps(y, z);
735 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
736 y = _mm_sub_ps(y, tmp);
737 y = _mm_add_ps(y, *(v4sf*)_ps_1);
738
739 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
740
741 v4sf y2 = *(v4sf*)_ps_sincof_p0;
742 y2 = _mm_mul_ps(y2, z);
743 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
744 y2 = _mm_mul_ps(y2, z);
745 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
746 y2 = _mm_mul_ps(y2, z);
747 y2 = _mm_mul_ps(y2, x);
748 y2 = _mm_add_ps(y2, x);
749
750 /* select the correct result from the two polynoms */
751 xmm3 = poly_mask;
752 v4sf ysin2 = _mm_and_ps(xmm3, y2);
753 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
754 y2 = _mm_sub_ps(y2,ysin2);
755 y = _mm_sub_ps(y, ysin1);
756
757 xmm1 = _mm_add_ps(ysin1,ysin2);
758 xmm2 = _mm_add_ps(y,y2);
759
760 /* update the sign */
761 *s = _mm_xor_ps(xmm1, sign_bit_sin);
762 *c = _mm_xor_ps(xmm2, sign_bit_cos);
763 }
764
765 #endif
766