Mercurial > hg > js-dsp-test
comparison fft/native/bqvec/pommier/sse_mathfun.h @ 29:cf59817a5983
Build stuff for native
author | Chris Cannam |
---|---|
date | Sat, 17 Oct 2015 15:00:08 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
28:77fca9c01e46 | 29:cf59817a5983 |
---|---|
1 | |
2 #ifndef _POMMIER_SSE_MATHFUN_H_ | |
3 #define _POMMIER_SSE_MATHFUN_H_ | |
4 | |
5 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log | |
6 | |
7 Inspired by Intel Approximate Math library, and based on the | |
8 corresponding algorithms of the cephes math library | |
9 | |
10 The default is to use the SSE1 version. If you define USE_SSE2 the | |
11 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do | |
12 not expect any significant performance improvement with SSE2. | |
13 */ | |
14 | |
15 /* Copyright (C) 2007 Julien Pommier | |
16 | |
17 This software is provided 'as-is', without any express or implied | |
18 warranty. In no event will the authors be held liable for any damages | |
19 arising from the use of this software. | |
20 | |
21 Permission is granted to anyone to use this software for any purpose, | |
22 including commercial applications, and to alter it and redistribute it | |
23 freely, subject to the following restrictions: | |
24 | |
25 1. The origin of this software must not be misrepresented; you must not | |
26 claim that you wrote the original software. If you use this software | |
27 in a product, an acknowledgment in the product documentation would be | |
28 appreciated but is not required. | |
29 2. Altered source versions must be plainly marked as such, and must not be | |
30 misrepresented as being the original software. | |
31 3. This notice may not be removed or altered from any source distribution. | |
32 | |
33 (this is the zlib license) | |
34 */ | |
35 | |
36 #include <xmmintrin.h> | |
37 | |
38 /* yes I know, the top of this file is quite ugly */ | |
39 | |
40 #ifdef _MSC_VER /* visual c++ */ | |
41 # define ALIGN16_BEG __declspec(align(16)) | |
42 # define ALIGN16_END | |
43 #else /* gcc or icc */ | |
44 # define ALIGN16_BEG | |
45 # define ALIGN16_END __attribute__((aligned(16))) | |
46 #endif | |
47 | |
48 /* __m128 is ugly to write */ | |
49 typedef __m128 v4sf; // vector of 4 float (sse1) | |
50 | |
51 #ifdef USE_SSE2 | |
52 # include <emmintrin.h> | |
53 typedef __m128i v4si; // vector of 4 int (sse2) | |
54 #else | |
55 typedef __m64 v2si; // vector of 2 int (mmx) | |
56 #endif | |
57 | |
58 /* declare some SSE constants -- why can't I figure a better way to do that? */ | |
59 #define _PS_CONST(Name, Val) \ | |
60 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } | |
61 #define _PI32_CONST(Name, Val) \ | |
62 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } | |
63 #define _PS_CONST_TYPE(Name, Type, Val) \ | |
64 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } | |
65 | |
66 _PS_CONST(1 , 1.0f); | |
67 _PS_CONST(0p5, 0.5f); | |
68 /* the smallest non denormalized float number */ | |
69 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000); | |
70 _PS_CONST_TYPE(mant_mask, int, 0x7f800000); | |
71 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); | |
72 | |
73 _PS_CONST_TYPE(sign_mask, int, 0x80000000); | |
74 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); | |
75 | |
76 _PI32_CONST(1, 1); | |
77 _PI32_CONST(inv1, ~1); | |
78 _PI32_CONST(2, 2); | |
79 _PI32_CONST(4, 4); | |
80 _PI32_CONST(0x7f, 0x7f); | |
81 | |
82 _PS_CONST(cephes_SQRTHF, 0.707106781186547524); | |
83 _PS_CONST(cephes_log_p0, 7.0376836292E-2); | |
84 _PS_CONST(cephes_log_p1, - 1.1514610310E-1); | |
85 _PS_CONST(cephes_log_p2, 1.1676998740E-1); | |
86 _PS_CONST(cephes_log_p3, - 1.2420140846E-1); | |
87 _PS_CONST(cephes_log_p4, + 1.4249322787E-1); | |
88 _PS_CONST(cephes_log_p5, - 1.6668057665E-1); | |
89 _PS_CONST(cephes_log_p6, + 2.0000714765E-1); | |
90 _PS_CONST(cephes_log_p7, - 2.4999993993E-1); | |
91 _PS_CONST(cephes_log_p8, + 3.3333331174E-1); | |
92 _PS_CONST(cephes_log_q1, -2.12194440e-4); | |
93 _PS_CONST(cephes_log_q2, 0.693359375); | |
94 | |
95 #if defined (__MINGW32__) | |
96 | |
97 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics | |
98 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely | |
99 broken on my mingw gcc 3.4.5 ... | |
100 | |
101 Note that the bug on _mm_cmp* does occur only at -O0 optimization level | |
102 */ | |
103 | |
104 inline __m128 my_movehl_ps(__m128 a, const __m128 b) { | |
105 asm ( | |
106 "movhlps %2,%0\n\t" | |
107 : "=x" (a) | |
108 : "0" (a), "x"(b) | |
109 ); | |
110 return a; } | |
111 #warning "redefined _mm_movehl_ps (see gcc bug 21179)" | |
112 #define _mm_movehl_ps my_movehl_ps | |
113 | |
114 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) { | |
115 asm ( | |
116 "cmpltps %2,%0\n\t" | |
117 : "=x" (a) | |
118 : "0" (a), "x"(b) | |
119 ); | |
120 return a; | |
121 } | |
122 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) { | |
123 asm ( | |
124 "cmpnleps %2,%0\n\t" | |
125 : "=x" (a) | |
126 : "0" (a), "x"(b) | |
127 ); | |
128 return a; | |
129 } | |
130 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) { | |
131 asm ( | |
132 "cmpeqps %2,%0\n\t" | |
133 : "=x" (a) | |
134 : "0" (a), "x"(b) | |
135 ); | |
136 return a; | |
137 } | |
138 #warning "redefined _mm_cmpxx_ps functions..." | |
139 #define _mm_cmplt_ps my_cmplt_ps | |
140 #define _mm_cmpgt_ps my_cmpgt_ps | |
141 #define _mm_cmpeq_ps my_cmpeq_ps | |
142 #endif | |
143 | |
144 #ifndef USE_SSE2 | |
145 typedef union xmm_mm_union { | |
146 __m128 xmm; | |
147 __m64 mm[2]; | |
148 } xmm_mm_union; | |
149 | |
150 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ | |
151 xmm_mm_union u; u.xmm = xmm_; \ | |
152 mm0_ = u.mm[0]; \ | |
153 mm1_ = u.mm[1]; \ | |
154 } | |
155 | |
156 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ | |
157 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ | |
158 } | |
159 | |
160 #endif // USE_SSE2 | |
161 | |
162 /* natural logarithm computed for 4 simultaneous float | |
163 return NaN for x <= 0 | |
164 */ | |
165 v4sf log_ps(v4sf x) { | |
166 #ifdef USE_SSE2 | |
167 v4si emm0; | |
168 #else | |
169 v2si mm0, mm1; | |
170 #endif | |
171 v4sf one = *(v4sf*)_ps_1; | |
172 | |
173 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); | |
174 | |
175 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ | |
176 | |
177 #ifndef USE_SSE2 | |
178 /* part 1: x = frexpf(x, &e); */ | |
179 COPY_XMM_TO_MM(x, mm0, mm1); | |
180 mm0 = _mm_srli_pi32(mm0, 23); | |
181 mm1 = _mm_srli_pi32(mm1, 23); | |
182 #else | |
183 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); | |
184 #endif | |
185 /* keep only the fractional part */ | |
186 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); | |
187 x = _mm_or_ps(x, *(v4sf*)_ps_0p5); | |
188 | |
189 #ifndef USE_SSE2 | |
190 /* now e=mm0:mm1 contain the really base-2 exponent */ | |
191 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); | |
192 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); | |
193 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); | |
194 _mm_empty(); /* bye bye mmx */ | |
195 #else | |
196 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); | |
197 v4sf e = _mm_cvtepi32_ps(emm0); | |
198 #endif | |
199 | |
200 e = _mm_add_ps(e, one); | |
201 | |
202 /* part2: | |
203 if( x < SQRTHF ) { | |
204 e -= 1; | |
205 x = x + x - 1.0; | |
206 } else { x = x - 1.0; } | |
207 */ | |
208 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); | |
209 v4sf tmp = _mm_and_ps(x, mask); | |
210 x = _mm_sub_ps(x, one); | |
211 e = _mm_sub_ps(e, _mm_and_ps(one, mask)); | |
212 x = _mm_add_ps(x, tmp); | |
213 | |
214 | |
215 v4sf z = _mm_mul_ps(x,x); | |
216 | |
217 v4sf y = *(v4sf*)_ps_cephes_log_p0; | |
218 y = _mm_mul_ps(y, x); | |
219 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); | |
220 y = _mm_mul_ps(y, x); | |
221 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); | |
222 y = _mm_mul_ps(y, x); | |
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); | |
224 y = _mm_mul_ps(y, x); | |
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); | |
226 y = _mm_mul_ps(y, x); | |
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); | |
228 y = _mm_mul_ps(y, x); | |
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); | |
230 y = _mm_mul_ps(y, x); | |
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); | |
232 y = _mm_mul_ps(y, x); | |
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); | |
234 y = _mm_mul_ps(y, x); | |
235 | |
236 y = _mm_mul_ps(y, z); | |
237 | |
238 | |
239 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); | |
240 y = _mm_add_ps(y, tmp); | |
241 | |
242 | |
243 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); | |
244 y = _mm_sub_ps(y, tmp); | |
245 | |
246 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); | |
247 x = _mm_add_ps(x, y); | |
248 x = _mm_add_ps(x, tmp); | |
249 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN | |
250 return x; | |
251 } | |
252 | |
253 _PS_CONST(exp_hi, 88.3762626647949f); | |
254 _PS_CONST(exp_lo, -88.3762626647949f); | |
255 | |
256 _PS_CONST(cephes_LOG2EF, 1.44269504088896341); | |
257 _PS_CONST(cephes_exp_C1, 0.693359375); | |
258 _PS_CONST(cephes_exp_C2, -2.12194440e-4); | |
259 | |
260 _PS_CONST(cephes_exp_p0, 1.9875691500E-4); | |
261 _PS_CONST(cephes_exp_p1, 1.3981999507E-3); | |
262 _PS_CONST(cephes_exp_p2, 8.3334519073E-3); | |
263 _PS_CONST(cephes_exp_p3, 4.1665795894E-2); | |
264 _PS_CONST(cephes_exp_p4, 1.6666665459E-1); | |
265 _PS_CONST(cephes_exp_p5, 5.0000001201E-1); | |
266 | |
267 v4sf exp_ps(v4sf x) { | |
268 v4sf tmp = _mm_setzero_ps(), fx; | |
269 #ifdef USE_SSE2 | |
270 v4si emm0; | |
271 #else | |
272 v2si mm0, mm1; | |
273 #endif | |
274 v4sf one = *(v4sf*)_ps_1; | |
275 | |
276 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); | |
277 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); | |
278 | |
279 /* express exp(x) as exp(g + n*log(2)) */ | |
280 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); | |
281 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); | |
282 | |
283 /* how to perform a floorf with SSE: just below */ | |
284 #ifndef USE_SSE2 | |
285 /* step 1 : cast to int */ | |
286 tmp = _mm_movehl_ps(tmp, fx); | |
287 mm0 = _mm_cvttps_pi32(fx); | |
288 mm1 = _mm_cvttps_pi32(tmp); | |
289 /* step 2 : cast back to float */ | |
290 tmp = _mm_cvtpi32x2_ps(mm0, mm1); | |
291 #else | |
292 emm0 = _mm_cvttps_epi32(fx); | |
293 tmp = _mm_cvtepi32_ps(emm0); | |
294 #endif | |
295 /* if greater, substract 1 */ | |
296 v4sf mask = _mm_cmpgt_ps(tmp, fx); | |
297 mask = _mm_and_ps(mask, one); | |
298 fx = _mm_sub_ps(tmp, mask); | |
299 | |
300 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); | |
301 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); | |
302 x = _mm_sub_ps(x, tmp); | |
303 x = _mm_sub_ps(x, z); | |
304 | |
305 z = _mm_mul_ps(x,x); | |
306 | |
307 v4sf y = *(v4sf*)_ps_cephes_exp_p0; | |
308 y = _mm_mul_ps(y, x); | |
309 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); | |
310 y = _mm_mul_ps(y, x); | |
311 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); | |
312 y = _mm_mul_ps(y, x); | |
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); | |
314 y = _mm_mul_ps(y, x); | |
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); | |
316 y = _mm_mul_ps(y, x); | |
317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); | |
318 y = _mm_mul_ps(y, z); | |
319 y = _mm_add_ps(y, x); | |
320 y = _mm_add_ps(y, one); | |
321 | |
322 /* build 2^n */ | |
323 #ifndef USE_SSE2 | |
324 z = _mm_movehl_ps(z, fx); | |
325 mm0 = _mm_cvttps_pi32(fx); | |
326 mm1 = _mm_cvttps_pi32(z); | |
327 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); | |
328 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); | |
329 mm0 = _mm_slli_pi32(mm0, 23); | |
330 mm1 = _mm_slli_pi32(mm1, 23); | |
331 | |
332 v4sf pow2n; | |
333 COPY_MM_TO_XMM(mm0, mm1, pow2n); | |
334 _mm_empty(); | |
335 #else | |
336 emm0 = _mm_cvttps_epi32(fx); | |
337 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); | |
338 emm0 = _mm_slli_epi32(emm0, 23); | |
339 v4sf pow2n = _mm_castsi128_ps(emm0); | |
340 #endif | |
341 y = _mm_mul_ps(y, pow2n); | |
342 return y; | |
343 } | |
344 | |
345 _PS_CONST(minus_cephes_DP1, -0.78515625); | |
346 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); | |
347 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); | |
348 _PS_CONST(sincof_p0, -1.9515295891E-4); | |
349 _PS_CONST(sincof_p1, 8.3321608736E-3); | |
350 _PS_CONST(sincof_p2, -1.6666654611E-1); | |
351 _PS_CONST(coscof_p0, 2.443315711809948E-005); | |
352 _PS_CONST(coscof_p1, -1.388731625493765E-003); | |
353 _PS_CONST(coscof_p2, 4.166664568298827E-002); | |
354 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI | |
355 | |
356 | |
357 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so | |
358 it runs also on old athlons XPs and the pentium III of your grand | |
359 mother. | |
360 | |
361 The code is the exact rewriting of the cephes sinf function. | |
362 Precision is excellent as long as x < 8192 (I did not bother to | |
363 take into account the special handling they have for greater values | |
364 -- it does not return garbage for arguments over 8192, though, but | |
365 the extra precision is missing). | |
366 | |
367 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the | |
368 surprising but correct result. | |
369 | |
370 Performance is also surprisingly good, 1.33 times faster than the | |
371 macos vsinf SSE2 function, and 1.5 times faster than the | |
372 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not | |
373 too bad for an SSE1 function (with no special tuning) ! | |
374 However the latter libraries probably have a much better handling of NaN, | |
375 Inf, denormalized and other special arguments.. | |
376 | |
377 On my core 1 duo, the execution of this function takes approximately 95 cycles. | |
378 | |
379 From what I have observed on the experiments with Intel AMath lib, switching to an | |
380 SSE2 version would improve the perf by only 10%. | |
381 | |
382 Since it is based on SSE intrinsics, it has to be compiled at -O2 to | |
383 deliver full speed. | |
384 */ | |
385 v4sf sin_ps(v4sf x) { // any x | |
386 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; | |
387 | |
388 #ifdef USE_SSE2 | |
389 v4si emm0, emm2; | |
390 #else | |
391 v2si mm0, mm1, mm2, mm3; | |
392 #endif | |
393 sign_bit = x; | |
394 /* take the absolute value */ | |
395 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); | |
396 /* extract the sign bit (upper one) */ | |
397 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); | |
398 | |
399 /* scale by 4/Pi */ | |
400 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); | |
401 | |
402 //printf("plop:"); print4(y); | |
403 #ifdef USE_SSE2 | |
404 /* store the integer part of y in mm0 */ | |
405 emm2 = _mm_cvttps_epi32(y); | |
406 /* j=(j+1) & (~1) (see the cephes sources) */ | |
407 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); | |
408 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); | |
409 y = _mm_cvtepi32_ps(emm2); | |
410 /* get the swap sign flag */ | |
411 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); | |
412 emm0 = _mm_slli_epi32(emm0, 29); | |
413 /* get the polynom selection mask | |
414 there is one polynom for 0 <= x <= Pi/4 | |
415 and another one for Pi/4<x<=Pi/2 | |
416 | |
417 Both branches will be computed. | |
418 */ | |
419 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); | |
420 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); | |
421 | |
422 v4sf swap_sign_bit = _mm_castsi128_ps(emm0); | |
423 v4sf poly_mask = _mm_castsi128_ps(emm2); | |
424 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); | |
425 #else | |
426 /* store the integer part of y in mm0:mm1 */ | |
427 xmm2 = _mm_movehl_ps(xmm2, y); | |
428 mm2 = _mm_cvttps_pi32(y); | |
429 mm3 = _mm_cvttps_pi32(xmm2); | |
430 /* j=(j+1) & (~1) (see the cephes sources) */ | |
431 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); | |
432 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); | |
433 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); | |
434 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); | |
435 y = _mm_cvtpi32x2_ps(mm2, mm3); | |
436 /* get the swap sign flag */ | |
437 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); | |
438 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); | |
439 mm0 = _mm_slli_pi32(mm0, 29); | |
440 mm1 = _mm_slli_pi32(mm1, 29); | |
441 /* get the polynom selection mask */ | |
442 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); | |
443 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); | |
444 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); | |
445 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); | |
446 v4sf swap_sign_bit, poly_mask; | |
447 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit); | |
448 COPY_MM_TO_XMM(mm2, mm3, poly_mask); | |
449 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); | |
450 _mm_empty(); /* good-bye mmx */ | |
451 #endif | |
452 | |
453 /* The magic pass: "Extended precision modular arithmetic" | |
454 x = ((x - y * DP1) - y * DP2) - y * DP3; */ | |
455 xmm1 = *(v4sf*)_ps_minus_cephes_DP1; | |
456 xmm2 = *(v4sf*)_ps_minus_cephes_DP2; | |
457 xmm3 = *(v4sf*)_ps_minus_cephes_DP3; | |
458 xmm1 = _mm_mul_ps(y, xmm1); | |
459 xmm2 = _mm_mul_ps(y, xmm2); | |
460 xmm3 = _mm_mul_ps(y, xmm3); | |
461 x = _mm_add_ps(x, xmm1); | |
462 x = _mm_add_ps(x, xmm2); | |
463 x = _mm_add_ps(x, xmm3); | |
464 | |
465 /* Evaluate the first polynom (0 <= x <= Pi/4) */ | |
466 y = *(v4sf*)_ps_coscof_p0; | |
467 v4sf z = _mm_mul_ps(x,x); | |
468 | |
469 y = _mm_mul_ps(y, z); | |
470 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); | |
471 y = _mm_mul_ps(y, z); | |
472 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); | |
473 y = _mm_mul_ps(y, z); | |
474 y = _mm_mul_ps(y, z); | |
475 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); | |
476 y = _mm_sub_ps(y, tmp); | |
477 y = _mm_add_ps(y, *(v4sf*)_ps_1); | |
478 | |
479 /* Evaluate the second polynom (Pi/4 <= x <= 0) */ | |
480 | |
481 v4sf y2 = *(v4sf*)_ps_sincof_p0; | |
482 y2 = _mm_mul_ps(y2, z); | |
483 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); | |
484 y2 = _mm_mul_ps(y2, z); | |
485 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); | |
486 y2 = _mm_mul_ps(y2, z); | |
487 y2 = _mm_mul_ps(y2, x); | |
488 y2 = _mm_add_ps(y2, x); | |
489 | |
490 /* select the correct result from the two polynoms */ | |
491 xmm3 = poly_mask; | |
492 y2 = _mm_and_ps(xmm3, y2); //, xmm3); | |
493 y = _mm_andnot_ps(xmm3, y); | |
494 y = _mm_add_ps(y,y2); | |
495 /* update the sign */ | |
496 y = _mm_xor_ps(y, sign_bit); | |
497 | |
498 return y; | |
499 } | |
500 | |
501 /* almost the same as sin_ps */ | |
502 v4sf cos_ps(v4sf x) { // any x | |
503 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; | |
504 #ifdef USE_SSE2 | |
505 v4si emm0, emm2; | |
506 #else | |
507 v2si mm0, mm1, mm2, mm3; | |
508 #endif | |
509 /* take the absolute value */ | |
510 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); | |
511 | |
512 /* scale by 4/Pi */ | |
513 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); | |
514 | |
515 #ifdef USE_SSE2 | |
516 /* store the integer part of y in mm0 */ | |
517 emm2 = _mm_cvttps_epi32(y); | |
518 /* j=(j+1) & (~1) (see the cephes sources) */ | |
519 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); | |
520 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); | |
521 y = _mm_cvtepi32_ps(emm2); | |
522 | |
523 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); | |
524 | |
525 /* get the swap sign flag */ | |
526 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); | |
527 emm0 = _mm_slli_epi32(emm0, 29); | |
528 /* get the polynom selection mask */ | |
529 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); | |
530 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); | |
531 | |
532 v4sf sign_bit = _mm_castsi128_ps(emm0); | |
533 v4sf poly_mask = _mm_castsi128_ps(emm2); | |
534 #else | |
535 /* store the integer part of y in mm0:mm1 */ | |
536 xmm2 = _mm_movehl_ps(xmm2, y); | |
537 mm2 = _mm_cvttps_pi32(y); | |
538 mm3 = _mm_cvttps_pi32(xmm2); | |
539 | |
540 /* j=(j+1) & (~1) (see the cephes sources) */ | |
541 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); | |
542 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); | |
543 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); | |
544 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); | |
545 | |
546 y = _mm_cvtpi32x2_ps(mm2, mm3); | |
547 | |
548 | |
549 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); | |
550 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); | |
551 | |
552 /* get the swap sign flag in mm0:mm1 and the | |
553 polynom selection mask in mm2:mm3 */ | |
554 | |
555 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); | |
556 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); | |
557 mm0 = _mm_slli_pi32(mm0, 29); | |
558 mm1 = _mm_slli_pi32(mm1, 29); | |
559 | |
560 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); | |
561 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); | |
562 | |
563 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); | |
564 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); | |
565 | |
566 v4sf sign_bit, poly_mask; | |
567 COPY_MM_TO_XMM(mm0, mm1, sign_bit); | |
568 COPY_MM_TO_XMM(mm2, mm3, poly_mask); | |
569 _mm_empty(); /* good-bye mmx */ | |
570 #endif | |
571 /* The magic pass: "Extended precision modular arithmetic" | |
572 x = ((x - y * DP1) - y * DP2) - y * DP3; */ | |
573 xmm1 = *(v4sf*)_ps_minus_cephes_DP1; | |
574 xmm2 = *(v4sf*)_ps_minus_cephes_DP2; | |
575 xmm3 = *(v4sf*)_ps_minus_cephes_DP3; | |
576 xmm1 = _mm_mul_ps(y, xmm1); | |
577 xmm2 = _mm_mul_ps(y, xmm2); | |
578 xmm3 = _mm_mul_ps(y, xmm3); | |
579 x = _mm_add_ps(x, xmm1); | |
580 x = _mm_add_ps(x, xmm2); | |
581 x = _mm_add_ps(x, xmm3); | |
582 | |
583 /* Evaluate the first polynom (0 <= x <= Pi/4) */ | |
584 y = *(v4sf*)_ps_coscof_p0; | |
585 v4sf z = _mm_mul_ps(x,x); | |
586 | |
587 y = _mm_mul_ps(y, z); | |
588 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); | |
589 y = _mm_mul_ps(y, z); | |
590 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); | |
591 y = _mm_mul_ps(y, z); | |
592 y = _mm_mul_ps(y, z); | |
593 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); | |
594 y = _mm_sub_ps(y, tmp); | |
595 y = _mm_add_ps(y, *(v4sf*)_ps_1); | |
596 | |
597 /* Evaluate the second polynom (Pi/4 <= x <= 0) */ | |
598 | |
599 v4sf y2 = *(v4sf*)_ps_sincof_p0; | |
600 y2 = _mm_mul_ps(y2, z); | |
601 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); | |
602 y2 = _mm_mul_ps(y2, z); | |
603 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); | |
604 y2 = _mm_mul_ps(y2, z); | |
605 y2 = _mm_mul_ps(y2, x); | |
606 y2 = _mm_add_ps(y2, x); | |
607 | |
608 /* select the correct result from the two polynoms */ | |
609 xmm3 = poly_mask; | |
610 y2 = _mm_and_ps(xmm3, y2); //, xmm3); | |
611 y = _mm_andnot_ps(xmm3, y); | |
612 y = _mm_add_ps(y,y2); | |
613 /* update the sign */ | |
614 y = _mm_xor_ps(y, sign_bit); | |
615 | |
616 return y; | |
617 } | |
618 | |
619 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. | |
620 it is almost as fast, and gives you a free cosine with your sine */ | |
621 void sincos_ps(v4sf x, v4sf *s, v4sf *c) { | |
622 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; | |
623 #ifdef USE_SSE2 | |
624 v4si emm0, emm2, emm4; | |
625 #else | |
626 v2si mm0, mm1, mm2, mm3, mm4, mm5; | |
627 #endif | |
628 sign_bit_sin = x; | |
629 /* take the absolute value */ | |
630 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); | |
631 /* extract the sign bit (upper one) */ | |
632 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); | |
633 | |
634 /* scale by 4/Pi */ | |
635 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); | |
636 | |
637 #ifdef USE_SSE2 | |
638 /* store the integer part of y in emm2 */ | |
639 emm2 = _mm_cvttps_epi32(y); | |
640 | |
641 /* j=(j+1) & (~1) (see the cephes sources) */ | |
642 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); | |
643 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); | |
644 y = _mm_cvtepi32_ps(emm2); | |
645 | |
646 emm4 = emm2; | |
647 | |
648 /* get the swap sign flag for the sine */ | |
649 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); | |
650 emm0 = _mm_slli_epi32(emm0, 29); | |
651 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0); | |
652 | |
653 /* get the polynom selection mask for the sine*/ | |
654 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); | |
655 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); | |
656 v4sf poly_mask = _mm_castsi128_ps(emm2); | |
657 #else | |
658 /* store the integer part of y in mm2:mm3 */ | |
659 xmm3 = _mm_movehl_ps(xmm3, y); | |
660 mm2 = _mm_cvttps_pi32(y); | |
661 mm3 = _mm_cvttps_pi32(xmm3); | |
662 | |
663 /* j=(j+1) & (~1) (see the cephes sources) */ | |
664 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); | |
665 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); | |
666 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); | |
667 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); | |
668 | |
669 y = _mm_cvtpi32x2_ps(mm2, mm3); | |
670 | |
671 mm4 = mm2; | |
672 mm5 = mm3; | |
673 | |
674 /* get the swap sign flag for the sine */ | |
675 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); | |
676 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); | |
677 mm0 = _mm_slli_pi32(mm0, 29); | |
678 mm1 = _mm_slli_pi32(mm1, 29); | |
679 v4sf swap_sign_bit_sin; | |
680 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); | |
681 | |
682 /* get the polynom selection mask for the sine */ | |
683 | |
684 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); | |
685 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); | |
686 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); | |
687 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); | |
688 v4sf poly_mask; | |
689 COPY_MM_TO_XMM(mm2, mm3, poly_mask); | |
690 #endif | |
691 | |
692 /* The magic pass: "Extended precision modular arithmetic" | |
693 x = ((x - y * DP1) - y * DP2) - y * DP3; */ | |
694 xmm1 = *(v4sf*)_ps_minus_cephes_DP1; | |
695 xmm2 = *(v4sf*)_ps_minus_cephes_DP2; | |
696 xmm3 = *(v4sf*)_ps_minus_cephes_DP3; | |
697 xmm1 = _mm_mul_ps(y, xmm1); | |
698 xmm2 = _mm_mul_ps(y, xmm2); | |
699 xmm3 = _mm_mul_ps(y, xmm3); | |
700 x = _mm_add_ps(x, xmm1); | |
701 x = _mm_add_ps(x, xmm2); | |
702 x = _mm_add_ps(x, xmm3); | |
703 | |
704 #ifdef USE_SSE2 | |
705 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2); | |
706 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4); | |
707 emm4 = _mm_slli_epi32(emm4, 29); | |
708 v4sf sign_bit_cos = _mm_castsi128_ps(emm4); | |
709 #else | |
710 /* get the sign flag for the cosine */ | |
711 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); | |
712 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); | |
713 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); | |
714 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); | |
715 mm4 = _mm_slli_pi32(mm4, 29); | |
716 mm5 = _mm_slli_pi32(mm5, 29); | |
717 v4sf sign_bit_cos; | |
718 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); | |
719 _mm_empty(); /* good-bye mmx */ | |
720 #endif | |
721 | |
722 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); | |
723 | |
724 | |
725 /* Evaluate the first polynom (0 <= x <= Pi/4) */ | |
726 v4sf z = _mm_mul_ps(x,x); | |
727 y = *(v4sf*)_ps_coscof_p0; | |
728 | |
729 y = _mm_mul_ps(y, z); | |
730 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); | |
731 y = _mm_mul_ps(y, z); | |
732 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); | |
733 y = _mm_mul_ps(y, z); | |
734 y = _mm_mul_ps(y, z); | |
735 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); | |
736 y = _mm_sub_ps(y, tmp); | |
737 y = _mm_add_ps(y, *(v4sf*)_ps_1); | |
738 | |
739 /* Evaluate the second polynom (Pi/4 <= x <= 0) */ | |
740 | |
741 v4sf y2 = *(v4sf*)_ps_sincof_p0; | |
742 y2 = _mm_mul_ps(y2, z); | |
743 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); | |
744 y2 = _mm_mul_ps(y2, z); | |
745 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); | |
746 y2 = _mm_mul_ps(y2, z); | |
747 y2 = _mm_mul_ps(y2, x); | |
748 y2 = _mm_add_ps(y2, x); | |
749 | |
750 /* select the correct result from the two polynoms */ | |
751 xmm3 = poly_mask; | |
752 v4sf ysin2 = _mm_and_ps(xmm3, y2); | |
753 v4sf ysin1 = _mm_andnot_ps(xmm3, y); | |
754 y2 = _mm_sub_ps(y2,ysin2); | |
755 y = _mm_sub_ps(y, ysin1); | |
756 | |
757 xmm1 = _mm_add_ps(ysin1,ysin2); | |
758 xmm2 = _mm_add_ps(y,y2); | |
759 | |
760 /* update the sign */ | |
761 *s = _mm_xor_ps(xmm1, sign_bit_sin); | |
762 *c = _mm_xor_ps(xmm2, sign_bit_cos); | |
763 } | |
764 | |
765 #endif | |
766 |