Chris@29: Chris@29: #ifndef _POMMIER_SSE_MATHFUN_H_ Chris@29: #define _POMMIER_SSE_MATHFUN_H_ Chris@29: Chris@29: /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log Chris@29: Chris@29: Inspired by Intel Approximate Math library, and based on the Chris@29: corresponding algorithms of the cephes math library Chris@29: Chris@29: The default is to use the SSE1 version. If you define USE_SSE2 the Chris@29: the SSE2 intrinsics will be used in place of the MMX intrinsics. Do Chris@29: not expect any significant performance improvement with SSE2. Chris@29: */ Chris@29: Chris@29: /* Copyright (C) 2007 Julien Pommier Chris@29: Chris@29: This software is provided 'as-is', without any express or implied Chris@29: warranty. In no event will the authors be held liable for any damages Chris@29: arising from the use of this software. Chris@29: Chris@29: Permission is granted to anyone to use this software for any purpose, Chris@29: including commercial applications, and to alter it and redistribute it Chris@29: freely, subject to the following restrictions: Chris@29: Chris@29: 1. The origin of this software must not be misrepresented; you must not Chris@29: claim that you wrote the original software. If you use this software Chris@29: in a product, an acknowledgment in the product documentation would be Chris@29: appreciated but is not required. Chris@29: 2. Altered source versions must be plainly marked as such, and must not be Chris@29: misrepresented as being the original software. Chris@29: 3. This notice may not be removed or altered from any source distribution. Chris@29: Chris@29: (this is the zlib license) Chris@29: */ Chris@29: Chris@29: #include Chris@29: Chris@29: /* yes I know, the top of this file is quite ugly */ Chris@29: Chris@29: #ifdef _MSC_VER /* visual c++ */ Chris@29: # define ALIGN16_BEG __declspec(align(16)) Chris@29: # define ALIGN16_END Chris@29: #else /* gcc or icc */ Chris@29: # define ALIGN16_BEG Chris@29: # define ALIGN16_END __attribute__((aligned(16))) Chris@29: #endif Chris@29: Chris@29: /* __m128 is ugly to write */ Chris@29: typedef __m128 v4sf; // vector of 4 float (sse1) Chris@29: Chris@29: #ifdef USE_SSE2 Chris@29: # include Chris@29: typedef __m128i v4si; // vector of 4 int (sse2) Chris@29: #else Chris@29: typedef __m64 v2si; // vector of 2 int (mmx) Chris@29: #endif Chris@29: Chris@29: /* declare some SSE constants -- why can't I figure a better way to do that? */ Chris@29: #define _PS_CONST(Name, Val) \ Chris@29: static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } Chris@29: #define _PI32_CONST(Name, Val) \ Chris@29: static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } Chris@29: #define _PS_CONST_TYPE(Name, Type, Val) \ Chris@29: static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } Chris@29: Chris@29: _PS_CONST(1 , 1.0f); Chris@29: _PS_CONST(0p5, 0.5f); Chris@29: /* the smallest non denormalized float number */ Chris@29: _PS_CONST_TYPE(min_norm_pos, int, 0x00800000); Chris@29: _PS_CONST_TYPE(mant_mask, int, 0x7f800000); Chris@29: _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); Chris@29: Chris@29: _PS_CONST_TYPE(sign_mask, int, 0x80000000); Chris@29: _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); Chris@29: Chris@29: _PI32_CONST(1, 1); Chris@29: _PI32_CONST(inv1, ~1); Chris@29: _PI32_CONST(2, 2); Chris@29: _PI32_CONST(4, 4); Chris@29: _PI32_CONST(0x7f, 0x7f); Chris@29: Chris@29: _PS_CONST(cephes_SQRTHF, 0.707106781186547524); Chris@29: _PS_CONST(cephes_log_p0, 7.0376836292E-2); Chris@29: _PS_CONST(cephes_log_p1, - 1.1514610310E-1); Chris@29: _PS_CONST(cephes_log_p2, 1.1676998740E-1); Chris@29: _PS_CONST(cephes_log_p3, - 1.2420140846E-1); Chris@29: _PS_CONST(cephes_log_p4, + 1.4249322787E-1); Chris@29: _PS_CONST(cephes_log_p5, - 1.6668057665E-1); Chris@29: _PS_CONST(cephes_log_p6, + 2.0000714765E-1); Chris@29: _PS_CONST(cephes_log_p7, - 2.4999993993E-1); Chris@29: _PS_CONST(cephes_log_p8, + 3.3333331174E-1); Chris@29: _PS_CONST(cephes_log_q1, -2.12194440e-4); Chris@29: _PS_CONST(cephes_log_q2, 0.693359375); Chris@29: Chris@29: #if defined (__MINGW32__) Chris@29: Chris@29: /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics Chris@29: The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely Chris@29: broken on my mingw gcc 3.4.5 ... Chris@29: Chris@29: Note that the bug on _mm_cmp* does occur only at -O0 optimization level Chris@29: */ Chris@29: Chris@29: inline __m128 my_movehl_ps(__m128 a, const __m128 b) { Chris@29: asm ( Chris@29: "movhlps %2,%0\n\t" Chris@29: : "=x" (a) Chris@29: : "0" (a), "x"(b) Chris@29: ); Chris@29: return a; } Chris@29: #warning "redefined _mm_movehl_ps (see gcc bug 21179)" Chris@29: #define _mm_movehl_ps my_movehl_ps Chris@29: Chris@29: inline __m128 my_cmplt_ps(__m128 a, const __m128 b) { Chris@29: asm ( Chris@29: "cmpltps %2,%0\n\t" Chris@29: : "=x" (a) Chris@29: : "0" (a), "x"(b) Chris@29: ); Chris@29: return a; Chris@29: } Chris@29: inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) { Chris@29: asm ( Chris@29: "cmpnleps %2,%0\n\t" Chris@29: : "=x" (a) Chris@29: : "0" (a), "x"(b) Chris@29: ); Chris@29: return a; Chris@29: } Chris@29: inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) { Chris@29: asm ( Chris@29: "cmpeqps %2,%0\n\t" Chris@29: : "=x" (a) Chris@29: : "0" (a), "x"(b) Chris@29: ); Chris@29: return a; Chris@29: } Chris@29: #warning "redefined _mm_cmpxx_ps functions..." Chris@29: #define _mm_cmplt_ps my_cmplt_ps Chris@29: #define _mm_cmpgt_ps my_cmpgt_ps Chris@29: #define _mm_cmpeq_ps my_cmpeq_ps Chris@29: #endif Chris@29: Chris@29: #ifndef USE_SSE2 Chris@29: typedef union xmm_mm_union { Chris@29: __m128 xmm; Chris@29: __m64 mm[2]; Chris@29: } xmm_mm_union; Chris@29: Chris@29: #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ Chris@29: xmm_mm_union u; u.xmm = xmm_; \ Chris@29: mm0_ = u.mm[0]; \ Chris@29: mm1_ = u.mm[1]; \ Chris@29: } Chris@29: Chris@29: #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ Chris@29: xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ Chris@29: } Chris@29: Chris@29: #endif // USE_SSE2 Chris@29: Chris@29: /* natural logarithm computed for 4 simultaneous float Chris@29: return NaN for x <= 0 Chris@29: */ Chris@29: v4sf log_ps(v4sf x) { Chris@29: #ifdef USE_SSE2 Chris@29: v4si emm0; Chris@29: #else Chris@29: v2si mm0, mm1; Chris@29: #endif Chris@29: v4sf one = *(v4sf*)_ps_1; Chris@29: Chris@29: v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); Chris@29: Chris@29: x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ Chris@29: Chris@29: #ifndef USE_SSE2 Chris@29: /* part 1: x = frexpf(x, &e); */ Chris@29: COPY_XMM_TO_MM(x, mm0, mm1); Chris@29: mm0 = _mm_srli_pi32(mm0, 23); Chris@29: mm1 = _mm_srli_pi32(mm1, 23); Chris@29: #else Chris@29: emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); Chris@29: #endif Chris@29: /* keep only the fractional part */ Chris@29: x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); Chris@29: x = _mm_or_ps(x, *(v4sf*)_ps_0p5); Chris@29: Chris@29: #ifndef USE_SSE2 Chris@29: /* now e=mm0:mm1 contain the really base-2 exponent */ Chris@29: mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); Chris@29: mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); Chris@29: v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); Chris@29: _mm_empty(); /* bye bye mmx */ Chris@29: #else Chris@29: emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); Chris@29: v4sf e = _mm_cvtepi32_ps(emm0); Chris@29: #endif Chris@29: Chris@29: e = _mm_add_ps(e, one); Chris@29: Chris@29: /* part2: Chris@29: if( x < SQRTHF ) { Chris@29: e -= 1; Chris@29: x = x + x - 1.0; Chris@29: } else { x = x - 1.0; } Chris@29: */ Chris@29: v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); Chris@29: v4sf tmp = _mm_and_ps(x, mask); Chris@29: x = _mm_sub_ps(x, one); Chris@29: e = _mm_sub_ps(e, _mm_and_ps(one, mask)); Chris@29: x = _mm_add_ps(x, tmp); Chris@29: Chris@29: Chris@29: v4sf z = _mm_mul_ps(x,x); Chris@29: Chris@29: v4sf y = *(v4sf*)_ps_cephes_log_p0; Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); Chris@29: y = _mm_mul_ps(y, x); Chris@29: Chris@29: y = _mm_mul_ps(y, z); Chris@29: Chris@29: Chris@29: tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); Chris@29: y = _mm_add_ps(y, tmp); Chris@29: Chris@29: Chris@29: tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); Chris@29: y = _mm_sub_ps(y, tmp); Chris@29: Chris@29: tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); Chris@29: x = _mm_add_ps(x, y); Chris@29: x = _mm_add_ps(x, tmp); Chris@29: x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN Chris@29: return x; Chris@29: } Chris@29: Chris@29: _PS_CONST(exp_hi, 88.3762626647949f); Chris@29: _PS_CONST(exp_lo, -88.3762626647949f); Chris@29: Chris@29: _PS_CONST(cephes_LOG2EF, 1.44269504088896341); Chris@29: _PS_CONST(cephes_exp_C1, 0.693359375); Chris@29: _PS_CONST(cephes_exp_C2, -2.12194440e-4); Chris@29: Chris@29: _PS_CONST(cephes_exp_p0, 1.9875691500E-4); Chris@29: _PS_CONST(cephes_exp_p1, 1.3981999507E-3); Chris@29: _PS_CONST(cephes_exp_p2, 8.3334519073E-3); Chris@29: _PS_CONST(cephes_exp_p3, 4.1665795894E-2); Chris@29: _PS_CONST(cephes_exp_p4, 1.6666665459E-1); Chris@29: _PS_CONST(cephes_exp_p5, 5.0000001201E-1); Chris@29: Chris@29: v4sf exp_ps(v4sf x) { Chris@29: v4sf tmp = _mm_setzero_ps(), fx; Chris@29: #ifdef USE_SSE2 Chris@29: v4si emm0; Chris@29: #else Chris@29: v2si mm0, mm1; Chris@29: #endif Chris@29: v4sf one = *(v4sf*)_ps_1; Chris@29: Chris@29: x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); Chris@29: x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); Chris@29: Chris@29: /* express exp(x) as exp(g + n*log(2)) */ Chris@29: fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); Chris@29: fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); Chris@29: Chris@29: /* how to perform a floorf with SSE: just below */ Chris@29: #ifndef USE_SSE2 Chris@29: /* step 1 : cast to int */ Chris@29: tmp = _mm_movehl_ps(tmp, fx); Chris@29: mm0 = _mm_cvttps_pi32(fx); Chris@29: mm1 = _mm_cvttps_pi32(tmp); Chris@29: /* step 2 : cast back to float */ Chris@29: tmp = _mm_cvtpi32x2_ps(mm0, mm1); Chris@29: #else Chris@29: emm0 = _mm_cvttps_epi32(fx); Chris@29: tmp = _mm_cvtepi32_ps(emm0); Chris@29: #endif Chris@29: /* if greater, substract 1 */ Chris@29: v4sf mask = _mm_cmpgt_ps(tmp, fx); Chris@29: mask = _mm_and_ps(mask, one); Chris@29: fx = _mm_sub_ps(tmp, mask); Chris@29: Chris@29: tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); Chris@29: v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); Chris@29: x = _mm_sub_ps(x, tmp); Chris@29: x = _mm_sub_ps(x, z); Chris@29: Chris@29: z = _mm_mul_ps(x,x); Chris@29: Chris@29: v4sf y = *(v4sf*)_ps_cephes_exp_p0; Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); Chris@29: y = _mm_mul_ps(y, x); Chris@29: y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); Chris@29: y = _mm_mul_ps(y, z); Chris@29: y = _mm_add_ps(y, x); Chris@29: y = _mm_add_ps(y, one); Chris@29: Chris@29: /* build 2^n */ Chris@29: #ifndef USE_SSE2 Chris@29: z = _mm_movehl_ps(z, fx); Chris@29: mm0 = _mm_cvttps_pi32(fx); Chris@29: mm1 = _mm_cvttps_pi32(z); Chris@29: mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); Chris@29: mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); Chris@29: mm0 = _mm_slli_pi32(mm0, 23); Chris@29: mm1 = _mm_slli_pi32(mm1, 23); Chris@29: Chris@29: v4sf pow2n; Chris@29: COPY_MM_TO_XMM(mm0, mm1, pow2n); Chris@29: _mm_empty(); Chris@29: #else Chris@29: emm0 = _mm_cvttps_epi32(fx); Chris@29: emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); Chris@29: emm0 = _mm_slli_epi32(emm0, 23); Chris@29: v4sf pow2n = _mm_castsi128_ps(emm0); Chris@29: #endif Chris@29: y = _mm_mul_ps(y, pow2n); Chris@29: return y; Chris@29: } Chris@29: Chris@29: _PS_CONST(minus_cephes_DP1, -0.78515625); Chris@29: _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); Chris@29: _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); Chris@29: _PS_CONST(sincof_p0, -1.9515295891E-4); Chris@29: _PS_CONST(sincof_p1, 8.3321608736E-3); Chris@29: _PS_CONST(sincof_p2, -1.6666654611E-1); Chris@29: _PS_CONST(coscof_p0, 2.443315711809948E-005); Chris@29: _PS_CONST(coscof_p1, -1.388731625493765E-003); Chris@29: _PS_CONST(coscof_p2, 4.166664568298827E-002); Chris@29: _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI Chris@29: Chris@29: Chris@29: /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so Chris@29: it runs also on old athlons XPs and the pentium III of your grand Chris@29: mother. Chris@29: Chris@29: The code is the exact rewriting of the cephes sinf function. Chris@29: Precision is excellent as long as x < 8192 (I did not bother to Chris@29: take into account the special handling they have for greater values Chris@29: -- it does not return garbage for arguments over 8192, though, but Chris@29: the extra precision is missing). Chris@29: Chris@29: Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the Chris@29: surprising but correct result. Chris@29: Chris@29: Performance is also surprisingly good, 1.33 times faster than the Chris@29: macos vsinf SSE2 function, and 1.5 times faster than the Chris@29: __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not Chris@29: too bad for an SSE1 function (with no special tuning) ! Chris@29: However the latter libraries probably have a much better handling of NaN, Chris@29: Inf, denormalized and other special arguments.. Chris@29: Chris@29: On my core 1 duo, the execution of this function takes approximately 95 cycles. Chris@29: Chris@29: From what I have observed on the experiments with Intel AMath lib, switching to an Chris@29: SSE2 version would improve the perf by only 10%. Chris@29: Chris@29: Since it is based on SSE intrinsics, it has to be compiled at -O2 to Chris@29: deliver full speed. Chris@29: */ Chris@29: v4sf sin_ps(v4sf x) { // any x Chris@29: v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; Chris@29: Chris@29: #ifdef USE_SSE2 Chris@29: v4si emm0, emm2; Chris@29: #else Chris@29: v2si mm0, mm1, mm2, mm3; Chris@29: #endif Chris@29: sign_bit = x; Chris@29: /* take the absolute value */ Chris@29: x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); Chris@29: /* extract the sign bit (upper one) */ Chris@29: sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); Chris@29: Chris@29: /* scale by 4/Pi */ Chris@29: y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); Chris@29: Chris@29: //printf("plop:"); print4(y); Chris@29: #ifdef USE_SSE2 Chris@29: /* store the integer part of y in mm0 */ Chris@29: emm2 = _mm_cvttps_epi32(y); Chris@29: /* j=(j+1) & (~1) (see the cephes sources) */ Chris@29: emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); Chris@29: emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); Chris@29: y = _mm_cvtepi32_ps(emm2); Chris@29: /* get the swap sign flag */ Chris@29: emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); Chris@29: emm0 = _mm_slli_epi32(emm0, 29); Chris@29: /* get the polynom selection mask Chris@29: there is one polynom for 0 <= x <= Pi/4 Chris@29: and another one for Pi/4