Chris@29: /* NEON implementation of sin, cos, exp and log Chris@29: Chris@29: Inspired by Intel Approximate Math library, and based on the Chris@29: corresponding algorithms of the cephes math library Chris@29: */ Chris@29: Chris@29: /* Copyright (C) 2011 Julien Pommier Chris@29: Chris@29: This software is provided 'as-is', without any express or implied Chris@29: warranty. In no event will the authors be held liable for any damages Chris@29: arising from the use of this software. Chris@29: Chris@29: Permission is granted to anyone to use this software for any purpose, Chris@29: including commercial applications, and to alter it and redistribute it Chris@29: freely, subject to the following restrictions: Chris@29: Chris@29: 1. The origin of this software must not be misrepresented; you must not Chris@29: claim that you wrote the original software. If you use this software Chris@29: in a product, an acknowledgment in the product documentation would be Chris@29: appreciated but is not required. Chris@29: 2. Altered source versions must be plainly marked as such, and must not be Chris@29: misrepresented as being the original software. Chris@29: 3. This notice may not be removed or altered from any source distribution. Chris@29: Chris@29: (this is the zlib license) Chris@29: */ Chris@29: Chris@29: #include Chris@29: Chris@29: typedef float32x4_t v4sf; // vector of 4 float Chris@29: typedef uint32x4_t v4su; // vector of 4 uint32 Chris@29: typedef int32x4_t v4si; // vector of 4 uint32 Chris@29: Chris@29: #define c_inv_mant_mask ~0x7f800000u Chris@29: #define c_cephes_SQRTHF 0.707106781186547524 Chris@29: #define c_cephes_log_p0 7.0376836292E-2 Chris@29: #define c_cephes_log_p1 - 1.1514610310E-1 Chris@29: #define c_cephes_log_p2 1.1676998740E-1 Chris@29: #define c_cephes_log_p3 - 1.2420140846E-1 Chris@29: #define c_cephes_log_p4 + 1.4249322787E-1 Chris@29: #define c_cephes_log_p5 - 1.6668057665E-1 Chris@29: #define c_cephes_log_p6 + 2.0000714765E-1 Chris@29: #define c_cephes_log_p7 - 2.4999993993E-1 Chris@29: #define c_cephes_log_p8 + 3.3333331174E-1 Chris@29: #define c_cephes_log_q1 -2.12194440e-4 Chris@29: #define c_cephes_log_q2 0.693359375 Chris@29: Chris@29: /* natural logarithm computed for 4 simultaneous float Chris@29: return NaN for x <= 0 Chris@29: */ Chris@29: v4sf log_ps(v4sf x) { Chris@29: v4sf one = vdupq_n_f32(1); Chris@29: Chris@29: x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ Chris@29: v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); Chris@29: Chris@29: v4si ux = vreinterpretq_s32_f32(x); Chris@29: Chris@29: v4si emm0 = vshrq_n_s32(ux, 23); Chris@29: Chris@29: /* keep only the fractional part */ Chris@29: ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); Chris@29: ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); Chris@29: x = vreinterpretq_f32_s32(ux); Chris@29: Chris@29: emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); Chris@29: v4sf e = vcvtq_f32_s32(emm0); Chris@29: Chris@29: e = vaddq_f32(e, one); Chris@29: Chris@29: /* part2: Chris@29: if( x < SQRTHF ) { Chris@29: e -= 1; Chris@29: x = x + x - 1.0; Chris@29: } else { x = x - 1.0; } Chris@29: */ Chris@29: v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); Chris@29: v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); Chris@29: x = vsubq_f32(x, one); Chris@29: e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); Chris@29: x = vaddq_f32(x, tmp); Chris@29: Chris@29: v4sf z = vmulq_f32(x,x); Chris@29: Chris@29: v4sf y = vdupq_n_f32(c_cephes_log_p0); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); Chris@29: y = vmulq_f32(y, x); Chris@29: Chris@29: y = vmulq_f32(y, z); Chris@29: Chris@29: Chris@29: tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); Chris@29: y = vaddq_f32(y, tmp); Chris@29: Chris@29: Chris@29: tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); Chris@29: y = vsubq_f32(y, tmp); Chris@29: Chris@29: tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); Chris@29: x = vaddq_f32(x, y); Chris@29: x = vaddq_f32(x, tmp); Chris@29: x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN Chris@29: return x; Chris@29: } Chris@29: Chris@29: #define c_exp_hi 88.3762626647949f Chris@29: #define c_exp_lo -88.3762626647949f Chris@29: Chris@29: #define c_cephes_LOG2EF 1.44269504088896341 Chris@29: #define c_cephes_exp_C1 0.693359375 Chris@29: #define c_cephes_exp_C2 -2.12194440e-4 Chris@29: Chris@29: #define c_cephes_exp_p0 1.9875691500E-4 Chris@29: #define c_cephes_exp_p1 1.3981999507E-3 Chris@29: #define c_cephes_exp_p2 8.3334519073E-3 Chris@29: #define c_cephes_exp_p3 4.1665795894E-2 Chris@29: #define c_cephes_exp_p4 1.6666665459E-1 Chris@29: #define c_cephes_exp_p5 5.0000001201E-1 Chris@29: Chris@29: /* exp() computed for 4 float at once */ Chris@29: v4sf exp_ps(v4sf x) { Chris@29: v4sf tmp, fx; Chris@29: Chris@29: v4sf one = vdupq_n_f32(1); Chris@29: x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); Chris@29: x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); Chris@29: Chris@29: /* express exp(x) as exp(g + n*log(2)) */ Chris@29: fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); Chris@29: Chris@29: /* perform a floorf */ Chris@29: tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); Chris@29: Chris@29: /* if greater, substract 1 */ Chris@29: v4su mask = vcgtq_f32(tmp, fx); Chris@29: mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); Chris@29: Chris@29: Chris@29: fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); Chris@29: Chris@29: tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); Chris@29: v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); Chris@29: x = vsubq_f32(x, tmp); Chris@29: x = vsubq_f32(x, z); Chris@29: Chris@29: static const float32_t cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; Chris@29: v4sf y = vld1q_dup_f32(cephes_exp_p+0); Chris@29: v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); Chris@29: v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); Chris@29: v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); Chris@29: v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); Chris@29: v4sf c5 = vld1q_dup_f32(cephes_exp_p+5); Chris@29: Chris@29: y = vmulq_f32(y, x); Chris@29: z = vmulq_f32(x,x); Chris@29: y = vaddq_f32(y, c1); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, c2); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, c3); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, c4); Chris@29: y = vmulq_f32(y, x); Chris@29: y = vaddq_f32(y, c5); Chris@29: Chris@29: y = vmulq_f32(y, z); Chris@29: y = vaddq_f32(y, x); Chris@29: y = vaddq_f32(y, one); Chris@29: Chris@29: /* build 2^n */ Chris@29: int32x4_t mm; Chris@29: mm = vcvtq_s32_f32(fx); Chris@29: mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); Chris@29: mm = vshlq_n_s32(mm, 23); Chris@29: v4sf pow2n = vreinterpretq_f32_s32(mm); Chris@29: Chris@29: y = vmulq_f32(y, pow2n); Chris@29: return y; Chris@29: } Chris@29: Chris@29: #define c_minus_cephes_DP1 -0.78515625 Chris@29: #define c_minus_cephes_DP2 -2.4187564849853515625e-4 Chris@29: #define c_minus_cephes_DP3 -3.77489497744594108e-8 Chris@29: #define c_sincof_p0 -1.9515295891E-4 Chris@29: #define c_sincof_p1 8.3321608736E-3 Chris@29: #define c_sincof_p2 -1.6666654611E-1 Chris@29: #define c_coscof_p0 2.443315711809948E-005 Chris@29: #define c_coscof_p1 -1.388731625493765E-003 Chris@29: #define c_coscof_p2 4.166664568298827E-002 Chris@29: #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI Chris@29: Chris@29: /* evaluation of 4 sines & cosines at once. Chris@29: Chris@29: The code is the exact rewriting of the cephes sinf function. Chris@29: Precision is excellent as long as x < 8192 (I did not bother to Chris@29: take into account the special handling they have for greater values Chris@29: -- it does not return garbage for arguments over 8192, though, but Chris@29: the extra precision is missing). Chris@29: Chris@29: Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the Chris@29: surprising but correct result. Chris@29: Chris@29: Note also that when you compute sin(x), cos(x) is available at Chris@29: almost no extra price so both sin_ps and cos_ps make use of Chris@29: sincos_ps.. Chris@29: */ Chris@29: void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x Chris@29: v4sf xmm1, xmm2, xmm3, y; Chris@29: Chris@29: v4su emm2; Chris@29: Chris@29: v4su sign_mask_sin, sign_mask_cos; Chris@29: sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); Chris@29: x = vabsq_f32(x); Chris@29: Chris@29: /* scale by 4/Pi */ Chris@29: y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); Chris@29: Chris@29: /* store the integer part of y in mm0 */ Chris@29: emm2 = vcvtq_u32_f32(y); Chris@29: /* j=(j+1) & (~1) (see the cephes sources) */ Chris@29: emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); Chris@29: emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); Chris@29: y = vcvtq_f32_u32(emm2); Chris@29: Chris@29: /* get the polynom selection mask Chris@29: there is one polynom for 0 <= x <= Pi/4 Chris@29: and another one for Pi/4