annotate fft/native/bqvec/pommier/sse_mathfun.h @ 40:223f770b5341 kissfft-double tip

Try a double-precision kissfft
author Chris Cannam
date Wed, 07 Sep 2016 10:40:32 +0100
parents cf59817a5983
children
rev   line source
Chris@29 1
Chris@29 2 #ifndef _POMMIER_SSE_MATHFUN_H_
Chris@29 3 #define _POMMIER_SSE_MATHFUN_H_
Chris@29 4
Chris@29 5 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Chris@29 6
Chris@29 7 Inspired by Intel Approximate Math library, and based on the
Chris@29 8 corresponding algorithms of the cephes math library
Chris@29 9
Chris@29 10 The default is to use the SSE1 version. If you define USE_SSE2 the
Chris@29 11 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
Chris@29 12 not expect any significant performance improvement with SSE2.
Chris@29 13 */
Chris@29 14
Chris@29 15 /* Copyright (C) 2007 Julien Pommier
Chris@29 16
Chris@29 17 This software is provided 'as-is', without any express or implied
Chris@29 18 warranty. In no event will the authors be held liable for any damages
Chris@29 19 arising from the use of this software.
Chris@29 20
Chris@29 21 Permission is granted to anyone to use this software for any purpose,
Chris@29 22 including commercial applications, and to alter it and redistribute it
Chris@29 23 freely, subject to the following restrictions:
Chris@29 24
Chris@29 25 1. The origin of this software must not be misrepresented; you must not
Chris@29 26 claim that you wrote the original software. If you use this software
Chris@29 27 in a product, an acknowledgment in the product documentation would be
Chris@29 28 appreciated but is not required.
Chris@29 29 2. Altered source versions must be plainly marked as such, and must not be
Chris@29 30 misrepresented as being the original software.
Chris@29 31 3. This notice may not be removed or altered from any source distribution.
Chris@29 32
Chris@29 33 (this is the zlib license)
Chris@29 34 */
Chris@29 35
Chris@29 36 #include <xmmintrin.h>
Chris@29 37
Chris@29 38 /* yes I know, the top of this file is quite ugly */
Chris@29 39
Chris@29 40 #ifdef _MSC_VER /* visual c++ */
Chris@29 41 # define ALIGN16_BEG __declspec(align(16))
Chris@29 42 # define ALIGN16_END
Chris@29 43 #else /* gcc or icc */
Chris@29 44 # define ALIGN16_BEG
Chris@29 45 # define ALIGN16_END __attribute__((aligned(16)))
Chris@29 46 #endif
Chris@29 47
Chris@29 48 /* __m128 is ugly to write */
Chris@29 49 typedef __m128 v4sf; // vector of 4 float (sse1)
Chris@29 50
Chris@29 51 #ifdef USE_SSE2
Chris@29 52 # include <emmintrin.h>
Chris@29 53 typedef __m128i v4si; // vector of 4 int (sse2)
Chris@29 54 #else
Chris@29 55 typedef __m64 v2si; // vector of 2 int (mmx)
Chris@29 56 #endif
Chris@29 57
Chris@29 58 /* declare some SSE constants -- why can't I figure a better way to do that? */
Chris@29 59 #define _PS_CONST(Name, Val) \
Chris@29 60 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29 61 #define _PI32_CONST(Name, Val) \
Chris@29 62 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29 63 #define _PS_CONST_TYPE(Name, Type, Val) \
Chris@29 64 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29 65
Chris@29 66 _PS_CONST(1 , 1.0f);
Chris@29 67 _PS_CONST(0p5, 0.5f);
Chris@29 68 /* the smallest non denormalized float number */
Chris@29 69 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
Chris@29 70 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
Chris@29 71 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
Chris@29 72
Chris@29 73 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
Chris@29 74 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
Chris@29 75
Chris@29 76 _PI32_CONST(1, 1);
Chris@29 77 _PI32_CONST(inv1, ~1);
Chris@29 78 _PI32_CONST(2, 2);
Chris@29 79 _PI32_CONST(4, 4);
Chris@29 80 _PI32_CONST(0x7f, 0x7f);
Chris@29 81
Chris@29 82 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
Chris@29 83 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
Chris@29 84 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
Chris@29 85 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
Chris@29 86 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
Chris@29 87 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
Chris@29 88 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
Chris@29 89 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
Chris@29 90 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
Chris@29 91 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
Chris@29 92 _PS_CONST(cephes_log_q1, -2.12194440e-4);
Chris@29 93 _PS_CONST(cephes_log_q2, 0.693359375);
Chris@29 94
Chris@29 95 #if defined (__MINGW32__)
Chris@29 96
Chris@29 97 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
Chris@29 98 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
Chris@29 99 broken on my mingw gcc 3.4.5 ...
Chris@29 100
Chris@29 101 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
Chris@29 102 */
Chris@29 103
Chris@29 104 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
Chris@29 105 asm (
Chris@29 106 "movhlps %2,%0\n\t"
Chris@29 107 : "=x" (a)
Chris@29 108 : "0" (a), "x"(b)
Chris@29 109 );
Chris@29 110 return a; }
Chris@29 111 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
Chris@29 112 #define _mm_movehl_ps my_movehl_ps
Chris@29 113
Chris@29 114 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
Chris@29 115 asm (
Chris@29 116 "cmpltps %2,%0\n\t"
Chris@29 117 : "=x" (a)
Chris@29 118 : "0" (a), "x"(b)
Chris@29 119 );
Chris@29 120 return a;
Chris@29 121 }
Chris@29 122 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
Chris@29 123 asm (
Chris@29 124 "cmpnleps %2,%0\n\t"
Chris@29 125 : "=x" (a)
Chris@29 126 : "0" (a), "x"(b)
Chris@29 127 );
Chris@29 128 return a;
Chris@29 129 }
Chris@29 130 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
Chris@29 131 asm (
Chris@29 132 "cmpeqps %2,%0\n\t"
Chris@29 133 : "=x" (a)
Chris@29 134 : "0" (a), "x"(b)
Chris@29 135 );
Chris@29 136 return a;
Chris@29 137 }
Chris@29 138 #warning "redefined _mm_cmpxx_ps functions..."
Chris@29 139 #define _mm_cmplt_ps my_cmplt_ps
Chris@29 140 #define _mm_cmpgt_ps my_cmpgt_ps
Chris@29 141 #define _mm_cmpeq_ps my_cmpeq_ps
Chris@29 142 #endif
Chris@29 143
Chris@29 144 #ifndef USE_SSE2
Chris@29 145 typedef union xmm_mm_union {
Chris@29 146 __m128 xmm;
Chris@29 147 __m64 mm[2];
Chris@29 148 } xmm_mm_union;
Chris@29 149
Chris@29 150 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
Chris@29 151 xmm_mm_union u; u.xmm = xmm_; \
Chris@29 152 mm0_ = u.mm[0]; \
Chris@29 153 mm1_ = u.mm[1]; \
Chris@29 154 }
Chris@29 155
Chris@29 156 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
Chris@29 157 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
Chris@29 158 }
Chris@29 159
Chris@29 160 #endif // USE_SSE2
Chris@29 161
Chris@29 162 /* natural logarithm computed for 4 simultaneous float
Chris@29 163 return NaN for x <= 0
Chris@29 164 */
Chris@29 165 v4sf log_ps(v4sf x) {
Chris@29 166 #ifdef USE_SSE2
Chris@29 167 v4si emm0;
Chris@29 168 #else
Chris@29 169 v2si mm0, mm1;
Chris@29 170 #endif
Chris@29 171 v4sf one = *(v4sf*)_ps_1;
Chris@29 172
Chris@29 173 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
Chris@29 174
Chris@29 175 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
Chris@29 176
Chris@29 177 #ifndef USE_SSE2
Chris@29 178 /* part 1: x = frexpf(x, &e); */
Chris@29 179 COPY_XMM_TO_MM(x, mm0, mm1);
Chris@29 180 mm0 = _mm_srli_pi32(mm0, 23);
Chris@29 181 mm1 = _mm_srli_pi32(mm1, 23);
Chris@29 182 #else
Chris@29 183 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
Chris@29 184 #endif
Chris@29 185 /* keep only the fractional part */
Chris@29 186 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
Chris@29 187 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
Chris@29 188
Chris@29 189 #ifndef USE_SSE2
Chris@29 190 /* now e=mm0:mm1 contain the really base-2 exponent */
Chris@29 191 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
Chris@29 192 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
Chris@29 193 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
Chris@29 194 _mm_empty(); /* bye bye mmx */
Chris@29 195 #else
Chris@29 196 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
Chris@29 197 v4sf e = _mm_cvtepi32_ps(emm0);
Chris@29 198 #endif
Chris@29 199
Chris@29 200 e = _mm_add_ps(e, one);
Chris@29 201
Chris@29 202 /* part2:
Chris@29 203 if( x < SQRTHF ) {
Chris@29 204 e -= 1;
Chris@29 205 x = x + x - 1.0;
Chris@29 206 } else { x = x - 1.0; }
Chris@29 207 */
Chris@29 208 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
Chris@29 209 v4sf tmp = _mm_and_ps(x, mask);
Chris@29 210 x = _mm_sub_ps(x, one);
Chris@29 211 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
Chris@29 212 x = _mm_add_ps(x, tmp);
Chris@29 213
Chris@29 214
Chris@29 215 v4sf z = _mm_mul_ps(x,x);
Chris@29 216
Chris@29 217 v4sf y = *(v4sf*)_ps_cephes_log_p0;
Chris@29 218 y = _mm_mul_ps(y, x);
Chris@29 219 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
Chris@29 220 y = _mm_mul_ps(y, x);
Chris@29 221 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
Chris@29 222 y = _mm_mul_ps(y, x);
Chris@29 223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
Chris@29 224 y = _mm_mul_ps(y, x);
Chris@29 225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
Chris@29 226 y = _mm_mul_ps(y, x);
Chris@29 227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
Chris@29 228 y = _mm_mul_ps(y, x);
Chris@29 229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
Chris@29 230 y = _mm_mul_ps(y, x);
Chris@29 231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
Chris@29 232 y = _mm_mul_ps(y, x);
Chris@29 233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
Chris@29 234 y = _mm_mul_ps(y, x);
Chris@29 235
Chris@29 236 y = _mm_mul_ps(y, z);
Chris@29 237
Chris@29 238
Chris@29 239 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
Chris@29 240 y = _mm_add_ps(y, tmp);
Chris@29 241
Chris@29 242
Chris@29 243 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
Chris@29 244 y = _mm_sub_ps(y, tmp);
Chris@29 245
Chris@29 246 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
Chris@29 247 x = _mm_add_ps(x, y);
Chris@29 248 x = _mm_add_ps(x, tmp);
Chris@29 249 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
Chris@29 250 return x;
Chris@29 251 }
Chris@29 252
Chris@29 253 _PS_CONST(exp_hi, 88.3762626647949f);
Chris@29 254 _PS_CONST(exp_lo, -88.3762626647949f);
Chris@29 255
Chris@29 256 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
Chris@29 257 _PS_CONST(cephes_exp_C1, 0.693359375);
Chris@29 258 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
Chris@29 259
Chris@29 260 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
Chris@29 261 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
Chris@29 262 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
Chris@29 263 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
Chris@29 264 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
Chris@29 265 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
Chris@29 266
Chris@29 267 v4sf exp_ps(v4sf x) {
Chris@29 268 v4sf tmp = _mm_setzero_ps(), fx;
Chris@29 269 #ifdef USE_SSE2
Chris@29 270 v4si emm0;
Chris@29 271 #else
Chris@29 272 v2si mm0, mm1;
Chris@29 273 #endif
Chris@29 274 v4sf one = *(v4sf*)_ps_1;
Chris@29 275
Chris@29 276 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
Chris@29 277 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
Chris@29 278
Chris@29 279 /* express exp(x) as exp(g + n*log(2)) */
Chris@29 280 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
Chris@29 281 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
Chris@29 282
Chris@29 283 /* how to perform a floorf with SSE: just below */
Chris@29 284 #ifndef USE_SSE2
Chris@29 285 /* step 1 : cast to int */
Chris@29 286 tmp = _mm_movehl_ps(tmp, fx);
Chris@29 287 mm0 = _mm_cvttps_pi32(fx);
Chris@29 288 mm1 = _mm_cvttps_pi32(tmp);
Chris@29 289 /* step 2 : cast back to float */
Chris@29 290 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
Chris@29 291 #else
Chris@29 292 emm0 = _mm_cvttps_epi32(fx);
Chris@29 293 tmp = _mm_cvtepi32_ps(emm0);
Chris@29 294 #endif
Chris@29 295 /* if greater, substract 1 */
Chris@29 296 v4sf mask = _mm_cmpgt_ps(tmp, fx);
Chris@29 297 mask = _mm_and_ps(mask, one);
Chris@29 298 fx = _mm_sub_ps(tmp, mask);
Chris@29 299
Chris@29 300 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
Chris@29 301 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
Chris@29 302 x = _mm_sub_ps(x, tmp);
Chris@29 303 x = _mm_sub_ps(x, z);
Chris@29 304
Chris@29 305 z = _mm_mul_ps(x,x);
Chris@29 306
Chris@29 307 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
Chris@29 308 y = _mm_mul_ps(y, x);
Chris@29 309 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
Chris@29 310 y = _mm_mul_ps(y, x);
Chris@29 311 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
Chris@29 312 y = _mm_mul_ps(y, x);
Chris@29 313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
Chris@29 314 y = _mm_mul_ps(y, x);
Chris@29 315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
Chris@29 316 y = _mm_mul_ps(y, x);
Chris@29 317 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
Chris@29 318 y = _mm_mul_ps(y, z);
Chris@29 319 y = _mm_add_ps(y, x);
Chris@29 320 y = _mm_add_ps(y, one);
Chris@29 321
Chris@29 322 /* build 2^n */
Chris@29 323 #ifndef USE_SSE2
Chris@29 324 z = _mm_movehl_ps(z, fx);
Chris@29 325 mm0 = _mm_cvttps_pi32(fx);
Chris@29 326 mm1 = _mm_cvttps_pi32(z);
Chris@29 327 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
Chris@29 328 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
Chris@29 329 mm0 = _mm_slli_pi32(mm0, 23);
Chris@29 330 mm1 = _mm_slli_pi32(mm1, 23);
Chris@29 331
Chris@29 332 v4sf pow2n;
Chris@29 333 COPY_MM_TO_XMM(mm0, mm1, pow2n);
Chris@29 334 _mm_empty();
Chris@29 335 #else
Chris@29 336 emm0 = _mm_cvttps_epi32(fx);
Chris@29 337 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
Chris@29 338 emm0 = _mm_slli_epi32(emm0, 23);
Chris@29 339 v4sf pow2n = _mm_castsi128_ps(emm0);
Chris@29 340 #endif
Chris@29 341 y = _mm_mul_ps(y, pow2n);
Chris@29 342 return y;
Chris@29 343 }
Chris@29 344
Chris@29 345 _PS_CONST(minus_cephes_DP1, -0.78515625);
Chris@29 346 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
Chris@29 347 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
Chris@29 348 _PS_CONST(sincof_p0, -1.9515295891E-4);
Chris@29 349 _PS_CONST(sincof_p1, 8.3321608736E-3);
Chris@29 350 _PS_CONST(sincof_p2, -1.6666654611E-1);
Chris@29 351 _PS_CONST(coscof_p0, 2.443315711809948E-005);
Chris@29 352 _PS_CONST(coscof_p1, -1.388731625493765E-003);
Chris@29 353 _PS_CONST(coscof_p2, 4.166664568298827E-002);
Chris@29 354 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
Chris@29 355
Chris@29 356
Chris@29 357 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
Chris@29 358 it runs also on old athlons XPs and the pentium III of your grand
Chris@29 359 mother.
Chris@29 360
Chris@29 361 The code is the exact rewriting of the cephes sinf function.
Chris@29 362 Precision is excellent as long as x < 8192 (I did not bother to
Chris@29 363 take into account the special handling they have for greater values
Chris@29 364 -- it does not return garbage for arguments over 8192, though, but
Chris@29 365 the extra precision is missing).
Chris@29 366
Chris@29 367 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
Chris@29 368 surprising but correct result.
Chris@29 369
Chris@29 370 Performance is also surprisingly good, 1.33 times faster than the
Chris@29 371 macos vsinf SSE2 function, and 1.5 times faster than the
Chris@29 372 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
Chris@29 373 too bad for an SSE1 function (with no special tuning) !
Chris@29 374 However the latter libraries probably have a much better handling of NaN,
Chris@29 375 Inf, denormalized and other special arguments..
Chris@29 376
Chris@29 377 On my core 1 duo, the execution of this function takes approximately 95 cycles.
Chris@29 378
Chris@29 379 From what I have observed on the experiments with Intel AMath lib, switching to an
Chris@29 380 SSE2 version would improve the perf by only 10%.
Chris@29 381
Chris@29 382 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
Chris@29 383 deliver full speed.
Chris@29 384 */
Chris@29 385 v4sf sin_ps(v4sf x) { // any x
Chris@29 386 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
Chris@29 387
Chris@29 388 #ifdef USE_SSE2
Chris@29 389 v4si emm0, emm2;
Chris@29 390 #else
Chris@29 391 v2si mm0, mm1, mm2, mm3;
Chris@29 392 #endif
Chris@29 393 sign_bit = x;
Chris@29 394 /* take the absolute value */
Chris@29 395 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
Chris@29 396 /* extract the sign bit (upper one) */
Chris@29 397 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
Chris@29 398
Chris@29 399 /* scale by 4/Pi */
Chris@29 400 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
Chris@29 401
Chris@29 402 //printf("plop:"); print4(y);
Chris@29 403 #ifdef USE_SSE2
Chris@29 404 /* store the integer part of y in mm0 */
Chris@29 405 emm2 = _mm_cvttps_epi32(y);
Chris@29 406 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 407 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
Chris@29 408 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
Chris@29 409 y = _mm_cvtepi32_ps(emm2);
Chris@29 410 /* get the swap sign flag */
Chris@29 411 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
Chris@29 412 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29 413 /* get the polynom selection mask
Chris@29 414 there is one polynom for 0 <= x <= Pi/4
Chris@29 415 and another one for Pi/4<x<=Pi/2
Chris@29 416
Chris@29 417 Both branches will be computed.
Chris@29 418 */
Chris@29 419 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
Chris@29 420 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29 421
Chris@29 422 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
Chris@29 423 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29 424 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
Chris@29 425 #else
Chris@29 426 /* store the integer part of y in mm0:mm1 */
Chris@29 427 xmm2 = _mm_movehl_ps(xmm2, y);
Chris@29 428 mm2 = _mm_cvttps_pi32(y);
Chris@29 429 mm3 = _mm_cvttps_pi32(xmm2);
Chris@29 430 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 431 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
Chris@29 432 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
Chris@29 433 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
Chris@29 434 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
Chris@29 435 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29 436 /* get the swap sign flag */
Chris@29 437 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
Chris@29 438 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
Chris@29 439 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29 440 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29 441 /* get the polynom selection mask */
Chris@29 442 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
Chris@29 443 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
Chris@29 444 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29 445 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29 446 v4sf swap_sign_bit, poly_mask;
Chris@29 447 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
Chris@29 448 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29 449 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
Chris@29 450 _mm_empty(); /* good-bye mmx */
Chris@29 451 #endif
Chris@29 452
Chris@29 453 /* The magic pass: "Extended precision modular arithmetic"
Chris@29 454 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29 455 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
Chris@29 456 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
Chris@29 457 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
Chris@29 458 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29 459 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29 460 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29 461 x = _mm_add_ps(x, xmm1);
Chris@29 462 x = _mm_add_ps(x, xmm2);
Chris@29 463 x = _mm_add_ps(x, xmm3);
Chris@29 464
Chris@29 465 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29 466 y = *(v4sf*)_ps_coscof_p0;
Chris@29 467 v4sf z = _mm_mul_ps(x,x);
Chris@29 468
Chris@29 469 y = _mm_mul_ps(y, z);
Chris@29 470 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
Chris@29 471 y = _mm_mul_ps(y, z);
Chris@29 472 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
Chris@29 473 y = _mm_mul_ps(y, z);
Chris@29 474 y = _mm_mul_ps(y, z);
Chris@29 475 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
Chris@29 476 y = _mm_sub_ps(y, tmp);
Chris@29 477 y = _mm_add_ps(y, *(v4sf*)_ps_1);
Chris@29 478
Chris@29 479 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29 480
Chris@29 481 v4sf y2 = *(v4sf*)_ps_sincof_p0;
Chris@29 482 y2 = _mm_mul_ps(y2, z);
Chris@29 483 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
Chris@29 484 y2 = _mm_mul_ps(y2, z);
Chris@29 485 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
Chris@29 486 y2 = _mm_mul_ps(y2, z);
Chris@29 487 y2 = _mm_mul_ps(y2, x);
Chris@29 488 y2 = _mm_add_ps(y2, x);
Chris@29 489
Chris@29 490 /* select the correct result from the two polynoms */
Chris@29 491 xmm3 = poly_mask;
Chris@29 492 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
Chris@29 493 y = _mm_andnot_ps(xmm3, y);
Chris@29 494 y = _mm_add_ps(y,y2);
Chris@29 495 /* update the sign */
Chris@29 496 y = _mm_xor_ps(y, sign_bit);
Chris@29 497
Chris@29 498 return y;
Chris@29 499 }
Chris@29 500
Chris@29 501 /* almost the same as sin_ps */
Chris@29 502 v4sf cos_ps(v4sf x) { // any x
Chris@29 503 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
Chris@29 504 #ifdef USE_SSE2
Chris@29 505 v4si emm0, emm2;
Chris@29 506 #else
Chris@29 507 v2si mm0, mm1, mm2, mm3;
Chris@29 508 #endif
Chris@29 509 /* take the absolute value */
Chris@29 510 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
Chris@29 511
Chris@29 512 /* scale by 4/Pi */
Chris@29 513 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
Chris@29 514
Chris@29 515 #ifdef USE_SSE2
Chris@29 516 /* store the integer part of y in mm0 */
Chris@29 517 emm2 = _mm_cvttps_epi32(y);
Chris@29 518 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 519 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
Chris@29 520 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
Chris@29 521 y = _mm_cvtepi32_ps(emm2);
Chris@29 522
Chris@29 523 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
Chris@29 524
Chris@29 525 /* get the swap sign flag */
Chris@29 526 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
Chris@29 527 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29 528 /* get the polynom selection mask */
Chris@29 529 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
Chris@29 530 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29 531
Chris@29 532 v4sf sign_bit = _mm_castsi128_ps(emm0);
Chris@29 533 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29 534 #else
Chris@29 535 /* store the integer part of y in mm0:mm1 */
Chris@29 536 xmm2 = _mm_movehl_ps(xmm2, y);
Chris@29 537 mm2 = _mm_cvttps_pi32(y);
Chris@29 538 mm3 = _mm_cvttps_pi32(xmm2);
Chris@29 539
Chris@29 540 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 541 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
Chris@29 542 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
Chris@29 543 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
Chris@29 544 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
Chris@29 545
Chris@29 546 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29 547
Chris@29 548
Chris@29 549 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
Chris@29 550 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
Chris@29 551
Chris@29 552 /* get the swap sign flag in mm0:mm1 and the
Chris@29 553 polynom selection mask in mm2:mm3 */
Chris@29 554
Chris@29 555 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
Chris@29 556 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
Chris@29 557 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29 558 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29 559
Chris@29 560 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
Chris@29 561 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
Chris@29 562
Chris@29 563 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29 564 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29 565
Chris@29 566 v4sf sign_bit, poly_mask;
Chris@29 567 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
Chris@29 568 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29 569 _mm_empty(); /* good-bye mmx */
Chris@29 570 #endif
Chris@29 571 /* The magic pass: "Extended precision modular arithmetic"
Chris@29 572 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29 573 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
Chris@29 574 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
Chris@29 575 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
Chris@29 576 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29 577 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29 578 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29 579 x = _mm_add_ps(x, xmm1);
Chris@29 580 x = _mm_add_ps(x, xmm2);
Chris@29 581 x = _mm_add_ps(x, xmm3);
Chris@29 582
Chris@29 583 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29 584 y = *(v4sf*)_ps_coscof_p0;
Chris@29 585 v4sf z = _mm_mul_ps(x,x);
Chris@29 586
Chris@29 587 y = _mm_mul_ps(y, z);
Chris@29 588 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
Chris@29 589 y = _mm_mul_ps(y, z);
Chris@29 590 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
Chris@29 591 y = _mm_mul_ps(y, z);
Chris@29 592 y = _mm_mul_ps(y, z);
Chris@29 593 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
Chris@29 594 y = _mm_sub_ps(y, tmp);
Chris@29 595 y = _mm_add_ps(y, *(v4sf*)_ps_1);
Chris@29 596
Chris@29 597 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29 598
Chris@29 599 v4sf y2 = *(v4sf*)_ps_sincof_p0;
Chris@29 600 y2 = _mm_mul_ps(y2, z);
Chris@29 601 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
Chris@29 602 y2 = _mm_mul_ps(y2, z);
Chris@29 603 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
Chris@29 604 y2 = _mm_mul_ps(y2, z);
Chris@29 605 y2 = _mm_mul_ps(y2, x);
Chris@29 606 y2 = _mm_add_ps(y2, x);
Chris@29 607
Chris@29 608 /* select the correct result from the two polynoms */
Chris@29 609 xmm3 = poly_mask;
Chris@29 610 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
Chris@29 611 y = _mm_andnot_ps(xmm3, y);
Chris@29 612 y = _mm_add_ps(y,y2);
Chris@29 613 /* update the sign */
Chris@29 614 y = _mm_xor_ps(y, sign_bit);
Chris@29 615
Chris@29 616 return y;
Chris@29 617 }
Chris@29 618
Chris@29 619 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
Chris@29 620 it is almost as fast, and gives you a free cosine with your sine */
Chris@29 621 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
Chris@29 622 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
Chris@29 623 #ifdef USE_SSE2
Chris@29 624 v4si emm0, emm2, emm4;
Chris@29 625 #else
Chris@29 626 v2si mm0, mm1, mm2, mm3, mm4, mm5;
Chris@29 627 #endif
Chris@29 628 sign_bit_sin = x;
Chris@29 629 /* take the absolute value */
Chris@29 630 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
Chris@29 631 /* extract the sign bit (upper one) */
Chris@29 632 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
Chris@29 633
Chris@29 634 /* scale by 4/Pi */
Chris@29 635 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
Chris@29 636
Chris@29 637 #ifdef USE_SSE2
Chris@29 638 /* store the integer part of y in emm2 */
Chris@29 639 emm2 = _mm_cvttps_epi32(y);
Chris@29 640
Chris@29 641 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 642 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
Chris@29 643 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
Chris@29 644 y = _mm_cvtepi32_ps(emm2);
Chris@29 645
Chris@29 646 emm4 = emm2;
Chris@29 647
Chris@29 648 /* get the swap sign flag for the sine */
Chris@29 649 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
Chris@29 650 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29 651 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
Chris@29 652
Chris@29 653 /* get the polynom selection mask for the sine*/
Chris@29 654 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
Chris@29 655 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29 656 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29 657 #else
Chris@29 658 /* store the integer part of y in mm2:mm3 */
Chris@29 659 xmm3 = _mm_movehl_ps(xmm3, y);
Chris@29 660 mm2 = _mm_cvttps_pi32(y);
Chris@29 661 mm3 = _mm_cvttps_pi32(xmm3);
Chris@29 662
Chris@29 663 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29 664 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
Chris@29 665 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
Chris@29 666 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
Chris@29 667 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
Chris@29 668
Chris@29 669 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29 670
Chris@29 671 mm4 = mm2;
Chris@29 672 mm5 = mm3;
Chris@29 673
Chris@29 674 /* get the swap sign flag for the sine */
Chris@29 675 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
Chris@29 676 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
Chris@29 677 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29 678 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29 679 v4sf swap_sign_bit_sin;
Chris@29 680 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
Chris@29 681
Chris@29 682 /* get the polynom selection mask for the sine */
Chris@29 683
Chris@29 684 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
Chris@29 685 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
Chris@29 686 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29 687 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29 688 v4sf poly_mask;
Chris@29 689 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29 690 #endif
Chris@29 691
Chris@29 692 /* The magic pass: "Extended precision modular arithmetic"
Chris@29 693 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29 694 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
Chris@29 695 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
Chris@29 696 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
Chris@29 697 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29 698 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29 699 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29 700 x = _mm_add_ps(x, xmm1);
Chris@29 701 x = _mm_add_ps(x, xmm2);
Chris@29 702 x = _mm_add_ps(x, xmm3);
Chris@29 703
Chris@29 704 #ifdef USE_SSE2
Chris@29 705 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
Chris@29 706 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
Chris@29 707 emm4 = _mm_slli_epi32(emm4, 29);
Chris@29 708 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
Chris@29 709 #else
Chris@29 710 /* get the sign flag for the cosine */
Chris@29 711 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
Chris@29 712 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
Chris@29 713 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
Chris@29 714 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
Chris@29 715 mm4 = _mm_slli_pi32(mm4, 29);
Chris@29 716 mm5 = _mm_slli_pi32(mm5, 29);
Chris@29 717 v4sf sign_bit_cos;
Chris@29 718 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
Chris@29 719 _mm_empty(); /* good-bye mmx */
Chris@29 720 #endif
Chris@29 721
Chris@29 722 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
Chris@29 723
Chris@29 724
Chris@29 725 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29 726 v4sf z = _mm_mul_ps(x,x);
Chris@29 727 y = *(v4sf*)_ps_coscof_p0;
Chris@29 728
Chris@29 729 y = _mm_mul_ps(y, z);
Chris@29 730 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
Chris@29 731 y = _mm_mul_ps(y, z);
Chris@29 732 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
Chris@29 733 y = _mm_mul_ps(y, z);
Chris@29 734 y = _mm_mul_ps(y, z);
Chris@29 735 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
Chris@29 736 y = _mm_sub_ps(y, tmp);
Chris@29 737 y = _mm_add_ps(y, *(v4sf*)_ps_1);
Chris@29 738
Chris@29 739 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29 740
Chris@29 741 v4sf y2 = *(v4sf*)_ps_sincof_p0;
Chris@29 742 y2 = _mm_mul_ps(y2, z);
Chris@29 743 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
Chris@29 744 y2 = _mm_mul_ps(y2, z);
Chris@29 745 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
Chris@29 746 y2 = _mm_mul_ps(y2, z);
Chris@29 747 y2 = _mm_mul_ps(y2, x);
Chris@29 748 y2 = _mm_add_ps(y2, x);
Chris@29 749
Chris@29 750 /* select the correct result from the two polynoms */
Chris@29 751 xmm3 = poly_mask;
Chris@29 752 v4sf ysin2 = _mm_and_ps(xmm3, y2);
Chris@29 753 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
Chris@29 754 y2 = _mm_sub_ps(y2,ysin2);
Chris@29 755 y = _mm_sub_ps(y, ysin1);
Chris@29 756
Chris@29 757 xmm1 = _mm_add_ps(ysin1,ysin2);
Chris@29 758 xmm2 = _mm_add_ps(y,y2);
Chris@29 759
Chris@29 760 /* update the sign */
Chris@29 761 *s = _mm_xor_ps(xmm1, sign_bit_sin);
Chris@29 762 *c = _mm_xor_ps(xmm2, sign_bit_cos);
Chris@29 763 }
Chris@29 764
Chris@29 765 #endif
Chris@29 766