js-dsp-test: fft/native/bqvec/pommier/sse

annotate fft/native/bqvec/pommier/sse_mathfun.h @ 40:223f770b5341 kissfft-double tip

Try a double-precision kissfft

author	Chris Cannam
date	Wed, 07 Sep 2016 10:40:32 +0100
parents	cf59817a5983
children

rev	line source
Chris@29	1
Chris@29	2 #ifndef _POMMIER_SSE_MATHFUN_H_
Chris@29	3 #define _POMMIER_SSE_MATHFUN_H_
Chris@29	4
Chris@29	5 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Chris@29	6
Chris@29	7 Inspired by Intel Approximate Math library, and based on the
Chris@29	8 corresponding algorithms of the cephes math library
Chris@29	9
Chris@29	10 The default is to use the SSE1 version. If you define USE_SSE2 the
Chris@29	11 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
Chris@29	12 not expect any significant performance improvement with SSE2.
Chris@29	13 */
Chris@29	14
Chris@29	15 /* Copyright (C) 2007 Julien Pommier
Chris@29	16
Chris@29	17 This software is provided 'as-is', without any express or implied
Chris@29	18 warranty. In no event will the authors be held liable for any damages
Chris@29	19 arising from the use of this software.
Chris@29	20
Chris@29	21 Permission is granted to anyone to use this software for any purpose,
Chris@29	22 including commercial applications, and to alter it and redistribute it
Chris@29	23 freely, subject to the following restrictions:
Chris@29	24
Chris@29	25 1. The origin of this software must not be misrepresented; you must not
Chris@29	26 claim that you wrote the original software. If you use this software
Chris@29	27 in a product, an acknowledgment in the product documentation would be
Chris@29	28 appreciated but is not required.
Chris@29	29 2. Altered source versions must be plainly marked as such, and must not be
Chris@29	30 misrepresented as being the original software.
Chris@29	31 3. This notice may not be removed or altered from any source distribution.
Chris@29	32
Chris@29	33 (this is the zlib license)
Chris@29	34 */
Chris@29	35
Chris@29	36 #include <xmmintrin.h>
Chris@29	37
Chris@29	38 /* yes I know, the top of this file is quite ugly */
Chris@29	39
Chris@29	40 #ifdef _MSC_VER /* visual c++ */
Chris@29	41 # define ALIGN16_BEG __declspec(align(16))
Chris@29	42 # define ALIGN16_END
Chris@29	43 #else /* gcc or icc */
Chris@29	44 # define ALIGN16_BEG
Chris@29	45 # define ALIGN16_END __attribute__((aligned(16)))
Chris@29	46 #endif
Chris@29	47
Chris@29	48 /* __m128 is ugly to write */
Chris@29	49 typedef __m128 v4sf; // vector of 4 float (sse1)
Chris@29	50
Chris@29	51 #ifdef USE_SSE2
Chris@29	52 # include <emmintrin.h>
Chris@29	53 typedef __m128i v4si; // vector of 4 int (sse2)
Chris@29	54 #else
Chris@29	55 typedef __m64 v2si; // vector of 2 int (mmx)
Chris@29	56 #endif
Chris@29	57
Chris@29	58 /* declare some SSE constants -- why can't I figure a better way to do that? */
Chris@29	59 #define _PS_CONST(Name, Val) \
Chris@29	60 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29	61 #define _PI32_CONST(Name, Val) \
Chris@29	62 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29	63 #define _PS_CONST_TYPE(Name, Type, Val) \
Chris@29	64 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
Chris@29	65
Chris@29	66 _PS_CONST(1 , 1.0f);
Chris@29	67 _PS_CONST(0p5, 0.5f);
Chris@29	68 /* the smallest non denormalized float number */
Chris@29	69 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
Chris@29	70 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
Chris@29	71 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
Chris@29	72
Chris@29	73 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
Chris@29	74 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
Chris@29	75
Chris@29	76 _PI32_CONST(1, 1);
Chris@29	77 _PI32_CONST(inv1, ~1);
Chris@29	78 _PI32_CONST(2, 2);
Chris@29	79 _PI32_CONST(4, 4);
Chris@29	80 _PI32_CONST(0x7f, 0x7f);
Chris@29	81
Chris@29	82 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
Chris@29	83 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
Chris@29	84 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
Chris@29	85 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
Chris@29	86 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
Chris@29	87 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
Chris@29	88 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
Chris@29	89 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
Chris@29	90 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
Chris@29	91 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
Chris@29	92 _PS_CONST(cephes_log_q1, -2.12194440e-4);
Chris@29	93 _PS_CONST(cephes_log_q2, 0.693359375);
Chris@29	94
Chris@29	95 #if defined (__MINGW32__)
Chris@29	96
Chris@29	97 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
Chris@29	98 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
Chris@29	99 broken on my mingw gcc 3.4.5 ...
Chris@29	100
Chris@29	101 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
Chris@29	102 */
Chris@29	103
Chris@29	104 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
Chris@29	105 asm (
Chris@29	106 "movhlps %2,%0\n\t"
Chris@29	107 : "=x" (a)
Chris@29	108 : "0" (a), "x"(b)
Chris@29	109 );
Chris@29	110 return a; }
Chris@29	111 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
Chris@29	112 #define _mm_movehl_ps my_movehl_ps
Chris@29	113
Chris@29	114 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
Chris@29	115 asm (
Chris@29	116 "cmpltps %2,%0\n\t"
Chris@29	117 : "=x" (a)
Chris@29	118 : "0" (a), "x"(b)
Chris@29	119 );
Chris@29	120 return a;
Chris@29	121 }
Chris@29	122 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
Chris@29	123 asm (
Chris@29	124 "cmpnleps %2,%0\n\t"
Chris@29	125 : "=x" (a)
Chris@29	126 : "0" (a), "x"(b)
Chris@29	127 );
Chris@29	128 return a;
Chris@29	129 }
Chris@29	130 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
Chris@29	131 asm (
Chris@29	132 "cmpeqps %2,%0\n\t"
Chris@29	133 : "=x" (a)
Chris@29	134 : "0" (a), "x"(b)
Chris@29	135 );
Chris@29	136 return a;
Chris@29	137 }
Chris@29	138 #warning "redefined _mm_cmpxx_ps functions..."
Chris@29	139 #define _mm_cmplt_ps my_cmplt_ps
Chris@29	140 #define _mm_cmpgt_ps my_cmpgt_ps
Chris@29	141 #define _mm_cmpeq_ps my_cmpeq_ps
Chris@29	142 #endif
Chris@29	143
Chris@29	144 #ifndef USE_SSE2
Chris@29	145 typedef union xmm_mm_union {
Chris@29	146 __m128 xmm;
Chris@29	147 __m64 mm[2];
Chris@29	148 } xmm_mm_union;
Chris@29	149
Chris@29	150 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
Chris@29	151 xmm_mm_union u; u.xmm = xmm_; \
Chris@29	152 mm0_ = u.mm[0]; \
Chris@29	153 mm1_ = u.mm[1]; \
Chris@29	154 }
Chris@29	155
Chris@29	156 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
Chris@29	157 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
Chris@29	158 }
Chris@29	159
Chris@29	160 #endif // USE_SSE2
Chris@29	161
Chris@29	162 /* natural logarithm computed for 4 simultaneous float
Chris@29	163 return NaN for x <= 0
Chris@29	164 */
Chris@29	165 v4sf log_ps(v4sf x) {
Chris@29	166 #ifdef USE_SSE2
Chris@29	167 v4si emm0;
Chris@29	168 #else
Chris@29	169 v2si mm0, mm1;
Chris@29	170 #endif
Chris@29	171 v4sf one = (v4sf)_ps_1;
Chris@29	172
Chris@29	173 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
Chris@29	174
Chris@29	175 x = _mm_max_ps(x, (v4sf)_ps_min_norm_pos); /* cut off denormalized stuff */
Chris@29	176
Chris@29	177 #ifndef USE_SSE2
Chris@29	178 /* part 1: x = frexpf(x, &e); */
Chris@29	179 COPY_XMM_TO_MM(x, mm0, mm1);
Chris@29	180 mm0 = _mm_srli_pi32(mm0, 23);
Chris@29	181 mm1 = _mm_srli_pi32(mm1, 23);
Chris@29	182 #else
Chris@29	183 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
Chris@29	184 #endif
Chris@29	185 /* keep only the fractional part */
Chris@29	186 x = _mm_and_ps(x, (v4sf)_ps_inv_mant_mask);
Chris@29	187 x = _mm_or_ps(x, (v4sf)_ps_0p5);
Chris@29	188
Chris@29	189 #ifndef USE_SSE2
Chris@29	190 /* now e=mm0:mm1 contain the really base-2 exponent */
Chris@29	191 mm0 = _mm_sub_pi32(mm0, (v2si)_pi32_0x7f);
Chris@29	192 mm1 = _mm_sub_pi32(mm1, (v2si)_pi32_0x7f);
Chris@29	193 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
Chris@29	194 _mm_empty(); /* bye bye mmx */
Chris@29	195 #else
Chris@29	196 emm0 = _mm_sub_epi32(emm0, (v4si)_pi32_0x7f);
Chris@29	197 v4sf e = _mm_cvtepi32_ps(emm0);
Chris@29	198 #endif
Chris@29	199
Chris@29	200 e = _mm_add_ps(e, one);
Chris@29	201
Chris@29	202 /* part2:
Chris@29	203 if( x < SQRTHF ) {
Chris@29	204 e -= 1;
Chris@29	205 x = x + x - 1.0;
Chris@29	206 } else { x = x - 1.0; }
Chris@29	207 */
Chris@29	208 v4sf mask = _mm_cmplt_ps(x, (v4sf)_ps_cephes_SQRTHF);
Chris@29	209 v4sf tmp = _mm_and_ps(x, mask);
Chris@29	210 x = _mm_sub_ps(x, one);
Chris@29	211 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
Chris@29	212 x = _mm_add_ps(x, tmp);
Chris@29	213
Chris@29	214
Chris@29	215 v4sf z = _mm_mul_ps(x,x);
Chris@29	216
Chris@29	217 v4sf y = (v4sf)_ps_cephes_log_p0;
Chris@29	218 y = _mm_mul_ps(y, x);
Chris@29	219 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p1);
Chris@29	220 y = _mm_mul_ps(y, x);
Chris@29	221 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p2);
Chris@29	222 y = _mm_mul_ps(y, x);
Chris@29	223 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p3);
Chris@29	224 y = _mm_mul_ps(y, x);
Chris@29	225 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p4);
Chris@29	226 y = _mm_mul_ps(y, x);
Chris@29	227 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p5);
Chris@29	228 y = _mm_mul_ps(y, x);
Chris@29	229 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p6);
Chris@29	230 y = _mm_mul_ps(y, x);
Chris@29	231 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p7);
Chris@29	232 y = _mm_mul_ps(y, x);
Chris@29	233 y = _mm_add_ps(y, (v4sf)_ps_cephes_log_p8);
Chris@29	234 y = _mm_mul_ps(y, x);
Chris@29	235
Chris@29	236 y = _mm_mul_ps(y, z);
Chris@29	237
Chris@29	238
Chris@29	239 tmp = _mm_mul_ps(e, (v4sf)_ps_cephes_log_q1);
Chris@29	240 y = _mm_add_ps(y, tmp);
Chris@29	241
Chris@29	242
Chris@29	243 tmp = _mm_mul_ps(z, (v4sf)_ps_0p5);
Chris@29	244 y = _mm_sub_ps(y, tmp);
Chris@29	245
Chris@29	246 tmp = _mm_mul_ps(e, (v4sf)_ps_cephes_log_q2);
Chris@29	247 x = _mm_add_ps(x, y);
Chris@29	248 x = _mm_add_ps(x, tmp);
Chris@29	249 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
Chris@29	250 return x;
Chris@29	251 }
Chris@29	252
Chris@29	253 _PS_CONST(exp_hi, 88.3762626647949f);
Chris@29	254 _PS_CONST(exp_lo, -88.3762626647949f);
Chris@29	255
Chris@29	256 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
Chris@29	257 _PS_CONST(cephes_exp_C1, 0.693359375);
Chris@29	258 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
Chris@29	259
Chris@29	260 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
Chris@29	261 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
Chris@29	262 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
Chris@29	263 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
Chris@29	264 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
Chris@29	265 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
Chris@29	266
Chris@29	267 v4sf exp_ps(v4sf x) {
Chris@29	268 v4sf tmp = _mm_setzero_ps(), fx;
Chris@29	269 #ifdef USE_SSE2
Chris@29	270 v4si emm0;
Chris@29	271 #else
Chris@29	272 v2si mm0, mm1;
Chris@29	273 #endif
Chris@29	274 v4sf one = (v4sf)_ps_1;
Chris@29	275
Chris@29	276 x = _mm_min_ps(x, (v4sf)_ps_exp_hi);
Chris@29	277 x = _mm_max_ps(x, (v4sf)_ps_exp_lo);
Chris@29	278
Chris@29	279 /* express exp(x) as exp(g + nlog(2)) /
Chris@29	280 fx = _mm_mul_ps(x, (v4sf)_ps_cephes_LOG2EF);
Chris@29	281 fx = _mm_add_ps(fx, (v4sf)_ps_0p5);
Chris@29	282
Chris@29	283 /* how to perform a floorf with SSE: just below */
Chris@29	284 #ifndef USE_SSE2
Chris@29	285 /* step 1 : cast to int */
Chris@29	286 tmp = _mm_movehl_ps(tmp, fx);
Chris@29	287 mm0 = _mm_cvttps_pi32(fx);
Chris@29	288 mm1 = _mm_cvttps_pi32(tmp);
Chris@29	289 /* step 2 : cast back to float */
Chris@29	290 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
Chris@29	291 #else
Chris@29	292 emm0 = _mm_cvttps_epi32(fx);
Chris@29	293 tmp = _mm_cvtepi32_ps(emm0);
Chris@29	294 #endif
Chris@29	295 /* if greater, substract 1 */
Chris@29	296 v4sf mask = _mm_cmpgt_ps(tmp, fx);
Chris@29	297 mask = _mm_and_ps(mask, one);
Chris@29	298 fx = _mm_sub_ps(tmp, mask);
Chris@29	299
Chris@29	300 tmp = _mm_mul_ps(fx, (v4sf)_ps_cephes_exp_C1);
Chris@29	301 v4sf z = _mm_mul_ps(fx, (v4sf)_ps_cephes_exp_C2);
Chris@29	302 x = _mm_sub_ps(x, tmp);
Chris@29	303 x = _mm_sub_ps(x, z);
Chris@29	304
Chris@29	305 z = _mm_mul_ps(x,x);
Chris@29	306
Chris@29	307 v4sf y = (v4sf)_ps_cephes_exp_p0;
Chris@29	308 y = _mm_mul_ps(y, x);
Chris@29	309 y = _mm_add_ps(y, (v4sf)_ps_cephes_exp_p1);
Chris@29	310 y = _mm_mul_ps(y, x);
Chris@29	311 y = _mm_add_ps(y, (v4sf)_ps_cephes_exp_p2);
Chris@29	312 y = _mm_mul_ps(y, x);
Chris@29	313 y = _mm_add_ps(y, (v4sf)_ps_cephes_exp_p3);
Chris@29	314 y = _mm_mul_ps(y, x);
Chris@29	315 y = _mm_add_ps(y, (v4sf)_ps_cephes_exp_p4);
Chris@29	316 y = _mm_mul_ps(y, x);
Chris@29	317 y = _mm_add_ps(y, (v4sf)_ps_cephes_exp_p5);
Chris@29	318 y = _mm_mul_ps(y, z);
Chris@29	319 y = _mm_add_ps(y, x);
Chris@29	320 y = _mm_add_ps(y, one);
Chris@29	321
Chris@29	322 /* build 2^n */
Chris@29	323 #ifndef USE_SSE2
Chris@29	324 z = _mm_movehl_ps(z, fx);
Chris@29	325 mm0 = _mm_cvttps_pi32(fx);
Chris@29	326 mm1 = _mm_cvttps_pi32(z);
Chris@29	327 mm0 = _mm_add_pi32(mm0, (v2si)_pi32_0x7f);
Chris@29	328 mm1 = _mm_add_pi32(mm1, (v2si)_pi32_0x7f);
Chris@29	329 mm0 = _mm_slli_pi32(mm0, 23);
Chris@29	330 mm1 = _mm_slli_pi32(mm1, 23);
Chris@29	331
Chris@29	332 v4sf pow2n;
Chris@29	333 COPY_MM_TO_XMM(mm0, mm1, pow2n);
Chris@29	334 _mm_empty();
Chris@29	335 #else
Chris@29	336 emm0 = _mm_cvttps_epi32(fx);
Chris@29	337 emm0 = _mm_add_epi32(emm0, (v4si)_pi32_0x7f);
Chris@29	338 emm0 = _mm_slli_epi32(emm0, 23);
Chris@29	339 v4sf pow2n = _mm_castsi128_ps(emm0);
Chris@29	340 #endif
Chris@29	341 y = _mm_mul_ps(y, pow2n);
Chris@29	342 return y;
Chris@29	343 }
Chris@29	344
Chris@29	345 _PS_CONST(minus_cephes_DP1, -0.78515625);
Chris@29	346 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
Chris@29	347 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
Chris@29	348 _PS_CONST(sincof_p0, -1.9515295891E-4);
Chris@29	349 _PS_CONST(sincof_p1, 8.3321608736E-3);
Chris@29	350 _PS_CONST(sincof_p2, -1.6666654611E-1);
Chris@29	351 _PS_CONST(coscof_p0, 2.443315711809948E-005);
Chris@29	352 _PS_CONST(coscof_p1, -1.388731625493765E-003);
Chris@29	353 _PS_CONST(coscof_p2, 4.166664568298827E-002);
Chris@29	354 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
Chris@29	355
Chris@29	356
Chris@29	357 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
Chris@29	358 it runs also on old athlons XPs and the pentium III of your grand
Chris@29	359 mother.
Chris@29	360
Chris@29	361 The code is the exact rewriting of the cephes sinf function.
Chris@29	362 Precision is excellent as long as x < 8192 (I did not bother to
Chris@29	363 take into account the special handling they have for greater values
Chris@29	364 -- it does not return garbage for arguments over 8192, though, but
Chris@29	365 the extra precision is missing).
Chris@29	366
Chris@29	367 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
Chris@29	368 surprising but correct result.
Chris@29	369
Chris@29	370 Performance is also surprisingly good, 1.33 times faster than the
Chris@29	371 macos vsinf SSE2 function, and 1.5 times faster than the
Chris@29	372 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
Chris@29	373 too bad for an SSE1 function (with no special tuning) !
Chris@29	374 However the latter libraries probably have a much better handling of NaN,
Chris@29	375 Inf, denormalized and other special arguments..
Chris@29	376
Chris@29	377 On my core 1 duo, the execution of this function takes approximately 95 cycles.
Chris@29	378
Chris@29	379 From what I have observed on the experiments with Intel AMath lib, switching to an
Chris@29	380 SSE2 version would improve the perf by only 10%.
Chris@29	381
Chris@29	382 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
Chris@29	383 deliver full speed.
Chris@29	384 */
Chris@29	385 v4sf sin_ps(v4sf x) { // any x
Chris@29	386 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
Chris@29	387
Chris@29	388 #ifdef USE_SSE2
Chris@29	389 v4si emm0, emm2;
Chris@29	390 #else
Chris@29	391 v2si mm0, mm1, mm2, mm3;
Chris@29	392 #endif
Chris@29	393 sign_bit = x;
Chris@29	394 /* take the absolute value */
Chris@29	395 x = _mm_and_ps(x, (v4sf)_ps_inv_sign_mask);
Chris@29	396 /* extract the sign bit (upper one) */
Chris@29	397 sign_bit = _mm_and_ps(sign_bit, (v4sf)_ps_sign_mask);
Chris@29	398
Chris@29	399 /* scale by 4/Pi */
Chris@29	400 y = _mm_mul_ps(x, (v4sf)_ps_cephes_FOPI);
Chris@29	401
Chris@29	402 //printf("plop:"); print4(y);
Chris@29	403 #ifdef USE_SSE2
Chris@29	404 /* store the integer part of y in mm0 */
Chris@29	405 emm2 = _mm_cvttps_epi32(y);
Chris@29	406 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	407 emm2 = _mm_add_epi32(emm2, (v4si)_pi32_1);
Chris@29	408 emm2 = _mm_and_si128(emm2, (v4si)_pi32_inv1);
Chris@29	409 y = _mm_cvtepi32_ps(emm2);
Chris@29	410 /* get the swap sign flag */
Chris@29	411 emm0 = _mm_and_si128(emm2, (v4si)_pi32_4);
Chris@29	412 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29	413 /* get the polynom selection mask
Chris@29	414 there is one polynom for 0 <= x <= Pi/4
Chris@29	415 and another one for Pi/4<x<=Pi/2
Chris@29	416
Chris@29	417 Both branches will be computed.
Chris@29	418 */
Chris@29	419 emm2 = _mm_and_si128(emm2, (v4si)_pi32_2);
Chris@29	420 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29	421
Chris@29	422 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
Chris@29	423 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29	424 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
Chris@29	425 #else
Chris@29	426 /* store the integer part of y in mm0:mm1 */
Chris@29	427 xmm2 = _mm_movehl_ps(xmm2, y);
Chris@29	428 mm2 = _mm_cvttps_pi32(y);
Chris@29	429 mm3 = _mm_cvttps_pi32(xmm2);
Chris@29	430 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	431 mm2 = _mm_add_pi32(mm2, (v2si)_pi32_1);
Chris@29	432 mm3 = _mm_add_pi32(mm3, (v2si)_pi32_1);
Chris@29	433 mm2 = _mm_and_si64(mm2, (v2si)_pi32_inv1);
Chris@29	434 mm3 = _mm_and_si64(mm3, (v2si)_pi32_inv1);
Chris@29	435 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29	436 /* get the swap sign flag */
Chris@29	437 mm0 = _mm_and_si64(mm2, (v2si)_pi32_4);
Chris@29	438 mm1 = _mm_and_si64(mm3, (v2si)_pi32_4);
Chris@29	439 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29	440 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29	441 /* get the polynom selection mask */
Chris@29	442 mm2 = _mm_and_si64(mm2, (v2si)_pi32_2);
Chris@29	443 mm3 = _mm_and_si64(mm3, (v2si)_pi32_2);
Chris@29	444 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29	445 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29	446 v4sf swap_sign_bit, poly_mask;
Chris@29	447 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
Chris@29	448 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29	449 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
Chris@29	450 _mm_empty(); /* good-bye mmx */
Chris@29	451 #endif
Chris@29	452
Chris@29	453 /* The magic pass: "Extended precision modular arithmetic"
Chris@29	454 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29	455 xmm1 = (v4sf)_ps_minus_cephes_DP1;
Chris@29	456 xmm2 = (v4sf)_ps_minus_cephes_DP2;
Chris@29	457 xmm3 = (v4sf)_ps_minus_cephes_DP3;
Chris@29	458 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29	459 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29	460 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29	461 x = _mm_add_ps(x, xmm1);
Chris@29	462 x = _mm_add_ps(x, xmm2);
Chris@29	463 x = _mm_add_ps(x, xmm3);
Chris@29	464
Chris@29	465 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29	466 y = (v4sf)_ps_coscof_p0;
Chris@29	467 v4sf z = _mm_mul_ps(x,x);
Chris@29	468
Chris@29	469 y = _mm_mul_ps(y, z);
Chris@29	470 y = _mm_add_ps(y, (v4sf)_ps_coscof_p1);
Chris@29	471 y = _mm_mul_ps(y, z);
Chris@29	472 y = _mm_add_ps(y, (v4sf)_ps_coscof_p2);
Chris@29	473 y = _mm_mul_ps(y, z);
Chris@29	474 y = _mm_mul_ps(y, z);
Chris@29	475 v4sf tmp = _mm_mul_ps(z, (v4sf)_ps_0p5);
Chris@29	476 y = _mm_sub_ps(y, tmp);
Chris@29	477 y = _mm_add_ps(y, (v4sf)_ps_1);
Chris@29	478
Chris@29	479 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29	480
Chris@29	481 v4sf y2 = (v4sf)_ps_sincof_p0;
Chris@29	482 y2 = _mm_mul_ps(y2, z);
Chris@29	483 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p1);
Chris@29	484 y2 = _mm_mul_ps(y2, z);
Chris@29	485 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p2);
Chris@29	486 y2 = _mm_mul_ps(y2, z);
Chris@29	487 y2 = _mm_mul_ps(y2, x);
Chris@29	488 y2 = _mm_add_ps(y2, x);
Chris@29	489
Chris@29	490 /* select the correct result from the two polynoms */
Chris@29	491 xmm3 = poly_mask;
Chris@29	492 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
Chris@29	493 y = _mm_andnot_ps(xmm3, y);
Chris@29	494 y = _mm_add_ps(y,y2);
Chris@29	495 /* update the sign */
Chris@29	496 y = _mm_xor_ps(y, sign_bit);
Chris@29	497
Chris@29	498 return y;
Chris@29	499 }
Chris@29	500
Chris@29	501 /* almost the same as sin_ps */
Chris@29	502 v4sf cos_ps(v4sf x) { // any x
Chris@29	503 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
Chris@29	504 #ifdef USE_SSE2
Chris@29	505 v4si emm0, emm2;
Chris@29	506 #else
Chris@29	507 v2si mm0, mm1, mm2, mm3;
Chris@29	508 #endif
Chris@29	509 /* take the absolute value */
Chris@29	510 x = _mm_and_ps(x, (v4sf)_ps_inv_sign_mask);
Chris@29	511
Chris@29	512 /* scale by 4/Pi */
Chris@29	513 y = _mm_mul_ps(x, (v4sf)_ps_cephes_FOPI);
Chris@29	514
Chris@29	515 #ifdef USE_SSE2
Chris@29	516 /* store the integer part of y in mm0 */
Chris@29	517 emm2 = _mm_cvttps_epi32(y);
Chris@29	518 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	519 emm2 = _mm_add_epi32(emm2, (v4si)_pi32_1);
Chris@29	520 emm2 = _mm_and_si128(emm2, (v4si)_pi32_inv1);
Chris@29	521 y = _mm_cvtepi32_ps(emm2);
Chris@29	522
Chris@29	523 emm2 = _mm_sub_epi32(emm2, (v4si)_pi32_2);
Chris@29	524
Chris@29	525 /* get the swap sign flag */
Chris@29	526 emm0 = _mm_andnot_si128(emm2, (v4si)_pi32_4);
Chris@29	527 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29	528 /* get the polynom selection mask */
Chris@29	529 emm2 = _mm_and_si128(emm2, (v4si)_pi32_2);
Chris@29	530 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29	531
Chris@29	532 v4sf sign_bit = _mm_castsi128_ps(emm0);
Chris@29	533 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29	534 #else
Chris@29	535 /* store the integer part of y in mm0:mm1 */
Chris@29	536 xmm2 = _mm_movehl_ps(xmm2, y);
Chris@29	537 mm2 = _mm_cvttps_pi32(y);
Chris@29	538 mm3 = _mm_cvttps_pi32(xmm2);
Chris@29	539
Chris@29	540 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	541 mm2 = _mm_add_pi32(mm2, (v2si)_pi32_1);
Chris@29	542 mm3 = _mm_add_pi32(mm3, (v2si)_pi32_1);
Chris@29	543 mm2 = _mm_and_si64(mm2, (v2si)_pi32_inv1);
Chris@29	544 mm3 = _mm_and_si64(mm3, (v2si)_pi32_inv1);
Chris@29	545
Chris@29	546 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29	547
Chris@29	548
Chris@29	549 mm2 = _mm_sub_pi32(mm2, (v2si)_pi32_2);
Chris@29	550 mm3 = _mm_sub_pi32(mm3, (v2si)_pi32_2);
Chris@29	551
Chris@29	552 /* get the swap sign flag in mm0:mm1 and the
Chris@29	553 polynom selection mask in mm2:mm3 */
Chris@29	554
Chris@29	555 mm0 = _mm_andnot_si64(mm2, (v2si)_pi32_4);
Chris@29	556 mm1 = _mm_andnot_si64(mm3, (v2si)_pi32_4);
Chris@29	557 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29	558 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29	559
Chris@29	560 mm2 = _mm_and_si64(mm2, (v2si)_pi32_2);
Chris@29	561 mm3 = _mm_and_si64(mm3, (v2si)_pi32_2);
Chris@29	562
Chris@29	563 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29	564 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29	565
Chris@29	566 v4sf sign_bit, poly_mask;
Chris@29	567 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
Chris@29	568 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29	569 _mm_empty(); /* good-bye mmx */
Chris@29	570 #endif
Chris@29	571 /* The magic pass: "Extended precision modular arithmetic"
Chris@29	572 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29	573 xmm1 = (v4sf)_ps_minus_cephes_DP1;
Chris@29	574 xmm2 = (v4sf)_ps_minus_cephes_DP2;
Chris@29	575 xmm3 = (v4sf)_ps_minus_cephes_DP3;
Chris@29	576 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29	577 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29	578 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29	579 x = _mm_add_ps(x, xmm1);
Chris@29	580 x = _mm_add_ps(x, xmm2);
Chris@29	581 x = _mm_add_ps(x, xmm3);
Chris@29	582
Chris@29	583 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29	584 y = (v4sf)_ps_coscof_p0;
Chris@29	585 v4sf z = _mm_mul_ps(x,x);
Chris@29	586
Chris@29	587 y = _mm_mul_ps(y, z);
Chris@29	588 y = _mm_add_ps(y, (v4sf)_ps_coscof_p1);
Chris@29	589 y = _mm_mul_ps(y, z);
Chris@29	590 y = _mm_add_ps(y, (v4sf)_ps_coscof_p2);
Chris@29	591 y = _mm_mul_ps(y, z);
Chris@29	592 y = _mm_mul_ps(y, z);
Chris@29	593 v4sf tmp = _mm_mul_ps(z, (v4sf)_ps_0p5);
Chris@29	594 y = _mm_sub_ps(y, tmp);
Chris@29	595 y = _mm_add_ps(y, (v4sf)_ps_1);
Chris@29	596
Chris@29	597 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29	598
Chris@29	599 v4sf y2 = (v4sf)_ps_sincof_p0;
Chris@29	600 y2 = _mm_mul_ps(y2, z);
Chris@29	601 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p1);
Chris@29	602 y2 = _mm_mul_ps(y2, z);
Chris@29	603 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p2);
Chris@29	604 y2 = _mm_mul_ps(y2, z);
Chris@29	605 y2 = _mm_mul_ps(y2, x);
Chris@29	606 y2 = _mm_add_ps(y2, x);
Chris@29	607
Chris@29	608 /* select the correct result from the two polynoms */
Chris@29	609 xmm3 = poly_mask;
Chris@29	610 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
Chris@29	611 y = _mm_andnot_ps(xmm3, y);
Chris@29	612 y = _mm_add_ps(y,y2);
Chris@29	613 /* update the sign */
Chris@29	614 y = _mm_xor_ps(y, sign_bit);
Chris@29	615
Chris@29	616 return y;
Chris@29	617 }
Chris@29	618
Chris@29	619 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
Chris@29	620 it is almost as fast, and gives you a free cosine with your sine */
Chris@29	621 void sincos_ps(v4sf x, v4sf s, v4sf c) {
Chris@29	622 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
Chris@29	623 #ifdef USE_SSE2
Chris@29	624 v4si emm0, emm2, emm4;
Chris@29	625 #else
Chris@29	626 v2si mm0, mm1, mm2, mm3, mm4, mm5;
Chris@29	627 #endif
Chris@29	628 sign_bit_sin = x;
Chris@29	629 /* take the absolute value */
Chris@29	630 x = _mm_and_ps(x, (v4sf)_ps_inv_sign_mask);
Chris@29	631 /* extract the sign bit (upper one) */
Chris@29	632 sign_bit_sin = _mm_and_ps(sign_bit_sin, (v4sf)_ps_sign_mask);
Chris@29	633
Chris@29	634 /* scale by 4/Pi */
Chris@29	635 y = _mm_mul_ps(x, (v4sf)_ps_cephes_FOPI);
Chris@29	636
Chris@29	637 #ifdef USE_SSE2
Chris@29	638 /* store the integer part of y in emm2 */
Chris@29	639 emm2 = _mm_cvttps_epi32(y);
Chris@29	640
Chris@29	641 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	642 emm2 = _mm_add_epi32(emm2, (v4si)_pi32_1);
Chris@29	643 emm2 = _mm_and_si128(emm2, (v4si)_pi32_inv1);
Chris@29	644 y = _mm_cvtepi32_ps(emm2);
Chris@29	645
Chris@29	646 emm4 = emm2;
Chris@29	647
Chris@29	648 /* get the swap sign flag for the sine */
Chris@29	649 emm0 = _mm_and_si128(emm2, (v4si)_pi32_4);
Chris@29	650 emm0 = _mm_slli_epi32(emm0, 29);
Chris@29	651 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
Chris@29	652
Chris@29	653 /* get the polynom selection mask for the sine*/
Chris@29	654 emm2 = _mm_and_si128(emm2, (v4si)_pi32_2);
Chris@29	655 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
Chris@29	656 v4sf poly_mask = _mm_castsi128_ps(emm2);
Chris@29	657 #else
Chris@29	658 /* store the integer part of y in mm2:mm3 */
Chris@29	659 xmm3 = _mm_movehl_ps(xmm3, y);
Chris@29	660 mm2 = _mm_cvttps_pi32(y);
Chris@29	661 mm3 = _mm_cvttps_pi32(xmm3);
Chris@29	662
Chris@29	663 /* j=(j+1) & (~1) (see the cephes sources) */
Chris@29	664 mm2 = _mm_add_pi32(mm2, (v2si)_pi32_1);
Chris@29	665 mm3 = _mm_add_pi32(mm3, (v2si)_pi32_1);
Chris@29	666 mm2 = _mm_and_si64(mm2, (v2si)_pi32_inv1);
Chris@29	667 mm3 = _mm_and_si64(mm3, (v2si)_pi32_inv1);
Chris@29	668
Chris@29	669 y = _mm_cvtpi32x2_ps(mm2, mm3);
Chris@29	670
Chris@29	671 mm4 = mm2;
Chris@29	672 mm5 = mm3;
Chris@29	673
Chris@29	674 /* get the swap sign flag for the sine */
Chris@29	675 mm0 = _mm_and_si64(mm2, (v2si)_pi32_4);
Chris@29	676 mm1 = _mm_and_si64(mm3, (v2si)_pi32_4);
Chris@29	677 mm0 = _mm_slli_pi32(mm0, 29);
Chris@29	678 mm1 = _mm_slli_pi32(mm1, 29);
Chris@29	679 v4sf swap_sign_bit_sin;
Chris@29	680 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
Chris@29	681
Chris@29	682 /* get the polynom selection mask for the sine */
Chris@29	683
Chris@29	684 mm2 = _mm_and_si64(mm2, (v2si)_pi32_2);
Chris@29	685 mm3 = _mm_and_si64(mm3, (v2si)_pi32_2);
Chris@29	686 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
Chris@29	687 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
Chris@29	688 v4sf poly_mask;
Chris@29	689 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
Chris@29	690 #endif
Chris@29	691
Chris@29	692 /* The magic pass: "Extended precision modular arithmetic"
Chris@29	693 x = ((x - y * DP1) - y * DP2) - y * DP3; */
Chris@29	694 xmm1 = (v4sf)_ps_minus_cephes_DP1;
Chris@29	695 xmm2 = (v4sf)_ps_minus_cephes_DP2;
Chris@29	696 xmm3 = (v4sf)_ps_minus_cephes_DP3;
Chris@29	697 xmm1 = _mm_mul_ps(y, xmm1);
Chris@29	698 xmm2 = _mm_mul_ps(y, xmm2);
Chris@29	699 xmm3 = _mm_mul_ps(y, xmm3);
Chris@29	700 x = _mm_add_ps(x, xmm1);
Chris@29	701 x = _mm_add_ps(x, xmm2);
Chris@29	702 x = _mm_add_ps(x, xmm3);
Chris@29	703
Chris@29	704 #ifdef USE_SSE2
Chris@29	705 emm4 = _mm_sub_epi32(emm4, (v4si)_pi32_2);
Chris@29	706 emm4 = _mm_andnot_si128(emm4, (v4si)_pi32_4);
Chris@29	707 emm4 = _mm_slli_epi32(emm4, 29);
Chris@29	708 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
Chris@29	709 #else
Chris@29	710 /* get the sign flag for the cosine */
Chris@29	711 mm4 = _mm_sub_pi32(mm4, (v2si)_pi32_2);
Chris@29	712 mm5 = _mm_sub_pi32(mm5, (v2si)_pi32_2);
Chris@29	713 mm4 = _mm_andnot_si64(mm4, (v2si)_pi32_4);
Chris@29	714 mm5 = _mm_andnot_si64(mm5, (v2si)_pi32_4);
Chris@29	715 mm4 = _mm_slli_pi32(mm4, 29);
Chris@29	716 mm5 = _mm_slli_pi32(mm5, 29);
Chris@29	717 v4sf sign_bit_cos;
Chris@29	718 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
Chris@29	719 _mm_empty(); /* good-bye mmx */
Chris@29	720 #endif
Chris@29	721
Chris@29	722 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
Chris@29	723
Chris@29	724
Chris@29	725 /* Evaluate the first polynom (0 <= x <= Pi/4) */
Chris@29	726 v4sf z = _mm_mul_ps(x,x);
Chris@29	727 y = (v4sf)_ps_coscof_p0;
Chris@29	728
Chris@29	729 y = _mm_mul_ps(y, z);
Chris@29	730 y = _mm_add_ps(y, (v4sf)_ps_coscof_p1);
Chris@29	731 y = _mm_mul_ps(y, z);
Chris@29	732 y = _mm_add_ps(y, (v4sf)_ps_coscof_p2);
Chris@29	733 y = _mm_mul_ps(y, z);
Chris@29	734 y = _mm_mul_ps(y, z);
Chris@29	735 v4sf tmp = _mm_mul_ps(z, (v4sf)_ps_0p5);
Chris@29	736 y = _mm_sub_ps(y, tmp);
Chris@29	737 y = _mm_add_ps(y, (v4sf)_ps_1);
Chris@29	738
Chris@29	739 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
Chris@29	740
Chris@29	741 v4sf y2 = (v4sf)_ps_sincof_p0;
Chris@29	742 y2 = _mm_mul_ps(y2, z);
Chris@29	743 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p1);
Chris@29	744 y2 = _mm_mul_ps(y2, z);
Chris@29	745 y2 = _mm_add_ps(y2, (v4sf)_ps_sincof_p2);
Chris@29	746 y2 = _mm_mul_ps(y2, z);
Chris@29	747 y2 = _mm_mul_ps(y2, x);
Chris@29	748 y2 = _mm_add_ps(y2, x);
Chris@29	749
Chris@29	750 /* select the correct result from the two polynoms */
Chris@29	751 xmm3 = poly_mask;
Chris@29	752 v4sf ysin2 = _mm_and_ps(xmm3, y2);
Chris@29	753 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
Chris@29	754 y2 = _mm_sub_ps(y2,ysin2);
Chris@29	755 y = _mm_sub_ps(y, ysin1);
Chris@29	756
Chris@29	757 xmm1 = _mm_add_ps(ysin1,ysin2);
Chris@29	758 xmm2 = _mm_add_ps(y,y2);
Chris@29	759
Chris@29	760 /* update the sign */
Chris@29	761 *s = _mm_xor_ps(xmm1, sign_bit_sin);
Chris@29	762 *c = _mm_xor_ps(xmm2, sign_bit_cos);
Chris@29	763 }
Chris@29	764
Chris@29	765 #endif
Chris@29	766

Mercurial > hg > js-dsp-test

annotate fft/native/bqvec/pommier/sse_mathfun.h @ 40:223f770b5341 kissfft-double tip