annotate src/fftw-3.3.8/simd-support/simd-avx.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
Chris@82 22 #error "AVX only works in single or double precision"
Chris@82 23 #endif
Chris@82 24
Chris@82 25 #ifdef FFTW_SINGLE
Chris@82 26 # define DS(d,s) s /* single-precision option */
Chris@82 27 # define SUFF(name) name ## s
Chris@82 28 #else
Chris@82 29 # define DS(d,s) d /* double-precision option */
Chris@82 30 # define SUFF(name) name ## d
Chris@82 31 #endif
Chris@82 32
Chris@82 33 #define SIMD_SUFFIX _avx /* for renaming */
Chris@82 34 #define VL DS(2, 4) /* SIMD complex vector length */
Chris@82 35 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
Chris@82 36 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
Chris@82 37
Chris@82 38 #if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
Chris@82 39 #error "compiling simd-avx.h without -mavx"
Chris@82 40 #endif
Chris@82 41
Chris@82 42 #ifdef _MSC_VER
Chris@82 43 #ifndef inline
Chris@82 44 #define inline __inline
Chris@82 45 #endif
Chris@82 46 #endif
Chris@82 47
Chris@82 48 #include <immintrin.h>
Chris@82 49
Chris@82 50 typedef DS(__m256d, __m256) V;
Chris@82 51 #define VADD SUFF(_mm256_add_p)
Chris@82 52 #define VSUB SUFF(_mm256_sub_p)
Chris@82 53 #define VMUL SUFF(_mm256_mul_p)
Chris@82 54 #define VXOR SUFF(_mm256_xor_p)
Chris@82 55 #define VSHUF SUFF(_mm256_shuffle_p)
Chris@82 56
Chris@82 57 #define SHUFVALD(fp0,fp1) \
Chris@82 58 (((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
Chris@82 59 #define SHUFVALS(fp0,fp1,fp2,fp3) \
Chris@82 60 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
Chris@82 61
Chris@82 62 #define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
Chris@82 63 #define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
Chris@82 64
Chris@82 65 #define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
Chris@82 66 #define DVK(var, val) V var = VLIT(val, val)
Chris@82 67 #define LDK(x) x
Chris@82 68
Chris@82 69 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
Chris@82 70 {
Chris@82 71 (void)aligned_like; /* UNUSED */
Chris@82 72 (void)ivs; /* UNUSED */
Chris@82 73 return SUFF(_mm256_loadu_p)(x);
Chris@82 74 }
Chris@82 75
Chris@82 76 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 77 {
Chris@82 78 (void)aligned_like; /* UNUSED */
Chris@82 79 (void)ovs; /* UNUSED */
Chris@82 80 SUFF(_mm256_storeu_p)(x, v);
Chris@82 81 }
Chris@82 82
Chris@82 83 #if FFTW_SINGLE
Chris@82 84
Chris@82 85 # ifdef _MSC_VER
Chris@82 86 /* Temporarily disable the warning "uninitialized local variable
Chris@82 87 'name' used" and runtime checks for using a variable before it is
Chris@82 88 defined which is erroneously triggered by the LOADL0 / LOADH macros
Chris@82 89 as they only modify VAL partly each. */
Chris@82 90 # ifndef __INTEL_COMPILER
Chris@82 91 # pragma warning(disable : 4700)
Chris@82 92 # pragma runtime_checks("u", off)
Chris@82 93 # endif
Chris@82 94 # endif
Chris@82 95 # ifdef __INTEL_COMPILER
Chris@82 96 # pragma warning(disable : 592)
Chris@82 97 # endif
Chris@82 98
Chris@82 99 #define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
Chris@82 100 #define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
Chris@82 101 #define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
Chris@82 102 #define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
Chris@82 103
Chris@82 104 /* it seems like the only AVX way to store 4 complex floats is to
Chris@82 105 extract two pairs of complex floats into two __m128 registers, and
Chris@82 106 then use SSE-like half-stores. Similarly, to load 4 complex
Chris@82 107 floats, we load two pairs of complex floats into two __m128
Chris@82 108 registers, and then pack the two __m128 registers into one __m256
Chris@82 109 value. */
Chris@82 110 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@82 111 {
Chris@82 112 __m128 l, h;
Chris@82 113 V v;
Chris@82 114 (void)aligned_like; /* UNUSED */
Chris@82 115 l = LOADL(x, l);
Chris@82 116 l = LOADH(x + ivs, l);
Chris@82 117 h = LOADL(x + 2*ivs, h);
Chris@82 118 h = LOADH(x + 3*ivs, h);
Chris@82 119 v = _mm256_castps128_ps256(l);
Chris@82 120 v = _mm256_insertf128_ps(v, h, 1);
Chris@82 121 return v;
Chris@82 122 }
Chris@82 123
Chris@82 124 # ifdef _MSC_VER
Chris@82 125 # ifndef __INTEL_COMPILER
Chris@82 126 # pragma warning(default : 4700)
Chris@82 127 # pragma runtime_checks("u", restore)
Chris@82 128 # endif
Chris@82 129 # endif
Chris@82 130 # ifdef __INTEL_COMPILER
Chris@82 131 # pragma warning(default : 592)
Chris@82 132 # endif
Chris@82 133
Chris@82 134 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 135 {
Chris@82 136 __m128 h = _mm256_extractf128_ps(v, 1);
Chris@82 137 __m128 l = _mm256_castps256_ps128(v);
Chris@82 138 (void)aligned_like; /* UNUSED */
Chris@82 139 /* WARNING: the extra_iter hack depends upon STOREL occurring
Chris@82 140 after STOREH */
Chris@82 141 STOREH(x + 3*ovs, h);
Chris@82 142 STOREL(x + 2*ovs, h);
Chris@82 143 STOREH(x + ovs, l);
Chris@82 144 STOREL(x, l);
Chris@82 145 }
Chris@82 146
Chris@82 147 #define STM2(x, v, ovs, aligned_like) /* no-op */
Chris@82 148 static inline void STN2(R *x, V v0, V v1, INT ovs)
Chris@82 149 {
Chris@82 150 V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
Chris@82 151 V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
Chris@82 152 __m128 h0 = _mm256_extractf128_ps(x0, 1);
Chris@82 153 __m128 l0 = _mm256_castps256_ps128(x0);
Chris@82 154 __m128 h1 = _mm256_extractf128_ps(x1, 1);
Chris@82 155 __m128 l1 = _mm256_castps256_ps128(x1);
Chris@82 156
Chris@82 157 *(__m128 *)(x + 3*ovs) = h1;
Chris@82 158 *(__m128 *)(x + 2*ovs) = h0;
Chris@82 159 *(__m128 *)(x + 1*ovs) = l1;
Chris@82 160 *(__m128 *)(x + 0*ovs) = l0;
Chris@82 161 }
Chris@82 162
Chris@82 163 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@82 164 #define STN4(x, v0, v1, v2, v3, ovs) \
Chris@82 165 { \
Chris@82 166 V xxx0, xxx1, xxx2, xxx3; \
Chris@82 167 V yyy0, yyy1, yyy2, yyy3; \
Chris@82 168 xxx0 = _mm256_unpacklo_ps(v0, v2); \
Chris@82 169 xxx1 = _mm256_unpackhi_ps(v0, v2); \
Chris@82 170 xxx2 = _mm256_unpacklo_ps(v1, v3); \
Chris@82 171 xxx3 = _mm256_unpackhi_ps(v1, v3); \
Chris@82 172 yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
Chris@82 173 yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
Chris@82 174 yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
Chris@82 175 yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
Chris@82 176 *(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
Chris@82 177 *(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
Chris@82 178 *(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
Chris@82 179 *(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
Chris@82 180 *(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
Chris@82 181 *(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
Chris@82 182 *(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
Chris@82 183 *(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
Chris@82 184 }
Chris@82 185
Chris@82 186 #else
Chris@82 187 static inline __m128d VMOVAPD_LD(const R *x)
Chris@82 188 {
Chris@82 189 /* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
Chris@82 190 into a 256-bit vmovapd, which requires 32-byte aligment instead of
Chris@82 191 16-byte alignment.
Chris@82 192
Chris@82 193 Force the use of vmovapd via asm until compilers stabilize.
Chris@82 194 */
Chris@82 195 #if defined(__GNUC__)
Chris@82 196 __m128d var;
Chris@82 197 __asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
Chris@82 198 return var;
Chris@82 199 #else
Chris@82 200 return *(const __m128d *)x;
Chris@82 201 #endif
Chris@82 202 }
Chris@82 203
Chris@82 204 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@82 205 {
Chris@82 206 V var;
Chris@82 207 (void)aligned_like; /* UNUSED */
Chris@82 208 var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
Chris@82 209 var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
Chris@82 210 return var;
Chris@82 211 }
Chris@82 212
Chris@82 213 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 214 {
Chris@82 215 (void)aligned_like; /* UNUSED */
Chris@82 216 /* WARNING: the extra_iter hack depends upon the store of the low
Chris@82 217 part occurring after the store of the high part */
Chris@82 218 *(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
Chris@82 219 *(__m128d *)x = _mm256_castpd256_pd128(v);
Chris@82 220 }
Chris@82 221
Chris@82 222
Chris@82 223 #define STM2 ST
Chris@82 224 #define STN2(x, v0, v1, ovs) /* nop */
Chris@82 225 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@82 226
Chris@82 227 /* STN4 is a macro, not a function, thanks to Visual C++ developers
Chris@82 228 deciding "it would be infrequent that people would want to pass more
Chris@82 229 than 3 [__m128 parameters] by value." Even though the comment
Chris@82 230 was made about __m128 parameters, it appears to apply to __m256
Chris@82 231 parameters as well. */
Chris@82 232 #define STN4(x, v0, v1, v2, v3, ovs) \
Chris@82 233 { \
Chris@82 234 V xxx0, xxx1, xxx2, xxx3; \
Chris@82 235 xxx0 = _mm256_unpacklo_pd(v0, v1); \
Chris@82 236 xxx1 = _mm256_unpackhi_pd(v0, v1); \
Chris@82 237 xxx2 = _mm256_unpacklo_pd(v2, v3); \
Chris@82 238 xxx3 = _mm256_unpackhi_pd(v2, v3); \
Chris@82 239 STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
Chris@82 240 STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
Chris@82 241 STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
Chris@82 242 STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
Chris@82 243 }
Chris@82 244 #endif
Chris@82 245
Chris@82 246 static inline V FLIP_RI(V x)
Chris@82 247 {
Chris@82 248 return VSHUF(x, x,
Chris@82 249 DS(SHUFVALD(1, 0),
Chris@82 250 SHUFVALS(1, 0, 3, 2)));
Chris@82 251 }
Chris@82 252
Chris@82 253 static inline V VCONJ(V x)
Chris@82 254 {
Chris@82 255 /* Produce a SIMD vector[VL] of (0 + -0i).
Chris@82 256
Chris@82 257 We really want to write this:
Chris@82 258
Chris@82 259 V pmpm = VLIT(-0.0, 0.0);
Chris@82 260
Chris@82 261 but historically some compilers have ignored the distiction
Chris@82 262 between +0 and -0. It looks like 'gcc-8 -fast-math' treats -0
Chris@82 263 as 0 too.
Chris@82 264 */
Chris@82 265 union uvec {
Chris@82 266 unsigned u[8];
Chris@82 267 V v;
Chris@82 268 };
Chris@82 269 static const union uvec pmpm = {
Chris@82 270 #ifdef FFTW_SINGLE
Chris@82 271 { 0x00000000, 0x80000000, 0x00000000, 0x80000000,
Chris@82 272 0x00000000, 0x80000000, 0x00000000, 0x80000000 }
Chris@82 273 #else
Chris@82 274 { 0x00000000, 0x00000000, 0x00000000, 0x80000000,
Chris@82 275 0x00000000, 0x00000000, 0x00000000, 0x80000000 }
Chris@82 276 #endif
Chris@82 277 };
Chris@82 278 return VXOR(pmpm.v, x);
Chris@82 279 }
Chris@82 280
Chris@82 281 static inline V VBYI(V x)
Chris@82 282 {
Chris@82 283 return FLIP_RI(VCONJ(x));
Chris@82 284 }
Chris@82 285
Chris@82 286 /* FMA support */
Chris@82 287 #define VFMA(a, b, c) VADD(c, VMUL(a, b))
Chris@82 288 #define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
Chris@82 289 #define VFMS(a, b, c) VSUB(VMUL(a, b), c)
Chris@82 290 #define VFMAI(b, c) VADD(c, VBYI(b))
Chris@82 291 #define VFNMSI(b, c) VSUB(c, VBYI(b))
Chris@82 292 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
Chris@82 293 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
Chris@82 294 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
Chris@82 295
Chris@82 296 static inline V VZMUL(V tx, V sr)
Chris@82 297 {
Chris@82 298 V tr = VDUPL(tx);
Chris@82 299 V ti = VDUPH(tx);
Chris@82 300 tr = VMUL(sr, tr);
Chris@82 301 sr = VBYI(sr);
Chris@82 302 return VFMA(ti, sr, tr);
Chris@82 303 }
Chris@82 304
Chris@82 305 static inline V VZMULJ(V tx, V sr)
Chris@82 306 {
Chris@82 307 V tr = VDUPL(tx);
Chris@82 308 V ti = VDUPH(tx);
Chris@82 309 tr = VMUL(sr, tr);
Chris@82 310 sr = VBYI(sr);
Chris@82 311 return VFNMS(ti, sr, tr);
Chris@82 312 }
Chris@82 313
Chris@82 314 static inline V VZMULI(V tx, V sr)
Chris@82 315 {
Chris@82 316 V tr = VDUPL(tx);
Chris@82 317 V ti = VDUPH(tx);
Chris@82 318 ti = VMUL(ti, sr);
Chris@82 319 sr = VBYI(sr);
Chris@82 320 return VFMS(tr, sr, ti);
Chris@82 321 }
Chris@82 322
Chris@82 323 static inline V VZMULIJ(V tx, V sr)
Chris@82 324 {
Chris@82 325 V tr = VDUPL(tx);
Chris@82 326 V ti = VDUPH(tx);
Chris@82 327 ti = VMUL(ti, sr);
Chris@82 328 sr = VBYI(sr);
Chris@82 329 return VFMA(tr, sr, ti);
Chris@82 330 }
Chris@82 331
Chris@82 332 /* twiddle storage #1: compact, slower */
Chris@82 333 #ifdef FFTW_SINGLE
Chris@82 334 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
Chris@82 335 #else
Chris@82 336 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
Chris@82 337 #endif
Chris@82 338 #define TWVL1 (VL)
Chris@82 339
Chris@82 340 static inline V BYTW1(const R *t, V sr)
Chris@82 341 {
Chris@82 342 return VZMUL(LDA(t, 2, t), sr);
Chris@82 343 }
Chris@82 344
Chris@82 345 static inline V BYTWJ1(const R *t, V sr)
Chris@82 346 {
Chris@82 347 return VZMULJ(LDA(t, 2, t), sr);
Chris@82 348 }
Chris@82 349
Chris@82 350 /* twiddle storage #2: twice the space, faster (when in cache) */
Chris@82 351 #ifdef FFTW_SINGLE
Chris@82 352 # define VTW2(v,x) \
Chris@82 353 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@82 354 {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
Chris@82 355 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
Chris@82 356 {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
Chris@82 357 #else
Chris@82 358 # define VTW2(v,x) \
Chris@82 359 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@82 360 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
Chris@82 361 #endif
Chris@82 362 #define TWVL2 (2 * VL)
Chris@82 363
Chris@82 364 static inline V BYTW2(const R *t, V sr)
Chris@82 365 {
Chris@82 366 const V *twp = (const V *)t;
Chris@82 367 V si = FLIP_RI(sr);
Chris@82 368 V tr = twp[0], ti = twp[1];
Chris@82 369 return VFMA(tr, sr, VMUL(ti, si));
Chris@82 370 }
Chris@82 371
Chris@82 372 static inline V BYTWJ2(const R *t, V sr)
Chris@82 373 {
Chris@82 374 const V *twp = (const V *)t;
Chris@82 375 V si = FLIP_RI(sr);
Chris@82 376 V tr = twp[0], ti = twp[1];
Chris@82 377 return VFNMS(ti, si, VMUL(tr, sr));
Chris@82 378 }
Chris@82 379
Chris@82 380 /* twiddle storage #3 */
Chris@82 381 #define VTW3 VTW1
Chris@82 382 #define TWVL3 TWVL1
Chris@82 383
Chris@82 384 /* twiddle storage for split arrays */
Chris@82 385 #ifdef FFTW_SINGLE
Chris@82 386 # define VTWS(v,x) \
Chris@82 387 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@82 388 {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
Chris@82 389 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
Chris@82 390 {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
Chris@82 391 #else
Chris@82 392 # define VTWS(v,x) \
Chris@82 393 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@82 394 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
Chris@82 395 #endif
Chris@82 396 #define TWVLS (2 * VL)
Chris@82 397
Chris@82 398
Chris@82 399 /* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
Chris@82 400 See Intel Optimization Manual (April 2011, version 248966), Section
Chris@82 401 11.3 */
Chris@82 402 #define VLEAVE _mm256_zeroupper
Chris@82 403
Chris@82 404 #include "simd-common.h"