annotate src/fftw-3.3.3/simd-support/simd-avx.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
Chris@10 22 #error "AVX only works in single or double precision"
Chris@10 23 #endif
Chris@10 24
Chris@10 25 #ifdef FFTW_SINGLE
Chris@10 26 # define DS(d,s) s /* single-precision option */
Chris@10 27 # define SUFF(name) name ## s
Chris@10 28 #else
Chris@10 29 # define DS(d,s) d /* double-precision option */
Chris@10 30 # define SUFF(name) name ## d
Chris@10 31 #endif
Chris@10 32
Chris@10 33 #define SIMD_SUFFIX _avx /* for renaming */
Chris@10 34 #define VL DS(2, 4) /* SIMD complex vector length */
Chris@10 35 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
Chris@10 36 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
Chris@10 37
Chris@10 38 #if defined(__GNUC__) && !defined(__AVX__) /* sanity check */
Chris@10 39 #error "compiling simd-avx.h without -mavx"
Chris@10 40 #endif
Chris@10 41
Chris@10 42 #ifdef _MSC_VER
Chris@10 43 #ifndef inline
Chris@10 44 #define inline __inline
Chris@10 45 #endif
Chris@10 46 #endif
Chris@10 47
Chris@10 48 #include <immintrin.h>
Chris@10 49
Chris@10 50 typedef DS(__m256d, __m256) V;
Chris@10 51 #define VADD SUFF(_mm256_add_p)
Chris@10 52 #define VSUB SUFF(_mm256_sub_p)
Chris@10 53 #define VMUL SUFF(_mm256_mul_p)
Chris@10 54 #define VXOR SUFF(_mm256_xor_p)
Chris@10 55 #define VSHUF SUFF(_mm256_shuffle_p)
Chris@10 56
Chris@10 57 #define SHUFVALD(fp0,fp1) \
Chris@10 58 (((fp1) << 3) | ((fp0) << 2) | ((fp1) << 1) | ((fp0)))
Chris@10 59 #define SHUFVALS(fp0,fp1,fp2,fp3) \
Chris@10 60 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
Chris@10 61
Chris@10 62 #define VDUPL(x) DS(_mm256_unpacklo_pd(x, x), VSHUF(x, x, SHUFVALS(0, 0, 2, 2)))
Chris@10 63 #define VDUPH(x) DS(_mm256_unpackhi_pd(x, x), VSHUF(x, x, SHUFVALS(1, 1, 3, 3)))
Chris@10 64
Chris@10 65 #define VLIT(x0, x1) DS(_mm256_set_pd(x0, x1, x0, x1), _mm256_set_ps(x0, x1, x0, x1, x0, x1, x0, x1))
Chris@10 66 #define DVK(var, val) V var = VLIT(val, val)
Chris@10 67 #define LDK(x) x
Chris@10 68
Chris@10 69 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
Chris@10 70 {
Chris@10 71 (void)aligned_like; /* UNUSED */
Chris@10 72 (void)ivs; /* UNUSED */
Chris@10 73 return SUFF(_mm256_loadu_p)(x);
Chris@10 74 }
Chris@10 75
Chris@10 76 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
Chris@10 77 {
Chris@10 78 (void)aligned_like; /* UNUSED */
Chris@10 79 (void)ovs; /* UNUSED */
Chris@10 80 SUFF(_mm256_storeu_p)(x, v);
Chris@10 81 }
Chris@10 82
Chris@10 83 #if FFTW_SINGLE
Chris@10 84
Chris@10 85 #define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
Chris@10 86 #define LOADL(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
Chris@10 87 #define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
Chris@10 88 #define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
Chris@10 89
Chris@10 90 /* it seems like the only AVX way to store 4 complex floats is to
Chris@10 91 extract two pairs of complex floats into two __m128 registers, and
Chris@10 92 then use SSE-like half-stores. Similarly, to load 4 complex
Chris@10 93 floats, we load two pairs of complex floats into two __m128
Chris@10 94 registers, and then pack the two __m128 registers into one __m256
Chris@10 95 value. */
Chris@10 96 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@10 97 {
Chris@10 98 __m128 l, h;
Chris@10 99 V v;
Chris@10 100 (void)aligned_like; /* UNUSED */
Chris@10 101 l = LOADL(x, l);
Chris@10 102 l = LOADH(x + ivs, l);
Chris@10 103 h = LOADL(x + 2*ivs, h);
Chris@10 104 h = LOADH(x + 3*ivs, h);
Chris@10 105 v = _mm256_castps128_ps256(l);
Chris@10 106 v = _mm256_insertf128_ps(v, h, 1);
Chris@10 107 return v;
Chris@10 108 }
Chris@10 109
Chris@10 110 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@10 111 {
Chris@10 112 __m128 h = _mm256_extractf128_ps(v, 1);
Chris@10 113 __m128 l = _mm256_castps256_ps128(v);
Chris@10 114 (void)aligned_like; /* UNUSED */
Chris@10 115 /* WARNING: the extra_iter hack depends upon STOREL occurring
Chris@10 116 after STOREH */
Chris@10 117 STOREH(x + 3*ovs, h);
Chris@10 118 STOREL(x + 2*ovs, h);
Chris@10 119 STOREH(x + ovs, l);
Chris@10 120 STOREL(x, l);
Chris@10 121 }
Chris@10 122
Chris@10 123 #define STM2(x, v, ovs, aligned_like) /* no-op */
Chris@10 124 static inline void STN2(R *x, V v0, V v1, INT ovs)
Chris@10 125 {
Chris@10 126 V x0 = VSHUF(v0, v1, SHUFVALS(0, 1, 0, 1));
Chris@10 127 V x1 = VSHUF(v0, v1, SHUFVALS(2, 3, 2, 3));
Chris@10 128 __m128 h0 = _mm256_extractf128_ps(x0, 1);
Chris@10 129 __m128 l0 = _mm256_castps256_ps128(x0);
Chris@10 130 __m128 h1 = _mm256_extractf128_ps(x1, 1);
Chris@10 131 __m128 l1 = _mm256_castps256_ps128(x1);
Chris@10 132 *(__m128 *)(x + 3*ovs) = h1;
Chris@10 133 *(__m128 *)(x + 2*ovs) = h0;
Chris@10 134 *(__m128 *)(x + 1*ovs) = l1;
Chris@10 135 *(__m128 *)(x + 0*ovs) = l0;
Chris@10 136 }
Chris@10 137
Chris@10 138 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@10 139 #define STN4(x, v0, v1, v2, v3, ovs) \
Chris@10 140 { \
Chris@10 141 V xxx0, xxx1, xxx2, xxx3; \
Chris@10 142 V yyy0, yyy1, yyy2, yyy3; \
Chris@10 143 xxx0 = _mm256_unpacklo_ps(v0, v2); \
Chris@10 144 xxx1 = _mm256_unpackhi_ps(v0, v2); \
Chris@10 145 xxx2 = _mm256_unpacklo_ps(v1, v3); \
Chris@10 146 xxx3 = _mm256_unpackhi_ps(v1, v3); \
Chris@10 147 yyy0 = _mm256_unpacklo_ps(xxx0, xxx2); \
Chris@10 148 yyy1 = _mm256_unpackhi_ps(xxx0, xxx2); \
Chris@10 149 yyy2 = _mm256_unpacklo_ps(xxx1, xxx3); \
Chris@10 150 yyy3 = _mm256_unpackhi_ps(xxx1, xxx3); \
Chris@10 151 *(__m128 *)(x + 0 * ovs) = _mm256_castps256_ps128(yyy0); \
Chris@10 152 *(__m128 *)(x + 4 * ovs) = _mm256_extractf128_ps(yyy0, 1); \
Chris@10 153 *(__m128 *)(x + 1 * ovs) = _mm256_castps256_ps128(yyy1); \
Chris@10 154 *(__m128 *)(x + 5 * ovs) = _mm256_extractf128_ps(yyy1, 1); \
Chris@10 155 *(__m128 *)(x + 2 * ovs) = _mm256_castps256_ps128(yyy2); \
Chris@10 156 *(__m128 *)(x + 6 * ovs) = _mm256_extractf128_ps(yyy2, 1); \
Chris@10 157 *(__m128 *)(x + 3 * ovs) = _mm256_castps256_ps128(yyy3); \
Chris@10 158 *(__m128 *)(x + 7 * ovs) = _mm256_extractf128_ps(yyy3, 1); \
Chris@10 159 }
Chris@10 160
Chris@10 161 #else
Chris@10 162 static inline __m128d VMOVAPD_LD(const R *x)
Chris@10 163 {
Chris@10 164 /* gcc-4.6 miscompiles the combination _mm256_castpd128_pd256(VMOVAPD_LD(x))
Chris@10 165 into a 256-bit vmovapd, which requires 32-byte aligment instead of
Chris@10 166 16-byte alignment.
Chris@10 167
Chris@10 168 Force the use of vmovapd via asm until compilers stabilize.
Chris@10 169 */
Chris@10 170 #if defined(__GNUC__)
Chris@10 171 __m128d var;
Chris@10 172 __asm__("vmovapd %1, %0\n" : "=x"(var) : "m"(x[0]));
Chris@10 173 return var;
Chris@10 174 #else
Chris@10 175 return *(const __m128d *)x;
Chris@10 176 #endif
Chris@10 177 }
Chris@10 178
Chris@10 179 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@10 180 {
Chris@10 181 V var;
Chris@10 182 (void)aligned_like; /* UNUSED */
Chris@10 183 var = _mm256_castpd128_pd256(VMOVAPD_LD(x));
Chris@10 184 var = _mm256_insertf128_pd(var, *(const __m128d *)(x+ivs), 1);
Chris@10 185 return var;
Chris@10 186 }
Chris@10 187
Chris@10 188 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@10 189 {
Chris@10 190 (void)aligned_like; /* UNUSED */
Chris@10 191 /* WARNING: the extra_iter hack depends upon the store of the low
Chris@10 192 part occurring after the store of the high part */
Chris@10 193 *(__m128d *)(x + ovs) = _mm256_extractf128_pd(v, 1);
Chris@10 194 *(__m128d *)x = _mm256_castpd256_pd128(v);
Chris@10 195 }
Chris@10 196
Chris@10 197
Chris@10 198 #define STM2 ST
Chris@10 199 #define STN2(x, v0, v1, ovs) /* nop */
Chris@10 200 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@10 201
Chris@10 202 /* STN4 is a macro, not a function, thanks to Visual C++ developers
Chris@10 203 deciding "it would be infrequent that people would want to pass more
Chris@10 204 than 3 [__m128 parameters] by value." Even though the comment
Chris@10 205 was made about __m128 parameters, it appears to apply to __m256
Chris@10 206 parameters as well. */
Chris@10 207 #define STN4(x, v0, v1, v2, v3, ovs) \
Chris@10 208 { \
Chris@10 209 V xxx0, xxx1, xxx2, xxx3; \
Chris@10 210 xxx0 = _mm256_unpacklo_pd(v0, v1); \
Chris@10 211 xxx1 = _mm256_unpackhi_pd(v0, v1); \
Chris@10 212 xxx2 = _mm256_unpacklo_pd(v2, v3); \
Chris@10 213 xxx3 = _mm256_unpackhi_pd(v2, v3); \
Chris@10 214 STA(x, _mm256_permute2f128_pd(xxx0, xxx2, 0x20), 0, 0); \
Chris@10 215 STA(x + ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x20), 0, 0); \
Chris@10 216 STA(x + 2 * ovs, _mm256_permute2f128_pd(xxx0, xxx2, 0x31), 0, 0); \
Chris@10 217 STA(x + 3 * ovs, _mm256_permute2f128_pd(xxx1, xxx3, 0x31), 0, 0); \
Chris@10 218 }
Chris@10 219 #endif
Chris@10 220
Chris@10 221 static inline V FLIP_RI(V x)
Chris@10 222 {
Chris@10 223 return VSHUF(x, x,
Chris@10 224 DS(SHUFVALD(1, 0),
Chris@10 225 SHUFVALS(1, 0, 3, 2)));
Chris@10 226 }
Chris@10 227
Chris@10 228 static inline V VCONJ(V x)
Chris@10 229 {
Chris@10 230 V pmpm = VLIT(-0.0, 0.0);
Chris@10 231 return VXOR(pmpm, x);
Chris@10 232 }
Chris@10 233
Chris@10 234 static inline V VBYI(V x)
Chris@10 235 {
Chris@10 236 return FLIP_RI(VCONJ(x));
Chris@10 237 }
Chris@10 238
Chris@10 239 /* FMA support */
Chris@10 240 #define VFMA(a, b, c) VADD(c, VMUL(a, b))
Chris@10 241 #define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
Chris@10 242 #define VFMS(a, b, c) VSUB(VMUL(a, b), c)
Chris@10 243 #define VFMAI(b, c) VADD(c, VBYI(b))
Chris@10 244 #define VFNMSI(b, c) VSUB(c, VBYI(b))
Chris@10 245 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
Chris@10 246 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
Chris@10 247 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
Chris@10 248
Chris@10 249 static inline V VZMUL(V tx, V sr)
Chris@10 250 {
Chris@10 251 V tr = VDUPL(tx);
Chris@10 252 V ti = VDUPH(tx);
Chris@10 253 tr = VMUL(sr, tr);
Chris@10 254 sr = VBYI(sr);
Chris@10 255 return VFMA(ti, sr, tr);
Chris@10 256 }
Chris@10 257
Chris@10 258 static inline V VZMULJ(V tx, V sr)
Chris@10 259 {
Chris@10 260 V tr = VDUPL(tx);
Chris@10 261 V ti = VDUPH(tx);
Chris@10 262 tr = VMUL(sr, tr);
Chris@10 263 sr = VBYI(sr);
Chris@10 264 return VFNMS(ti, sr, tr);
Chris@10 265 }
Chris@10 266
Chris@10 267 static inline V VZMULI(V tx, V sr)
Chris@10 268 {
Chris@10 269 V tr = VDUPL(tx);
Chris@10 270 V ti = VDUPH(tx);
Chris@10 271 ti = VMUL(ti, sr);
Chris@10 272 sr = VBYI(sr);
Chris@10 273 return VFMS(tr, sr, ti);
Chris@10 274 }
Chris@10 275
Chris@10 276 static inline V VZMULIJ(V tx, V sr)
Chris@10 277 {
Chris@10 278 V tr = VDUPL(tx);
Chris@10 279 V ti = VDUPH(tx);
Chris@10 280 ti = VMUL(ti, sr);
Chris@10 281 sr = VBYI(sr);
Chris@10 282 return VFMA(tr, sr, ti);
Chris@10 283 }
Chris@10 284
Chris@10 285 /* twiddle storage #1: compact, slower */
Chris@10 286 #ifdef FFTW_SINGLE
Chris@10 287 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
Chris@10 288 #else
Chris@10 289 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
Chris@10 290 #endif
Chris@10 291 #define TWVL1 (VL)
Chris@10 292
Chris@10 293 static inline V BYTW1(const R *t, V sr)
Chris@10 294 {
Chris@10 295 return VZMUL(LDA(t, 2, t), sr);
Chris@10 296 }
Chris@10 297
Chris@10 298 static inline V BYTWJ1(const R *t, V sr)
Chris@10 299 {
Chris@10 300 return VZMULJ(LDA(t, 2, t), sr);
Chris@10 301 }
Chris@10 302
Chris@10 303 /* twiddle storage #2: twice the space, faster (when in cache) */
Chris@10 304 #ifdef FFTW_SINGLE
Chris@10 305 # define VTW2(v,x) \
Chris@10 306 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@10 307 {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
Chris@10 308 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
Chris@10 309 {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
Chris@10 310 #else
Chris@10 311 # define VTW2(v,x) \
Chris@10 312 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@10 313 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
Chris@10 314 #endif
Chris@10 315 #define TWVL2 (2 * VL)
Chris@10 316
Chris@10 317 static inline V BYTW2(const R *t, V sr)
Chris@10 318 {
Chris@10 319 const V *twp = (const V *)t;
Chris@10 320 V si = FLIP_RI(sr);
Chris@10 321 V tr = twp[0], ti = twp[1];
Chris@10 322 return VFMA(tr, sr, VMUL(ti, si));
Chris@10 323 }
Chris@10 324
Chris@10 325 static inline V BYTWJ2(const R *t, V sr)
Chris@10 326 {
Chris@10 327 const V *twp = (const V *)t;
Chris@10 328 V si = FLIP_RI(sr);
Chris@10 329 V tr = twp[0], ti = twp[1];
Chris@10 330 return VFNMS(ti, si, VMUL(tr, sr));
Chris@10 331 }
Chris@10 332
Chris@10 333 /* twiddle storage #3 */
Chris@10 334 #define VTW3 VTW1
Chris@10 335 #define TWVL3 TWVL1
Chris@10 336
Chris@10 337 /* twiddle storage for split arrays */
Chris@10 338 #ifdef FFTW_SINGLE
Chris@10 339 # define VTWS(v,x) \
Chris@10 340 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@10 341 {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
Chris@10 342 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
Chris@10 343 {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
Chris@10 344 #else
Chris@10 345 # define VTWS(v,x) \
Chris@10 346 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@10 347 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
Chris@10 348 #endif
Chris@10 349 #define TWVLS (2 * VL)
Chris@10 350
Chris@10 351
Chris@10 352 /* Use VZEROUPPER to avoid the penalty of switching from AVX to SSE.
Chris@10 353 See Intel Optimization Manual (April 2011, version 248966), Section
Chris@10 354 11.3 */
Chris@10 355 #define VLEAVE _mm256_zeroupper
Chris@10 356
Chris@10 357 #include "simd-common.h"