annotate src/fftw-3.3.8/simd-support/simd-generic256.h @ 168:ceec0dd9ec9c

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam <cannam@all-day-breakfast.com>
date Fri, 07 Feb 2020 11:51:13 +0000
parents bd3cc4d1df30
children
rev   line source
cannam@167 1 /*
cannam@167 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
cannam@167 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
cannam@167 4 *
cannam@167 5 * Generic256d added by Romain Dolbeau, and turned into simd-generic256.h
cannam@167 6 * with single & double precision by Erik Lindahl.
cannam@167 7 * Romain Dolbeau hereby places his modifications in the public domain.
cannam@167 8 * Erik Lindahl hereby places his modifications in the public domain.
cannam@167 9 *
cannam@167 10 * This program is free software; you can redistribute it and/or modify
cannam@167 11 * it under the terms of the GNU General Public License as published by
cannam@167 12 * the Free Software Foundation; either version 2 of the License, or
cannam@167 13 * (at your option) any later version.
cannam@167 14 *
cannam@167 15 * This program is distributed in the hope that it will be useful,
cannam@167 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@167 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@167 18 * GNU General Public License for more details.
cannam@167 19 *
cannam@167 20 * You should have received a copy of the GNU General Public License
cannam@167 21 * along with this program; if not, write to the Free Software
cannam@167 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@167 23 *
cannam@167 24 */
cannam@167 25
cannam@167 26 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
cannam@167 27 # error "Generic simd256 only works in single or double precision"
cannam@167 28 #endif
cannam@167 29
cannam@167 30 #define SIMD_SUFFIX _generic_simd256 /* for renaming */
cannam@167 31
cannam@167 32 #ifdef FFTW_SINGLE
cannam@167 33 # define DS(d,s) s /* single-precision option */
cannam@167 34 # define VDUPL(x) {x[0],x[0],x[2],x[2],x[4],x[4],x[6],x[6]}
cannam@167 35 # define VDUPH(x) {x[1],x[1],x[3],x[3],x[5],x[5],x[7],x[7]}
cannam@167 36 # define DVK(var, val) V var = {val,val,val,val,val,val,val,val}
cannam@167 37 #else
cannam@167 38 # define DS(d,s) d /* double-precision option */
cannam@167 39 # define VDUPL(x) {x[0],x[0],x[2],x[2]}
cannam@167 40 # define VDUPH(x) {x[1],x[1],x[3],x[3]}
cannam@167 41 # define DVK(var, val) V var = {val, val, val, val}
cannam@167 42 #endif
cannam@167 43
cannam@167 44 #define VL DS(2,4) /* SIMD vector length, in term of complex numbers */
cannam@167 45 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
cannam@167 46 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
cannam@167 47
cannam@167 48 typedef DS(double,float) V __attribute__ ((vector_size(32)));
cannam@167 49
cannam@167 50 #define VADD(a,b) ((a)+(b))
cannam@167 51 #define VSUB(a,b) ((a)-(b))
cannam@167 52 #define VMUL(a,b) ((a)*(b))
cannam@167 53
cannam@167 54 #define LDK(x) x
cannam@167 55
cannam@167 56 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
cannam@167 57 {
cannam@167 58 V var;
cannam@167 59 (void)aligned_like; /* UNUSED */
cannam@167 60 return *(const V *)x;
cannam@167 61 }
cannam@167 62
cannam@167 63 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
cannam@167 64 {
cannam@167 65 (void)aligned_like; /* UNUSED */
cannam@167 66 (void)ovs; /* UNUSED */
cannam@167 67 *(V *)x = v;
cannam@167 68 }
cannam@167 69
cannam@167 70 static inline V LD(const R *x, INT ivs, const R *aligned_like)
cannam@167 71 {
cannam@167 72 V var;
cannam@167 73 (void)aligned_like; /* UNUSED */
cannam@167 74 var[0] = x[0];
cannam@167 75 var[1] = x[1];
cannam@167 76 var[2] = x[ivs];
cannam@167 77 var[3] = x[ivs+1];
cannam@167 78 #ifdef FFTW_SINGLE
cannam@167 79 var[4] = x[2*ivs];
cannam@167 80 var[5] = x[2*ivs+1];
cannam@167 81 var[6] = x[3*ivs];
cannam@167 82 var[7] = x[3*ivs+1];
cannam@167 83 #endif
cannam@167 84 return var;
cannam@167 85 }
cannam@167 86
cannam@167 87
cannam@167 88 /* ST has to be separate due to the storage hack requiring reverse order */
cannam@167 89
cannam@167 90 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
cannam@167 91 {
cannam@167 92 (void)aligned_like; /* UNUSED */
cannam@167 93 #ifdef FFTW_SINGLE
cannam@167 94 *(x + 3*ovs ) = v[6];
cannam@167 95 *(x + 3*ovs + 1) = v[7];
cannam@167 96 *(x + 2*ovs ) = v[4];
cannam@167 97 *(x + 2*ovs + 1) = v[5];
cannam@167 98 *(x + ovs ) = v[2];
cannam@167 99 *(x + ovs + 1) = v[3];
cannam@167 100 *(x ) = v[0];
cannam@167 101 *(x + 1) = v[1];
cannam@167 102 #else
cannam@167 103 *(x + ovs ) = v[2];
cannam@167 104 *(x + ovs + 1) = v[3];
cannam@167 105 *(x ) = v[0];
cannam@167 106 *(x + 1) = v[1];
cannam@167 107 #endif
cannam@167 108 }
cannam@167 109
cannam@167 110 #ifdef FFTW_SINGLE
cannam@167 111 #define STM2(x, v, ovs, a) /* no-op */
cannam@167 112 static inline void STN2(R *x, V v0, V v1, INT ovs)
cannam@167 113 {
cannam@167 114 x[ 0] = v0[0];
cannam@167 115 x[ 1] = v0[1];
cannam@167 116 x[ 2] = v1[0];
cannam@167 117 x[ 3] = v1[1];
cannam@167 118 x[ ovs ] = v0[2];
cannam@167 119 x[ ovs + 1] = v0[3];
cannam@167 120 x[ ovs + 2] = v1[2];
cannam@167 121 x[ ovs + 3] = v1[3];
cannam@167 122 x[2*ovs ] = v0[4];
cannam@167 123 x[2*ovs + 1] = v0[5];
cannam@167 124 x[2*ovs + 2] = v1[4];
cannam@167 125 x[2*ovs + 3] = v1[5];
cannam@167 126 x[3*ovs ] = v0[6];
cannam@167 127 x[3*ovs + 1] = v0[7];
cannam@167 128 x[3*ovs + 2] = v1[6];
cannam@167 129 x[3*ovs + 3] = v1[7];
cannam@167 130 }
cannam@167 131
cannam@167 132 # define STM4(x, v, ovs, aligned_like) /* no-op */
cannam@167 133 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
cannam@167 134 {
cannam@167 135 *(x ) = v0[0];
cannam@167 136 *(x + 1) = v1[0];
cannam@167 137 *(x + 2) = v2[0];
cannam@167 138 *(x + 3) = v3[0];
cannam@167 139 *(x + ovs ) = v0[1];
cannam@167 140 *(x + ovs + 1) = v1[1];
cannam@167 141 *(x + ovs + 2) = v2[1];
cannam@167 142 *(x + ovs + 3) = v3[1];
cannam@167 143 *(x + 2 * ovs ) = v0[2];
cannam@167 144 *(x + 2 * ovs + 1) = v1[2];
cannam@167 145 *(x + 2 * ovs + 2) = v2[2];
cannam@167 146 *(x + 2 * ovs + 3) = v3[2];
cannam@167 147 *(x + 3 * ovs ) = v0[3];
cannam@167 148 *(x + 3 * ovs + 1) = v1[3];
cannam@167 149 *(x + 3 * ovs + 2) = v2[3];
cannam@167 150 *(x + 3 * ovs + 3) = v3[3];
cannam@167 151 *(x + 4 * ovs ) = v0[4];
cannam@167 152 *(x + 4 * ovs + 1) = v1[4];
cannam@167 153 *(x + 4 * ovs + 2) = v2[4];
cannam@167 154 *(x + 4 * ovs + 3) = v3[4];
cannam@167 155 *(x + 5 * ovs ) = v0[5];
cannam@167 156 *(x + 5 * ovs + 1) = v1[5];
cannam@167 157 *(x + 5 * ovs + 2) = v2[5];
cannam@167 158 *(x + 5 * ovs + 3) = v3[5];
cannam@167 159 *(x + 6 * ovs ) = v0[6];
cannam@167 160 *(x + 6 * ovs + 1) = v1[6];
cannam@167 161 *(x + 6 * ovs + 2) = v2[6];
cannam@167 162 *(x + 6 * ovs + 3) = v3[6];
cannam@167 163 *(x + 7 * ovs ) = v0[7];
cannam@167 164 *(x + 7 * ovs + 1) = v1[7];
cannam@167 165 *(x + 7 * ovs + 2) = v2[7];
cannam@167 166 *(x + 7 * ovs + 3) = v3[7];
cannam@167 167 }
cannam@167 168
cannam@167 169 #else
cannam@167 170 /* FFTW_DOUBLE */
cannam@167 171
cannam@167 172 #define STM2 ST
cannam@167 173 #define STN2(x, v0, v1, ovs) /* nop */
cannam@167 174 #define STM4(x, v, ovs, aligned_like) /* no-op */
cannam@167 175
cannam@167 176 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) {
cannam@167 177 *(x ) = v0[0];
cannam@167 178 *(x + 1) = v1[0];
cannam@167 179 *(x + 2) = v2[0];
cannam@167 180 *(x + 3) = v3[0];
cannam@167 181 *(x + ovs ) = v0[1];
cannam@167 182 *(x + ovs + 1) = v1[1];
cannam@167 183 *(x + ovs + 2) = v2[1];
cannam@167 184 *(x + ovs + 3) = v3[1];
cannam@167 185 *(x + 2 * ovs ) = v0[2];
cannam@167 186 *(x + 2 * ovs + 1) = v1[2];
cannam@167 187 *(x + 2 * ovs + 2) = v2[2];
cannam@167 188 *(x + 2 * ovs + 3) = v3[2];
cannam@167 189 *(x + 3 * ovs ) = v0[3];
cannam@167 190 *(x + 3 * ovs + 1) = v1[3];
cannam@167 191 *(x + 3 * ovs + 2) = v2[3];
cannam@167 192 *(x + 3 * ovs + 3) = v3[3];
cannam@167 193 }
cannam@167 194 #endif
cannam@167 195
cannam@167 196 static inline V FLIP_RI(V x)
cannam@167 197 {
cannam@167 198 #ifdef FFTW_SINGLE
cannam@167 199 return (V){x[1],x[0],x[3],x[2],x[5],x[4],x[7],x[6]};
cannam@167 200 #else
cannam@167 201 return (V){x[1],x[0],x[3],x[2]};
cannam@167 202 #endif
cannam@167 203 }
cannam@167 204
cannam@167 205 static inline V VCONJ(V x)
cannam@167 206 {
cannam@167 207 #ifdef FFTW_SINGLE
cannam@167 208 return (x * (V){1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0});
cannam@167 209 #else
cannam@167 210 return (x * (V){1.0,-1.0,1.0,-1.0});
cannam@167 211 #endif
cannam@167 212 }
cannam@167 213
cannam@167 214 static inline V VBYI(V x)
cannam@167 215 {
cannam@167 216 return FLIP_RI(VCONJ(x));
cannam@167 217 }
cannam@167 218
cannam@167 219 /* FMA support */
cannam@167 220 #define VFMA(a, b, c) VADD(c, VMUL(a, b))
cannam@167 221 #define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
cannam@167 222 #define VFMS(a, b, c) VSUB(VMUL(a, b), c)
cannam@167 223 #define VFMAI(b, c) VADD(c, VBYI(b))
cannam@167 224 #define VFNMSI(b, c) VSUB(c, VBYI(b))
cannam@167 225 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
cannam@167 226 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
cannam@167 227 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
cannam@167 228
cannam@167 229 static inline V VZMUL(V tx, V sr)
cannam@167 230 {
cannam@167 231 V tr = VDUPL(tx);
cannam@167 232 V ti = VDUPH(tx);
cannam@167 233 tr = VMUL(sr, tr);
cannam@167 234 sr = VBYI(sr);
cannam@167 235 return VFMA(ti, sr, tr);
cannam@167 236 }
cannam@167 237
cannam@167 238 static inline V VZMULJ(V tx, V sr)
cannam@167 239 {
cannam@167 240 V tr = VDUPL(tx);
cannam@167 241 V ti = VDUPH(tx);
cannam@167 242 tr = VMUL(sr, tr);
cannam@167 243 sr = VBYI(sr);
cannam@167 244 return VFNMS(ti, sr, tr);
cannam@167 245 }
cannam@167 246
cannam@167 247 static inline V VZMULI(V tx, V sr)
cannam@167 248 {
cannam@167 249 V tr = VDUPL(tx);
cannam@167 250 V ti = VDUPH(tx);
cannam@167 251 ti = VMUL(ti, sr);
cannam@167 252 sr = VBYI(sr);
cannam@167 253 return VFMS(tr, sr, ti);
cannam@167 254 }
cannam@167 255
cannam@167 256 static inline V VZMULIJ(V tx, V sr)
cannam@167 257 {
cannam@167 258 V tr = VDUPL(tx);
cannam@167 259 V ti = VDUPH(tx);
cannam@167 260 ti = VMUL(ti, sr);
cannam@167 261 sr = VBYI(sr);
cannam@167 262 return VFMA(tr, sr, ti);
cannam@167 263 }
cannam@167 264
cannam@167 265 /* twiddle storage #1: compact, slower */
cannam@167 266 #ifdef FFTW_SINGLE
cannam@167 267 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}, {TW_CEXP, v+2, x}, {TW_CEXP, v+3, x}
cannam@167 268 #else
cannam@167 269 # define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
cannam@167 270 #endif
cannam@167 271 #define TWVL1 (VL)
cannam@167 272
cannam@167 273 static inline V BYTW1(const R *t, V sr)
cannam@167 274 {
cannam@167 275 return VZMUL(LDA(t, 2, t), sr);
cannam@167 276 }
cannam@167 277
cannam@167 278 static inline V BYTWJ1(const R *t, V sr)
cannam@167 279 {
cannam@167 280 return VZMULJ(LDA(t, 2, t), sr);
cannam@167 281 }
cannam@167 282
cannam@167 283 /* twiddle storage #2: twice the space, faster (when in cache) */
cannam@167 284 #ifdef FFTW_SINGLE
cannam@167 285 # define VTW2(v,x) \
cannam@167 286 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
cannam@167 287 {TW_COS, v+2, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, {TW_COS, v+3, x}, \
cannam@167 288 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}, \
cannam@167 289 {TW_SIN, v+2, -x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, -x}, {TW_SIN, v+3, x}
cannam@167 290 #else
cannam@167 291 # define VTW2(v,x) \
cannam@167 292 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
cannam@167 293 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
cannam@167 294 #endif
cannam@167 295 #define TWVL2 (2 * VL)
cannam@167 296
cannam@167 297 static inline V BYTW2(const R *t, V sr)
cannam@167 298 {
cannam@167 299 const V *twp = (const V *)t;
cannam@167 300 V si = FLIP_RI(sr);
cannam@167 301 V tr = twp[0], ti = twp[1];
cannam@167 302 return VFMA(tr, sr, VMUL(ti, si));
cannam@167 303 }
cannam@167 304
cannam@167 305 static inline V BYTWJ2(const R *t, V sr)
cannam@167 306 {
cannam@167 307 const V *twp = (const V *)t;
cannam@167 308 V si = FLIP_RI(sr);
cannam@167 309 V tr = twp[0], ti = twp[1];
cannam@167 310 return VFNMS(ti, si, VMUL(tr, sr));
cannam@167 311 }
cannam@167 312
cannam@167 313 /* twiddle storage #3 */
cannam@167 314 #define VTW3 VTW1
cannam@167 315 #define TWVL3 TWVL1
cannam@167 316
cannam@167 317 /* twiddle storage for split arrays */
cannam@167 318 #ifdef FFTW_SINGLE
cannam@167 319 # define VTWS(v,x) \
cannam@167 320 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
cannam@167 321 {TW_COS, v+4, x}, {TW_COS, v+5, x}, {TW_COS, v+6, x}, {TW_COS, v+7, x}, \
cannam@167 322 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}, \
cannam@167 323 {TW_SIN, v+4, x}, {TW_SIN, v+5, x}, {TW_SIN, v+6, x}, {TW_SIN, v+7, x}
cannam@167 324 #else
cannam@167 325 # define VTWS(v,x) \
cannam@167 326 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
cannam@167 327 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
cannam@167 328 #endif
cannam@167 329 #define TWVLS (2 * VL)
cannam@167 330
cannam@167 331 #define VLEAVE() /* nothing */
cannam@167 332
cannam@167 333 #include "simd-common.h"