annotate src/fftw-3.3.5/simd-support/simd-generic128.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * Generic128d added by Romain Dolbeau, and turned into simd-generic128.h
Chris@42 6 * with single & double precision by Erik Lindahl.
Chris@42 7 * Romain Dolbeau hereby places his modifications in the public domain.
Chris@42 8 * Erik Lindahl hereby places his modifications in the public domain.
Chris@42 9 *
Chris@42 10 * This program is free software; you can redistribute it and/or modify
Chris@42 11 * it under the terms of the GNU General Public License as published by
Chris@42 12 * the Free Software Foundation; either version 2 of the License, or
Chris@42 13 * (at your option) any later version.
Chris@42 14 *
Chris@42 15 * This program is distributed in the hope that it will be useful,
Chris@42 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 18 * GNU General Public License for more details.
Chris@42 19 *
Chris@42 20 * You should have received a copy of the GNU General Public License
Chris@42 21 * along with this program; if not, write to the Free Software
Chris@42 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 23 *
Chris@42 24 */
Chris@42 25
Chris@42 26
Chris@42 27 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
Chris@42 28 # error "Generic simd128 only works in single or double precision"
Chris@42 29 #endif
Chris@42 30
Chris@42 31 #define SIMD_SUFFIX _generic_simd128 /* for renaming */
Chris@42 32
Chris@42 33 #ifdef FFTW_SINGLE
Chris@42 34 # define DS(d,s) s /* single-precision option */
Chris@42 35 # define VDUPL(x) (V){x[0],x[0],x[2],x[2]}
Chris@42 36 # define VDUPH(x) (V){x[1],x[1],x[3],x[3]}
Chris@42 37 # define DVK(var, val) V var = {val,val,val,val}
Chris@42 38 #else
Chris@42 39 # define DS(d,s) d /* double-precision option */
Chris@42 40 # define VDUPL(x) (V){x[0],x[0]}
Chris@42 41 # define VDUPH(x) (V){x[1],x[1]}
Chris@42 42 # define DVK(var, val) V var = {val, val}
Chris@42 43 #endif
Chris@42 44
Chris@42 45 #define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
Chris@42 46 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
Chris@42 47 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
Chris@42 48
Chris@42 49 typedef DS(double,float) V __attribute__ ((vector_size(16)));
Chris@42 50
Chris@42 51 #define VADD(a,b) ((a)+(b))
Chris@42 52 #define VSUB(a,b) ((a)-(b))
Chris@42 53 #define VMUL(a,b) ((a)*(b))
Chris@42 54
Chris@42 55
Chris@42 56 #define LDK(x) x
Chris@42 57
Chris@42 58 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
Chris@42 59 {
Chris@42 60 (void)aligned_like; /* UNUSED */
Chris@42 61 (void)ivs; /* UNUSED */
Chris@42 62 return *(const V *)x;
Chris@42 63 }
Chris@42 64
Chris@42 65 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
Chris@42 66 {
Chris@42 67 (void)aligned_like; /* UNUSED */
Chris@42 68 (void)ovs; /* UNUSED */
Chris@42 69 *(V *)x = v;
Chris@42 70 }
Chris@42 71
Chris@42 72 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@42 73 {
Chris@42 74 (void)aligned_like; /* UNUSED */
Chris@42 75 V res;
Chris@42 76 res[0] = x[0];
Chris@42 77 res[1] = x[1];
Chris@42 78 #ifdef FFTW_SINGLE
Chris@42 79 res[2] = x[ivs];
Chris@42 80 res[3] = x[ivs+1];
Chris@42 81 #endif
Chris@42 82 return res;
Chris@42 83 }
Chris@42 84
Chris@42 85 #ifdef FFTW_SINGLE
Chris@42 86 /* ST has to be separate due to the storage hack requiring reverse order */
Chris@42 87 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@42 88 {
Chris@42 89 (void)aligned_like; /* UNUSED */
Chris@42 90 (void)ovs; /* UNUSED */
Chris@42 91 *(x + ovs ) = v[2];
Chris@42 92 *(x + ovs + 1) = v[3];
Chris@42 93 *(x ) = v[0];
Chris@42 94 *(x + 1) = v[1];
Chris@42 95 }
Chris@42 96 #else
Chris@42 97 /* FFTW_DOUBLE */
Chris@42 98 # define ST STA
Chris@42 99 #endif
Chris@42 100
Chris@42 101 #ifdef FFTW_SINGLE
Chris@42 102 #define STM2 ST
Chris@42 103 #define STN2(x, v0, v1, ovs) /* nop */
Chris@42 104
Chris@42 105 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
Chris@42 106 {
Chris@42 107 *(x ) = v0[0];
Chris@42 108 *(x + 1) = v1[0];
Chris@42 109 *(x + 2) = v2[0];
Chris@42 110 *(x + 3) = v3[0];
Chris@42 111 *(x + ovs ) = v0[1];
Chris@42 112 *(x + ovs + 1) = v1[1];
Chris@42 113 *(x + ovs + 2) = v2[1];
Chris@42 114 *(x + ovs + 3) = v3[1];
Chris@42 115 *(x + 2 * ovs ) = v0[2];
Chris@42 116 *(x + 2 * ovs + 1) = v1[2];
Chris@42 117 *(x + 2 * ovs + 2) = v2[2];
Chris@42 118 *(x + 2 * ovs + 3) = v3[2];
Chris@42 119 *(x + 3 * ovs ) = v0[3];
Chris@42 120 *(x + 3 * ovs + 1) = v1[3];
Chris@42 121 *(x + 3 * ovs + 2) = v2[3];
Chris@42 122 *(x + 3 * ovs + 3) = v3[3];
Chris@42 123 }
Chris@42 124 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@42 125
Chris@42 126
Chris@42 127 #else
Chris@42 128 /* FFTW_DOUBLE */
Chris@42 129
Chris@42 130 #define STM2 STA
Chris@42 131 #define STN2(x, v0, v1, ovs) /* nop */
Chris@42 132
Chris@42 133 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
Chris@42 134 {
Chris@42 135 (void)aligned_like; /* UNUSED */
Chris@42 136 *(x) = v[0];
Chris@42 137 *(x+ovs) = v[1];
Chris@42 138 }
Chris@42 139 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
Chris@42 140 #endif
Chris@42 141
Chris@42 142
Chris@42 143 static inline V FLIP_RI(V x)
Chris@42 144 {
Chris@42 145 #ifdef FFTW_SINGLE
Chris@42 146 return (V){x[1],x[0],x[3],x[2]};
Chris@42 147 #else
Chris@42 148 return (V){x[1],x[0]};
Chris@42 149 #endif
Chris@42 150 }
Chris@42 151
Chris@42 152 static inline V VCONJ(V x)
Chris@42 153 {
Chris@42 154 #ifdef FFTW_SINGLE
Chris@42 155 return (V){x[0],-x[1],x[2],-x[3]};
Chris@42 156 #else
Chris@42 157 return (V){x[0],-x[1]};
Chris@42 158 #endif
Chris@42 159 }
Chris@42 160
Chris@42 161 static inline V VBYI(V x)
Chris@42 162 {
Chris@42 163 x = VCONJ(x);
Chris@42 164 x = FLIP_RI(x);
Chris@42 165 return x;
Chris@42 166 }
Chris@42 167
Chris@42 168 /* FMA support */
Chris@42 169 #define VFMA(a, b, c) VADD(c, VMUL(a, b))
Chris@42 170 #define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
Chris@42 171 #define VFMS(a, b, c) VSUB(VMUL(a, b), c)
Chris@42 172 #define VFMAI(b, c) VADD(c, VBYI(b))
Chris@42 173 #define VFNMSI(b, c) VSUB(c, VBYI(b))
Chris@42 174 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
Chris@42 175 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
Chris@42 176 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
Chris@42 177
Chris@42 178 static inline V VZMUL(V tx, V sr)
Chris@42 179 {
Chris@42 180 V tr = VDUPL(tx);
Chris@42 181 V ti = VDUPH(tx);
Chris@42 182 tr = VMUL(sr, tr);
Chris@42 183 sr = VBYI(sr);
Chris@42 184 return VFMA(ti, sr, tr);
Chris@42 185 }
Chris@42 186
Chris@42 187 static inline V VZMULJ(V tx, V sr)
Chris@42 188 {
Chris@42 189 V tr = VDUPL(tx);
Chris@42 190 V ti = VDUPH(tx);
Chris@42 191 tr = VMUL(sr, tr);
Chris@42 192 sr = VBYI(sr);
Chris@42 193 return VFNMS(ti, sr, tr);
Chris@42 194 }
Chris@42 195
Chris@42 196 static inline V VZMULI(V tx, V sr)
Chris@42 197 {
Chris@42 198 V tr = VDUPL(tx);
Chris@42 199 V ti = VDUPH(tx);
Chris@42 200 ti = VMUL(ti, sr);
Chris@42 201 sr = VBYI(sr);
Chris@42 202 return VFMS(tr, sr, ti);
Chris@42 203 }
Chris@42 204
Chris@42 205 static inline V VZMULIJ(V tx, V sr)
Chris@42 206 {
Chris@42 207 V tr = VDUPL(tx);
Chris@42 208 V ti = VDUPH(tx);
Chris@42 209 ti = VMUL(ti, sr);
Chris@42 210 sr = VBYI(sr);
Chris@42 211 return VFMA(tr, sr, ti);
Chris@42 212 }
Chris@42 213
Chris@42 214 /* twiddle storage #1: compact, slower */
Chris@42 215 #ifdef FFTW_SINGLE
Chris@42 216 # define VTW1(v,x) \
Chris@42 217 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
Chris@42 218 static inline V BYTW1(const R *t, V sr)
Chris@42 219 {
Chris@42 220 return VZMUL(LDA(t, 2, t), sr);
Chris@42 221 }
Chris@42 222 static inline V BYTWJ1(const R *t, V sr)
Chris@42 223 {
Chris@42 224 return VZMULJ(LDA(t, 2, t), sr);
Chris@42 225 }
Chris@42 226 #else /* !FFTW_SINGLE */
Chris@42 227 # define VTW1(v,x) {TW_CEXP, v, x}
Chris@42 228 static inline V BYTW1(const R *t, V sr)
Chris@42 229 {
Chris@42 230 V tx = LD(t, 1, t);
Chris@42 231 return VZMUL(tx, sr);
Chris@42 232 }
Chris@42 233 static inline V BYTWJ1(const R *t, V sr)
Chris@42 234 {
Chris@42 235 V tx = LD(t, 1, t);
Chris@42 236 return VZMULJ(tx, sr);
Chris@42 237 }
Chris@42 238 #endif
Chris@42 239 #define TWVL1 (VL)
Chris@42 240
Chris@42 241 /* twiddle storage #2: twice the space, faster (when in cache) */
Chris@42 242 #ifdef FFTW_SINGLE
Chris@42 243 # define VTW2(v,x) \
Chris@42 244 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@42 245 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
Chris@42 246 #else /* !FFTW_SINGLE */
Chris@42 247 # define VTW2(v,x) \
Chris@42 248 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
Chris@42 249 #endif
Chris@42 250 #define TWVL2 (2 * VL)
Chris@42 251 static inline V BYTW2(const R *t, V sr)
Chris@42 252 {
Chris@42 253 const V *twp = (const V *)t;
Chris@42 254 V si = FLIP_RI(sr);
Chris@42 255 V tr = twp[0], ti = twp[1];
Chris@42 256 return VFMA(tr, sr, VMUL(ti, si));
Chris@42 257 }
Chris@42 258 static inline V BYTWJ2(const R *t, V sr)
Chris@42 259 {
Chris@42 260 const V *twp = (const V *)t;
Chris@42 261 V si = FLIP_RI(sr);
Chris@42 262 V tr = twp[0], ti = twp[1];
Chris@42 263 return VFNMS(ti, si, VMUL(tr, sr));
Chris@42 264 }
Chris@42 265
Chris@42 266 /* twiddle storage #3 */
Chris@42 267 #ifdef FFTW_SINGLE
Chris@42 268 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
Chris@42 269 # define TWVL3 (VL)
Chris@42 270 #else
Chris@42 271 # define VTW3(v,x) VTW1(v,x)
Chris@42 272 # define TWVL3 TWVL1
Chris@42 273 #endif
Chris@42 274
Chris@42 275 /* twiddle storage for split arrays */
Chris@42 276 #ifdef FFTW_SINGLE
Chris@42 277 # define VTWS(v,x) \
Chris@42 278 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@42 279 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
Chris@42 280 #else
Chris@42 281 # define VTWS(v,x) \
Chris@42 282 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
Chris@42 283 #endif
Chris@42 284 #define TWVLS (2 * VL)
Chris@42 285
Chris@42 286 #define VLEAVE() /* nothing */
Chris@42 287
Chris@42 288 #include "simd-common.h"