annotate src/fftw-3.3.8/simd-support/simd-vsx.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * VSX SIMD implementation added 2015 Erik Lindahl.
Chris@82 6 * Erik Lindahl places his modifications in the public domain.
Chris@82 7 *
Chris@82 8 * This program is free software; you can redistribute it and/or modify
Chris@82 9 * it under the terms of the GNU General Public License as published by
Chris@82 10 * the Free Software Foundation; either version 2 of the License, or
Chris@82 11 * (at your option) any later version.
Chris@82 12 *
Chris@82 13 * This program is distributed in the hope that it will be useful,
Chris@82 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 16 * GNU General Public License for more details.
Chris@82 17 *
Chris@82 18 * You should have received a copy of the GNU General Public License
Chris@82 19 * along with this program; if not, write to the Free Software
Chris@82 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 21 *
Chris@82 22 */
Chris@82 23
Chris@82 24 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
Chris@82 25 # error "VSX only works in single or double precision"
Chris@82 26 #endif
Chris@82 27
Chris@82 28 #ifdef FFTW_SINGLE
Chris@82 29 # define DS(d,s) s /* single-precision option */
Chris@82 30 # define SUFF(name) name ## s
Chris@82 31 #else
Chris@82 32 # define DS(d,s) d /* double-precision option */
Chris@82 33 # define SUFF(name) name ## d
Chris@82 34 #endif
Chris@82 35
Chris@82 36 #define SIMD_SUFFIX _vsx /* for renaming */
Chris@82 37 #define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
Chris@82 38 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
Chris@82 39 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
Chris@82 40
Chris@82 41 #include <altivec.h>
Chris@82 42 #include <stdio.h>
Chris@82 43
Chris@82 44 typedef DS(vector double,vector float) V;
Chris@82 45
Chris@82 46 #define VADD(a,b) vec_add(a,b)
Chris@82 47 #define VSUB(a,b) vec_sub(a,b)
Chris@82 48 #define VMUL(a,b) vec_mul(a,b)
Chris@82 49 #define VXOR(a,b) vec_xor(a,b)
Chris@82 50 #define UNPCKL(a,b) vec_mergel(a,b)
Chris@82 51 #define UNPCKH(a,b) vec_mergeh(a,b)
Chris@82 52 #ifdef FFTW_SINGLE
Chris@82 53 # define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}; vec_perm(a,a,perm); })
Chris@82 54 # define VDUPH(a) ({ const vector unsigned char perm = {4,5,6,7,4,5,6,7,12,13,14,15,12,13,14,15}; vec_perm(a,a,perm); })
Chris@82 55 #else
Chris@82 56 # define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}; vec_perm(a,a,perm); })
Chris@82 57 # define VDUPH(a) ({ const vector unsigned char perm = {8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15}; vec_perm(a,a,perm); })
Chris@82 58 #endif
Chris@82 59
Chris@82 60 static inline V LDK(R f) { return vec_splats(f); }
Chris@82 61
Chris@82 62 #define DVK(var, val) const R var = K(val)
Chris@82 63
Chris@82 64 static inline V VCONJ(V x)
Chris@82 65 {
Chris@82 66 const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
Chris@82 67 return vec_xor(x, pmpm);
Chris@82 68 }
Chris@82 69
Chris@82 70 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
Chris@82 71 {
Chris@82 72 #ifdef __ibmxl__
Chris@82 73 return vec_xl(0,(DS(double,float) *)x);
Chris@82 74 #else
Chris@82 75 return (*(const V *)(x));
Chris@82 76 #endif
Chris@82 77 }
Chris@82 78
Chris@82 79 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 80 {
Chris@82 81 #ifdef __ibmxl__
Chris@82 82 vec_xst(v,0,x);
Chris@82 83 #else
Chris@82 84 *(V *)x = v;
Chris@82 85 #endif
Chris@82 86 }
Chris@82 87
Chris@82 88 static inline V FLIP_RI(V x)
Chris@82 89 {
Chris@82 90 #ifdef FFTW_SINGLE
Chris@82 91 const vector unsigned char perm = { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 };
Chris@82 92 #else
Chris@82 93 const vector unsigned char perm = { 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 };
Chris@82 94 #endif
Chris@82 95 return vec_perm(x,x,perm);
Chris@82 96 }
Chris@82 97
Chris@82 98 #ifdef FFTW_SINGLE
Chris@82 99
Chris@82 100 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@82 101 {
Chris@82 102 const vector unsigned char perm = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
Chris@82 103
Chris@82 104 return vec_perm((vector float)vec_splats(*(double *)(x)),
Chris@82 105 (vector float)vec_splats(*(double *)(x+ivs)),perm);
Chris@82 106 }
Chris@82 107
Chris@82 108 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 109 {
Chris@82 110 *(double *)(x+ovs) = vec_extract( (vector double)v, 1 );
Chris@82 111 *(double *)x = vec_extract( (vector double)v, 0 );
Chris@82 112 }
Chris@82 113 #else
Chris@82 114 /* DOUBLE */
Chris@82 115
Chris@82 116 # define LD LDA
Chris@82 117 # define ST STA
Chris@82 118
Chris@82 119 #endif
Chris@82 120
Chris@82 121 #define STM2 DS(STA,ST)
Chris@82 122 #define STN2(x, v0, v1, ovs) /* nop */
Chris@82 123
Chris@82 124 #ifdef FFTW_SINGLE
Chris@82 125
Chris@82 126 # define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@82 127 static inline void STN4(R *x, V v0, V v1, V v2, V v3, int ovs)
Chris@82 128 {
Chris@82 129 V xxx0, xxx1, xxx2, xxx3;
Chris@82 130 xxx0 = vec_mergeh(v0,v1);
Chris@82 131 xxx1 = vec_mergel(v0,v1);
Chris@82 132 xxx2 = vec_mergeh(v2,v3);
Chris@82 133 xxx3 = vec_mergel(v2,v3);
Chris@82 134 *(double *)x = vec_extract( (vector double)xxx0, 0 );
Chris@82 135 *(double *)(x+ovs) = vec_extract( (vector double)xxx0, 1 );
Chris@82 136 *(double *)(x+2*ovs) = vec_extract( (vector double)xxx1, 0 );
Chris@82 137 *(double *)(x+3*ovs) = vec_extract( (vector double)xxx1, 1 );
Chris@82 138 *(double *)(x+2) = vec_extract( (vector double)xxx2, 0 );
Chris@82 139 *(double *)(x+ovs+2) = vec_extract( (vector double)xxx2, 1 );
Chris@82 140 *(double *)(x+2*ovs+2) = vec_extract( (vector double)xxx3, 0 );
Chris@82 141 *(double *)(x+3*ovs+2) = vec_extract( (vector double)xxx3, 1 );
Chris@82 142 }
Chris@82 143 #else /* !FFTW_SINGLE */
Chris@82 144
Chris@82 145 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
Chris@82 146 {
Chris@82 147 (void)aligned_like; /* UNUSED */
Chris@82 148 x[0] = vec_extract(v,0);
Chris@82 149 x[ovs] = vec_extract(v,1);
Chris@82 150 }
Chris@82 151 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
Chris@82 152 #endif
Chris@82 153
Chris@82 154 static inline V VBYI(V x)
Chris@82 155 {
Chris@82 156 /* FIXME [matteof 2017-09-21] It is possible to use vpermxor(),
Chris@82 157 but gcc and xlc treat the permutation bits differently, and
Chris@82 158 gcc-6 seems to generate incorrect code when using
Chris@82 159 __builtin_crypto_vpermxor() (i.e., VBYI() works for a small
Chris@82 160 test case but fails in the large).
Chris@82 161
Chris@82 162 Punt on vpermxor() for now and do the simple thing.
Chris@82 163 */
Chris@82 164 return FLIP_RI(VCONJ(x));
Chris@82 165 }
Chris@82 166
Chris@82 167 /* FMA support */
Chris@82 168 #define VFMA(a, b, c) vec_madd(a,b,c)
Chris@82 169 #define VFNMS(a, b, c) vec_nmsub(a,b,c)
Chris@82 170 #define VFMS(a, b, c) vec_msub(a,b,c)
Chris@82 171 #define VFMAI(b, c) VADD(c, VBYI(b))
Chris@82 172 #define VFNMSI(b, c) VSUB(c, VBYI(b))
Chris@82 173 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
Chris@82 174 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
Chris@82 175 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
Chris@82 176
Chris@82 177 static inline V VZMUL(V tx, V sr)
Chris@82 178 {
Chris@82 179 V tr = VDUPL(tx);
Chris@82 180 V ti = VDUPH(tx);
Chris@82 181 tr = VMUL(sr, tr);
Chris@82 182 sr = VBYI(sr);
Chris@82 183 return VFMA(ti, sr, tr);
Chris@82 184 }
Chris@82 185
Chris@82 186 static inline V VZMULJ(V tx, V sr)
Chris@82 187 {
Chris@82 188 V tr = VDUPL(tx);
Chris@82 189 V ti = VDUPH(tx);
Chris@82 190 tr = VMUL(sr, tr);
Chris@82 191 sr = VBYI(sr);
Chris@82 192 return VFNMS(ti, sr, tr);
Chris@82 193 }
Chris@82 194
Chris@82 195 static inline V VZMULI(V tx, V sr)
Chris@82 196 {
Chris@82 197 V tr = VDUPL(tx);
Chris@82 198 V ti = VDUPH(tx);
Chris@82 199 ti = VMUL(ti, sr);
Chris@82 200 sr = VBYI(sr);
Chris@82 201 return VFMS(tr, sr, ti);
Chris@82 202 }
Chris@82 203
Chris@82 204 static inline V VZMULIJ(V tx, V sr)
Chris@82 205 {
Chris@82 206 V tr = VDUPL(tx);
Chris@82 207 V ti = VDUPH(tx);
Chris@82 208 ti = VMUL(ti, sr);
Chris@82 209 sr = VBYI(sr);
Chris@82 210 return VFMA(tr, sr, ti);
Chris@82 211 }
Chris@82 212
Chris@82 213 /* twiddle storage #1: compact, slower */
Chris@82 214 #ifdef FFTW_SINGLE
Chris@82 215 # define VTW1(v,x) \
Chris@82 216 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
Chris@82 217 static inline V BYTW1(const R *t, V sr)
Chris@82 218 {
Chris@82 219 V tx = LDA(t,0,t);
Chris@82 220 V tr = UNPCKH(tx, tx);
Chris@82 221 V ti = UNPCKL(tx, tx);
Chris@82 222 tr = VMUL(tr, sr);
Chris@82 223 sr = VBYI(sr);
Chris@82 224 return VFMA(ti, sr, tr);
Chris@82 225 }
Chris@82 226 static inline V BYTWJ1(const R *t, V sr)
Chris@82 227 {
Chris@82 228 V tx = LDA(t,0,t);
Chris@82 229 V tr = UNPCKH(tx, tx);
Chris@82 230 V ti = UNPCKL(tx, tx);
Chris@82 231 tr = VMUL(tr, sr);
Chris@82 232 sr = VBYI(sr);
Chris@82 233 return VFNMS(ti, sr, tr);
Chris@82 234 }
Chris@82 235 #else /* !FFTW_SINGLE */
Chris@82 236 # define VTW1(v,x) {TW_CEXP, v, x}
Chris@82 237 static inline V BYTW1(const R *t, V sr)
Chris@82 238 {
Chris@82 239 V tx = LD(t, 1, t);
Chris@82 240 return VZMUL(tx, sr);
Chris@82 241 }
Chris@82 242 static inline V BYTWJ1(const R *t, V sr)
Chris@82 243 {
Chris@82 244 V tx = LD(t, 1, t);
Chris@82 245 return VZMULJ(tx, sr);
Chris@82 246 }
Chris@82 247 #endif
Chris@82 248 #define TWVL1 (VL)
Chris@82 249
Chris@82 250 /* twiddle storage #2: twice the space, faster (when in cache) */
Chris@82 251 #ifdef FFTW_SINGLE
Chris@82 252 # define VTW2(v,x) \
Chris@82 253 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@82 254 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
Chris@82 255 #else /* !FFTW_SINGLE */
Chris@82 256 # define VTW2(v,x) \
Chris@82 257 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
Chris@82 258 #endif
Chris@82 259 #define TWVL2 (2 * VL)
Chris@82 260 static inline V BYTW2(const R *t, V sr)
Chris@82 261 {
Chris@82 262 V si = FLIP_RI(sr);
Chris@82 263 V ti = LDA(t+2*VL,0,t);
Chris@82 264 V tt = VMUL(ti, si);
Chris@82 265 V tr = LDA(t,0,t);
Chris@82 266 return VFMA(tr, sr, tt);
Chris@82 267 }
Chris@82 268 static inline V BYTWJ2(const R *t, V sr)
Chris@82 269 {
Chris@82 270 V si = FLIP_RI(sr);
Chris@82 271 V tr = LDA(t,0,t);
Chris@82 272 V tt = VMUL(tr, sr);
Chris@82 273 V ti = LDA(t+2*VL,0,t);
Chris@82 274 return VFNMS(ti, si, tt);
Chris@82 275 }
Chris@82 276
Chris@82 277 /* twiddle storage #3 */
Chris@82 278 #ifdef FFTW_SINGLE
Chris@82 279 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
Chris@82 280 # define TWVL3 (VL)
Chris@82 281 #else
Chris@82 282 # define VTW3(v,x) VTW1(v,x)
Chris@82 283 # define TWVL3 TWVL1
Chris@82 284 #endif
Chris@82 285
Chris@82 286 /* twiddle storage for split arrays */
Chris@82 287 #ifdef FFTW_SINGLE
Chris@82 288 # define VTWS(v,x) \
Chris@82 289 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@82 290 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
Chris@82 291 #else
Chris@82 292 # define VTWS(v,x) \
Chris@82 293 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
Chris@82 294 #endif
Chris@82 295 #define TWVLS (2 * VL)
Chris@82 296
Chris@82 297 #define VLEAVE() /* nothing */
Chris@82 298
Chris@82 299 #include "simd-common.h"