sv-dependency-builds: src/fftw-3.3.5/simd-support/simd-vsx.h annotate

annotate src/fftw-3.3.5/simd-support/simd-vsx.h @ 148:b4bfdf10c4b3

Update Win64 capnp builds to v0.6

author	Chris Cannam <cannam@all-day-breakfast.com>
date	Mon, 22 May 2017 18:56:49 +0100
parents	7867fa7e1b6b
children

rev	line source
cannam@127	1 /*
cannam@127	2 * Copyright (c) 2003, 2007-14 Matteo Frigo
cannam@127	3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
cannam@127	4 *
cannam@127	5 * VSX SIMD implementation added 2015 Erik Lindahl.
cannam@127	6 * Erik Lindahl places his modifications in the public domain.
cannam@127	7 *
cannam@127	8 * This program is free software; you can redistribute it and/or modify
cannam@127	9 * it under the terms of the GNU General Public License as published by
cannam@127	10 * the Free Software Foundation; either version 2 of the License, or
cannam@127	11 * (at your option) any later version.
cannam@127	12 *
cannam@127	13 * This program is distributed in the hope that it will be useful,
cannam@127	14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@127	15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@127	16 * GNU General Public License for more details.
cannam@127	17 *
cannam@127	18 * You should have received a copy of the GNU General Public License
cannam@127	19 * along with this program; if not, write to the Free Software
cannam@127	20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@127	21 *
cannam@127	22 */
cannam@127	23
cannam@127	24 #if defined(FFTW_LDOUBLE) \|\| defined(FFTW_QUAD)
cannam@127	25 # error "VSX only works in single or double precision"
cannam@127	26 #endif
cannam@127	27
cannam@127	28 #ifdef FFTW_SINGLE
cannam@127	29 # define DS(d,s) s /* single-precision option */
cannam@127	30 # define SUFF(name) name ## s
cannam@127	31 #else
cannam@127	32 # define DS(d,s) d /* double-precision option */
cannam@127	33 # define SUFF(name) name ## d
cannam@127	34 #endif
cannam@127	35
cannam@127	36 #define SIMD_SUFFIX _vsx /* for renaming */
cannam@127	37 #define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
cannam@127	38 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
cannam@127	39 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
cannam@127	40
cannam@127	41 #include <altivec.h>
cannam@127	42 #include <stdio.h>
cannam@127	43
cannam@127	44 typedef DS(vector double,vector float) V;
cannam@127	45
cannam@127	46 #define VADD(a,b) vec_add(a,b)
cannam@127	47 #define VSUB(a,b) vec_sub(a,b)
cannam@127	48 #define VMUL(a,b) vec_mul(a,b)
cannam@127	49 #define VXOR(a,b) vec_xor(a,b)
cannam@127	50 #define UNPCKL(a,b) vec_mergel(a,b)
cannam@127	51 #define UNPCKH(a,b) vec_mergeh(a,b)
cannam@127	52 #ifdef FFTW_SINGLE
cannam@127	53 # define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}; vec_perm(a,a,perm); })
cannam@127	54 # define VDUPH(a) ({ const vector unsigned char perm = {4,5,6,7,4,5,6,7,12,13,14,15,12,13,14,15}; vec_perm(a,a,perm); })
cannam@127	55 #else
cannam@127	56 # define VDUPL(a) ({ const vector unsigned char perm = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}; vec_perm(a,a,perm); })
cannam@127	57 # define VDUPH(a) ({ const vector unsigned char perm = {8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15}; vec_perm(a,a,perm); })
cannam@127	58 #endif
cannam@127	59
cannam@127	60 static inline V LDK(R f) { return vec_splats(f); }
cannam@127	61
cannam@127	62 #define DVK(var, val) const R var = K(val)
cannam@127	63
cannam@127	64 static inline V VCONJ(V x)
cannam@127	65 {
cannam@127	66 const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
cannam@127	67 return vec_xor(x, pmpm);
cannam@127	68 }
cannam@127	69
cannam@127	70 static inline V LDA(const R x, INT ivs, const R aligned_like)
cannam@127	71 {
cannam@127	72 #ifdef __ibmxl__
cannam@127	73 return vec_xl(0,(DS(double,float) *)x);
cannam@127	74 #else
cannam@127	75 return ((const V )(x));
cannam@127	76 #endif
cannam@127	77 }
cannam@127	78
cannam@127	79 static inline void STA(R x, V v, INT ovs, const R aligned_like)
cannam@127	80 {
cannam@127	81 #ifdef __ibmxl__
cannam@127	82 vec_xst(v,0,x);
cannam@127	83 #else
cannam@127	84 (V )x = v;
cannam@127	85 #endif
cannam@127	86 }
cannam@127	87
cannam@127	88 static inline V FLIP_RI(V x)
cannam@127	89 {
cannam@127	90 #ifdef FFTW_SINGLE
cannam@127	91 const vector unsigned char perm = { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 };
cannam@127	92 #else
cannam@127	93 const vector unsigned char perm = { 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 };
cannam@127	94 #endif
cannam@127	95 return vec_perm(x,x,perm);
cannam@127	96 }
cannam@127	97
cannam@127	98 #ifdef FFTW_SINGLE
cannam@127	99
cannam@127	100 static inline V LD(const R x, INT ivs, const R aligned_like)
cannam@127	101 {
cannam@127	102 const vector unsigned char perm = {0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
cannam@127	103
cannam@127	104 return vec_perm((vector float)vec_splats((double )(x)),
cannam@127	105 (vector float)vec_splats((double )(x+ivs)),perm);
cannam@127	106 }
cannam@127	107
cannam@127	108 static inline void ST(R x, V v, INT ovs, const R aligned_like)
cannam@127	109 {
cannam@127	110 (double )(x+ovs) = vec_extract( (vector double)v, 1 );
cannam@127	111 (double )x = vec_extract( (vector double)v, 0 );
cannam@127	112 }
cannam@127	113 #else
cannam@127	114 /* DOUBLE */
cannam@127	115
cannam@127	116 # define LD LDA
cannam@127	117 # define ST STA
cannam@127	118
cannam@127	119 #endif
cannam@127	120
cannam@127	121 #define STM2 DS(STA,ST)
cannam@127	122 #define STN2(x, v0, v1, ovs) /* nop */
cannam@127	123
cannam@127	124 #ifdef FFTW_SINGLE
cannam@127	125
cannam@127	126 # define STM4(x, v, ovs, aligned_like) /* no-op */
cannam@127	127 static inline void STN4(R *x, V v0, V v1, V v2, V v3, int ovs)
cannam@127	128 {
cannam@127	129 V xxx0, xxx1, xxx2, xxx3;
cannam@127	130 xxx0 = vec_mergeh(v0,v1);
cannam@127	131 xxx1 = vec_mergel(v0,v1);
cannam@127	132 xxx2 = vec_mergeh(v2,v3);
cannam@127	133 xxx3 = vec_mergel(v2,v3);
cannam@127	134 (double )x = vec_extract( (vector double)xxx0, 0 );
cannam@127	135 (double )(x+ovs) = vec_extract( (vector double)xxx0, 1 );
cannam@127	136 (double )(x+2*ovs) = vec_extract( (vector double)xxx1, 0 );
cannam@127	137 (double )(x+3*ovs) = vec_extract( (vector double)xxx1, 1 );
cannam@127	138 (double )(x+2) = vec_extract( (vector double)xxx2, 0 );
cannam@127	139 (double )(x+ovs+2) = vec_extract( (vector double)xxx2, 1 );
cannam@127	140 (double )(x+2*ovs+2) = vec_extract( (vector double)xxx3, 0 );
cannam@127	141 (double )(x+3*ovs+2) = vec_extract( (vector double)xxx3, 1 );
cannam@127	142 }
cannam@127	143 #else /* !FFTW_SINGLE */
cannam@127	144
cannam@127	145 static inline void STM4(R x, V v, INT ovs, const R aligned_like)
cannam@127	146 {
cannam@127	147 (void)aligned_like; /* UNUSED */
cannam@127	148 x[0] = vec_extract(v,0);
cannam@127	149 x[ovs] = vec_extract(v,1);
cannam@127	150 }
cannam@127	151 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
cannam@127	152 #endif
cannam@127	153
cannam@127	154 static inline V VBYI(V x)
cannam@127	155 {
cannam@127	156 /* Complicated low-level stuff. vpermxor is really a cryptographic instruction that is only
cannam@127	157 * available in the low-level inteface both for GCC and XLC. However, on little-endian
cannam@127	158 * platforms there is also the complicated swapping going on. XLC does this here too, but
cannam@127	159 * not GCC, so we need different permute constants.
cannam@127	160 */
cannam@127	161 #if defined(__POWER8_VECTOR__) && defined(__GNUC__) && defined(__LITTLE_ENDIAN__)
cannam@127	162 # ifdef FFTW_SINGLE
cannam@127	163 const vector unsigned char perm = { 0xbb, 0xaa, 0x99, 0x88, 0xff, 0xee, 0xdd, 0xcc, 0x33, 0x22, 0x11, 0x00, 0x77, 0x66, 0x55, 0x44 };
cannam@127	164 # else
cannam@127	165 const vector unsigned char perm = { 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88 };
cannam@127	166 # endif
cannam@127	167 const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
cannam@127	168 return (V)__builtin_crypto_vpermxor((vector unsigned char)x,(vector unsigned char)pmpm,perm);
cannam@127	169 #elif defined(__POWER8_VECTOR__) && (defined(__ibmxl__) \|\| (defined(__GNUC__) && !defined(__LITTLE_ENDIAN__)))
cannam@127	170 # ifdef FFTW_SINGLE
cannam@127	171 const vector unsigned char perm = { 0x44, 0x55, 0x66, 0x77, 0x00, 0x11, 0x22, 0x33, 0xCC, 0xDD, 0xEE, 0xFF, 0x88, 0x99, 0xAA, 0xBB };
cannam@127	172 # else
cannam@127	173 const vector unsigned char perm = { 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77 };
cannam@127	174 # endif
cannam@127	175 const V pmpm = vec_mergel(vec_splats((R)0.0),-(vec_splats((R)0.0)));
cannam@127	176 return (V)__vpermxor((vector unsigned char)x,(vector unsigned char)pmpm,perm);
cannam@127	177 #else
cannam@127	178 /* The safe option */
cannam@127	179 return FLIP_RI(VCONJ(x));
cannam@127	180 #endif
cannam@127	181 }
cannam@127	182
cannam@127	183 /* FMA support */
cannam@127	184 #define VFMA(a, b, c) vec_madd(a,b,c)
cannam@127	185 #define VFNMS(a, b, c) vec_nmsub(a,b,c)
cannam@127	186 #define VFMS(a, b, c) vec_msub(a,b,c)
cannam@127	187 #define VFMAI(b, c) VADD(c, VBYI(b))
cannam@127	188 #define VFNMSI(b, c) VSUB(c, VBYI(b))
cannam@127	189 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
cannam@127	190 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
cannam@127	191 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
cannam@127	192
cannam@127	193 static inline V VZMUL(V tx, V sr)
cannam@127	194 {
cannam@127	195 V tr = VDUPL(tx);
cannam@127	196 V ti = VDUPH(tx);
cannam@127	197 tr = VMUL(sr, tr);
cannam@127	198 sr = VBYI(sr);
cannam@127	199 return VFMA(ti, sr, tr);
cannam@127	200 }
cannam@127	201
cannam@127	202 static inline V VZMULJ(V tx, V sr)
cannam@127	203 {
cannam@127	204 V tr = VDUPL(tx);
cannam@127	205 V ti = VDUPH(tx);
cannam@127	206 tr = VMUL(sr, tr);
cannam@127	207 sr = VBYI(sr);
cannam@127	208 return VFNMS(ti, sr, tr);
cannam@127	209 }
cannam@127	210
cannam@127	211 static inline V VZMULI(V tx, V sr)
cannam@127	212 {
cannam@127	213 V tr = VDUPL(tx);
cannam@127	214 V ti = VDUPH(tx);
cannam@127	215 ti = VMUL(ti, sr);
cannam@127	216 sr = VBYI(sr);
cannam@127	217 return VFMS(tr, sr, ti);
cannam@127	218 }
cannam@127	219
cannam@127	220 static inline V VZMULIJ(V tx, V sr)
cannam@127	221 {
cannam@127	222 V tr = VDUPL(tx);
cannam@127	223 V ti = VDUPH(tx);
cannam@127	224 ti = VMUL(ti, sr);
cannam@127	225 sr = VBYI(sr);
cannam@127	226 return VFMA(tr, sr, ti);
cannam@127	227 }
cannam@127	228
cannam@127	229 /* twiddle storage #1: compact, slower */
cannam@127	230 #ifdef FFTW_SINGLE
cannam@127	231 # define VTW1(v,x) \
cannam@127	232 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
cannam@127	233 static inline V BYTW1(const R *t, V sr)
cannam@127	234 {
cannam@127	235 V tx = LDA(t,0,t);
cannam@127	236 V tr = UNPCKH(tx, tx);
cannam@127	237 V ti = UNPCKL(tx, tx);
cannam@127	238 tr = VMUL(tr, sr);
cannam@127	239 sr = VBYI(sr);
cannam@127	240 return VFMA(ti, sr, tr);
cannam@127	241 }
cannam@127	242 static inline V BYTWJ1(const R *t, V sr)
cannam@127	243 {
cannam@127	244 V tx = LDA(t,0,t);
cannam@127	245 V tr = UNPCKH(tx, tx);
cannam@127	246 V ti = UNPCKL(tx, tx);
cannam@127	247 tr = VMUL(tr, sr);
cannam@127	248 sr = VBYI(sr);
cannam@127	249 return VFNMS(ti, sr, tr);
cannam@127	250 }
cannam@127	251 #else /* !FFTW_SINGLE */
cannam@127	252 # define VTW1(v,x) {TW_CEXP, v, x}
cannam@127	253 static inline V BYTW1(const R *t, V sr)
cannam@127	254 {
cannam@127	255 V tx = LD(t, 1, t);
cannam@127	256 return VZMUL(tx, sr);
cannam@127	257 }
cannam@127	258 static inline V BYTWJ1(const R *t, V sr)
cannam@127	259 {
cannam@127	260 V tx = LD(t, 1, t);
cannam@127	261 return VZMULJ(tx, sr);
cannam@127	262 }
cannam@127	263 #endif
cannam@127	264 #define TWVL1 (VL)
cannam@127	265
cannam@127	266 /* twiddle storage #2: twice the space, faster (when in cache) */
cannam@127	267 #ifdef FFTW_SINGLE
cannam@127	268 # define VTW2(v,x) \
cannam@127	269 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
cannam@127	270 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
cannam@127	271 #else /* !FFTW_SINGLE */
cannam@127	272 # define VTW2(v,x) \
cannam@127	273 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
cannam@127	274 #endif
cannam@127	275 #define TWVL2 (2 * VL)
cannam@127	276 static inline V BYTW2(const R *t, V sr)
cannam@127	277 {
cannam@127	278 V si = FLIP_RI(sr);
cannam@127	279 V ti = LDA(t+2*VL,0,t);
cannam@127	280 V tt = VMUL(ti, si);
cannam@127	281 V tr = LDA(t,0,t);
cannam@127	282 return VFMA(tr, sr, tt);
cannam@127	283 }
cannam@127	284 static inline V BYTWJ2(const R *t, V sr)
cannam@127	285 {
cannam@127	286 V si = FLIP_RI(sr);
cannam@127	287 V tr = LDA(t,0,t);
cannam@127	288 V tt = VMUL(tr, sr);
cannam@127	289 V ti = LDA(t+2*VL,0,t);
cannam@127	290 return VFNMS(ti, si, tt);
cannam@127	291 }
cannam@127	292
cannam@127	293 /* twiddle storage #3 */
cannam@127	294 #ifdef FFTW_SINGLE
cannam@127	295 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
cannam@127	296 # define TWVL3 (VL)
cannam@127	297 #else
cannam@127	298 # define VTW3(v,x) VTW1(v,x)
cannam@127	299 # define TWVL3 TWVL1
cannam@127	300 #endif
cannam@127	301
cannam@127	302 /* twiddle storage for split arrays */
cannam@127	303 #ifdef FFTW_SINGLE
cannam@127	304 # define VTWS(v,x) \
cannam@127	305 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
cannam@127	306 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
cannam@127	307 #else
cannam@127	308 # define VTWS(v,x) \
cannam@127	309 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
cannam@127	310 #endif
cannam@127	311 #define TWVLS (2 * VL)
cannam@127	312
cannam@127	313 #define VLEAVE() /* nothing */
cannam@127	314
cannam@127	315 #include "simd-common.h"

Mercurial > hg > sv-dependency-builds

annotate src/fftw-3.3.5/simd-support/simd-vsx.h @ 148:b4bfdf10c4b3