annotate src/fftw-3.3.5/simd-support/simd-altivec.h @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 #ifndef FFTW_SINGLE
Chris@42 22 #error "ALTIVEC only works in single precision"
Chris@42 23 #endif
Chris@42 24
Chris@42 25 /* define these unconditionally, because they are used by
Chris@42 26 taint.c which is compiled without altivec */
Chris@42 27 #define SIMD_SUFFIX _altivec /* for renaming */
Chris@42 28 #define VL 2 /* SIMD complex vector length */
Chris@42 29 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
Chris@42 30 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
Chris@42 31
Chris@42 32 #if !defined(__VEC__) && !defined(FAKE__VEC__)
Chris@42 33 # error "compiling simd-altivec.h requires -maltivec or equivalent"
Chris@42 34 #endif
Chris@42 35
Chris@42 36 #ifdef HAVE_ALTIVEC_H
Chris@42 37 # include <altivec.h>
Chris@42 38 #endif
Chris@42 39
Chris@42 40 typedef vector float V;
Chris@42 41 #define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
Chris@42 42 #define LDK(x) x
Chris@42 43 #define DVK(var, val) const V var = VLIT(val, val, val, val)
Chris@42 44
Chris@42 45 static inline V VADD(V a, V b) { return vec_add(a, b); }
Chris@42 46 static inline V VSUB(V a, V b) { return vec_sub(a, b); }
Chris@42 47 static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
Chris@42 48 static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
Chris@42 49
Chris@42 50 static inline V VMUL(V a, V b)
Chris@42 51 {
Chris@42 52 DVK(zero, -0.0);
Chris@42 53 return VFMA(a, b, zero);
Chris@42 54 }
Chris@42 55
Chris@42 56 static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
Chris@42 57
Chris@42 58 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
Chris@42 59 {
Chris@42 60 UNUSED(ivs);
Chris@42 61 UNUSED(aligned_like);
Chris@42 62 return vec_ld(0, x);
Chris@42 63 }
Chris@42 64
Chris@42 65 static inline V LD(const R *x, INT ivs, const R *aligned_like)
Chris@42 66 {
Chris@42 67 /* common subexpressions */
Chris@42 68 const INT fivs = sizeof(R) * ivs;
Chris@42 69 /* you are not expected to understand this: */
Chris@42 70 const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
Chris@42 71 vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
Chris@42 72 vector unsigned char mh = vec_lvsl(0, aligned_like);
Chris@42 73 vector unsigned char msk =
Chris@42 74 (vector unsigned char)vec_sel((V)mh, (V)ml, perm);
Chris@42 75 /* end of common subexpressions */
Chris@42 76
Chris@42 77 return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
Chris@42 78 }
Chris@42 79
Chris@42 80 /* store lower half */
Chris@42 81 static inline void STH(R *x, V v, R *aligned_like)
Chris@42 82 {
Chris@42 83 v = vec_perm(v, v, vec_lvsr(0, aligned_like));
Chris@42 84 vec_ste(v, 0, x);
Chris@42 85 vec_ste(v, sizeof(R), x);
Chris@42 86 }
Chris@42 87
Chris@42 88 static inline void STL(R *x, V v, INT ovs, R *aligned_like)
Chris@42 89 {
Chris@42 90 const INT fovs = sizeof(R) * ovs;
Chris@42 91 v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
Chris@42 92 vec_ste(v, fovs, x);
Chris@42 93 vec_ste(v, sizeof(R) + fovs, x);
Chris@42 94 }
Chris@42 95
Chris@42 96 static inline void STA(R *x, V v, INT ovs, R *aligned_like)
Chris@42 97 {
Chris@42 98 UNUSED(ovs);
Chris@42 99 UNUSED(aligned_like);
Chris@42 100 vec_st(v, 0, x);
Chris@42 101 }
Chris@42 102
Chris@42 103 static inline void ST(R *x, V v, INT ovs, R *aligned_like)
Chris@42 104 {
Chris@42 105 /* WARNING: the extra_iter hack depends upon STH occurring after
Chris@42 106 STL */
Chris@42 107 STL(x, v, ovs, aligned_like);
Chris@42 108 STH(x, v, aligned_like);
Chris@42 109 }
Chris@42 110
Chris@42 111 #define STM2(x, v, ovs, aligned_like) /* no-op */
Chris@42 112
Chris@42 113 static inline void STN2(R *x, V v0, V v1, INT ovs)
Chris@42 114 {
Chris@42 115 const INT fovs = sizeof(R) * ovs;
Chris@42 116 const vector unsigned int even =
Chris@42 117 VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
Chris@42 118 const vector unsigned int odd =
Chris@42 119 VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
Chris@42 120 vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
Chris@42 121 vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
Chris@42 122 }
Chris@42 123
Chris@42 124 #define STM4(x, v, ovs, aligned_like) /* no-op */
Chris@42 125
Chris@42 126 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
Chris@42 127 {
Chris@42 128 const INT fovs = sizeof(R) * ovs;
Chris@42 129 V x0 = vec_mergeh(v0, v2);
Chris@42 130 V x1 = vec_mergel(v0, v2);
Chris@42 131 V x2 = vec_mergeh(v1, v3);
Chris@42 132 V x3 = vec_mergel(v1, v3);
Chris@42 133 V y0 = vec_mergeh(x0, x2);
Chris@42 134 V y1 = vec_mergel(x0, x2);
Chris@42 135 V y2 = vec_mergeh(x1, x3);
Chris@42 136 V y3 = vec_mergel(x1, x3);
Chris@42 137 vec_st(y0, 0, x);
Chris@42 138 vec_st(y1, fovs, x);
Chris@42 139 vec_st(y2, 2 * fovs, x);
Chris@42 140 vec_st(y3, 3 * fovs, x);
Chris@42 141 }
Chris@42 142
Chris@42 143 static inline V FLIP_RI(V x)
Chris@42 144 {
Chris@42 145 const vector unsigned int perm =
Chris@42 146 VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
Chris@42 147 return vec_perm(x, x, (vector unsigned char)perm);
Chris@42 148 }
Chris@42 149
Chris@42 150 static inline V VCONJ(V x)
Chris@42 151 {
Chris@42 152 const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
Chris@42 153 return vec_xor(x, pmpm);
Chris@42 154 }
Chris@42 155
Chris@42 156 static inline V VBYI(V x)
Chris@42 157 {
Chris@42 158 return FLIP_RI(VCONJ(x));
Chris@42 159 }
Chris@42 160
Chris@42 161 static inline V VFMAI(V b, V c)
Chris@42 162 {
Chris@42 163 const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
Chris@42 164 return VFMA(FLIP_RI(b), mpmp, c);
Chris@42 165 }
Chris@42 166
Chris@42 167 static inline V VFNMSI(V b, V c)
Chris@42 168 {
Chris@42 169 const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
Chris@42 170 return VFNMS(FLIP_RI(b), mpmp, c);
Chris@42 171 }
Chris@42 172
Chris@42 173 static inline V VFMACONJ(V b, V c)
Chris@42 174 {
Chris@42 175 const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
Chris@42 176 return VFMA(b, pmpm, c);
Chris@42 177 }
Chris@42 178
Chris@42 179 static inline V VFNMSCONJ(V b, V c)
Chris@42 180 {
Chris@42 181 const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
Chris@42 182 return VFNMS(b, pmpm, c);
Chris@42 183 }
Chris@42 184
Chris@42 185 static inline V VFMSCONJ(V b, V c)
Chris@42 186 {
Chris@42 187 return VSUB(VCONJ(b), c);
Chris@42 188 }
Chris@42 189
Chris@42 190 static inline V VZMUL(V tx, V sr)
Chris@42 191 {
Chris@42 192 const vector unsigned int real =
Chris@42 193 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
Chris@42 194 const vector unsigned int imag =
Chris@42 195 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
Chris@42 196 V si = VBYI(sr);
Chris@42 197 V tr = vec_perm(tx, tx, (vector unsigned char)real);
Chris@42 198 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
Chris@42 199 return VFMA(ti, si, VMUL(tr, sr));
Chris@42 200 }
Chris@42 201
Chris@42 202 static inline V VZMULJ(V tx, V sr)
Chris@42 203 {
Chris@42 204 const vector unsigned int real =
Chris@42 205 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
Chris@42 206 const vector unsigned int imag =
Chris@42 207 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
Chris@42 208 V si = VBYI(sr);
Chris@42 209 V tr = vec_perm(tx, tx, (vector unsigned char)real);
Chris@42 210 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
Chris@42 211 return VFNMS(ti, si, VMUL(tr, sr));
Chris@42 212 }
Chris@42 213
Chris@42 214 static inline V VZMULI(V tx, V si)
Chris@42 215 {
Chris@42 216 const vector unsigned int real =
Chris@42 217 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
Chris@42 218 const vector unsigned int imag =
Chris@42 219 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
Chris@42 220 V sr = VBYI(si);
Chris@42 221 V tr = vec_perm(tx, tx, (vector unsigned char)real);
Chris@42 222 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
Chris@42 223 return VFNMS(ti, si, VMUL(tr, sr));
Chris@42 224 }
Chris@42 225
Chris@42 226 static inline V VZMULIJ(V tx, V si)
Chris@42 227 {
Chris@42 228 const vector unsigned int real =
Chris@42 229 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
Chris@42 230 const vector unsigned int imag =
Chris@42 231 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
Chris@42 232 V sr = VBYI(si);
Chris@42 233 V tr = vec_perm(tx, tx, (vector unsigned char)real);
Chris@42 234 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
Chris@42 235 return VFMA(ti, si, VMUL(tr, sr));
Chris@42 236 }
Chris@42 237
Chris@42 238 /* twiddle storage #1: compact, slower */
Chris@42 239 #define VTW1(v,x) \
Chris@42 240 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
Chris@42 241 #define TWVL1 (VL)
Chris@42 242
Chris@42 243 static inline V BYTW1(const R *t, V sr)
Chris@42 244 {
Chris@42 245 const V *twp = (const V *)t;
Chris@42 246 V si = VBYI(sr);
Chris@42 247 V tx = twp[0];
Chris@42 248 V tr = vec_mergeh(tx, tx);
Chris@42 249 V ti = vec_mergel(tx, tx);
Chris@42 250 return VFMA(ti, si, VMUL(tr, sr));
Chris@42 251 }
Chris@42 252
Chris@42 253 static inline V BYTWJ1(const R *t, V sr)
Chris@42 254 {
Chris@42 255 const V *twp = (const V *)t;
Chris@42 256 V si = VBYI(sr);
Chris@42 257 V tx = twp[0];
Chris@42 258 V tr = vec_mergeh(tx, tx);
Chris@42 259 V ti = vec_mergel(tx, tx);
Chris@42 260 return VFNMS(ti, si, VMUL(tr, sr));
Chris@42 261 }
Chris@42 262
Chris@42 263 /* twiddle storage #2: twice the space, faster (when in cache) */
Chris@42 264 #define VTW2(v,x) \
Chris@42 265 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
Chris@42 266 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
Chris@42 267 #define TWVL2 (2 * VL)
Chris@42 268
Chris@42 269 static inline V BYTW2(const R *t, V sr)
Chris@42 270 {
Chris@42 271 const V *twp = (const V *)t;
Chris@42 272 V si = FLIP_RI(sr);
Chris@42 273 V tr = twp[0], ti = twp[1];
Chris@42 274 return VFMA(ti, si, VMUL(tr, sr));
Chris@42 275 }
Chris@42 276
Chris@42 277 static inline V BYTWJ2(const R *t, V sr)
Chris@42 278 {
Chris@42 279 const V *twp = (const V *)t;
Chris@42 280 V si = FLIP_RI(sr);
Chris@42 281 V tr = twp[0], ti = twp[1];
Chris@42 282 return VFNMS(ti, si, VMUL(tr, sr));
Chris@42 283 }
Chris@42 284
Chris@42 285 /* twiddle storage #3 */
Chris@42 286 #define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
Chris@42 287 #define TWVL3 (VL)
Chris@42 288
Chris@42 289 /* twiddle storage for split arrays */
Chris@42 290 #define VTWS(v,x) \
Chris@42 291 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
Chris@42 292 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
Chris@42 293 #define TWVLS (2 * VL)
Chris@42 294
Chris@42 295 #define VLEAVE() /* nothing */
Chris@42 296
Chris@42 297 #include "simd-common.h"