cannam@95
|
1 /*
|
cannam@95
|
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
|
cannam@95
|
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
cannam@95
|
4 *
|
cannam@95
|
5 * This program is free software; you can redistribute it and/or modify
|
cannam@95
|
6 * it under the terms of the GNU General Public License as published by
|
cannam@95
|
7 * the Free Software Foundation; either version 2 of the License, or
|
cannam@95
|
8 * (at your option) any later version.
|
cannam@95
|
9 *
|
cannam@95
|
10 * This program is distributed in the hope that it will be useful,
|
cannam@95
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
cannam@95
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
cannam@95
|
13 * GNU General Public License for more details.
|
cannam@95
|
14 *
|
cannam@95
|
15 * You should have received a copy of the GNU General Public License
|
cannam@95
|
16 * along with this program; if not, write to the Free Software
|
cannam@95
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
cannam@95
|
18 *
|
cannam@95
|
19 */
|
cannam@95
|
20
|
cannam@95
|
21 #ifndef FFTW_SINGLE
|
cannam@95
|
22 #error "ALTIVEC only works in single precision"
|
cannam@95
|
23 #endif
|
cannam@95
|
24
|
cannam@95
|
25 /* define these unconditionally, because they are used by
|
cannam@95
|
26 taint.c which is compiled without altivec */
|
cannam@95
|
27 #define SIMD_SUFFIX _altivec /* for renaming */
|
cannam@95
|
28 #define VL 2 /* SIMD complex vector length */
|
cannam@95
|
29 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
|
cannam@95
|
30 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA
|
cannam@95
|
31
|
cannam@95
|
32 #if !defined(__VEC__) && !defined(FAKE__VEC__)
|
cannam@95
|
33 # error "compiling simd-altivec.h requires -maltivec or equivalent"
|
cannam@95
|
34 #endif
|
cannam@95
|
35
|
cannam@95
|
36 #ifdef HAVE_ALTIVEC_H
|
cannam@95
|
37 # include <altivec.h>
|
cannam@95
|
38 #endif
|
cannam@95
|
39
|
cannam@95
|
40 typedef vector float V;
|
cannam@95
|
41 #define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
|
cannam@95
|
42 #define LDK(x) x
|
cannam@95
|
43 #define DVK(var, val) const V var = VLIT(val, val, val, val)
|
cannam@95
|
44
|
cannam@95
|
45 static inline V VADD(V a, V b) { return vec_add(a, b); }
|
cannam@95
|
46 static inline V VSUB(V a, V b) { return vec_sub(a, b); }
|
cannam@95
|
47 static inline V VFMA(V a, V b, V c) { return vec_madd(a, b, c); }
|
cannam@95
|
48 static inline V VFNMS(V a, V b, V c) { return vec_nmsub(a, b, c); }
|
cannam@95
|
49
|
cannam@95
|
50 static inline V VMUL(V a, V b)
|
cannam@95
|
51 {
|
cannam@95
|
52 DVK(zero, -0.0);
|
cannam@95
|
53 return VFMA(a, b, zero);
|
cannam@95
|
54 }
|
cannam@95
|
55
|
cannam@95
|
56 static inline V VFMS(V a, V b, V c) { return VSUB(VMUL(a, b), c); }
|
cannam@95
|
57
|
cannam@95
|
58 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
cannam@95
|
59 {
|
cannam@95
|
60 UNUSED(ivs);
|
cannam@95
|
61 UNUSED(aligned_like);
|
cannam@95
|
62 return vec_ld(0, x);
|
cannam@95
|
63 }
|
cannam@95
|
64
|
cannam@95
|
65 static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
cannam@95
|
66 {
|
cannam@95
|
67 /* common subexpressions */
|
cannam@95
|
68 const INT fivs = sizeof(R) * ivs;
|
cannam@95
|
69 /* you are not expected to understand this: */
|
cannam@95
|
70 const vector unsigned int perm = VLIT(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
|
cannam@95
|
71 vector unsigned char ml = vec_lvsr(fivs + 8, aligned_like);
|
cannam@95
|
72 vector unsigned char mh = vec_lvsl(0, aligned_like);
|
cannam@95
|
73 vector unsigned char msk =
|
cannam@95
|
74 (vector unsigned char)vec_sel((V)mh, (V)ml, perm);
|
cannam@95
|
75 /* end of common subexpressions */
|
cannam@95
|
76
|
cannam@95
|
77 return vec_perm(vec_ld(0, x), vec_ld(fivs, x), msk);
|
cannam@95
|
78 }
|
cannam@95
|
79
|
cannam@95
|
80 /* store lower half */
|
cannam@95
|
81 static inline void STH(R *x, V v, R *aligned_like)
|
cannam@95
|
82 {
|
cannam@95
|
83 v = vec_perm(v, v, vec_lvsr(0, aligned_like));
|
cannam@95
|
84 vec_ste(v, 0, x);
|
cannam@95
|
85 vec_ste(v, sizeof(R), x);
|
cannam@95
|
86 }
|
cannam@95
|
87
|
cannam@95
|
88 static inline void STL(R *x, V v, INT ovs, R *aligned_like)
|
cannam@95
|
89 {
|
cannam@95
|
90 const INT fovs = sizeof(R) * ovs;
|
cannam@95
|
91 v = vec_perm(v, v, vec_lvsr(fovs + 8, aligned_like));
|
cannam@95
|
92 vec_ste(v, fovs, x);
|
cannam@95
|
93 vec_ste(v, sizeof(R) + fovs, x);
|
cannam@95
|
94 }
|
cannam@95
|
95
|
cannam@95
|
96 static inline void STA(R *x, V v, INT ovs, R *aligned_like)
|
cannam@95
|
97 {
|
cannam@95
|
98 UNUSED(ovs);
|
cannam@95
|
99 UNUSED(aligned_like);
|
cannam@95
|
100 vec_st(v, 0, x);
|
cannam@95
|
101 }
|
cannam@95
|
102
|
cannam@95
|
103 static inline void ST(R *x, V v, INT ovs, R *aligned_like)
|
cannam@95
|
104 {
|
cannam@95
|
105 /* WARNING: the extra_iter hack depends upon STH occurring after
|
cannam@95
|
106 STL */
|
cannam@95
|
107 STL(x, v, ovs, aligned_like);
|
cannam@95
|
108 STH(x, v, aligned_like);
|
cannam@95
|
109 }
|
cannam@95
|
110
|
cannam@95
|
111 #define STM2(x, v, ovs, aligned_like) /* no-op */
|
cannam@95
|
112
|
cannam@95
|
113 static inline void STN2(R *x, V v0, V v1, INT ovs)
|
cannam@95
|
114 {
|
cannam@95
|
115 const INT fovs = sizeof(R) * ovs;
|
cannam@95
|
116 const vector unsigned int even =
|
cannam@95
|
117 VLIT(0x00010203, 0x04050607, 0x10111213, 0x14151617);
|
cannam@95
|
118 const vector unsigned int odd =
|
cannam@95
|
119 VLIT(0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f);
|
cannam@95
|
120 vec_st(vec_perm(v0, v1, (vector unsigned char)even), 0, x);
|
cannam@95
|
121 vec_st(vec_perm(v0, v1, (vector unsigned char)odd), fovs, x);
|
cannam@95
|
122 }
|
cannam@95
|
123
|
cannam@95
|
124 #define STM4(x, v, ovs, aligned_like) /* no-op */
|
cannam@95
|
125
|
cannam@95
|
126 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
|
cannam@95
|
127 {
|
cannam@95
|
128 const INT fovs = sizeof(R) * ovs;
|
cannam@95
|
129 V x0 = vec_mergeh(v0, v2);
|
cannam@95
|
130 V x1 = vec_mergel(v0, v2);
|
cannam@95
|
131 V x2 = vec_mergeh(v1, v3);
|
cannam@95
|
132 V x3 = vec_mergel(v1, v3);
|
cannam@95
|
133 V y0 = vec_mergeh(x0, x2);
|
cannam@95
|
134 V y1 = vec_mergel(x0, x2);
|
cannam@95
|
135 V y2 = vec_mergeh(x1, x3);
|
cannam@95
|
136 V y3 = vec_mergel(x1, x3);
|
cannam@95
|
137 vec_st(y0, 0, x);
|
cannam@95
|
138 vec_st(y1, fovs, x);
|
cannam@95
|
139 vec_st(y2, 2 * fovs, x);
|
cannam@95
|
140 vec_st(y3, 3 * fovs, x);
|
cannam@95
|
141 }
|
cannam@95
|
142
|
cannam@95
|
143 static inline V FLIP_RI(V x)
|
cannam@95
|
144 {
|
cannam@95
|
145 const vector unsigned int perm =
|
cannam@95
|
146 VLIT(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
|
cannam@95
|
147 return vec_perm(x, x, (vector unsigned char)perm);
|
cannam@95
|
148 }
|
cannam@95
|
149
|
cannam@95
|
150 static inline V VCONJ(V x)
|
cannam@95
|
151 {
|
cannam@95
|
152 const V pmpm = VLIT(0.0, -0.0, 0.0, -0.0);
|
cannam@95
|
153 return vec_xor(x, pmpm);
|
cannam@95
|
154 }
|
cannam@95
|
155
|
cannam@95
|
156 static inline V VBYI(V x)
|
cannam@95
|
157 {
|
cannam@95
|
158 return FLIP_RI(VCONJ(x));
|
cannam@95
|
159 }
|
cannam@95
|
160
|
cannam@95
|
161 static inline V VFMAI(V b, V c)
|
cannam@95
|
162 {
|
cannam@95
|
163 const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
|
cannam@95
|
164 return VFMA(FLIP_RI(b), mpmp, c);
|
cannam@95
|
165 }
|
cannam@95
|
166
|
cannam@95
|
167 static inline V VFNMSI(V b, V c)
|
cannam@95
|
168 {
|
cannam@95
|
169 const V mpmp = VLIT(-1.0, 1.0, -1.0, 1.0);
|
cannam@95
|
170 return VFNMS(FLIP_RI(b), mpmp, c);
|
cannam@95
|
171 }
|
cannam@95
|
172
|
cannam@95
|
173 static inline V VFMACONJ(V b, V c)
|
cannam@95
|
174 {
|
cannam@95
|
175 const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
|
cannam@95
|
176 return VFMA(b, pmpm, c);
|
cannam@95
|
177 }
|
cannam@95
|
178
|
cannam@95
|
179 static inline V VFNMSCONJ(V b, V c)
|
cannam@95
|
180 {
|
cannam@95
|
181 const V pmpm = VLIT(1.0, -1.0, 1.0, -1.0);
|
cannam@95
|
182 return VFNMS(b, pmpm, c);
|
cannam@95
|
183 }
|
cannam@95
|
184
|
cannam@95
|
185 static inline V VFMSCONJ(V b, V c)
|
cannam@95
|
186 {
|
cannam@95
|
187 return VSUB(VCONJ(b), c);
|
cannam@95
|
188 }
|
cannam@95
|
189
|
cannam@95
|
190 static inline V VZMUL(V tx, V sr)
|
cannam@95
|
191 {
|
cannam@95
|
192 const vector unsigned int real =
|
cannam@95
|
193 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
cannam@95
|
194 const vector unsigned int imag =
|
cannam@95
|
195 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
cannam@95
|
196 V si = VBYI(sr);
|
cannam@95
|
197 V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
cannam@95
|
198 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
cannam@95
|
199 return VFMA(ti, si, VMUL(tr, sr));
|
cannam@95
|
200 }
|
cannam@95
|
201
|
cannam@95
|
202 static inline V VZMULJ(V tx, V sr)
|
cannam@95
|
203 {
|
cannam@95
|
204 const vector unsigned int real =
|
cannam@95
|
205 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
cannam@95
|
206 const vector unsigned int imag =
|
cannam@95
|
207 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
cannam@95
|
208 V si = VBYI(sr);
|
cannam@95
|
209 V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
cannam@95
|
210 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
cannam@95
|
211 return VFNMS(ti, si, VMUL(tr, sr));
|
cannam@95
|
212 }
|
cannam@95
|
213
|
cannam@95
|
214 static inline V VZMULI(V tx, V si)
|
cannam@95
|
215 {
|
cannam@95
|
216 const vector unsigned int real =
|
cannam@95
|
217 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
cannam@95
|
218 const vector unsigned int imag =
|
cannam@95
|
219 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
cannam@95
|
220 V sr = VBYI(si);
|
cannam@95
|
221 V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
cannam@95
|
222 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
cannam@95
|
223 return VFNMS(ti, si, VMUL(tr, sr));
|
cannam@95
|
224 }
|
cannam@95
|
225
|
cannam@95
|
226 static inline V VZMULIJ(V tx, V si)
|
cannam@95
|
227 {
|
cannam@95
|
228 const vector unsigned int real =
|
cannam@95
|
229 VLIT(0x00010203, 0x00010203, 0x08090a0b, 0x08090a0b);
|
cannam@95
|
230 const vector unsigned int imag =
|
cannam@95
|
231 VLIT(0x04050607, 0x04050607, 0x0c0d0e0f, 0x0c0d0e0f);
|
cannam@95
|
232 V sr = VBYI(si);
|
cannam@95
|
233 V tr = vec_perm(tx, tx, (vector unsigned char)real);
|
cannam@95
|
234 V ti = vec_perm(tx, tx, (vector unsigned char)imag);
|
cannam@95
|
235 return VFMA(ti, si, VMUL(tr, sr));
|
cannam@95
|
236 }
|
cannam@95
|
237
|
cannam@95
|
238 /* twiddle storage #1: compact, slower */
|
cannam@95
|
239 #define VTW1(v,x) \
|
cannam@95
|
240 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
cannam@95
|
241 #define TWVL1 (VL)
|
cannam@95
|
242
|
cannam@95
|
243 static inline V BYTW1(const R *t, V sr)
|
cannam@95
|
244 {
|
cannam@95
|
245 const V *twp = (const V *)t;
|
cannam@95
|
246 V si = VBYI(sr);
|
cannam@95
|
247 V tx = twp[0];
|
cannam@95
|
248 V tr = vec_mergeh(tx, tx);
|
cannam@95
|
249 V ti = vec_mergel(tx, tx);
|
cannam@95
|
250 return VFMA(ti, si, VMUL(tr, sr));
|
cannam@95
|
251 }
|
cannam@95
|
252
|
cannam@95
|
253 static inline V BYTWJ1(const R *t, V sr)
|
cannam@95
|
254 {
|
cannam@95
|
255 const V *twp = (const V *)t;
|
cannam@95
|
256 V si = VBYI(sr);
|
cannam@95
|
257 V tx = twp[0];
|
cannam@95
|
258 V tr = vec_mergeh(tx, tx);
|
cannam@95
|
259 V ti = vec_mergel(tx, tx);
|
cannam@95
|
260 return VFNMS(ti, si, VMUL(tr, sr));
|
cannam@95
|
261 }
|
cannam@95
|
262
|
cannam@95
|
263 /* twiddle storage #2: twice the space, faster (when in cache) */
|
cannam@95
|
264 #define VTW2(v,x) \
|
cannam@95
|
265 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
cannam@95
|
266 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
cannam@95
|
267 #define TWVL2 (2 * VL)
|
cannam@95
|
268
|
cannam@95
|
269 static inline V BYTW2(const R *t, V sr)
|
cannam@95
|
270 {
|
cannam@95
|
271 const V *twp = (const V *)t;
|
cannam@95
|
272 V si = FLIP_RI(sr);
|
cannam@95
|
273 V tr = twp[0], ti = twp[1];
|
cannam@95
|
274 return VFMA(ti, si, VMUL(tr, sr));
|
cannam@95
|
275 }
|
cannam@95
|
276
|
cannam@95
|
277 static inline V BYTWJ2(const R *t, V sr)
|
cannam@95
|
278 {
|
cannam@95
|
279 const V *twp = (const V *)t;
|
cannam@95
|
280 V si = FLIP_RI(sr);
|
cannam@95
|
281 V tr = twp[0], ti = twp[1];
|
cannam@95
|
282 return VFNMS(ti, si, VMUL(tr, sr));
|
cannam@95
|
283 }
|
cannam@95
|
284
|
cannam@95
|
285 /* twiddle storage #3 */
|
cannam@95
|
286 #define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
cannam@95
|
287 #define TWVL3 (VL)
|
cannam@95
|
288
|
cannam@95
|
289 /* twiddle storage for split arrays */
|
cannam@95
|
290 #define VTWS(v,x) \
|
cannam@95
|
291 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
cannam@95
|
292 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
cannam@95
|
293 #define TWVLS (2 * VL)
|
cannam@95
|
294
|
cannam@95
|
295 #define VLEAVE() /* nothing */
|
cannam@95
|
296
|
cannam@95
|
297 #include "simd-common.h"
|