cannam@127
|
1 /*
|
cannam@127
|
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
|
cannam@127
|
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
cannam@127
|
4 *
|
cannam@127
|
5 * Double-precision support added by Romain Dolbeau.
|
cannam@127
|
6 * Romain Dolbeau hereby places his modifications in the public domain.
|
cannam@127
|
7 *
|
cannam@127
|
8 * This program is free software; you can redistribute it and/or modify
|
cannam@127
|
9 * it under the terms of the GNU General Public License as published by
|
cannam@127
|
10 * the Free Software Foundation; either version 2 of the License, or
|
cannam@127
|
11 * (at your option) any later version.
|
cannam@127
|
12 *
|
cannam@127
|
13 * This program is distributed in the hope that it will be useful,
|
cannam@127
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
cannam@127
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
cannam@127
|
16 * GNU General Public License for more details.
|
cannam@127
|
17 *
|
cannam@127
|
18 * You should have received a copy of the GNU General Public License
|
cannam@127
|
19 * along with this program; if not, write to the Free Software
|
cannam@127
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
cannam@127
|
21 *
|
cannam@127
|
22 */
|
cannam@127
|
23
|
cannam@127
|
24 #if !defined(FFTW_SINGLE) && !defined( __aarch64__)
|
cannam@127
|
25 #error "NEON only works in single precision on 32 bits ARM"
|
cannam@127
|
26 #endif
|
cannam@127
|
27 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
|
cannam@127
|
28 #error "NEON only works in single or double precision"
|
cannam@127
|
29 #endif
|
cannam@127
|
30
|
cannam@127
|
31 #ifdef FFTW_SINGLE
|
cannam@127
|
32 # define DS(d,s) s /* single-precision option */
|
cannam@127
|
33 # define SUFF(name) name ## _f32
|
cannam@127
|
34 #else
|
cannam@127
|
35 # define DS(d,s) d /* double-precision option */
|
cannam@127
|
36 # define SUFF(name) name ## _f64
|
cannam@127
|
37 #endif
|
cannam@127
|
38
|
cannam@127
|
39 /* define these unconditionally, because they are used by
|
cannam@127
|
40 taint.c which is compiled without neon */
|
cannam@127
|
41 #define SIMD_SUFFIX _neon /* for renaming */
|
cannam@127
|
42 #define VL DS(1,2) /* SIMD complex vector length */
|
cannam@127
|
43 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
|
cannam@127
|
44 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
|
cannam@127
|
45
|
cannam@127
|
46 #if defined(__GNUC__) && !defined(__ARM_NEON__) && !defined(__ARM_NEON)
|
cannam@127
|
47 #error "compiling simd-neon.h requires -mfpu=neon or equivalent"
|
cannam@127
|
48 #endif
|
cannam@127
|
49
|
cannam@127
|
50 #include <arm_neon.h>
|
cannam@127
|
51
|
cannam@127
|
52 /* FIXME: I am not sure whether this code assumes little-endian
|
cannam@127
|
53 ordering. VLIT may or may not be wrong for big-endian systems. */
|
cannam@127
|
54 typedef DS(float64x2_t, float32x4_t) V;
|
cannam@127
|
55
|
cannam@127
|
56 #ifdef FFTW_SINGLE
|
cannam@127
|
57 # define VLIT(x0, x1) {x0, x1, x0, x1}
|
cannam@127
|
58 #else
|
cannam@127
|
59 # define VLIT(x0, x1) {x0, x1}
|
cannam@127
|
60 #endif
|
cannam@127
|
61 #define LDK(x) x
|
cannam@127
|
62 #define DVK(var, val) const V var = VLIT(val, val)
|
cannam@127
|
63
|
cannam@127
|
64 /* NEON has FMA, but a three-operand FMA is not too useful
|
cannam@127
|
65 for FFT purposes. We normally compute
|
cannam@127
|
66
|
cannam@127
|
67 t0=a+b*c
|
cannam@127
|
68 t1=a-b*c
|
cannam@127
|
69
|
cannam@127
|
70 In a three-operand instruction set this translates into
|
cannam@127
|
71
|
cannam@127
|
72 t0=a
|
cannam@127
|
73 t0+=b*c
|
cannam@127
|
74 t1=a
|
cannam@127
|
75 t1-=b*c
|
cannam@127
|
76
|
cannam@127
|
77 At least one move must be implemented, negating the advantage of
|
cannam@127
|
78 the FMA in the first place. At least some versions of gcc generate
|
cannam@127
|
79 both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
|
cannam@127
|
80 #if HAVE_FMA
|
cannam@127
|
81 #warning "--enable-fma on NEON is probably a bad idea (see source code)"
|
cannam@127
|
82 #endif
|
cannam@127
|
83
|
cannam@127
|
84 #define VADD(a, b) SUFF(vaddq)(a, b)
|
cannam@127
|
85 #define VSUB(a, b) SUFF(vsubq)(a, b)
|
cannam@127
|
86 #define VMUL(a, b) SUFF(vmulq)(a, b)
|
cannam@127
|
87 #define VFMA(a, b, c) SUFF(vmlaq)(c, a, b) /* a*b+c */
|
cannam@127
|
88 #define VFNMS(a, b, c) SUFF(vmlsq)(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
|
cannam@127
|
89 in ARM terminology */
|
cannam@127
|
90 #define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
|
cannam@127
|
91 arm instruction (?) */
|
cannam@127
|
92
|
cannam@127
|
93 #define STOREH(a, v) SUFF(vst1)((a), SUFF(vget_high)(v))
|
cannam@127
|
94 #define STOREL(a, v) SUFF(vst1)((a), SUFF(vget_low)(v))
|
cannam@127
|
95
|
cannam@127
|
96 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
|
cannam@127
|
97 {
|
cannam@127
|
98 (void) aligned_like; /* UNUSED */
|
cannam@127
|
99 return SUFF(vld1q)(x);
|
cannam@127
|
100 }
|
cannam@127
|
101 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
|
cannam@127
|
102 {
|
cannam@127
|
103 (void) aligned_like; /* UNUSED */
|
cannam@127
|
104 SUFF(vst1q)(x, v);
|
cannam@127
|
105 }
|
cannam@127
|
106
|
cannam@127
|
107
|
cannam@127
|
108 #ifdef FFTW_SINGLE
|
cannam@127
|
109 static inline V LD(const R *x, INT ivs, const R *aligned_like)
|
cannam@127
|
110 {
|
cannam@127
|
111 (void) aligned_like; /* UNUSED */
|
cannam@127
|
112 return SUFF(vcombine)(SUFF(vld1)(x), SUFF(vld1)((x + ivs)));
|
cannam@127
|
113 }
|
cannam@127
|
114 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
|
cannam@127
|
115 {
|
cannam@127
|
116 (void) aligned_like; /* UNUSED */
|
cannam@127
|
117 /* WARNING: the extra_iter hack depends upon store-low occurring
|
cannam@127
|
118 after store-high */
|
cannam@127
|
119 STOREH(x + ovs, v);
|
cannam@127
|
120 STOREL(x,v);
|
cannam@127
|
121 }
|
cannam@127
|
122 #else /* !FFTW_SINGLE */
|
cannam@127
|
123 # define LD LDA
|
cannam@127
|
124 # define ST STA
|
cannam@127
|
125 #endif
|
cannam@127
|
126
|
cannam@127
|
127 /* 2x2 complex transpose and store */
|
cannam@127
|
128 #define STM2 DS(STA,ST)
|
cannam@127
|
129 #define STN2(x, v0, v1, ovs) /* nop */
|
cannam@127
|
130
|
cannam@127
|
131 #ifdef FFTW_SINGLE
|
cannam@127
|
132 /* store and 4x4 real transpose */
|
cannam@127
|
133 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
cannam@127
|
134 {
|
cannam@127
|
135 (void) aligned_like; /* UNUSED */
|
cannam@127
|
136 SUFF(vst1_lane)((x) , SUFF(vget_low)(v), 0);
|
cannam@127
|
137 SUFF(vst1_lane)((x + ovs), SUFF(vget_low)(v), 1);
|
cannam@127
|
138 SUFF(vst1_lane)((x + 2 * ovs), SUFF(vget_high)(v), 0);
|
cannam@127
|
139 SUFF(vst1_lane)((x + 3 * ovs), SUFF(vget_high)(v), 1);
|
cannam@127
|
140 }
|
cannam@127
|
141 #define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
|
cannam@127
|
142 #else /* !FFTW_SINGLE */
|
cannam@127
|
143 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
|
cannam@127
|
144 {
|
cannam@127
|
145 (void)aligned_like; /* UNUSED */
|
cannam@127
|
146 STOREL(x, v);
|
cannam@127
|
147 STOREH(x + ovs, v);
|
cannam@127
|
148 }
|
cannam@127
|
149 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
|
cannam@127
|
150 #endif
|
cannam@127
|
151
|
cannam@127
|
152 #ifdef FFTW_SINGLE
|
cannam@127
|
153 #define FLIP_RI(x) SUFF(vrev64q)(x)
|
cannam@127
|
154 #else
|
cannam@127
|
155 /* FIXME */
|
cannam@127
|
156 #define FLIP_RI(x) SUFF(vcombine)(SUFF(vget_high)(x), SUFF(vget_low)(x))
|
cannam@127
|
157 #endif
|
cannam@127
|
158
|
cannam@127
|
159 static inline V VCONJ(V x)
|
cannam@127
|
160 {
|
cannam@127
|
161 #ifdef FFTW_SINGLE
|
cannam@127
|
162 static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
|
cannam@127
|
163 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
|
cannam@127
|
164 #else
|
cannam@127
|
165 static const uint64x2_t pm = {0, 0x8000000000000000ull};
|
cannam@127
|
166 /* Gcc-4.9.2 still does not include vreinterpretq_f64_u64, but simple
|
cannam@127
|
167 * casts generate the correct assembly.
|
cannam@127
|
168 */
|
cannam@127
|
169 return (float64x2_t)(veorq_u64((uint64x2_t)(x), (uint64x2_t)(pm)));
|
cannam@127
|
170 #endif
|
cannam@127
|
171 }
|
cannam@127
|
172
|
cannam@127
|
173 static inline V VBYI(V x)
|
cannam@127
|
174 {
|
cannam@127
|
175 return FLIP_RI(VCONJ(x));
|
cannam@127
|
176 }
|
cannam@127
|
177
|
cannam@127
|
178 static inline V VFMAI(V b, V c)
|
cannam@127
|
179 {
|
cannam@127
|
180 const V mp = VLIT(-1.0, 1.0);
|
cannam@127
|
181 return VFMA(FLIP_RI(b), mp, c);
|
cannam@127
|
182 }
|
cannam@127
|
183
|
cannam@127
|
184 static inline V VFNMSI(V b, V c)
|
cannam@127
|
185 {
|
cannam@127
|
186 const V mp = VLIT(-1.0, 1.0);
|
cannam@127
|
187 return VFNMS(FLIP_RI(b), mp, c);
|
cannam@127
|
188 }
|
cannam@127
|
189
|
cannam@127
|
190 static inline V VFMACONJ(V b, V c)
|
cannam@127
|
191 {
|
cannam@127
|
192 const V pm = VLIT(1.0, -1.0);
|
cannam@127
|
193 return VFMA(b, pm, c);
|
cannam@127
|
194 }
|
cannam@127
|
195
|
cannam@127
|
196 static inline V VFNMSCONJ(V b, V c)
|
cannam@127
|
197 {
|
cannam@127
|
198 const V pm = VLIT(1.0, -1.0);
|
cannam@127
|
199 return VFNMS(b, pm, c);
|
cannam@127
|
200 }
|
cannam@127
|
201
|
cannam@127
|
202 static inline V VFMSCONJ(V b, V c)
|
cannam@127
|
203 {
|
cannam@127
|
204 return VSUB(VCONJ(b), c);
|
cannam@127
|
205 }
|
cannam@127
|
206
|
cannam@127
|
207 #ifdef FFTW_SINGLE
|
cannam@127
|
208 #if 1
|
cannam@127
|
209 #define VEXTRACT_REIM(tr, ti, tx) \
|
cannam@127
|
210 { \
|
cannam@127
|
211 tr = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 0), \
|
cannam@127
|
212 SUFF(vdup_lane)(SUFF(vget_high)(tx), 0)); \
|
cannam@127
|
213 ti = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 1), \
|
cannam@127
|
214 SUFF(vdup_lane)(SUFF(vget_high)(tx), 1)); \
|
cannam@127
|
215 }
|
cannam@127
|
216 #else
|
cannam@127
|
217 /* this alternative might be faster in an ideal world, but gcc likes
|
cannam@127
|
218 to spill VVV onto the stack */
|
cannam@127
|
219 #define VEXTRACT_REIM(tr, ti, tx) \
|
cannam@127
|
220 { \
|
cannam@127
|
221 float32x4x2_t vvv = SUFF(vtrnq)(tx, tx); \
|
cannam@127
|
222 tr = vvv.val[0]; \
|
cannam@127
|
223 ti = vvv.val[1]; \
|
cannam@127
|
224 }
|
cannam@127
|
225 #endif
|
cannam@127
|
226 #else
|
cannam@127
|
227 #define VEXTRACT_REIM(tr, ti, tx) \
|
cannam@127
|
228 { \
|
cannam@127
|
229 tr = SUFF(vtrn1q)(tx, tx); \
|
cannam@127
|
230 ti = SUFF(vtrn2q)(tx, tx); \
|
cannam@127
|
231 }
|
cannam@127
|
232 #endif
|
cannam@127
|
233
|
cannam@127
|
234 static inline V VZMUL(V tx, V sr)
|
cannam@127
|
235 {
|
cannam@127
|
236 V tr, ti;
|
cannam@127
|
237 VEXTRACT_REIM(tr, ti, tx);
|
cannam@127
|
238 tr = VMUL(sr, tr);
|
cannam@127
|
239 sr = VBYI(sr);
|
cannam@127
|
240 return VFMA(ti, sr, tr);
|
cannam@127
|
241 }
|
cannam@127
|
242
|
cannam@127
|
243 static inline V VZMULJ(V tx, V sr)
|
cannam@127
|
244 {
|
cannam@127
|
245 V tr, ti;
|
cannam@127
|
246 VEXTRACT_REIM(tr, ti, tx);
|
cannam@127
|
247 tr = VMUL(sr, tr);
|
cannam@127
|
248 sr = VBYI(sr);
|
cannam@127
|
249 return VFNMS(ti, sr, tr);
|
cannam@127
|
250 }
|
cannam@127
|
251
|
cannam@127
|
252 static inline V VZMULI(V tx, V sr)
|
cannam@127
|
253 {
|
cannam@127
|
254 V tr, ti;
|
cannam@127
|
255 VEXTRACT_REIM(tr, ti, tx);
|
cannam@127
|
256 ti = VMUL(ti, sr);
|
cannam@127
|
257 sr = VBYI(sr);
|
cannam@127
|
258 return VFMS(tr, sr, ti);
|
cannam@127
|
259 }
|
cannam@127
|
260
|
cannam@127
|
261 static inline V VZMULIJ(V tx, V sr)
|
cannam@127
|
262 {
|
cannam@127
|
263 V tr, ti;
|
cannam@127
|
264 VEXTRACT_REIM(tr, ti, tx);
|
cannam@127
|
265 ti = VMUL(ti, sr);
|
cannam@127
|
266 sr = VBYI(sr);
|
cannam@127
|
267 return VFMA(tr, sr, ti);
|
cannam@127
|
268 }
|
cannam@127
|
269
|
cannam@127
|
270 /* twiddle storage #1: compact, slower */
|
cannam@127
|
271 #ifdef FFTW_SINGLE
|
cannam@127
|
272 #define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
cannam@127
|
273 #else
|
cannam@127
|
274 #define VTW1(v,x) {TW_CEXP, v, x}
|
cannam@127
|
275 #endif
|
cannam@127
|
276 #define TWVL1 VL
|
cannam@127
|
277 static inline V BYTW1(const R *t, V sr)
|
cannam@127
|
278 {
|
cannam@127
|
279 V tx = LDA(t, 2, 0);
|
cannam@127
|
280 return VZMUL(tx, sr);
|
cannam@127
|
281 }
|
cannam@127
|
282
|
cannam@127
|
283 static inline V BYTWJ1(const R *t, V sr)
|
cannam@127
|
284 {
|
cannam@127
|
285 V tx = LDA(t, 2, 0);
|
cannam@127
|
286 return VZMULJ(tx, sr);
|
cannam@127
|
287 }
|
cannam@127
|
288
|
cannam@127
|
289 /* twiddle storage #2: twice the space, faster (when in cache) */
|
cannam@127
|
290 #ifdef FFTW_SINGLE
|
cannam@127
|
291 # define VTW2(v,x) \
|
cannam@127
|
292 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
|
cannam@127
|
293 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
|
cannam@127
|
294 #else
|
cannam@127
|
295 # define VTW2(v,x) \
|
cannam@127
|
296 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
|
cannam@127
|
297 #endif
|
cannam@127
|
298 #define TWVL2 (2 * VL)
|
cannam@127
|
299
|
cannam@127
|
300 static inline V BYTW2(const R *t, V sr)
|
cannam@127
|
301 {
|
cannam@127
|
302 V si = FLIP_RI(sr);
|
cannam@127
|
303 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
|
cannam@127
|
304 return VFMA(ti, si, VMUL(tr, sr));
|
cannam@127
|
305 }
|
cannam@127
|
306
|
cannam@127
|
307 static inline V BYTWJ2(const R *t, V sr)
|
cannam@127
|
308 {
|
cannam@127
|
309 V si = FLIP_RI(sr);
|
cannam@127
|
310 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
|
cannam@127
|
311 return VFNMS(ti, si, VMUL(tr, sr));
|
cannam@127
|
312 }
|
cannam@127
|
313
|
cannam@127
|
314 /* twiddle storage #3 */
|
cannam@127
|
315 #ifdef FFTW_SINGLE
|
cannam@127
|
316 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
|
cannam@127
|
317 #else
|
cannam@127
|
318 # define VTW3(v,x) {TW_CEXP, v, x}
|
cannam@127
|
319 #endif
|
cannam@127
|
320 # define TWVL3 (VL)
|
cannam@127
|
321
|
cannam@127
|
322 /* twiddle storage for split arrays */
|
cannam@127
|
323 #ifdef FFTW_SINGLE
|
cannam@127
|
324 # define VTWS(v,x) \
|
cannam@127
|
325 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
|
cannam@127
|
326 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
|
cannam@127
|
327 #else
|
cannam@127
|
328 # define VTWS(v,x) \
|
cannam@127
|
329 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
|
cannam@127
|
330 #endif
|
cannam@127
|
331 #define TWVLS (2 * VL)
|
cannam@127
|
332
|
cannam@127
|
333 #define VLEAVE() /* nothing */
|
cannam@127
|
334
|
cannam@127
|
335 #include "simd-common.h"
|