comparison src/fftw-3.3.5/simd-support/simd-neon.h @ 127:7867fa7e1b6b

Current fftw source
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 18 Oct 2016 13:40:26 +0100
parents
children
comparison
equal deleted inserted replaced
126:4a7071416412 127:7867fa7e1b6b
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * Double-precision support added by Romain Dolbeau.
6 * Romain Dolbeau hereby places his modifications in the public domain.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 */
23
24 #if !defined(FFTW_SINGLE) && !defined( __aarch64__)
25 #error "NEON only works in single precision on 32 bits ARM"
26 #endif
27 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
28 #error "NEON only works in single or double precision"
29 #endif
30
31 #ifdef FFTW_SINGLE
32 # define DS(d,s) s /* single-precision option */
33 # define SUFF(name) name ## _f32
34 #else
35 # define DS(d,s) d /* double-precision option */
36 # define SUFF(name) name ## _f64
37 #endif
38
39 /* define these unconditionally, because they are used by
40 taint.c which is compiled without neon */
41 #define SIMD_SUFFIX _neon /* for renaming */
42 #define VL DS(1,2) /* SIMD complex vector length */
43 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
44 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
45
46 #if defined(__GNUC__) && !defined(__ARM_NEON__) && !defined(__ARM_NEON)
47 #error "compiling simd-neon.h requires -mfpu=neon or equivalent"
48 #endif
49
50 #include <arm_neon.h>
51
52 /* FIXME: I am not sure whether this code assumes little-endian
53 ordering. VLIT may or may not be wrong for big-endian systems. */
54 typedef DS(float64x2_t, float32x4_t) V;
55
56 #ifdef FFTW_SINGLE
57 # define VLIT(x0, x1) {x0, x1, x0, x1}
58 #else
59 # define VLIT(x0, x1) {x0, x1}
60 #endif
61 #define LDK(x) x
62 #define DVK(var, val) const V var = VLIT(val, val)
63
64 /* NEON has FMA, but a three-operand FMA is not too useful
65 for FFT purposes. We normally compute
66
67 t0=a+b*c
68 t1=a-b*c
69
70 In a three-operand instruction set this translates into
71
72 t0=a
73 t0+=b*c
74 t1=a
75 t1-=b*c
76
77 At least one move must be implemented, negating the advantage of
78 the FMA in the first place. At least some versions of gcc generate
79 both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
80 #if HAVE_FMA
81 #warning "--enable-fma on NEON is probably a bad idea (see source code)"
82 #endif
83
84 #define VADD(a, b) SUFF(vaddq)(a, b)
85 #define VSUB(a, b) SUFF(vsubq)(a, b)
86 #define VMUL(a, b) SUFF(vmulq)(a, b)
87 #define VFMA(a, b, c) SUFF(vmlaq)(c, a, b) /* a*b+c */
88 #define VFNMS(a, b, c) SUFF(vmlsq)(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
89 in ARM terminology */
90 #define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
91 arm instruction (?) */
92
93 #define STOREH(a, v) SUFF(vst1)((a), SUFF(vget_high)(v))
94 #define STOREL(a, v) SUFF(vst1)((a), SUFF(vget_low)(v))
95
96 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
97 {
98 (void) aligned_like; /* UNUSED */
99 return SUFF(vld1q)(x);
100 }
101 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
102 {
103 (void) aligned_like; /* UNUSED */
104 SUFF(vst1q)(x, v);
105 }
106
107
108 #ifdef FFTW_SINGLE
109 static inline V LD(const R *x, INT ivs, const R *aligned_like)
110 {
111 (void) aligned_like; /* UNUSED */
112 return SUFF(vcombine)(SUFF(vld1)(x), SUFF(vld1)((x + ivs)));
113 }
114 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
115 {
116 (void) aligned_like; /* UNUSED */
117 /* WARNING: the extra_iter hack depends upon store-low occurring
118 after store-high */
119 STOREH(x + ovs, v);
120 STOREL(x,v);
121 }
122 #else /* !FFTW_SINGLE */
123 # define LD LDA
124 # define ST STA
125 #endif
126
127 /* 2x2 complex transpose and store */
128 #define STM2 DS(STA,ST)
129 #define STN2(x, v0, v1, ovs) /* nop */
130
131 #ifdef FFTW_SINGLE
132 /* store and 4x4 real transpose */
133 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
134 {
135 (void) aligned_like; /* UNUSED */
136 SUFF(vst1_lane)((x) , SUFF(vget_low)(v), 0);
137 SUFF(vst1_lane)((x + ovs), SUFF(vget_low)(v), 1);
138 SUFF(vst1_lane)((x + 2 * ovs), SUFF(vget_high)(v), 0);
139 SUFF(vst1_lane)((x + 3 * ovs), SUFF(vget_high)(v), 1);
140 }
141 #define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
142 #else /* !FFTW_SINGLE */
143 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
144 {
145 (void)aligned_like; /* UNUSED */
146 STOREL(x, v);
147 STOREH(x + ovs, v);
148 }
149 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
150 #endif
151
152 #ifdef FFTW_SINGLE
153 #define FLIP_RI(x) SUFF(vrev64q)(x)
154 #else
155 /* FIXME */
156 #define FLIP_RI(x) SUFF(vcombine)(SUFF(vget_high)(x), SUFF(vget_low)(x))
157 #endif
158
159 static inline V VCONJ(V x)
160 {
161 #ifdef FFTW_SINGLE
162 static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
163 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
164 #else
165 static const uint64x2_t pm = {0, 0x8000000000000000ull};
166 /* Gcc-4.9.2 still does not include vreinterpretq_f64_u64, but simple
167 * casts generate the correct assembly.
168 */
169 return (float64x2_t)(veorq_u64((uint64x2_t)(x), (uint64x2_t)(pm)));
170 #endif
171 }
172
173 static inline V VBYI(V x)
174 {
175 return FLIP_RI(VCONJ(x));
176 }
177
178 static inline V VFMAI(V b, V c)
179 {
180 const V mp = VLIT(-1.0, 1.0);
181 return VFMA(FLIP_RI(b), mp, c);
182 }
183
184 static inline V VFNMSI(V b, V c)
185 {
186 const V mp = VLIT(-1.0, 1.0);
187 return VFNMS(FLIP_RI(b), mp, c);
188 }
189
190 static inline V VFMACONJ(V b, V c)
191 {
192 const V pm = VLIT(1.0, -1.0);
193 return VFMA(b, pm, c);
194 }
195
196 static inline V VFNMSCONJ(V b, V c)
197 {
198 const V pm = VLIT(1.0, -1.0);
199 return VFNMS(b, pm, c);
200 }
201
202 static inline V VFMSCONJ(V b, V c)
203 {
204 return VSUB(VCONJ(b), c);
205 }
206
207 #ifdef FFTW_SINGLE
208 #if 1
209 #define VEXTRACT_REIM(tr, ti, tx) \
210 { \
211 tr = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 0), \
212 SUFF(vdup_lane)(SUFF(vget_high)(tx), 0)); \
213 ti = SUFF(vcombine)(SUFF(vdup_lane)(SUFF(vget_low)(tx), 1), \
214 SUFF(vdup_lane)(SUFF(vget_high)(tx), 1)); \
215 }
216 #else
217 /* this alternative might be faster in an ideal world, but gcc likes
218 to spill VVV onto the stack */
219 #define VEXTRACT_REIM(tr, ti, tx) \
220 { \
221 float32x4x2_t vvv = SUFF(vtrnq)(tx, tx); \
222 tr = vvv.val[0]; \
223 ti = vvv.val[1]; \
224 }
225 #endif
226 #else
227 #define VEXTRACT_REIM(tr, ti, tx) \
228 { \
229 tr = SUFF(vtrn1q)(tx, tx); \
230 ti = SUFF(vtrn2q)(tx, tx); \
231 }
232 #endif
233
234 static inline V VZMUL(V tx, V sr)
235 {
236 V tr, ti;
237 VEXTRACT_REIM(tr, ti, tx);
238 tr = VMUL(sr, tr);
239 sr = VBYI(sr);
240 return VFMA(ti, sr, tr);
241 }
242
243 static inline V VZMULJ(V tx, V sr)
244 {
245 V tr, ti;
246 VEXTRACT_REIM(tr, ti, tx);
247 tr = VMUL(sr, tr);
248 sr = VBYI(sr);
249 return VFNMS(ti, sr, tr);
250 }
251
252 static inline V VZMULI(V tx, V sr)
253 {
254 V tr, ti;
255 VEXTRACT_REIM(tr, ti, tx);
256 ti = VMUL(ti, sr);
257 sr = VBYI(sr);
258 return VFMS(tr, sr, ti);
259 }
260
261 static inline V VZMULIJ(V tx, V sr)
262 {
263 V tr, ti;
264 VEXTRACT_REIM(tr, ti, tx);
265 ti = VMUL(ti, sr);
266 sr = VBYI(sr);
267 return VFMA(tr, sr, ti);
268 }
269
270 /* twiddle storage #1: compact, slower */
271 #ifdef FFTW_SINGLE
272 #define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
273 #else
274 #define VTW1(v,x) {TW_CEXP, v, x}
275 #endif
276 #define TWVL1 VL
277 static inline V BYTW1(const R *t, V sr)
278 {
279 V tx = LDA(t, 2, 0);
280 return VZMUL(tx, sr);
281 }
282
283 static inline V BYTWJ1(const R *t, V sr)
284 {
285 V tx = LDA(t, 2, 0);
286 return VZMULJ(tx, sr);
287 }
288
289 /* twiddle storage #2: twice the space, faster (when in cache) */
290 #ifdef FFTW_SINGLE
291 # define VTW2(v,x) \
292 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
293 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
294 #else
295 # define VTW2(v,x) \
296 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
297 #endif
298 #define TWVL2 (2 * VL)
299
300 static inline V BYTW2(const R *t, V sr)
301 {
302 V si = FLIP_RI(sr);
303 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
304 return VFMA(ti, si, VMUL(tr, sr));
305 }
306
307 static inline V BYTWJ2(const R *t, V sr)
308 {
309 V si = FLIP_RI(sr);
310 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
311 return VFNMS(ti, si, VMUL(tr, sr));
312 }
313
314 /* twiddle storage #3 */
315 #ifdef FFTW_SINGLE
316 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
317 #else
318 # define VTW3(v,x) {TW_CEXP, v, x}
319 #endif
320 # define TWVL3 (VL)
321
322 /* twiddle storage for split arrays */
323 #ifdef FFTW_SINGLE
324 # define VTWS(v,x) \
325 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
326 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
327 #else
328 # define VTWS(v,x) \
329 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
330 #endif
331 #define TWVLS (2 * VL)
332
333 #define VLEAVE() /* nothing */
334
335 #include "simd-common.h"