comparison src/fftw-3.3.3/simd-support/simd-neon.h @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 #ifndef FFTW_SINGLE
22 #error "NEON only works in single precision"
23 #endif
24
25 /* define these unconditionally, because they are used by
26 taint.c which is compiled without neon */
27 #define SIMD_SUFFIX _neon /* for renaming */
28 #define VL 2 /* SIMD complex vector length */
29 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
30 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
31
32 #if defined(__GNUC__) && !defined(__ARM_NEON__)
33 #error "compiling simd-neon.h requires -mfpu=neon or equivalent"
34 #endif
35
36 #include <arm_neon.h>
37
38 /* FIXME: I am not sure whether this code assumes little-endian
39 ordering. VLIT may or may not be wrong for big-endian systems. */
40 typedef float32x4_t V;
41
42 #define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
43 #define LDK(x) x
44 #define DVK(var, val) const V var = VLIT(val, val, val, val)
45
46 /* NEON has FMA, but a three-operand FMA is not too useful
47 for FFT purposes. We normally compute
48
49 t0=a+b*c
50 t1=a-b*c
51
52 In a three-operand instruction set this translates into
53
54 t0=a
55 t0+=b*c
56 t1=a
57 t1-=b*c
58
59 At least one move must be implemented, negating the advantage of
60 the FMA in the first place. At least some versions of gcc generate
61 both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
62 #if HAVE_FMA
63 #warning "--enable-fma on NEON is probably a bad idea (see source code)"
64 #endif
65
66 #define VADD(a, b) vaddq_f32(a, b)
67 #define VSUB(a, b) vsubq_f32(a, b)
68 #define VMUL(a, b) vmulq_f32(a, b)
69 #define VFMA(a, b, c) vmlaq_f32(c, a, b) /* a*b+c */
70 #define VFNMS(a, b, c) vmlsq_f32(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
71 in ARM terminology */
72 #define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
73 arm instruction (?) */
74
75 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
76 {
77 (void) aligned_like; /* UNUSED */
78 return vld1q_f32((const float32_t *)x);
79 }
80
81 static inline V LD(const R *x, INT ivs, const R *aligned_like)
82 {
83 (void) aligned_like; /* UNUSED */
84 return vcombine_f32(vld1_f32((float32_t *)x), vld1_f32((float32_t *)(x + ivs)));
85 }
86
87 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
88 {
89 (void) aligned_like; /* UNUSED */
90 vst1q_f32((float32_t *)x, v);
91 }
92
93 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
94 {
95 (void) aligned_like; /* UNUSED */
96 /* WARNING: the extra_iter hack depends upon store-low occurring
97 after store-high */
98 vst1_f32((float32_t *)(x + ovs), vget_high_f32(v));
99 vst1_f32((float32_t *)x, vget_low_f32(v));
100 }
101
102 /* 2x2 complex transpose and store */
103 #define STM2 ST
104 #define STN2(x, v0, v1, ovs) /* nop */
105
106 /* store and 4x4 real transpose */
107 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
108 {
109 (void) aligned_like; /* UNUSED */
110 vst1_lane_f32((float32_t *)(x) , vget_low_f32(v), 0);
111 vst1_lane_f32((float32_t *)(x + ovs), vget_low_f32(v), 1);
112 vst1_lane_f32((float32_t *)(x + 2 * ovs), vget_high_f32(v), 0);
113 vst1_lane_f32((float32_t *)(x + 3 * ovs), vget_high_f32(v), 1);
114 }
115 #define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
116
117 #define FLIP_RI(x) vrev64q_f32(x)
118
119 static inline V VCONJ(V x)
120 {
121 #if 1
122 static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
123 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
124 #else
125 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
126 return VMUL(x, pm);
127 #endif
128 }
129
130 static inline V VBYI(V x)
131 {
132 return FLIP_RI(VCONJ(x));
133 }
134
135 static inline V VFMAI(V b, V c)
136 {
137 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
138 return VFMA(FLIP_RI(b), mp, c);
139 }
140
141 static inline V VFNMSI(V b, V c)
142 {
143 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
144 return VFNMS(FLIP_RI(b), mp, c);
145 }
146
147 static inline V VFMACONJ(V b, V c)
148 {
149 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
150 return VFMA(b, pm, c);
151 }
152
153 static inline V VFNMSCONJ(V b, V c)
154 {
155 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
156 return VFNMS(b, pm, c);
157 }
158
159 static inline V VFMSCONJ(V b, V c)
160 {
161 return VSUB(VCONJ(b), c);
162 }
163
164 #if 1
165 #define VEXTRACT_REIM(tr, ti, tx) \
166 { \
167 tr = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 0), \
168 vdup_lane_f32(vget_high_f32(tx), 0)); \
169 ti = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 1), \
170 vdup_lane_f32(vget_high_f32(tx), 1)); \
171 }
172 #else
173 /* this alternative might be faster in an ideal world, but gcc likes
174 to spill VVV onto the stack */
175 #define VEXTRACT_REIM(tr, ti, tx) \
176 { \
177 float32x4x2_t vvv = vtrnq_f32(tx, tx); \
178 tr = vvv.val[0]; \
179 ti = vvv.val[1]; \
180 }
181 #endif
182
183 static inline V VZMUL(V tx, V sr)
184 {
185 V tr, ti;
186 VEXTRACT_REIM(tr, ti, tx);
187 tr = VMUL(sr, tr);
188 sr = VBYI(sr);
189 return VFMA(ti, sr, tr);
190 }
191
192 static inline V VZMULJ(V tx, V sr)
193 {
194 V tr, ti;
195 VEXTRACT_REIM(tr, ti, tx);
196 tr = VMUL(sr, tr);
197 sr = VBYI(sr);
198 return VFNMS(ti, sr, tr);
199 }
200
201 static inline V VZMULI(V tx, V sr)
202 {
203 V tr, ti;
204 VEXTRACT_REIM(tr, ti, tx);
205 ti = VMUL(ti, sr);
206 sr = VBYI(sr);
207 return VFMS(tr, sr, ti);
208 }
209
210 static inline V VZMULIJ(V tx, V sr)
211 {
212 V tr, ti;
213 VEXTRACT_REIM(tr, ti, tx);
214 ti = VMUL(ti, sr);
215 sr = VBYI(sr);
216 return VFMA(tr, sr, ti);
217 }
218
219 /* twiddle storage #1: compact, slower */
220 #define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
221 #define TWVL1 VL
222 static inline V BYTW1(const R *t, V sr)
223 {
224 V tx = LDA(t, 2, 0);
225 return VZMUL(tx, sr);
226 }
227
228 static inline V BYTWJ1(const R *t, V sr)
229 {
230 V tx = LDA(t, 2, 0);
231 return VZMULJ(tx, sr);
232 }
233
234 /* twiddle storage #2: twice the space, faster (when in cache) */
235 # define VTW2(v,x) \
236 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
237 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
238 #define TWVL2 (2 * VL)
239
240 static inline V BYTW2(const R *t, V sr)
241 {
242 V si = FLIP_RI(sr);
243 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
244 return VFMA(ti, si, VMUL(tr, sr));
245 }
246
247 static inline V BYTWJ2(const R *t, V sr)
248 {
249 V si = FLIP_RI(sr);
250 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
251 return VFNMS(ti, si, VMUL(tr, sr));
252 }
253
254 /* twiddle storage #3 */
255 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
256 # define TWVL3 (VL)
257
258 /* twiddle storage for split arrays */
259 # define VTWS(v,x) \
260 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
261 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
262 #define TWVLS (2 * VL)
263
264 #define VLEAVE() /* nothing */
265
266 #include "simd-common.h"