Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/simd-support/simd-neon.h @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 #ifndef FFTW_SINGLE | |
22 #error "NEON only works in single precision" | |
23 #endif | |
24 | |
25 /* define these unconditionally, because they are used by | |
26 taint.c which is compiled without neon */ | |
27 #define SIMD_SUFFIX _neon /* for renaming */ | |
28 #define VL 2 /* SIMD complex vector length */ | |
29 #define SIMD_VSTRIDE_OKA(x) ((x) == 2) | |
30 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK | |
31 | |
32 #if defined(__GNUC__) && !defined(__ARM_NEON__) | |
33 #error "compiling simd-neon.h requires -mfpu=neon or equivalent" | |
34 #endif | |
35 | |
36 #include <arm_neon.h> | |
37 | |
38 /* FIXME: I am not sure whether this code assumes little-endian | |
39 ordering. VLIT may or may not be wrong for big-endian systems. */ | |
40 typedef float32x4_t V; | |
41 | |
42 #define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3} | |
43 #define LDK(x) x | |
44 #define DVK(var, val) const V var = VLIT(val, val, val, val) | |
45 | |
46 /* NEON has FMA, but a three-operand FMA is not too useful | |
47 for FFT purposes. We normally compute | |
48 | |
49 t0=a+b*c | |
50 t1=a-b*c | |
51 | |
52 In a three-operand instruction set this translates into | |
53 | |
54 t0=a | |
55 t0+=b*c | |
56 t1=a | |
57 t1-=b*c | |
58 | |
59 At least one move must be implemented, negating the advantage of | |
60 the FMA in the first place. At least some versions of gcc generate | |
61 both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/ | |
62 #if HAVE_FMA | |
63 #warning "--enable-fma on NEON is probably a bad idea (see source code)" | |
64 #endif | |
65 | |
66 #define VADD(a, b) vaddq_f32(a, b) | |
67 #define VSUB(a, b) vsubq_f32(a, b) | |
68 #define VMUL(a, b) vmulq_f32(a, b) | |
69 #define VFMA(a, b, c) vmlaq_f32(c, a, b) /* a*b+c */ | |
70 #define VFNMS(a, b, c) vmlsq_f32(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b | |
71 in ARM terminology */ | |
72 #define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent | |
73 arm instruction (?) */ | |
74 | |
75 static inline V LDA(const R *x, INT ivs, const R *aligned_like) | |
76 { | |
77 (void) aligned_like; /* UNUSED */ | |
78 return vld1q_f32((const float32_t *)x); | |
79 } | |
80 | |
81 static inline V LD(const R *x, INT ivs, const R *aligned_like) | |
82 { | |
83 (void) aligned_like; /* UNUSED */ | |
84 return vcombine_f32(vld1_f32((float32_t *)x), vld1_f32((float32_t *)(x + ivs))); | |
85 } | |
86 | |
87 static inline void STA(R *x, V v, INT ovs, const R *aligned_like) | |
88 { | |
89 (void) aligned_like; /* UNUSED */ | |
90 vst1q_f32((float32_t *)x, v); | |
91 } | |
92 | |
93 static inline void ST(R *x, V v, INT ovs, const R *aligned_like) | |
94 { | |
95 (void) aligned_like; /* UNUSED */ | |
96 /* WARNING: the extra_iter hack depends upon store-low occurring | |
97 after store-high */ | |
98 vst1_f32((float32_t *)(x + ovs), vget_high_f32(v)); | |
99 vst1_f32((float32_t *)x, vget_low_f32(v)); | |
100 } | |
101 | |
102 /* 2x2 complex transpose and store */ | |
103 #define STM2 ST | |
104 #define STN2(x, v0, v1, ovs) /* nop */ | |
105 | |
106 /* store and 4x4 real transpose */ | |
107 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) | |
108 { | |
109 (void) aligned_like; /* UNUSED */ | |
110 vst1_lane_f32((float32_t *)(x) , vget_low_f32(v), 0); | |
111 vst1_lane_f32((float32_t *)(x + ovs), vget_low_f32(v), 1); | |
112 vst1_lane_f32((float32_t *)(x + 2 * ovs), vget_high_f32(v), 0); | |
113 vst1_lane_f32((float32_t *)(x + 3 * ovs), vget_high_f32(v), 1); | |
114 } | |
115 #define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */ | |
116 | |
117 #define FLIP_RI(x) vrev64q_f32(x) | |
118 | |
119 static inline V VCONJ(V x) | |
120 { | |
121 #if 1 | |
122 static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u}; | |
123 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm)); | |
124 #else | |
125 const V pm = VLIT(1.0, -1.0, 1.0, -1.0); | |
126 return VMUL(x, pm); | |
127 #endif | |
128 } | |
129 | |
130 static inline V VBYI(V x) | |
131 { | |
132 return FLIP_RI(VCONJ(x)); | |
133 } | |
134 | |
135 static inline V VFMAI(V b, V c) | |
136 { | |
137 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0); | |
138 return VFMA(FLIP_RI(b), mp, c); | |
139 } | |
140 | |
141 static inline V VFNMSI(V b, V c) | |
142 { | |
143 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0); | |
144 return VFNMS(FLIP_RI(b), mp, c); | |
145 } | |
146 | |
147 static inline V VFMACONJ(V b, V c) | |
148 { | |
149 const V pm = VLIT(1.0, -1.0, 1.0, -1.0); | |
150 return VFMA(b, pm, c); | |
151 } | |
152 | |
153 static inline V VFNMSCONJ(V b, V c) | |
154 { | |
155 const V pm = VLIT(1.0, -1.0, 1.0, -1.0); | |
156 return VFNMS(b, pm, c); | |
157 } | |
158 | |
159 static inline V VFMSCONJ(V b, V c) | |
160 { | |
161 return VSUB(VCONJ(b), c); | |
162 } | |
163 | |
164 #if 1 | |
165 #define VEXTRACT_REIM(tr, ti, tx) \ | |
166 { \ | |
167 tr = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 0), \ | |
168 vdup_lane_f32(vget_high_f32(tx), 0)); \ | |
169 ti = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 1), \ | |
170 vdup_lane_f32(vget_high_f32(tx), 1)); \ | |
171 } | |
172 #else | |
173 /* this alternative might be faster in an ideal world, but gcc likes | |
174 to spill VVV onto the stack */ | |
175 #define VEXTRACT_REIM(tr, ti, tx) \ | |
176 { \ | |
177 float32x4x2_t vvv = vtrnq_f32(tx, tx); \ | |
178 tr = vvv.val[0]; \ | |
179 ti = vvv.val[1]; \ | |
180 } | |
181 #endif | |
182 | |
183 static inline V VZMUL(V tx, V sr) | |
184 { | |
185 V tr, ti; | |
186 VEXTRACT_REIM(tr, ti, tx); | |
187 tr = VMUL(sr, tr); | |
188 sr = VBYI(sr); | |
189 return VFMA(ti, sr, tr); | |
190 } | |
191 | |
192 static inline V VZMULJ(V tx, V sr) | |
193 { | |
194 V tr, ti; | |
195 VEXTRACT_REIM(tr, ti, tx); | |
196 tr = VMUL(sr, tr); | |
197 sr = VBYI(sr); | |
198 return VFNMS(ti, sr, tr); | |
199 } | |
200 | |
201 static inline V VZMULI(V tx, V sr) | |
202 { | |
203 V tr, ti; | |
204 VEXTRACT_REIM(tr, ti, tx); | |
205 ti = VMUL(ti, sr); | |
206 sr = VBYI(sr); | |
207 return VFMS(tr, sr, ti); | |
208 } | |
209 | |
210 static inline V VZMULIJ(V tx, V sr) | |
211 { | |
212 V tr, ti; | |
213 VEXTRACT_REIM(tr, ti, tx); | |
214 ti = VMUL(ti, sr); | |
215 sr = VBYI(sr); | |
216 return VFMA(tr, sr, ti); | |
217 } | |
218 | |
219 /* twiddle storage #1: compact, slower */ | |
220 #define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x} | |
221 #define TWVL1 VL | |
222 static inline V BYTW1(const R *t, V sr) | |
223 { | |
224 V tx = LDA(t, 2, 0); | |
225 return VZMUL(tx, sr); | |
226 } | |
227 | |
228 static inline V BYTWJ1(const R *t, V sr) | |
229 { | |
230 V tx = LDA(t, 2, 0); | |
231 return VZMULJ(tx, sr); | |
232 } | |
233 | |
234 /* twiddle storage #2: twice the space, faster (when in cache) */ | |
235 # define VTW2(v,x) \ | |
236 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \ | |
237 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x} | |
238 #define TWVL2 (2 * VL) | |
239 | |
240 static inline V BYTW2(const R *t, V sr) | |
241 { | |
242 V si = FLIP_RI(sr); | |
243 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0); | |
244 return VFMA(ti, si, VMUL(tr, sr)); | |
245 } | |
246 | |
247 static inline V BYTWJ2(const R *t, V sr) | |
248 { | |
249 V si = FLIP_RI(sr); | |
250 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0); | |
251 return VFNMS(ti, si, VMUL(tr, sr)); | |
252 } | |
253 | |
254 /* twiddle storage #3 */ | |
255 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x} | |
256 # define TWVL3 (VL) | |
257 | |
258 /* twiddle storage for split arrays */ | |
259 # define VTWS(v,x) \ | |
260 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \ | |
261 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x} | |
262 #define TWVLS (2 * VL) | |
263 | |
264 #define VLEAVE() /* nothing */ | |
265 | |
266 #include "simd-common.h" |