comparison src/fftw-3.3.5/simd-support/simd-generic128.h @ 42:2cd0e3b3e1fd

Current fftw source
author Chris Cannam
date Tue, 18 Oct 2016 13:40:26 +0100
parents
children
comparison
equal deleted inserted replaced
41:481f5f8c5634 42:2cd0e3b3e1fd
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * Generic128d added by Romain Dolbeau, and turned into simd-generic128.h
6 * with single & double precision by Erik Lindahl.
7 * Romain Dolbeau hereby places his modifications in the public domain.
8 * Erik Lindahl hereby places his modifications in the public domain.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 *
24 */
25
26
27 #if defined(FFTW_LDOUBLE) || defined(FFTW_QUAD)
28 # error "Generic simd128 only works in single or double precision"
29 #endif
30
31 #define SIMD_SUFFIX _generic_simd128 /* for renaming */
32
33 #ifdef FFTW_SINGLE
34 # define DS(d,s) s /* single-precision option */
35 # define VDUPL(x) (V){x[0],x[0],x[2],x[2]}
36 # define VDUPH(x) (V){x[1],x[1],x[3],x[3]}
37 # define DVK(var, val) V var = {val,val,val,val}
38 #else
39 # define DS(d,s) d /* double-precision option */
40 # define VDUPL(x) (V){x[0],x[0]}
41 # define VDUPH(x) (V){x[1],x[1]}
42 # define DVK(var, val) V var = {val, val}
43 #endif
44
45 #define VL DS(1,2) /* SIMD vector length, in term of complex numbers */
46 #define SIMD_VSTRIDE_OKA(x) DS(1,((x) == 2))
47 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
48
49 typedef DS(double,float) V __attribute__ ((vector_size(16)));
50
51 #define VADD(a,b) ((a)+(b))
52 #define VSUB(a,b) ((a)-(b))
53 #define VMUL(a,b) ((a)*(b))
54
55
56 #define LDK(x) x
57
58 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
59 {
60 (void)aligned_like; /* UNUSED */
61 (void)ivs; /* UNUSED */
62 return *(const V *)x;
63 }
64
65 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
66 {
67 (void)aligned_like; /* UNUSED */
68 (void)ovs; /* UNUSED */
69 *(V *)x = v;
70 }
71
72 static inline V LD(const R *x, INT ivs, const R *aligned_like)
73 {
74 (void)aligned_like; /* UNUSED */
75 V res;
76 res[0] = x[0];
77 res[1] = x[1];
78 #ifdef FFTW_SINGLE
79 res[2] = x[ivs];
80 res[3] = x[ivs+1];
81 #endif
82 return res;
83 }
84
85 #ifdef FFTW_SINGLE
86 /* ST has to be separate due to the storage hack requiring reverse order */
87 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
88 {
89 (void)aligned_like; /* UNUSED */
90 (void)ovs; /* UNUSED */
91 *(x + ovs ) = v[2];
92 *(x + ovs + 1) = v[3];
93 *(x ) = v[0];
94 *(x + 1) = v[1];
95 }
96 #else
97 /* FFTW_DOUBLE */
98 # define ST STA
99 #endif
100
101 #ifdef FFTW_SINGLE
102 #define STM2 ST
103 #define STN2(x, v0, v1, ovs) /* nop */
104
105 static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
106 {
107 *(x ) = v0[0];
108 *(x + 1) = v1[0];
109 *(x + 2) = v2[0];
110 *(x + 3) = v3[0];
111 *(x + ovs ) = v0[1];
112 *(x + ovs + 1) = v1[1];
113 *(x + ovs + 2) = v2[1];
114 *(x + ovs + 3) = v3[1];
115 *(x + 2 * ovs ) = v0[2];
116 *(x + 2 * ovs + 1) = v1[2];
117 *(x + 2 * ovs + 2) = v2[2];
118 *(x + 2 * ovs + 3) = v3[2];
119 *(x + 3 * ovs ) = v0[3];
120 *(x + 3 * ovs + 1) = v1[3];
121 *(x + 3 * ovs + 2) = v2[3];
122 *(x + 3 * ovs + 3) = v3[3];
123 }
124 #define STM4(x, v, ovs, aligned_like) /* no-op */
125
126
127 #else
128 /* FFTW_DOUBLE */
129
130 #define STM2 STA
131 #define STN2(x, v0, v1, ovs) /* nop */
132
133 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
134 {
135 (void)aligned_like; /* UNUSED */
136 *(x) = v[0];
137 *(x+ovs) = v[1];
138 }
139 # define STN4(x, v0, v1, v2, v3, ovs) /* nothing */
140 #endif
141
142
143 static inline V FLIP_RI(V x)
144 {
145 #ifdef FFTW_SINGLE
146 return (V){x[1],x[0],x[3],x[2]};
147 #else
148 return (V){x[1],x[0]};
149 #endif
150 }
151
152 static inline V VCONJ(V x)
153 {
154 #ifdef FFTW_SINGLE
155 return (V){x[0],-x[1],x[2],-x[3]};
156 #else
157 return (V){x[0],-x[1]};
158 #endif
159 }
160
161 static inline V VBYI(V x)
162 {
163 x = VCONJ(x);
164 x = FLIP_RI(x);
165 return x;
166 }
167
168 /* FMA support */
169 #define VFMA(a, b, c) VADD(c, VMUL(a, b))
170 #define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
171 #define VFMS(a, b, c) VSUB(VMUL(a, b), c)
172 #define VFMAI(b, c) VADD(c, VBYI(b))
173 #define VFNMSI(b, c) VSUB(c, VBYI(b))
174 #define VFMACONJ(b,c) VADD(VCONJ(b),c)
175 #define VFMSCONJ(b,c) VSUB(VCONJ(b),c)
176 #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b))
177
178 static inline V VZMUL(V tx, V sr)
179 {
180 V tr = VDUPL(tx);
181 V ti = VDUPH(tx);
182 tr = VMUL(sr, tr);
183 sr = VBYI(sr);
184 return VFMA(ti, sr, tr);
185 }
186
187 static inline V VZMULJ(V tx, V sr)
188 {
189 V tr = VDUPL(tx);
190 V ti = VDUPH(tx);
191 tr = VMUL(sr, tr);
192 sr = VBYI(sr);
193 return VFNMS(ti, sr, tr);
194 }
195
196 static inline V VZMULI(V tx, V sr)
197 {
198 V tr = VDUPL(tx);
199 V ti = VDUPH(tx);
200 ti = VMUL(ti, sr);
201 sr = VBYI(sr);
202 return VFMS(tr, sr, ti);
203 }
204
205 static inline V VZMULIJ(V tx, V sr)
206 {
207 V tr = VDUPL(tx);
208 V ti = VDUPH(tx);
209 ti = VMUL(ti, sr);
210 sr = VBYI(sr);
211 return VFMA(tr, sr, ti);
212 }
213
214 /* twiddle storage #1: compact, slower */
215 #ifdef FFTW_SINGLE
216 # define VTW1(v,x) \
217 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
218 static inline V BYTW1(const R *t, V sr)
219 {
220 return VZMUL(LDA(t, 2, t), sr);
221 }
222 static inline V BYTWJ1(const R *t, V sr)
223 {
224 return VZMULJ(LDA(t, 2, t), sr);
225 }
226 #else /* !FFTW_SINGLE */
227 # define VTW1(v,x) {TW_CEXP, v, x}
228 static inline V BYTW1(const R *t, V sr)
229 {
230 V tx = LD(t, 1, t);
231 return VZMUL(tx, sr);
232 }
233 static inline V BYTWJ1(const R *t, V sr)
234 {
235 V tx = LD(t, 1, t);
236 return VZMULJ(tx, sr);
237 }
238 #endif
239 #define TWVL1 (VL)
240
241 /* twiddle storage #2: twice the space, faster (when in cache) */
242 #ifdef FFTW_SINGLE
243 # define VTW2(v,x) \
244 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
245 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
246 #else /* !FFTW_SINGLE */
247 # define VTW2(v,x) \
248 {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x}
249 #endif
250 #define TWVL2 (2 * VL)
251 static inline V BYTW2(const R *t, V sr)
252 {
253 const V *twp = (const V *)t;
254 V si = FLIP_RI(sr);
255 V tr = twp[0], ti = twp[1];
256 return VFMA(tr, sr, VMUL(ti, si));
257 }
258 static inline V BYTWJ2(const R *t, V sr)
259 {
260 const V *twp = (const V *)t;
261 V si = FLIP_RI(sr);
262 V tr = twp[0], ti = twp[1];
263 return VFNMS(ti, si, VMUL(tr, sr));
264 }
265
266 /* twiddle storage #3 */
267 #ifdef FFTW_SINGLE
268 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
269 # define TWVL3 (VL)
270 #else
271 # define VTW3(v,x) VTW1(v,x)
272 # define TWVL3 TWVL1
273 #endif
274
275 /* twiddle storage for split arrays */
276 #ifdef FFTW_SINGLE
277 # define VTWS(v,x) \
278 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
279 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
280 #else
281 # define VTWS(v,x) \
282 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
283 #endif
284 #define TWVLS (2 * VL)
285
286 #define VLEAVE() /* nothing */
287
288 #include "simd-common.h"