Chris@42
|
1 /*
|
Chris@42
|
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
|
Chris@42
|
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
Chris@42
|
4 *
|
Chris@42
|
5 * This program is free software; you can redistribute it and/or modify
|
Chris@42
|
6 * it under the terms of the GNU General Public License as published by
|
Chris@42
|
7 * the Free Software Foundation; either version 2 of the License, or
|
Chris@42
|
8 * (at your option) any later version.
|
Chris@42
|
9 *
|
Chris@42
|
10 * This program is distributed in the hope that it will be useful,
|
Chris@42
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Chris@42
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Chris@42
|
13 * GNU General Public License for more details.
|
Chris@42
|
14 *
|
Chris@42
|
15 * You should have received a copy of the GNU General Public License
|
Chris@42
|
16 * along with this program; if not, write to the Free Software
|
Chris@42
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Chris@42
|
18 *
|
Chris@42
|
19 */
|
Chris@42
|
20
|
Chris@42
|
21 #include "codelet-dft.h"
|
Chris@42
|
22 #include SIMD_HEADER
|
Chris@42
|
23
|
Chris@42
|
24 #define EXTERN_CONST(t, x) extern const t x; const t x
|
Chris@42
|
25
|
Chris@42
|
26 static int n1b_okp(const kdft_desc *d,
|
Chris@42
|
27 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@42
|
28 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@42
|
29 const planner *plnr)
|
Chris@42
|
30 {
|
Chris@42
|
31 return (1
|
Chris@42
|
32 && ALIGNED(ii)
|
Chris@42
|
33 && ALIGNED(io)
|
Chris@42
|
34 && !NO_SIMDP(plnr)
|
Chris@42
|
35 && SIMD_STRIDE_OK(is)
|
Chris@42
|
36 && SIMD_STRIDE_OK(os)
|
Chris@42
|
37 && SIMD_VSTRIDE_OK(ivs)
|
Chris@42
|
38 && SIMD_VSTRIDE_OK(ovs)
|
Chris@42
|
39 && ri == ii + 1
|
Chris@42
|
40 && ro == io + 1
|
Chris@42
|
41 && (vl % VL) == 0
|
Chris@42
|
42 && (!d->is || (d->is == is))
|
Chris@42
|
43 && (!d->os || (d->os == os))
|
Chris@42
|
44 && (!d->ivs || (d->ivs == ivs))
|
Chris@42
|
45 && (!d->ovs || (d->ovs == ovs))
|
Chris@42
|
46 );
|
Chris@42
|
47 }
|
Chris@42
|
48
|
Chris@42
|
49 EXTERN_CONST(kdft_genus, XSIMD(dft_n1bsimd_genus)) = { n1b_okp, VL };
|
Chris@42
|
50
|
Chris@42
|
51 static int n1f_okp(const kdft_desc *d,
|
Chris@42
|
52 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@42
|
53 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@42
|
54 const planner *plnr)
|
Chris@42
|
55 {
|
Chris@42
|
56 return (1
|
Chris@42
|
57 && ALIGNED(ri)
|
Chris@42
|
58 && ALIGNED(ro)
|
Chris@42
|
59 && !NO_SIMDP(plnr)
|
Chris@42
|
60 && SIMD_STRIDE_OK(is)
|
Chris@42
|
61 && SIMD_STRIDE_OK(os)
|
Chris@42
|
62 && SIMD_VSTRIDE_OK(ivs)
|
Chris@42
|
63 && SIMD_VSTRIDE_OK(ovs)
|
Chris@42
|
64 && ii == ri + 1
|
Chris@42
|
65 && io == ro + 1
|
Chris@42
|
66 && (vl % VL) == 0
|
Chris@42
|
67 && (!d->is || (d->is == is))
|
Chris@42
|
68 && (!d->os || (d->os == os))
|
Chris@42
|
69 && (!d->ivs || (d->ivs == ivs))
|
Chris@42
|
70 && (!d->ovs || (d->ovs == ovs))
|
Chris@42
|
71 );
|
Chris@42
|
72 }
|
Chris@42
|
73
|
Chris@42
|
74 EXTERN_CONST(kdft_genus, XSIMD(dft_n1fsimd_genus)) = { n1f_okp, VL };
|
Chris@42
|
75
|
Chris@42
|
76 static int n2b_okp(const kdft_desc *d,
|
Chris@42
|
77 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@42
|
78 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@42
|
79 const planner *plnr)
|
Chris@42
|
80 {
|
Chris@42
|
81 return (1
|
Chris@42
|
82 && ALIGNEDA(ii)
|
Chris@42
|
83 && ALIGNEDA(io)
|
Chris@42
|
84 && !NO_SIMDP(plnr)
|
Chris@42
|
85 && SIMD_STRIDE_OKA(is)
|
Chris@42
|
86 && SIMD_VSTRIDE_OKA(ivs)
|
Chris@42
|
87 && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
|
Chris@42
|
88 && SIMD_STRIDE_OKPAIR(ovs)
|
Chris@42
|
89 && ri == ii + 1
|
Chris@42
|
90 && ro == io + 1
|
Chris@42
|
91 && (vl % VL) == 0
|
Chris@42
|
92 && (!d->is || (d->is == is))
|
Chris@42
|
93 && (!d->os || (d->os == os))
|
Chris@42
|
94 && (!d->ivs || (d->ivs == ivs))
|
Chris@42
|
95 && (!d->ovs || (d->ovs == ovs))
|
Chris@42
|
96 );
|
Chris@42
|
97 }
|
Chris@42
|
98
|
Chris@42
|
99 EXTERN_CONST(kdft_genus, XSIMD(dft_n2bsimd_genus)) = { n2b_okp, VL };
|
Chris@42
|
100
|
Chris@42
|
101 static int n2f_okp(const kdft_desc *d,
|
Chris@42
|
102 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@42
|
103 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@42
|
104 const planner *plnr)
|
Chris@42
|
105 {
|
Chris@42
|
106 return (1
|
Chris@42
|
107 && ALIGNEDA(ri)
|
Chris@42
|
108 && ALIGNEDA(ro)
|
Chris@42
|
109 && !NO_SIMDP(plnr)
|
Chris@42
|
110 && SIMD_STRIDE_OKA(is)
|
Chris@42
|
111 && SIMD_VSTRIDE_OKA(ivs)
|
Chris@42
|
112 && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
|
Chris@42
|
113 && SIMD_STRIDE_OKPAIR(ovs)
|
Chris@42
|
114 && ii == ri + 1
|
Chris@42
|
115 && io == ro + 1
|
Chris@42
|
116 && (vl % VL) == 0
|
Chris@42
|
117 && (!d->is || (d->is == is))
|
Chris@42
|
118 && (!d->os || (d->os == os))
|
Chris@42
|
119 && (!d->ivs || (d->ivs == ivs))
|
Chris@42
|
120 && (!d->ovs || (d->ovs == ovs))
|
Chris@42
|
121 );
|
Chris@42
|
122 }
|
Chris@42
|
123
|
Chris@42
|
124 EXTERN_CONST(kdft_genus, XSIMD(dft_n2fsimd_genus)) = { n2f_okp, VL };
|
Chris@42
|
125
|
Chris@42
|
126 static int n2s_okp(const kdft_desc *d,
|
Chris@42
|
127 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@42
|
128 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@42
|
129 const planner *plnr)
|
Chris@42
|
130 {
|
Chris@42
|
131 return (1
|
Chris@42
|
132 && !NO_SIMDP(plnr)
|
Chris@42
|
133 && ALIGNEDA(ri)
|
Chris@42
|
134 && ALIGNEDA(ii)
|
Chris@42
|
135 && ALIGNEDA(ro)
|
Chris@42
|
136 && ALIGNEDA(io)
|
Chris@42
|
137 && SIMD_STRIDE_OKA(is)
|
Chris@42
|
138 && ivs == 1
|
Chris@42
|
139 && os == 1
|
Chris@42
|
140 && SIMD_STRIDE_OKA(ovs)
|
Chris@42
|
141 && (vl % (2 * VL)) == 0
|
Chris@42
|
142 && (!d->is || (d->is == is))
|
Chris@42
|
143 && (!d->os || (d->os == os))
|
Chris@42
|
144 && (!d->ivs || (d->ivs == ivs))
|
Chris@42
|
145 && (!d->ovs || (d->ovs == ovs))
|
Chris@42
|
146 );
|
Chris@42
|
147 }
|
Chris@42
|
148
|
Chris@42
|
149 EXTERN_CONST(kdft_genus, XSIMD(dft_n2ssimd_genus)) = { n2s_okp, 2 * VL };
|
Chris@42
|
150
|
Chris@42
|
151 static int q1b_okp(const ct_desc *d,
|
Chris@42
|
152 const R *rio, const R *iio,
|
Chris@42
|
153 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
154 const planner *plnr)
|
Chris@42
|
155 {
|
Chris@42
|
156 return (1
|
Chris@42
|
157 && ALIGNED(iio)
|
Chris@42
|
158 && !NO_SIMDP(plnr)
|
Chris@42
|
159 && SIMD_STRIDE_OK(rs)
|
Chris@42
|
160 && SIMD_STRIDE_OK(vs)
|
Chris@42
|
161 && SIMD_VSTRIDE_OK(ms)
|
Chris@42
|
162 && rio == iio + 1
|
Chris@42
|
163 && (m % VL) == 0
|
Chris@42
|
164 && (mb % VL) == 0
|
Chris@42
|
165 && (me % VL) == 0
|
Chris@42
|
166 && (!d->rs || (d->rs == rs))
|
Chris@42
|
167 && (!d->vs || (d->vs == vs))
|
Chris@42
|
168 && (!d->ms || (d->ms == ms))
|
Chris@42
|
169 );
|
Chris@42
|
170 }
|
Chris@42
|
171 EXTERN_CONST(ct_genus, XSIMD(dft_q1bsimd_genus)) = { q1b_okp, VL };
|
Chris@42
|
172
|
Chris@42
|
173 static int q1f_okp(const ct_desc *d,
|
Chris@42
|
174 const R *rio, const R *iio,
|
Chris@42
|
175 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
176 const planner *plnr)
|
Chris@42
|
177 {
|
Chris@42
|
178 return (1
|
Chris@42
|
179 && ALIGNED(rio)
|
Chris@42
|
180 && !NO_SIMDP(plnr)
|
Chris@42
|
181 && SIMD_STRIDE_OK(rs)
|
Chris@42
|
182 && SIMD_STRIDE_OK(vs)
|
Chris@42
|
183 && SIMD_VSTRIDE_OK(ms)
|
Chris@42
|
184 && iio == rio + 1
|
Chris@42
|
185 && (m % VL) == 0
|
Chris@42
|
186 && (mb % VL) == 0
|
Chris@42
|
187 && (me % VL) == 0
|
Chris@42
|
188 && (!d->rs || (d->rs == rs))
|
Chris@42
|
189 && (!d->vs || (d->vs == vs))
|
Chris@42
|
190 && (!d->ms || (d->ms == ms))
|
Chris@42
|
191 );
|
Chris@42
|
192 }
|
Chris@42
|
193 EXTERN_CONST(ct_genus, XSIMD(dft_q1fsimd_genus)) = { q1f_okp, VL };
|
Chris@42
|
194
|
Chris@42
|
195 static int t_okp_common(const ct_desc *d,
|
Chris@42
|
196 const R *rio, const R *iio,
|
Chris@42
|
197 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
198 const planner *plnr)
|
Chris@42
|
199 {
|
Chris@42
|
200 UNUSED(rio); UNUSED(iio);
|
Chris@42
|
201 return (1
|
Chris@42
|
202 && !NO_SIMDP(plnr)
|
Chris@42
|
203 && SIMD_STRIDE_OKA(rs)
|
Chris@42
|
204 && SIMD_VSTRIDE_OKA(ms)
|
Chris@42
|
205 && (m % VL) == 0
|
Chris@42
|
206 && (mb % VL) == 0
|
Chris@42
|
207 && (me % VL) == 0
|
Chris@42
|
208 && (!d->rs || (d->rs == rs))
|
Chris@42
|
209 && (!d->vs || (d->vs == vs))
|
Chris@42
|
210 && (!d->ms || (d->ms == ms))
|
Chris@42
|
211 );
|
Chris@42
|
212 }
|
Chris@42
|
213
|
Chris@42
|
214 static int t_okp_commonu(const ct_desc *d,
|
Chris@42
|
215 const R *rio, const R *iio,
|
Chris@42
|
216 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
217 const planner *plnr)
|
Chris@42
|
218 {
|
Chris@42
|
219 UNUSED(rio); UNUSED(iio); UNUSED(m);
|
Chris@42
|
220 return (1
|
Chris@42
|
221 && !NO_SIMDP(plnr)
|
Chris@42
|
222 && SIMD_STRIDE_OK(rs)
|
Chris@42
|
223 && SIMD_VSTRIDE_OK(ms)
|
Chris@42
|
224 && (mb % VL) == 0
|
Chris@42
|
225 && (me % VL) == 0
|
Chris@42
|
226 && (!d->rs || (d->rs == rs))
|
Chris@42
|
227 && (!d->vs || (d->vs == vs))
|
Chris@42
|
228 && (!d->ms || (d->ms == ms))
|
Chris@42
|
229 );
|
Chris@42
|
230 }
|
Chris@42
|
231
|
Chris@42
|
232 static int t_okp_t1f(const ct_desc *d,
|
Chris@42
|
233 const R *rio, const R *iio,
|
Chris@42
|
234 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
235 const planner *plnr)
|
Chris@42
|
236 {
|
Chris@42
|
237 return t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
238 && iio == rio + 1
|
Chris@42
|
239 && ALIGNEDA(rio);
|
Chris@42
|
240 }
|
Chris@42
|
241
|
Chris@42
|
242 EXTERN_CONST(ct_genus, XSIMD(dft_t1fsimd_genus)) = { t_okp_t1f, VL };
|
Chris@42
|
243
|
Chris@42
|
244 static int t_okp_t1fu(const ct_desc *d,
|
Chris@42
|
245 const R *rio, const R *iio,
|
Chris@42
|
246 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
247 const planner *plnr)
|
Chris@42
|
248 {
|
Chris@42
|
249 return t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
250 && iio == rio + 1
|
Chris@42
|
251 && ALIGNED(rio);
|
Chris@42
|
252 }
|
Chris@42
|
253
|
Chris@42
|
254 EXTERN_CONST(ct_genus, XSIMD(dft_t1fusimd_genus)) = { t_okp_t1fu, VL };
|
Chris@42
|
255
|
Chris@42
|
256 static int t_okp_t1b(const ct_desc *d,
|
Chris@42
|
257 const R *rio, const R *iio,
|
Chris@42
|
258 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
259 const planner *plnr)
|
Chris@42
|
260 {
|
Chris@42
|
261 return t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
262 && rio == iio + 1
|
Chris@42
|
263 && ALIGNEDA(iio);
|
Chris@42
|
264 }
|
Chris@42
|
265
|
Chris@42
|
266 EXTERN_CONST(ct_genus, XSIMD(dft_t1bsimd_genus)) = { t_okp_t1b, VL };
|
Chris@42
|
267
|
Chris@42
|
268 static int t_okp_t1bu(const ct_desc *d,
|
Chris@42
|
269 const R *rio, const R *iio,
|
Chris@42
|
270 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
271 const planner *plnr)
|
Chris@42
|
272 {
|
Chris@42
|
273 return t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
274 && rio == iio + 1
|
Chris@42
|
275 && ALIGNED(iio);
|
Chris@42
|
276 }
|
Chris@42
|
277
|
Chris@42
|
278 EXTERN_CONST(ct_genus, XSIMD(dft_t1busimd_genus)) = { t_okp_t1bu, VL };
|
Chris@42
|
279
|
Chris@42
|
280 /* use t2* codelets only when n = m*radix is small, because
|
Chris@42
|
281 t2* codelets use ~2n twiddle factors (instead of ~n) */
|
Chris@42
|
282 static int small_enough(const ct_desc *d, INT m)
|
Chris@42
|
283 {
|
Chris@42
|
284 return m * d->radix <= 16384;
|
Chris@42
|
285 }
|
Chris@42
|
286
|
Chris@42
|
287 static int t_okp_t2f(const ct_desc *d,
|
Chris@42
|
288 const R *rio, const R *iio,
|
Chris@42
|
289 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
290 const planner *plnr)
|
Chris@42
|
291 {
|
Chris@42
|
292 return t_okp_t1f(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
293 && small_enough(d, m);
|
Chris@42
|
294 }
|
Chris@42
|
295
|
Chris@42
|
296 EXTERN_CONST(ct_genus, XSIMD(dft_t2fsimd_genus)) = { t_okp_t2f, VL };
|
Chris@42
|
297
|
Chris@42
|
298 static int t_okp_t2b(const ct_desc *d,
|
Chris@42
|
299 const R *rio, const R *iio,
|
Chris@42
|
300 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
301 const planner *plnr)
|
Chris@42
|
302 {
|
Chris@42
|
303 return t_okp_t1b(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@42
|
304 && small_enough(d, m);
|
Chris@42
|
305 }
|
Chris@42
|
306
|
Chris@42
|
307 EXTERN_CONST(ct_genus, XSIMD(dft_t2bsimd_genus)) = { t_okp_t2b, VL };
|
Chris@42
|
308
|
Chris@42
|
309 static int ts_okp(const ct_desc *d,
|
Chris@42
|
310 const R *rio, const R *iio,
|
Chris@42
|
311 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@42
|
312 const planner *plnr)
|
Chris@42
|
313 {
|
Chris@42
|
314 UNUSED(rio);
|
Chris@42
|
315 UNUSED(iio);
|
Chris@42
|
316 return (1
|
Chris@42
|
317 && !NO_SIMDP(plnr)
|
Chris@42
|
318 && ALIGNEDA(rio)
|
Chris@42
|
319 && ALIGNEDA(iio)
|
Chris@42
|
320 && SIMD_STRIDE_OKA(rs)
|
Chris@42
|
321 && ms == 1
|
Chris@42
|
322 && (m % (2 * VL)) == 0
|
Chris@42
|
323 && (mb % (2 * VL)) == 0
|
Chris@42
|
324 && (me % (2 * VL)) == 0
|
Chris@42
|
325 && (!d->rs || (d->rs == rs))
|
Chris@42
|
326 && (!d->vs || (d->vs == vs))
|
Chris@42
|
327 && (!d->ms || (d->ms == ms))
|
Chris@42
|
328 );
|
Chris@42
|
329 }
|
Chris@42
|
330
|
Chris@42
|
331 EXTERN_CONST(ct_genus, XSIMD(dft_tssimd_genus)) = { ts_okp, 2 * VL };
|