Chris@10
|
1 /*
|
Chris@10
|
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
|
Chris@10
|
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
Chris@10
|
4 *
|
Chris@10
|
5 * This program is free software; you can redistribute it and/or modify
|
Chris@10
|
6 * it under the terms of the GNU General Public License as published by
|
Chris@10
|
7 * the Free Software Foundation; either version 2 of the License, or
|
Chris@10
|
8 * (at your option) any later version.
|
Chris@10
|
9 *
|
Chris@10
|
10 * This program is distributed in the hope that it will be useful,
|
Chris@10
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Chris@10
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Chris@10
|
13 * GNU General Public License for more details.
|
Chris@10
|
14 *
|
Chris@10
|
15 * You should have received a copy of the GNU General Public License
|
Chris@10
|
16 * along with this program; if not, write to the Free Software
|
Chris@10
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Chris@10
|
18 *
|
Chris@10
|
19 */
|
Chris@10
|
20
|
Chris@10
|
21 #include "codelet-dft.h"
|
Chris@10
|
22 #include SIMD_HEADER
|
Chris@10
|
23
|
Chris@10
|
24 #define EXTERN_CONST(t, x) extern const t x; const t x
|
Chris@10
|
25
|
Chris@10
|
26 static int n1b_okp(const kdft_desc *d,
|
Chris@10
|
27 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@10
|
28 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@10
|
29 const planner *plnr)
|
Chris@10
|
30 {
|
Chris@10
|
31 return (1
|
Chris@10
|
32 && ALIGNED(ii)
|
Chris@10
|
33 && ALIGNED(io)
|
Chris@10
|
34 && !NO_SIMDP(plnr)
|
Chris@10
|
35 && SIMD_STRIDE_OK(is)
|
Chris@10
|
36 && SIMD_STRIDE_OK(os)
|
Chris@10
|
37 && SIMD_VSTRIDE_OK(ivs)
|
Chris@10
|
38 && SIMD_VSTRIDE_OK(ovs)
|
Chris@10
|
39 && ri == ii + 1
|
Chris@10
|
40 && ro == io + 1
|
Chris@10
|
41 && (vl % VL) == 0
|
Chris@10
|
42 && (!d->is || (d->is == is))
|
Chris@10
|
43 && (!d->os || (d->os == os))
|
Chris@10
|
44 && (!d->ivs || (d->ivs == ivs))
|
Chris@10
|
45 && (!d->ovs || (d->ovs == ovs))
|
Chris@10
|
46 );
|
Chris@10
|
47 }
|
Chris@10
|
48
|
Chris@10
|
49 EXTERN_CONST(kdft_genus, XSIMD(dft_n1bsimd_genus)) = { n1b_okp, VL };
|
Chris@10
|
50
|
Chris@10
|
51 static int n1f_okp(const kdft_desc *d,
|
Chris@10
|
52 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@10
|
53 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@10
|
54 const planner *plnr)
|
Chris@10
|
55 {
|
Chris@10
|
56 return (1
|
Chris@10
|
57 && ALIGNED(ri)
|
Chris@10
|
58 && ALIGNED(ro)
|
Chris@10
|
59 && !NO_SIMDP(plnr)
|
Chris@10
|
60 && SIMD_STRIDE_OK(is)
|
Chris@10
|
61 && SIMD_STRIDE_OK(os)
|
Chris@10
|
62 && SIMD_VSTRIDE_OK(ivs)
|
Chris@10
|
63 && SIMD_VSTRIDE_OK(ovs)
|
Chris@10
|
64 && ii == ri + 1
|
Chris@10
|
65 && io == ro + 1
|
Chris@10
|
66 && (vl % VL) == 0
|
Chris@10
|
67 && (!d->is || (d->is == is))
|
Chris@10
|
68 && (!d->os || (d->os == os))
|
Chris@10
|
69 && (!d->ivs || (d->ivs == ivs))
|
Chris@10
|
70 && (!d->ovs || (d->ovs == ovs))
|
Chris@10
|
71 );
|
Chris@10
|
72 }
|
Chris@10
|
73
|
Chris@10
|
74 EXTERN_CONST(kdft_genus, XSIMD(dft_n1fsimd_genus)) = { n1f_okp, VL };
|
Chris@10
|
75
|
Chris@10
|
76 static int n2b_okp(const kdft_desc *d,
|
Chris@10
|
77 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@10
|
78 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@10
|
79 const planner *plnr)
|
Chris@10
|
80 {
|
Chris@10
|
81 return (1
|
Chris@10
|
82 && ALIGNEDA(ii)
|
Chris@10
|
83 && ALIGNEDA(io)
|
Chris@10
|
84 && !NO_SIMDP(plnr)
|
Chris@10
|
85 && SIMD_STRIDE_OKA(is)
|
Chris@10
|
86 && SIMD_VSTRIDE_OKA(ivs)
|
Chris@10
|
87 && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
|
Chris@10
|
88 && SIMD_STRIDE_OKPAIR(ovs)
|
Chris@10
|
89 && ri == ii + 1
|
Chris@10
|
90 && ro == io + 1
|
Chris@10
|
91 && (vl % VL) == 0
|
Chris@10
|
92 && (!d->is || (d->is == is))
|
Chris@10
|
93 && (!d->os || (d->os == os))
|
Chris@10
|
94 && (!d->ivs || (d->ivs == ivs))
|
Chris@10
|
95 && (!d->ovs || (d->ovs == ovs))
|
Chris@10
|
96 );
|
Chris@10
|
97 }
|
Chris@10
|
98
|
Chris@10
|
99 EXTERN_CONST(kdft_genus, XSIMD(dft_n2bsimd_genus)) = { n2b_okp, VL };
|
Chris@10
|
100
|
Chris@10
|
101 static int n2f_okp(const kdft_desc *d,
|
Chris@10
|
102 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@10
|
103 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@10
|
104 const planner *plnr)
|
Chris@10
|
105 {
|
Chris@10
|
106 return (1
|
Chris@10
|
107 && ALIGNEDA(ri)
|
Chris@10
|
108 && ALIGNEDA(ro)
|
Chris@10
|
109 && !NO_SIMDP(plnr)
|
Chris@10
|
110 && SIMD_STRIDE_OKA(is)
|
Chris@10
|
111 && SIMD_VSTRIDE_OKA(ivs)
|
Chris@10
|
112 && SIMD_VSTRIDE_OKA(os) /* os == 2 enforced by codelet */
|
Chris@10
|
113 && SIMD_STRIDE_OKPAIR(ovs)
|
Chris@10
|
114 && ii == ri + 1
|
Chris@10
|
115 && io == ro + 1
|
Chris@10
|
116 && (vl % VL) == 0
|
Chris@10
|
117 && (!d->is || (d->is == is))
|
Chris@10
|
118 && (!d->os || (d->os == os))
|
Chris@10
|
119 && (!d->ivs || (d->ivs == ivs))
|
Chris@10
|
120 && (!d->ovs || (d->ovs == ovs))
|
Chris@10
|
121 );
|
Chris@10
|
122 }
|
Chris@10
|
123
|
Chris@10
|
124 EXTERN_CONST(kdft_genus, XSIMD(dft_n2fsimd_genus)) = { n2f_okp, VL };
|
Chris@10
|
125
|
Chris@10
|
126 static int n2s_okp(const kdft_desc *d,
|
Chris@10
|
127 const R *ri, const R *ii, const R *ro, const R *io,
|
Chris@10
|
128 INT is, INT os, INT vl, INT ivs, INT ovs,
|
Chris@10
|
129 const planner *plnr)
|
Chris@10
|
130 {
|
Chris@10
|
131 return (1
|
Chris@10
|
132 && !NO_SIMDP(plnr)
|
Chris@10
|
133 && ALIGNEDA(ri)
|
Chris@10
|
134 && ALIGNEDA(ii)
|
Chris@10
|
135 && ALIGNEDA(ro)
|
Chris@10
|
136 && ALIGNEDA(io)
|
Chris@10
|
137 && SIMD_STRIDE_OKA(is)
|
Chris@10
|
138 && ivs == 1
|
Chris@10
|
139 && os == 1
|
Chris@10
|
140 && SIMD_STRIDE_OKA(ovs)
|
Chris@10
|
141 && (vl % (2 * VL)) == 0
|
Chris@10
|
142 && (!d->is || (d->is == is))
|
Chris@10
|
143 && (!d->os || (d->os == os))
|
Chris@10
|
144 && (!d->ivs || (d->ivs == ivs))
|
Chris@10
|
145 && (!d->ovs || (d->ovs == ovs))
|
Chris@10
|
146 );
|
Chris@10
|
147 }
|
Chris@10
|
148
|
Chris@10
|
149 EXTERN_CONST(kdft_genus, XSIMD(dft_n2ssimd_genus)) = { n2s_okp, 2 * VL };
|
Chris@10
|
150
|
Chris@10
|
151 static int q1b_okp(const ct_desc *d,
|
Chris@10
|
152 const R *rio, const R *iio,
|
Chris@10
|
153 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
154 const planner *plnr)
|
Chris@10
|
155 {
|
Chris@10
|
156 return (1
|
Chris@10
|
157 && ALIGNED(iio)
|
Chris@10
|
158 && !NO_SIMDP(plnr)
|
Chris@10
|
159 && SIMD_STRIDE_OK(rs)
|
Chris@10
|
160 && SIMD_STRIDE_OK(vs)
|
Chris@10
|
161 && SIMD_VSTRIDE_OK(ms)
|
Chris@10
|
162 && rio == iio + 1
|
Chris@10
|
163 && (m % VL) == 0
|
Chris@10
|
164 && (mb % VL) == 0
|
Chris@10
|
165 && (me % VL) == 0
|
Chris@10
|
166 && (!d->rs || (d->rs == rs))
|
Chris@10
|
167 && (!d->vs || (d->vs == vs))
|
Chris@10
|
168 && (!d->ms || (d->ms == ms))
|
Chris@10
|
169 );
|
Chris@10
|
170 }
|
Chris@10
|
171 EXTERN_CONST(ct_genus, XSIMD(dft_q1bsimd_genus)) = { q1b_okp, VL };
|
Chris@10
|
172
|
Chris@10
|
173 static int q1f_okp(const ct_desc *d,
|
Chris@10
|
174 const R *rio, const R *iio,
|
Chris@10
|
175 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
176 const planner *plnr)
|
Chris@10
|
177 {
|
Chris@10
|
178 return (1
|
Chris@10
|
179 && ALIGNED(rio)
|
Chris@10
|
180 && !NO_SIMDP(plnr)
|
Chris@10
|
181 && SIMD_STRIDE_OK(rs)
|
Chris@10
|
182 && SIMD_STRIDE_OK(vs)
|
Chris@10
|
183 && SIMD_VSTRIDE_OK(ms)
|
Chris@10
|
184 && iio == rio + 1
|
Chris@10
|
185 && (m % VL) == 0
|
Chris@10
|
186 && (mb % VL) == 0
|
Chris@10
|
187 && (me % VL) == 0
|
Chris@10
|
188 && (!d->rs || (d->rs == rs))
|
Chris@10
|
189 && (!d->vs || (d->vs == vs))
|
Chris@10
|
190 && (!d->ms || (d->ms == ms))
|
Chris@10
|
191 );
|
Chris@10
|
192 }
|
Chris@10
|
193 EXTERN_CONST(ct_genus, XSIMD(dft_q1fsimd_genus)) = { q1f_okp, VL };
|
Chris@10
|
194
|
Chris@10
|
195 static int t_okp_common(const ct_desc *d,
|
Chris@10
|
196 const R *rio, const R *iio,
|
Chris@10
|
197 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
198 const planner *plnr)
|
Chris@10
|
199 {
|
Chris@10
|
200 UNUSED(rio); UNUSED(iio);
|
Chris@10
|
201 return (1
|
Chris@10
|
202 && !NO_SIMDP(plnr)
|
Chris@10
|
203 && SIMD_STRIDE_OKA(rs)
|
Chris@10
|
204 && SIMD_VSTRIDE_OKA(ms)
|
Chris@10
|
205 && (m % VL) == 0
|
Chris@10
|
206 && (mb % VL) == 0
|
Chris@10
|
207 && (me % VL) == 0
|
Chris@10
|
208 && (!d->rs || (d->rs == rs))
|
Chris@10
|
209 && (!d->vs || (d->vs == vs))
|
Chris@10
|
210 && (!d->ms || (d->ms == ms))
|
Chris@10
|
211 );
|
Chris@10
|
212 }
|
Chris@10
|
213
|
Chris@10
|
214 static int t_okp_commonu(const ct_desc *d,
|
Chris@10
|
215 const R *rio, const R *iio,
|
Chris@10
|
216 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
217 const planner *plnr)
|
Chris@10
|
218 {
|
Chris@10
|
219 UNUSED(rio); UNUSED(iio); UNUSED(m);
|
Chris@10
|
220 return (1
|
Chris@10
|
221 && !NO_SIMDP(plnr)
|
Chris@10
|
222 && SIMD_STRIDE_OK(rs)
|
Chris@10
|
223 && SIMD_VSTRIDE_OK(ms)
|
Chris@10
|
224 && (mb % VL) == 0
|
Chris@10
|
225 && (me % VL) == 0
|
Chris@10
|
226 && (!d->rs || (d->rs == rs))
|
Chris@10
|
227 && (!d->vs || (d->vs == vs))
|
Chris@10
|
228 && (!d->ms || (d->ms == ms))
|
Chris@10
|
229 );
|
Chris@10
|
230 }
|
Chris@10
|
231
|
Chris@10
|
232 static int t_okp_t1f(const ct_desc *d,
|
Chris@10
|
233 const R *rio, const R *iio,
|
Chris@10
|
234 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
235 const planner *plnr)
|
Chris@10
|
236 {
|
Chris@10
|
237 return t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
238 && iio == rio + 1
|
Chris@10
|
239 && ALIGNEDA(rio);
|
Chris@10
|
240 }
|
Chris@10
|
241
|
Chris@10
|
242 EXTERN_CONST(ct_genus, XSIMD(dft_t1fsimd_genus)) = { t_okp_t1f, VL };
|
Chris@10
|
243
|
Chris@10
|
244 static int t_okp_t1fu(const ct_desc *d,
|
Chris@10
|
245 const R *rio, const R *iio,
|
Chris@10
|
246 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
247 const planner *plnr)
|
Chris@10
|
248 {
|
Chris@10
|
249 return t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
250 && iio == rio + 1
|
Chris@10
|
251 && ALIGNED(rio);
|
Chris@10
|
252 }
|
Chris@10
|
253
|
Chris@10
|
254 EXTERN_CONST(ct_genus, XSIMD(dft_t1fusimd_genus)) = { t_okp_t1fu, VL };
|
Chris@10
|
255
|
Chris@10
|
256 static int t_okp_t1b(const ct_desc *d,
|
Chris@10
|
257 const R *rio, const R *iio,
|
Chris@10
|
258 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
259 const planner *plnr)
|
Chris@10
|
260 {
|
Chris@10
|
261 return t_okp_common(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
262 && rio == iio + 1
|
Chris@10
|
263 && ALIGNEDA(iio);
|
Chris@10
|
264 }
|
Chris@10
|
265
|
Chris@10
|
266 EXTERN_CONST(ct_genus, XSIMD(dft_t1bsimd_genus)) = { t_okp_t1b, VL };
|
Chris@10
|
267
|
Chris@10
|
268 static int t_okp_t1bu(const ct_desc *d,
|
Chris@10
|
269 const R *rio, const R *iio,
|
Chris@10
|
270 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
271 const planner *plnr)
|
Chris@10
|
272 {
|
Chris@10
|
273 return t_okp_commonu(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
274 && rio == iio + 1
|
Chris@10
|
275 && ALIGNED(iio);
|
Chris@10
|
276 }
|
Chris@10
|
277
|
Chris@10
|
278 EXTERN_CONST(ct_genus, XSIMD(dft_t1busimd_genus)) = { t_okp_t1bu, VL };
|
Chris@10
|
279
|
Chris@10
|
280 /* use t2* codelets only when n = m*radix is small, because
|
Chris@10
|
281 t2* codelets use ~2n twiddle factors (instead of ~n) */
|
Chris@10
|
282 static int small_enough(const ct_desc *d, INT m)
|
Chris@10
|
283 {
|
Chris@10
|
284 return m * d->radix <= 16384;
|
Chris@10
|
285 }
|
Chris@10
|
286
|
Chris@10
|
287 static int t_okp_t2f(const ct_desc *d,
|
Chris@10
|
288 const R *rio, const R *iio,
|
Chris@10
|
289 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
290 const planner *plnr)
|
Chris@10
|
291 {
|
Chris@10
|
292 return t_okp_t1f(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
293 && small_enough(d, m);
|
Chris@10
|
294 }
|
Chris@10
|
295
|
Chris@10
|
296 EXTERN_CONST(ct_genus, XSIMD(dft_t2fsimd_genus)) = { t_okp_t2f, VL };
|
Chris@10
|
297
|
Chris@10
|
298 static int t_okp_t2b(const ct_desc *d,
|
Chris@10
|
299 const R *rio, const R *iio,
|
Chris@10
|
300 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
301 const planner *plnr)
|
Chris@10
|
302 {
|
Chris@10
|
303 return t_okp_t1b(d, rio, iio, rs, vs, m, mb, me, ms, plnr)
|
Chris@10
|
304 && small_enough(d, m);
|
Chris@10
|
305 }
|
Chris@10
|
306
|
Chris@10
|
307 EXTERN_CONST(ct_genus, XSIMD(dft_t2bsimd_genus)) = { t_okp_t2b, VL };
|
Chris@10
|
308
|
Chris@10
|
309 static int ts_okp(const ct_desc *d,
|
Chris@10
|
310 const R *rio, const R *iio,
|
Chris@10
|
311 INT rs, INT vs, INT m, INT mb, INT me, INT ms,
|
Chris@10
|
312 const planner *plnr)
|
Chris@10
|
313 {
|
Chris@10
|
314 UNUSED(rio);
|
Chris@10
|
315 UNUSED(iio);
|
Chris@10
|
316 return (1
|
Chris@10
|
317 && !NO_SIMDP(plnr)
|
Chris@10
|
318 && ALIGNEDA(rio)
|
Chris@10
|
319 && ALIGNEDA(iio)
|
Chris@10
|
320 && SIMD_STRIDE_OKA(rs)
|
Chris@10
|
321 && ms == 1
|
Chris@10
|
322 && (m % (2 * VL)) == 0
|
Chris@10
|
323 && (mb % (2 * VL)) == 0
|
Chris@10
|
324 && (me % (2 * VL)) == 0
|
Chris@10
|
325 && (!d->rs || (d->rs == rs))
|
Chris@10
|
326 && (!d->vs || (d->vs == vs))
|
Chris@10
|
327 && (!d->ms || (d->ms == ms))
|
Chris@10
|
328 );
|
Chris@10
|
329 }
|
Chris@10
|
330
|
Chris@10
|
331 EXTERN_CONST(ct_genus, XSIMD(dft_tssimd_genus)) = { ts_okp, 2 * VL };
|