Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/dft/dftw-direct.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 | |
22 #include "ct.h" | |
23 | |
24 typedef struct { | |
25 ct_solver super; | |
26 const ct_desc *desc; | |
27 int bufferedp; | |
28 kdftw k; | |
29 } S; | |
30 | |
31 typedef struct { | |
32 plan_dftw super; | |
33 kdftw k; | |
34 INT r; | |
35 stride rs; | |
36 INT m, ms, v, vs, mb, me, extra_iter; | |
37 stride brs; | |
38 twid *td; | |
39 const S *slv; | |
40 } P; | |
41 | |
42 | |
43 /************************************************************* | |
44 Nonbuffered code | |
45 *************************************************************/ | |
46 static void apply(const plan *ego_, R *rio, R *iio) | |
47 { | |
48 const P *ego = (const P *) ego_; | |
49 INT i; | |
50 ASSERT_ALIGNED_DOUBLE; | |
51 for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) { | |
52 INT mb = ego->mb, ms = ego->ms; | |
53 ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, | |
54 ego->rs, mb, ego->me, ms); | |
55 } | |
56 } | |
57 | |
58 static void apply_extra_iter(const plan *ego_, R *rio, R *iio) | |
59 { | |
60 const P *ego = (const P *) ego_; | |
61 INT i, v = ego->v, vs = ego->vs; | |
62 INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms; | |
63 ASSERT_ALIGNED_DOUBLE; | |
64 for (i = 0; i < v; ++i, rio += vs, iio += vs) { | |
65 ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, | |
66 ego->rs, mb, mm, ms); | |
67 ego->k(rio + mm*ms, iio + mm*ms, ego->td->W, | |
68 ego->rs, mm, mm+2, 0); | |
69 } | |
70 } | |
71 | |
72 /************************************************************* | |
73 Buffered code | |
74 *************************************************************/ | |
75 static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf) | |
76 { | |
77 INT brs = WS(ego->brs, 1); | |
78 INT rs = WS(ego->rs, 1); | |
79 INT ms = ego->ms; | |
80 | |
81 X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1, | |
82 ego->r, rs, brs, | |
83 me - mb, ms, 2); | |
84 ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2); | |
85 X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms, | |
86 ego->r, brs, rs, | |
87 me - mb, 2, ms); | |
88 } | |
89 | |
90 /* must be even for SIMD alignment; should not be 2^k to avoid | |
91 associativity conflicts */ | |
92 static INT compute_batchsize(INT radix) | |
93 { | |
94 /* round up to multiple of 4 */ | |
95 radix += 3; | |
96 radix &= -4; | |
97 | |
98 return (radix + 2); | |
99 } | |
100 | |
101 static void apply_buf(const plan *ego_, R *rio, R *iio) | |
102 { | |
103 const P *ego = (const P *) ego_; | |
104 INT i, j, v = ego->v, r = ego->r; | |
105 INT batchsz = compute_batchsize(r); | |
106 R *buf; | |
107 INT mb = ego->mb, me = ego->me; | |
108 size_t bufsz = r * batchsz * 2 * sizeof(R); | |
109 | |
110 BUF_ALLOC(R *, buf, bufsz); | |
111 | |
112 for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) { | |
113 for (j = mb; j + batchsz < me; j += batchsz) | |
114 dobatch(ego, rio, iio, j, j + batchsz, buf); | |
115 | |
116 dobatch(ego, rio, iio, j, me, buf); | |
117 } | |
118 | |
119 BUF_FREE(buf, bufsz); | |
120 } | |
121 | |
122 /************************************************************* | |
123 common code | |
124 *************************************************************/ | |
125 static void awake(plan *ego_, enum wakefulness wakefulness) | |
126 { | |
127 P *ego = (P *) ego_; | |
128 | |
129 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw, | |
130 ego->r * ego->m, ego->r, ego->m + ego->extra_iter); | |
131 } | |
132 | |
133 static void destroy(plan *ego_) | |
134 { | |
135 P *ego = (P *) ego_; | |
136 X(stride_destroy)(ego->brs); | |
137 X(stride_destroy)(ego->rs); | |
138 } | |
139 | |
140 static void print(const plan *ego_, printer *p) | |
141 { | |
142 const P *ego = (const P *) ego_; | |
143 const S *slv = ego->slv; | |
144 const ct_desc *e = slv->desc; | |
145 | |
146 if (slv->bufferedp) | |
147 p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")", | |
148 compute_batchsize(ego->r), ego->r, | |
149 X(twiddle_length)(ego->r, e->tw), ego->v, e->nam); | |
150 else | |
151 p->print(p, "(dftw-direct-%D/%D%v \"%s\")", | |
152 ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam); | |
153 } | |
154 | |
155 static int applicable0(const S *ego, | |
156 INT r, INT irs, INT ors, | |
157 INT m, INT ms, | |
158 INT v, INT ivs, INT ovs, | |
159 INT mb, INT me, | |
160 R *rio, R *iio, | |
161 const planner *plnr, INT *extra_iter) | |
162 { | |
163 const ct_desc *e = ego->desc; | |
164 UNUSED(v); | |
165 | |
166 return ( | |
167 1 | |
168 && r == e->radix | |
169 && irs == ors /* in-place along R */ | |
170 && ivs == ovs /* in-place along V */ | |
171 | |
172 /* check for alignment/vector length restrictions */ | |
173 && ((*extra_iter = 0, | |
174 e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr)) | |
175 || | |
176 (*extra_iter = 1, | |
177 (1 | |
178 /* FIXME: require full array, otherwise some threads | |
179 may be extra_iter and other threads won't be. | |
180 Generating the proper twiddle factors is a pain in | |
181 this case */ | |
182 && mb == 0 && me == m | |
183 && e->genus->okp(e, rio, iio, irs, ivs, | |
184 m, mb, me - 1, ms, plnr) | |
185 && e->genus->okp(e, rio, iio, irs, ivs, | |
186 m, me - 1, me + 1, ms, plnr)))) | |
187 | |
188 && (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs, | |
189 m, mb, me - *extra_iter, ms, plnr)) | |
190 | |
191 ); | |
192 } | |
193 | |
194 static int applicable0_buf(const S *ego, | |
195 INT r, INT irs, INT ors, | |
196 INT m, INT ms, | |
197 INT v, INT ivs, INT ovs, | |
198 INT mb, INT me, | |
199 R *rio, R *iio, | |
200 const planner *plnr) | |
201 { | |
202 const ct_desc *e = ego->desc; | |
203 INT batchsz; | |
204 UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio); | |
205 | |
206 return ( | |
207 1 | |
208 && r == e->radix | |
209 && irs == ors /* in-place along R */ | |
210 && ivs == ovs /* in-place along V */ | |
211 | |
212 /* check for alignment/vector length restrictions, both for | |
213 batchsize and for the remainder */ | |
214 && (batchsz = compute_batchsize(r), 1) | |
215 && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0, | |
216 m, mb, mb + batchsz, 2, plnr)) | |
217 && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0, | |
218 m, mb, me, 2, plnr)) | |
219 ); | |
220 } | |
221 | |
222 static int applicable(const S *ego, | |
223 INT r, INT irs, INT ors, | |
224 INT m, INT ms, | |
225 INT v, INT ivs, INT ovs, | |
226 INT mb, INT me, | |
227 R *rio, R *iio, | |
228 const planner *plnr, INT *extra_iter) | |
229 { | |
230 if (ego->bufferedp) { | |
231 *extra_iter = 0; | |
232 if (!applicable0_buf(ego, | |
233 r, irs, ors, m, ms, v, ivs, ovs, mb, me, | |
234 rio, iio, plnr)) | |
235 return 0; | |
236 } else { | |
237 if (!applicable0(ego, | |
238 r, irs, ors, m, ms, v, ivs, ovs, mb, me, | |
239 rio, iio, plnr, extra_iter)) | |
240 return 0; | |
241 } | |
242 | |
243 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16), | |
244 v, m * r, r)) | |
245 return 0; | |
246 | |
247 if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr)) | |
248 return 0; | |
249 | |
250 return 1; | |
251 } | |
252 | |
253 static plan *mkcldw(const ct_solver *ego_, | |
254 INT r, INT irs, INT ors, | |
255 INT m, INT ms, | |
256 INT v, INT ivs, INT ovs, | |
257 INT mstart, INT mcount, | |
258 R *rio, R *iio, | |
259 planner *plnr) | |
260 { | |
261 const S *ego = (const S *) ego_; | |
262 P *pln; | |
263 const ct_desc *e = ego->desc; | |
264 INT extra_iter; | |
265 | |
266 static const plan_adt padt = { | |
267 0, awake, print, destroy | |
268 }; | |
269 | |
270 A(mstart >= 0 && mstart + mcount <= m); | |
271 if (!applicable(ego, | |
272 r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount, | |
273 rio, iio, plnr, &extra_iter)) | |
274 return (plan *)0; | |
275 | |
276 if (ego->bufferedp) { | |
277 pln = MKPLAN_DFTW(P, &padt, apply_buf); | |
278 } else { | |
279 pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply); | |
280 } | |
281 | |
282 pln->k = ego->k; | |
283 pln->rs = X(mkstride)(r, irs); | |
284 pln->td = 0; | |
285 pln->r = r; | |
286 pln->m = m; | |
287 pln->ms = ms; | |
288 pln->v = v; | |
289 pln->vs = ivs; | |
290 pln->mb = mstart; | |
291 pln->me = mstart + mcount; | |
292 pln->slv = ego; | |
293 pln->brs = X(mkstride)(r, 2 * compute_batchsize(r)); | |
294 pln->extra_iter = extra_iter; | |
295 | |
296 X(ops_zero)(&pln->super.super.ops); | |
297 X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops); | |
298 | |
299 if (ego->bufferedp) { | |
300 /* 8 load/stores * N * V */ | |
301 pln->super.super.ops.other += 8 * r * mcount * v; | |
302 } | |
303 | |
304 pln->super.super.could_prune_now_p = | |
305 (!ego->bufferedp && r >= 5 && r < 64 && m >= r); | |
306 return &(pln->super.super); | |
307 } | |
308 | |
309 static void regone(planner *plnr, kdftw codelet, | |
310 const ct_desc *desc, int dec, int bufferedp) | |
311 { | |
312 S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0); | |
313 slv->k = codelet; | |
314 slv->desc = desc; | |
315 slv->bufferedp = bufferedp; | |
316 REGISTER_SOLVER(plnr, &(slv->super.super)); | |
317 if (X(mksolver_ct_hook)) { | |
318 slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix, | |
319 dec, mkcldw, 0); | |
320 slv->k = codelet; | |
321 slv->desc = desc; | |
322 slv->bufferedp = bufferedp; | |
323 REGISTER_SOLVER(plnr, &(slv->super.super)); | |
324 } | |
325 } | |
326 | |
327 void X(regsolver_ct_directw)(planner *plnr, kdftw codelet, | |
328 const ct_desc *desc, int dec) | |
329 { | |
330 regone(plnr, codelet, desc, dec, /* bufferedp */ 0); | |
331 regone(plnr, codelet, desc, dec, /* bufferedp */ 1); | |
332 } |