comparison src/fftw-3.3.8/rdft/ct-hc2c-direct.c @ 167:bd3cc4d1df30

Add FFTW 3.3.8 source, and a Linux build
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 19 Nov 2019 14:52:55 +0000
parents
children
comparison
equal deleted inserted replaced
166:cbd6d7e562c7 167:bd3cc4d1df30
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21
22 #include "ct-hc2c.h"
23
24 typedef struct {
25 hc2c_solver super;
26 const hc2c_desc *desc;
27 int bufferedp;
28 khc2c k;
29 } S;
30
31 typedef struct {
32 plan_hc2c super;
33 khc2c k;
34 plan *cld0, *cldm; /* children for 0th and middle butterflies */
35 INT r, m, v, extra_iter;
36 INT ms, vs;
37 stride rs, brs;
38 twid *td;
39 const S *slv;
40 } P;
41
42 /*************************************************************
43 Nonbuffered code
44 *************************************************************/
45 static void apply(const plan *ego_, R *cr, R *ci)
46 {
47 const P *ego = (const P *) ego_;
48 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
49 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
50 INT i, m = ego->m, v = ego->v;
51 INT ms = ego->ms, vs = ego->vs;
52
53 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
54 cld0->apply((plan *) cld0, cr, ci, cr, ci);
55 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
56 ego->td->W, ego->rs, 1, (m+1)/2, ms);
57 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
58 cr + (m/2)*ms, ci + (m/2)*ms);
59 }
60 }
61
62 static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
63 {
64 const P *ego = (const P *) ego_;
65 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
66 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
67 INT i, m = ego->m, v = ego->v;
68 INT ms = ego->ms, vs = ego->vs;
69 INT mm = (m-1)/2;
70
71 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
72 cld0->apply((plan *) cld0, cr, ci, cr, ci);
73
74 /* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
75 even vector length MM-1, and then execute the last
76 iteration as a 2-vector with vector stride 0. The
77 twiddle factors of the second half of the last iteration
78 are bogus, but we only store the results of the first
79 half. */
80 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
81 ego->td->W, ego->rs, 1, mm, ms);
82 ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
83 ego->td->W, ego->rs, mm, mm+2, 0);
84 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
85 cr + (m/2)*ms, ci + (m/2)*ms);
86 }
87
88 }
89
90 /*************************************************************
91 Buffered code
92 *************************************************************/
93
94 /* should not be 2^k to avoid associativity conflicts */
95 static INT compute_batchsize(INT radix)
96 {
97 /* round up to multiple of 4 */
98 radix += 3;
99 radix &= -4;
100
101 return (radix + 2);
102 }
103
104 static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
105 INT mb, INT me, INT extra_iter, R *bufp)
106 {
107 INT b = WS(ego->brs, 1);
108 INT rs = WS(ego->rs, 1);
109 INT ms = ego->ms;
110 R *bufm = bufp + b - 2;
111 INT n = me - mb;
112
113 X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
114 ego->r / 2, rs, b,
115 n, ms, 2);
116 X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
117 ego->r / 2, rs, b,
118 n, -ms, -2);
119
120 if (extra_iter) {
121 /* initialize the extra_iter element to 0. It would be ok
122 to leave it uninitialized, since we transform uninitialized
123 data and ignore the result. However, we want to avoid
124 FP exceptions in case somebody is trapping them. */
125 A(n < compute_batchsize(ego->r));
126 X(zero1d_pair)(bufp + 2*n, bufp + 1 + 2*n, ego->r / 2, b);
127 X(zero1d_pair)(bufm - 2*n, bufm + 1 - 2*n, ego->r / 2, b);
128 }
129
130 ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W,
131 ego->brs, mb, me + extra_iter, 2);
132 X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms,
133 ego->r / 2, b, rs,
134 n, 2, ms);
135 X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
136 ego->r / 2, b, rs,
137 n, -2, -ms);
138 }
139
140 static void apply_buf(const plan *ego_, R *cr, R *ci)
141 {
142 const P *ego = (const P *) ego_;
143 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
144 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
145 INT i, j, ms = ego->ms, v = ego->v;
146 INT batchsz = compute_batchsize(ego->r);
147 R *buf;
148 INT mb = 1, me = (ego->m+1) / 2;
149 size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
150
151 BUF_ALLOC(R *, buf, bufsz);
152
153 for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
154 R *Rp = cr;
155 R *Ip = ci;
156 R *Rm = cr + ego->m * ms;
157 R *Im = ci + ego->m * ms;
158
159 cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
160
161 for (j = mb; j + batchsz < me; j += batchsz)
162 dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
163
164 dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
165
166 cldm->apply((plan *) cldm,
167 Rp + me * ms, Ip + me * ms,
168 Rp + me * ms, Ip + me * ms);
169
170 }
171
172 BUF_FREE(buf, bufsz);
173 }
174
175 /*************************************************************
176 common code
177 *************************************************************/
178 static void awake(plan *ego_, enum wakefulness wakefulness)
179 {
180 P *ego = (P *) ego_;
181
182 X(plan_awake)(ego->cld0, wakefulness);
183 X(plan_awake)(ego->cldm, wakefulness);
184 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
185 ego->r * ego->m, ego->r,
186 (ego->m - 1) / 2 + ego->extra_iter);
187 }
188
189 static void destroy(plan *ego_)
190 {
191 P *ego = (P *) ego_;
192 X(plan_destroy_internal)(ego->cld0);
193 X(plan_destroy_internal)(ego->cldm);
194 X(stride_destroy)(ego->rs);
195 X(stride_destroy)(ego->brs);
196 }
197
198 static void print(const plan *ego_, printer *p)
199 {
200 const P *ego = (const P *) ego_;
201 const S *slv = ego->slv;
202 const hc2c_desc *e = slv->desc;
203
204 if (slv->bufferedp)
205 p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
206 compute_batchsize(ego->r),
207 ego->r, X(twiddle_length)(ego->r, e->tw),
208 ego->extra_iter, ego->v, e->nam,
209 ego->cld0, ego->cldm);
210 else
211 p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
212 ego->r, X(twiddle_length)(ego->r, e->tw),
213 ego->extra_iter, ego->v, e->nam,
214 ego->cld0, ego->cldm);
215 }
216
217 static int applicable0(const S *ego, rdft_kind kind,
218 INT r, INT rs,
219 INT m, INT ms,
220 INT v, INT vs,
221 const R *cr, const R *ci,
222 const planner *plnr,
223 INT *extra_iter)
224 {
225 const hc2c_desc *e = ego->desc;
226 UNUSED(v);
227
228 return (
229 1
230 && r == e->radix
231 && kind == e->genus->kind
232
233 /* first v-loop iteration */
234 && ((*extra_iter = 0,
235 e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
236 rs, 1, (m+1)/2, ms, plnr))
237 ||
238 (*extra_iter = 1,
239 ((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
240 rs, 1, (m-1)/2, ms, plnr))
241 &&
242 (e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
243 rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
244
245 /* subsequent v-loop iterations */
246 && (cr += vs, ci += vs, 1)
247
248 && e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
249 rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
250 );
251 }
252
253 static int applicable0_buf(const S *ego, rdft_kind kind,
254 INT r, INT rs,
255 INT m, INT ms,
256 INT v, INT vs,
257 const R *cr, const R *ci,
258 const planner *plnr, INT *extra_iter)
259 {
260 const hc2c_desc *e = ego->desc;
261 INT batchsz, brs;
262 UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
263
264 return (
265 1
266 && r == e->radix
267 && kind == e->genus->kind
268
269 /* ignore cr, ci, use buffer */
270 && (cr = (const R *)0, ci = cr + 1,
271 batchsz = compute_batchsize(r),
272 brs = 4 * batchsz, 1)
273
274 && e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
275 brs, 1, 1+batchsz, 2, plnr)
276
277 && ((*extra_iter = 0,
278 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
279 brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
280 ||
281 (*extra_iter = 1,
282 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
283 brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
284
285 );
286 }
287
288 static int applicable(const S *ego, rdft_kind kind,
289 INT r, INT rs,
290 INT m, INT ms,
291 INT v, INT vs,
292 R *cr, R *ci,
293 const planner *plnr, INT *extra_iter)
294 {
295 if (ego->bufferedp) {
296 if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
297 extra_iter))
298 return 0;
299 } else {
300 if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
301 extra_iter))
302 return 0;
303 }
304
305 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
306 v, m * r, r))
307 return 0;
308
309 return 1;
310 }
311
312 static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
313 INT r, INT rs,
314 INT m, INT ms,
315 INT v, INT vs,
316 R *cr, R *ci,
317 planner *plnr)
318 {
319 const S *ego = (const S *) ego_;
320 P *pln;
321 const hc2c_desc *e = ego->desc;
322 plan *cld0 = 0, *cldm = 0;
323 INT imid = (m / 2) * ms;
324 INT extra_iter;
325
326 static const plan_adt padt = {
327 0, awake, print, destroy
328 };
329
330 if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
331 &extra_iter))
332 return (plan *)0;
333
334 cld0 = X(mkplan_d)(
335 plnr,
336 X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
337 X(mktensor_0d)(),
338 TAINT(cr, vs), TAINT(ci, vs),
339 TAINT(cr, vs), TAINT(ci, vs),
340 kind));
341 if (!cld0) goto nada;
342
343 cldm = X(mkplan_d)(
344 plnr,
345 X(mkproblem_rdft2_d)(((m % 2) ?
346 X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
347 X(mktensor_0d)(),
348 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
349 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
350 kind == R2HC ? R2HCII : HC2RIII));
351 if (!cldm) goto nada;
352
353 if (ego->bufferedp)
354 pln = MKPLAN_HC2C(P, &padt, apply_buf);
355 else
356 pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
357
358 pln->k = ego->k;
359 pln->td = 0;
360 pln->r = r; pln->rs = X(mkstride)(r, rs);
361 pln->m = m; pln->ms = ms;
362 pln->v = v; pln->vs = vs;
363 pln->slv = ego;
364 pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
365 pln->cld0 = cld0;
366 pln->cldm = cldm;
367 pln->extra_iter = extra_iter;
368
369 X(ops_zero)(&pln->super.super.ops);
370 X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
371 &e->ops, &pln->super.super.ops);
372 X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
373 X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
374
375 if (ego->bufferedp)
376 pln->super.super.ops.other += 4 * r * m * v;
377
378 return &(pln->super.super);
379
380 nada:
381 X(plan_destroy_internal)(cld0);
382 X(plan_destroy_internal)(cldm);
383 return 0;
384 }
385
386 static void regone(planner *plnr, khc2c codelet,
387 const hc2c_desc *desc,
388 hc2c_kind hc2ckind,
389 int bufferedp)
390 {
391 S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
392 slv->k = codelet;
393 slv->desc = desc;
394 slv->bufferedp = bufferedp;
395 REGISTER_SOLVER(plnr, &(slv->super.super));
396 }
397
398 void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
399 const hc2c_desc *desc,
400 hc2c_kind hc2ckind)
401 {
402 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
403 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
404 }