annotate src/fftw-3.3.3/rdft/ct-hc2c-direct.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21
Chris@10 22 #include "ct-hc2c.h"
Chris@10 23
Chris@10 24 typedef struct {
Chris@10 25 hc2c_solver super;
Chris@10 26 const hc2c_desc *desc;
Chris@10 27 int bufferedp;
Chris@10 28 khc2c k;
Chris@10 29 } S;
Chris@10 30
Chris@10 31 typedef struct {
Chris@10 32 plan_hc2c super;
Chris@10 33 khc2c k;
Chris@10 34 plan *cld0, *cldm; /* children for 0th and middle butterflies */
Chris@10 35 INT r, m, v, extra_iter;
Chris@10 36 INT ms, vs;
Chris@10 37 stride rs, brs;
Chris@10 38 twid *td;
Chris@10 39 const S *slv;
Chris@10 40 } P;
Chris@10 41
Chris@10 42 /*************************************************************
Chris@10 43 Nonbuffered code
Chris@10 44 *************************************************************/
Chris@10 45 static void apply(const plan *ego_, R *cr, R *ci)
Chris@10 46 {
Chris@10 47 const P *ego = (const P *) ego_;
Chris@10 48 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@10 49 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@10 50 INT i, m = ego->m, v = ego->v;
Chris@10 51 INT ms = ego->ms, vs = ego->vs;
Chris@10 52
Chris@10 53 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
Chris@10 54 cld0->apply((plan *) cld0, cr, ci, cr, ci);
Chris@10 55 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 56 ego->td->W, ego->rs, 1, (m+1)/2, ms);
Chris@10 57 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
Chris@10 58 cr + (m/2)*ms, ci + (m/2)*ms);
Chris@10 59 }
Chris@10 60 }
Chris@10 61
Chris@10 62 static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
Chris@10 63 {
Chris@10 64 const P *ego = (const P *) ego_;
Chris@10 65 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@10 66 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@10 67 INT i, m = ego->m, v = ego->v;
Chris@10 68 INT ms = ego->ms, vs = ego->vs;
Chris@10 69 INT mm = (m-1)/2;
Chris@10 70
Chris@10 71 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
Chris@10 72 cld0->apply((plan *) cld0, cr, ci, cr, ci);
Chris@10 73
Chris@10 74 /* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
Chris@10 75 even vector length MM-1, and then execute the last
Chris@10 76 iteration as a 2-vector with vector stride 0. The
Chris@10 77 twiddle factors of the second half of the last iteration
Chris@10 78 are bogus, but we only store the results of the first
Chris@10 79 half. */
Chris@10 80 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 81 ego->td->W, ego->rs, 1, mm, ms);
Chris@10 82 ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
Chris@10 83 ego->td->W, ego->rs, mm, mm+2, 0);
Chris@10 84 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
Chris@10 85 cr + (m/2)*ms, ci + (m/2)*ms);
Chris@10 86 }
Chris@10 87
Chris@10 88 }
Chris@10 89
Chris@10 90 /*************************************************************
Chris@10 91 Buffered code
Chris@10 92 *************************************************************/
Chris@10 93
Chris@10 94 /* should not be 2^k to avoid associativity conflicts */
Chris@10 95 static INT compute_batchsize(INT radix)
Chris@10 96 {
Chris@10 97 /* round up to multiple of 4 */
Chris@10 98 radix += 3;
Chris@10 99 radix &= -4;
Chris@10 100
Chris@10 101 return (radix + 2);
Chris@10 102 }
Chris@10 103
Chris@10 104 static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
Chris@10 105 INT mb, INT me, INT extra_iter, R *bufp)
Chris@10 106 {
Chris@10 107 INT b = WS(ego->brs, 1);
Chris@10 108 INT rs = WS(ego->rs, 1);
Chris@10 109 INT ms = ego->ms;
Chris@10 110 R *bufm = bufp + b - 2;
Chris@10 111
Chris@10 112 X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
Chris@10 113 ego->r / 2, rs, b,
Chris@10 114 me - mb, ms, 2);
Chris@10 115 X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
Chris@10 116 ego->r / 2, rs, b,
Chris@10 117 me - mb, -ms, -2);
Chris@10 118 ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W,
Chris@10 119 ego->brs, mb, me + extra_iter, 2);
Chris@10 120 X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms,
Chris@10 121 ego->r / 2, b, rs,
Chris@10 122 me - mb, 2, ms);
Chris@10 123 X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
Chris@10 124 ego->r / 2, b, rs,
Chris@10 125 me - mb, -2, -ms);
Chris@10 126 }
Chris@10 127
Chris@10 128 static void apply_buf(const plan *ego_, R *cr, R *ci)
Chris@10 129 {
Chris@10 130 const P *ego = (const P *) ego_;
Chris@10 131 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@10 132 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@10 133 INT i, j, ms = ego->ms, v = ego->v;
Chris@10 134 INT batchsz = compute_batchsize(ego->r);
Chris@10 135 R *buf;
Chris@10 136 INT mb = 1, me = (ego->m+1) / 2;
Chris@10 137 size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
Chris@10 138
Chris@10 139 BUF_ALLOC(R *, buf, bufsz);
Chris@10 140
Chris@10 141 for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
Chris@10 142 R *Rp = cr;
Chris@10 143 R *Ip = ci;
Chris@10 144 R *Rm = cr + ego->m * ms;
Chris@10 145 R *Im = ci + ego->m * ms;
Chris@10 146
Chris@10 147 cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
Chris@10 148
Chris@10 149 for (j = mb; j + batchsz < me; j += batchsz)
Chris@10 150 dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
Chris@10 151
Chris@10 152 dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
Chris@10 153
Chris@10 154 cldm->apply((plan *) cldm,
Chris@10 155 Rp + me * ms, Ip + me * ms,
Chris@10 156 Rp + me * ms, Ip + me * ms);
Chris@10 157
Chris@10 158 }
Chris@10 159
Chris@10 160 BUF_FREE(buf, bufsz);
Chris@10 161 }
Chris@10 162
Chris@10 163 /*************************************************************
Chris@10 164 common code
Chris@10 165 *************************************************************/
Chris@10 166 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@10 167 {
Chris@10 168 P *ego = (P *) ego_;
Chris@10 169
Chris@10 170 X(plan_awake)(ego->cld0, wakefulness);
Chris@10 171 X(plan_awake)(ego->cldm, wakefulness);
Chris@10 172 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
Chris@10 173 ego->r * ego->m, ego->r,
Chris@10 174 (ego->m - 1) / 2 + ego->extra_iter);
Chris@10 175 }
Chris@10 176
Chris@10 177 static void destroy(plan *ego_)
Chris@10 178 {
Chris@10 179 P *ego = (P *) ego_;
Chris@10 180 X(plan_destroy_internal)(ego->cld0);
Chris@10 181 X(plan_destroy_internal)(ego->cldm);
Chris@10 182 X(stride_destroy)(ego->rs);
Chris@10 183 X(stride_destroy)(ego->brs);
Chris@10 184 }
Chris@10 185
Chris@10 186 static void print(const plan *ego_, printer *p)
Chris@10 187 {
Chris@10 188 const P *ego = (const P *) ego_;
Chris@10 189 const S *slv = ego->slv;
Chris@10 190 const hc2c_desc *e = slv->desc;
Chris@10 191
Chris@10 192 if (slv->bufferedp)
Chris@10 193 p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@10 194 compute_batchsize(ego->r),
Chris@10 195 ego->r, X(twiddle_length)(ego->r, e->tw),
Chris@10 196 ego->extra_iter, ego->v, e->nam,
Chris@10 197 ego->cld0, ego->cldm);
Chris@10 198 else
Chris@10 199 p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@10 200 ego->r, X(twiddle_length)(ego->r, e->tw),
Chris@10 201 ego->extra_iter, ego->v, e->nam,
Chris@10 202 ego->cld0, ego->cldm);
Chris@10 203 }
Chris@10 204
Chris@10 205 static int applicable0(const S *ego, rdft_kind kind,
Chris@10 206 INT r, INT rs,
Chris@10 207 INT m, INT ms,
Chris@10 208 INT v, INT vs,
Chris@10 209 const R *cr, const R *ci,
Chris@10 210 const planner *plnr,
Chris@10 211 INT *extra_iter)
Chris@10 212 {
Chris@10 213 const hc2c_desc *e = ego->desc;
Chris@10 214 UNUSED(v);
Chris@10 215
Chris@10 216 return (
Chris@10 217 1
Chris@10 218 && r == e->radix
Chris@10 219 && kind == e->genus->kind
Chris@10 220
Chris@10 221 /* first v-loop iteration */
Chris@10 222 && ((*extra_iter = 0,
Chris@10 223 e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 224 rs, 1, (m+1)/2, ms, plnr))
Chris@10 225 ||
Chris@10 226 (*extra_iter = 1,
Chris@10 227 ((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 228 rs, 1, (m-1)/2, ms, plnr))
Chris@10 229 &&
Chris@10 230 (e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 231 rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
Chris@10 232
Chris@10 233 /* subsequent v-loop iterations */
Chris@10 234 && (cr += vs, ci += vs, 1)
Chris@10 235
Chris@10 236 && e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@10 237 rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
Chris@10 238 );
Chris@10 239 }
Chris@10 240
Chris@10 241 static int applicable0_buf(const S *ego, rdft_kind kind,
Chris@10 242 INT r, INT rs,
Chris@10 243 INT m, INT ms,
Chris@10 244 INT v, INT vs,
Chris@10 245 const R *cr, const R *ci,
Chris@10 246 const planner *plnr, INT *extra_iter)
Chris@10 247 {
Chris@10 248 const hc2c_desc *e = ego->desc;
Chris@10 249 INT batchsz, brs;
Chris@10 250 UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
Chris@10 251
Chris@10 252 return (
Chris@10 253 1
Chris@10 254 && r == e->radix
Chris@10 255 && kind == e->genus->kind
Chris@10 256
Chris@10 257 /* ignore cr, ci, use buffer */
Chris@10 258 && (cr = (const R *)0, ci = cr + 1,
Chris@10 259 batchsz = compute_batchsize(r),
Chris@10 260 brs = 4 * batchsz, 1)
Chris@10 261
Chris@10 262 && e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@10 263 brs, 1, 1+batchsz, 2, plnr)
Chris@10 264
Chris@10 265 && ((*extra_iter = 0,
Chris@10 266 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@10 267 brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
Chris@10 268 ||
Chris@10 269 (*extra_iter = 1,
Chris@10 270 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@10 271 brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
Chris@10 272
Chris@10 273 );
Chris@10 274 }
Chris@10 275
Chris@10 276 static int applicable(const S *ego, rdft_kind kind,
Chris@10 277 INT r, INT rs,
Chris@10 278 INT m, INT ms,
Chris@10 279 INT v, INT vs,
Chris@10 280 R *cr, R *ci,
Chris@10 281 const planner *plnr, INT *extra_iter)
Chris@10 282 {
Chris@10 283 if (ego->bufferedp) {
Chris@10 284 if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@10 285 extra_iter))
Chris@10 286 return 0;
Chris@10 287 } else {
Chris@10 288 if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@10 289 extra_iter))
Chris@10 290 return 0;
Chris@10 291 }
Chris@10 292
Chris@10 293 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
Chris@10 294 v, m * r, r))
Chris@10 295 return 0;
Chris@10 296
Chris@10 297 return 1;
Chris@10 298 }
Chris@10 299
Chris@10 300 static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
Chris@10 301 INT r, INT rs,
Chris@10 302 INT m, INT ms,
Chris@10 303 INT v, INT vs,
Chris@10 304 R *cr, R *ci,
Chris@10 305 planner *plnr)
Chris@10 306 {
Chris@10 307 const S *ego = (const S *) ego_;
Chris@10 308 P *pln;
Chris@10 309 const hc2c_desc *e = ego->desc;
Chris@10 310 plan *cld0 = 0, *cldm = 0;
Chris@10 311 INT imid = (m / 2) * ms;
Chris@10 312 INT extra_iter;
Chris@10 313
Chris@10 314 static const plan_adt padt = {
Chris@10 315 0, awake, print, destroy
Chris@10 316 };
Chris@10 317
Chris@10 318 if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@10 319 &extra_iter))
Chris@10 320 return (plan *)0;
Chris@10 321
Chris@10 322 cld0 = X(mkplan_d)(
Chris@10 323 plnr,
Chris@10 324 X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
Chris@10 325 X(mktensor_0d)(),
Chris@10 326 TAINT(cr, vs), TAINT(ci, vs),
Chris@10 327 TAINT(cr, vs), TAINT(ci, vs),
Chris@10 328 kind));
Chris@10 329 if (!cld0) goto nada;
Chris@10 330
Chris@10 331 cldm = X(mkplan_d)(
Chris@10 332 plnr,
Chris@10 333 X(mkproblem_rdft2_d)(((m % 2) ?
Chris@10 334 X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
Chris@10 335 X(mktensor_0d)(),
Chris@10 336 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
Chris@10 337 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
Chris@10 338 kind == R2HC ? R2HCII : HC2RIII));
Chris@10 339 if (!cldm) goto nada;
Chris@10 340
Chris@10 341 if (ego->bufferedp)
Chris@10 342 pln = MKPLAN_HC2C(P, &padt, apply_buf);
Chris@10 343 else
Chris@10 344 pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
Chris@10 345
Chris@10 346 pln->k = ego->k;
Chris@10 347 pln->td = 0;
Chris@10 348 pln->r = r; pln->rs = X(mkstride)(r, rs);
Chris@10 349 pln->m = m; pln->ms = ms;
Chris@10 350 pln->v = v; pln->vs = vs;
Chris@10 351 pln->slv = ego;
Chris@10 352 pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
Chris@10 353 pln->cld0 = cld0;
Chris@10 354 pln->cldm = cldm;
Chris@10 355 pln->extra_iter = extra_iter;
Chris@10 356
Chris@10 357 X(ops_zero)(&pln->super.super.ops);
Chris@10 358 X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
Chris@10 359 &e->ops, &pln->super.super.ops);
Chris@10 360 X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
Chris@10 361 X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
Chris@10 362
Chris@10 363 if (ego->bufferedp)
Chris@10 364 pln->super.super.ops.other += 4 * r * m * v;
Chris@10 365
Chris@10 366 return &(pln->super.super);
Chris@10 367
Chris@10 368 nada:
Chris@10 369 X(plan_destroy_internal)(cld0);
Chris@10 370 X(plan_destroy_internal)(cldm);
Chris@10 371 return 0;
Chris@10 372 }
Chris@10 373
Chris@10 374 static void regone(planner *plnr, khc2c codelet,
Chris@10 375 const hc2c_desc *desc,
Chris@10 376 hc2c_kind hc2ckind,
Chris@10 377 int bufferedp)
Chris@10 378 {
Chris@10 379 S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
Chris@10 380 slv->k = codelet;
Chris@10 381 slv->desc = desc;
Chris@10 382 slv->bufferedp = bufferedp;
Chris@10 383 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@10 384 }
Chris@10 385
Chris@10 386 void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
Chris@10 387 const hc2c_desc *desc,
Chris@10 388 hc2c_kind hc2ckind)
Chris@10 389 {
Chris@10 390 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
Chris@10 391 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
Chris@10 392 }