annotate src/fftw-3.3.5/rdft/ct-hc2c-direct.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 #include "ct-hc2c.h"
Chris@42 23
Chris@42 24 typedef struct {
Chris@42 25 hc2c_solver super;
Chris@42 26 const hc2c_desc *desc;
Chris@42 27 int bufferedp;
Chris@42 28 khc2c k;
Chris@42 29 } S;
Chris@42 30
Chris@42 31 typedef struct {
Chris@42 32 plan_hc2c super;
Chris@42 33 khc2c k;
Chris@42 34 plan *cld0, *cldm; /* children for 0th and middle butterflies */
Chris@42 35 INT r, m, v, extra_iter;
Chris@42 36 INT ms, vs;
Chris@42 37 stride rs, brs;
Chris@42 38 twid *td;
Chris@42 39 const S *slv;
Chris@42 40 } P;
Chris@42 41
Chris@42 42 /*************************************************************
Chris@42 43 Nonbuffered code
Chris@42 44 *************************************************************/
Chris@42 45 static void apply(const plan *ego_, R *cr, R *ci)
Chris@42 46 {
Chris@42 47 const P *ego = (const P *) ego_;
Chris@42 48 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@42 49 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@42 50 INT i, m = ego->m, v = ego->v;
Chris@42 51 INT ms = ego->ms, vs = ego->vs;
Chris@42 52
Chris@42 53 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
Chris@42 54 cld0->apply((plan *) cld0, cr, ci, cr, ci);
Chris@42 55 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 56 ego->td->W, ego->rs, 1, (m+1)/2, ms);
Chris@42 57 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
Chris@42 58 cr + (m/2)*ms, ci + (m/2)*ms);
Chris@42 59 }
Chris@42 60 }
Chris@42 61
Chris@42 62 static void apply_extra_iter(const plan *ego_, R *cr, R *ci)
Chris@42 63 {
Chris@42 64 const P *ego = (const P *) ego_;
Chris@42 65 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@42 66 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@42 67 INT i, m = ego->m, v = ego->v;
Chris@42 68 INT ms = ego->ms, vs = ego->vs;
Chris@42 69 INT mm = (m-1)/2;
Chris@42 70
Chris@42 71 for (i = 0; i < v; ++i, cr += vs, ci += vs) {
Chris@42 72 cld0->apply((plan *) cld0, cr, ci, cr, ci);
Chris@42 73
Chris@42 74 /* for 4-way SIMD when (m+1)/2-1 is odd: iterate over an
Chris@42 75 even vector length MM-1, and then execute the last
Chris@42 76 iteration as a 2-vector with vector stride 0. The
Chris@42 77 twiddle factors of the second half of the last iteration
Chris@42 78 are bogus, but we only store the results of the first
Chris@42 79 half. */
Chris@42 80 ego->k(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 81 ego->td->W, ego->rs, 1, mm, ms);
Chris@42 82 ego->k(cr + mm*ms, ci + mm*ms, cr + (m-mm)*ms, ci + (m-mm)*ms,
Chris@42 83 ego->td->W, ego->rs, mm, mm+2, 0);
Chris@42 84 cldm->apply((plan *) cldm, cr + (m/2)*ms, ci + (m/2)*ms,
Chris@42 85 cr + (m/2)*ms, ci + (m/2)*ms);
Chris@42 86 }
Chris@42 87
Chris@42 88 }
Chris@42 89
Chris@42 90 /*************************************************************
Chris@42 91 Buffered code
Chris@42 92 *************************************************************/
Chris@42 93
Chris@42 94 /* should not be 2^k to avoid associativity conflicts */
Chris@42 95 static INT compute_batchsize(INT radix)
Chris@42 96 {
Chris@42 97 /* round up to multiple of 4 */
Chris@42 98 radix += 3;
Chris@42 99 radix &= -4;
Chris@42 100
Chris@42 101 return (radix + 2);
Chris@42 102 }
Chris@42 103
Chris@42 104 static void dobatch(const P *ego, R *Rp, R *Ip, R *Rm, R *Im,
Chris@42 105 INT mb, INT me, INT extra_iter, R *bufp)
Chris@42 106 {
Chris@42 107 INT b = WS(ego->brs, 1);
Chris@42 108 INT rs = WS(ego->rs, 1);
Chris@42 109 INT ms = ego->ms;
Chris@42 110 R *bufm = bufp + b - 2;
Chris@42 111 INT n = me - mb;
Chris@42 112
Chris@42 113 X(cpy2d_pair_ci)(Rp + mb * ms, Ip + mb * ms, bufp, bufp + 1,
Chris@42 114 ego->r / 2, rs, b,
Chris@42 115 n, ms, 2);
Chris@42 116 X(cpy2d_pair_ci)(Rm - mb * ms, Im - mb * ms, bufm, bufm + 1,
Chris@42 117 ego->r / 2, rs, b,
Chris@42 118 n, -ms, -2);
Chris@42 119
Chris@42 120 if (extra_iter) {
Chris@42 121 /* initialize the extra_iter element to 0. It would be ok
Chris@42 122 to leave it uninitialized, since we transform uninitialized
Chris@42 123 data and ignore the result. However, we want to avoid
Chris@42 124 FP exceptions in case somebody is trapping them. */
Chris@42 125 A(n < compute_batchsize(ego->r));
Chris@42 126 X(zero1d_pair)(bufp + 2*n, bufp + 1 + 2*n, ego->r / 2, b);
Chris@42 127 X(zero1d_pair)(bufm - 2*n, bufm + 1 - 2*n, ego->r / 2, b);
Chris@42 128 }
Chris@42 129
Chris@42 130 ego->k(bufp, bufp + 1, bufm, bufm + 1, ego->td->W,
Chris@42 131 ego->brs, mb, me + extra_iter, 2);
Chris@42 132 X(cpy2d_pair_co)(bufp, bufp + 1, Rp + mb * ms, Ip + mb * ms,
Chris@42 133 ego->r / 2, b, rs,
Chris@42 134 n, 2, ms);
Chris@42 135 X(cpy2d_pair_co)(bufm, bufm + 1, Rm - mb * ms, Im - mb * ms,
Chris@42 136 ego->r / 2, b, rs,
Chris@42 137 n, -2, -ms);
Chris@42 138 }
Chris@42 139
Chris@42 140 static void apply_buf(const plan *ego_, R *cr, R *ci)
Chris@42 141 {
Chris@42 142 const P *ego = (const P *) ego_;
Chris@42 143 plan_rdft2 *cld0 = (plan_rdft2 *) ego->cld0;
Chris@42 144 plan_rdft2 *cldm = (plan_rdft2 *) ego->cldm;
Chris@42 145 INT i, j, ms = ego->ms, v = ego->v;
Chris@42 146 INT batchsz = compute_batchsize(ego->r);
Chris@42 147 R *buf;
Chris@42 148 INT mb = 1, me = (ego->m+1) / 2;
Chris@42 149 size_t bufsz = ego->r * batchsz * 2 * sizeof(R);
Chris@42 150
Chris@42 151 BUF_ALLOC(R *, buf, bufsz);
Chris@42 152
Chris@42 153 for (i = 0; i < v; ++i, cr += ego->vs, ci += ego->vs) {
Chris@42 154 R *Rp = cr;
Chris@42 155 R *Ip = ci;
Chris@42 156 R *Rm = cr + ego->m * ms;
Chris@42 157 R *Im = ci + ego->m * ms;
Chris@42 158
Chris@42 159 cld0->apply((plan *) cld0, Rp, Ip, Rp, Ip);
Chris@42 160
Chris@42 161 for (j = mb; j + batchsz < me; j += batchsz)
Chris@42 162 dobatch(ego, Rp, Ip, Rm, Im, j, j + batchsz, 0, buf);
Chris@42 163
Chris@42 164 dobatch(ego, Rp, Ip, Rm, Im, j, me, ego->extra_iter, buf);
Chris@42 165
Chris@42 166 cldm->apply((plan *) cldm,
Chris@42 167 Rp + me * ms, Ip + me * ms,
Chris@42 168 Rp + me * ms, Ip + me * ms);
Chris@42 169
Chris@42 170 }
Chris@42 171
Chris@42 172 BUF_FREE(buf, bufsz);
Chris@42 173 }
Chris@42 174
Chris@42 175 /*************************************************************
Chris@42 176 common code
Chris@42 177 *************************************************************/
Chris@42 178 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@42 179 {
Chris@42 180 P *ego = (P *) ego_;
Chris@42 181
Chris@42 182 X(plan_awake)(ego->cld0, wakefulness);
Chris@42 183 X(plan_awake)(ego->cldm, wakefulness);
Chris@42 184 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
Chris@42 185 ego->r * ego->m, ego->r,
Chris@42 186 (ego->m - 1) / 2 + ego->extra_iter);
Chris@42 187 }
Chris@42 188
Chris@42 189 static void destroy(plan *ego_)
Chris@42 190 {
Chris@42 191 P *ego = (P *) ego_;
Chris@42 192 X(plan_destroy_internal)(ego->cld0);
Chris@42 193 X(plan_destroy_internal)(ego->cldm);
Chris@42 194 X(stride_destroy)(ego->rs);
Chris@42 195 X(stride_destroy)(ego->brs);
Chris@42 196 }
Chris@42 197
Chris@42 198 static void print(const plan *ego_, printer *p)
Chris@42 199 {
Chris@42 200 const P *ego = (const P *) ego_;
Chris@42 201 const S *slv = ego->slv;
Chris@42 202 const hc2c_desc *e = slv->desc;
Chris@42 203
Chris@42 204 if (slv->bufferedp)
Chris@42 205 p->print(p, "(hc2c-directbuf/%D-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@42 206 compute_batchsize(ego->r),
Chris@42 207 ego->r, X(twiddle_length)(ego->r, e->tw),
Chris@42 208 ego->extra_iter, ego->v, e->nam,
Chris@42 209 ego->cld0, ego->cldm);
Chris@42 210 else
Chris@42 211 p->print(p, "(hc2c-direct-%D/%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@42 212 ego->r, X(twiddle_length)(ego->r, e->tw),
Chris@42 213 ego->extra_iter, ego->v, e->nam,
Chris@42 214 ego->cld0, ego->cldm);
Chris@42 215 }
Chris@42 216
Chris@42 217 static int applicable0(const S *ego, rdft_kind kind,
Chris@42 218 INT r, INT rs,
Chris@42 219 INT m, INT ms,
Chris@42 220 INT v, INT vs,
Chris@42 221 const R *cr, const R *ci,
Chris@42 222 const planner *plnr,
Chris@42 223 INT *extra_iter)
Chris@42 224 {
Chris@42 225 const hc2c_desc *e = ego->desc;
Chris@42 226 UNUSED(v);
Chris@42 227
Chris@42 228 return (
Chris@42 229 1
Chris@42 230 && r == e->radix
Chris@42 231 && kind == e->genus->kind
Chris@42 232
Chris@42 233 /* first v-loop iteration */
Chris@42 234 && ((*extra_iter = 0,
Chris@42 235 e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 236 rs, 1, (m+1)/2, ms, plnr))
Chris@42 237 ||
Chris@42 238 (*extra_iter = 1,
Chris@42 239 ((e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 240 rs, 1, (m-1)/2, ms, plnr))
Chris@42 241 &&
Chris@42 242 (e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 243 rs, (m-1)/2, (m-1)/2 + 2, 0, plnr)))))
Chris@42 244
Chris@42 245 /* subsequent v-loop iterations */
Chris@42 246 && (cr += vs, ci += vs, 1)
Chris@42 247
Chris@42 248 && e->genus->okp(cr + ms, ci + ms, cr + (m-1)*ms, ci + (m-1)*ms,
Chris@42 249 rs, 1, (m+1)/2 - *extra_iter, ms, plnr)
Chris@42 250 );
Chris@42 251 }
Chris@42 252
Chris@42 253 static int applicable0_buf(const S *ego, rdft_kind kind,
Chris@42 254 INT r, INT rs,
Chris@42 255 INT m, INT ms,
Chris@42 256 INT v, INT vs,
Chris@42 257 const R *cr, const R *ci,
Chris@42 258 const planner *plnr, INT *extra_iter)
Chris@42 259 {
Chris@42 260 const hc2c_desc *e = ego->desc;
Chris@42 261 INT batchsz, brs;
Chris@42 262 UNUSED(v); UNUSED(rs); UNUSED(ms); UNUSED(vs);
Chris@42 263
Chris@42 264 return (
Chris@42 265 1
Chris@42 266 && r == e->radix
Chris@42 267 && kind == e->genus->kind
Chris@42 268
Chris@42 269 /* ignore cr, ci, use buffer */
Chris@42 270 && (cr = (const R *)0, ci = cr + 1,
Chris@42 271 batchsz = compute_batchsize(r),
Chris@42 272 brs = 4 * batchsz, 1)
Chris@42 273
Chris@42 274 && e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@42 275 brs, 1, 1+batchsz, 2, plnr)
Chris@42 276
Chris@42 277 && ((*extra_iter = 0,
Chris@42 278 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@42 279 brs, 1, 1 + (((m-1)/2) % batchsz), 2, plnr))
Chris@42 280 ||
Chris@42 281 (*extra_iter = 1,
Chris@42 282 e->genus->okp(cr, ci, cr + brs - 2, ci + brs - 2,
Chris@42 283 brs, 1, 1 + 1 + (((m-1)/2) % batchsz), 2, plnr)))
Chris@42 284
Chris@42 285 );
Chris@42 286 }
Chris@42 287
Chris@42 288 static int applicable(const S *ego, rdft_kind kind,
Chris@42 289 INT r, INT rs,
Chris@42 290 INT m, INT ms,
Chris@42 291 INT v, INT vs,
Chris@42 292 R *cr, R *ci,
Chris@42 293 const planner *plnr, INT *extra_iter)
Chris@42 294 {
Chris@42 295 if (ego->bufferedp) {
Chris@42 296 if (!applicable0_buf(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@42 297 extra_iter))
Chris@42 298 return 0;
Chris@42 299 } else {
Chris@42 300 if (!applicable0(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@42 301 extra_iter))
Chris@42 302 return 0;
Chris@42 303 }
Chris@42 304
Chris@42 305 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
Chris@42 306 v, m * r, r))
Chris@42 307 return 0;
Chris@42 308
Chris@42 309 return 1;
Chris@42 310 }
Chris@42 311
Chris@42 312 static plan *mkcldw(const hc2c_solver *ego_, rdft_kind kind,
Chris@42 313 INT r, INT rs,
Chris@42 314 INT m, INT ms,
Chris@42 315 INT v, INT vs,
Chris@42 316 R *cr, R *ci,
Chris@42 317 planner *plnr)
Chris@42 318 {
Chris@42 319 const S *ego = (const S *) ego_;
Chris@42 320 P *pln;
Chris@42 321 const hc2c_desc *e = ego->desc;
Chris@42 322 plan *cld0 = 0, *cldm = 0;
Chris@42 323 INT imid = (m / 2) * ms;
Chris@42 324 INT extra_iter;
Chris@42 325
Chris@42 326 static const plan_adt padt = {
Chris@42 327 0, awake, print, destroy
Chris@42 328 };
Chris@42 329
Chris@42 330 if (!applicable(ego, kind, r, rs, m, ms, v, vs, cr, ci, plnr,
Chris@42 331 &extra_iter))
Chris@42 332 return (plan *)0;
Chris@42 333
Chris@42 334 cld0 = X(mkplan_d)(
Chris@42 335 plnr,
Chris@42 336 X(mkproblem_rdft2_d)(X(mktensor_1d)(r, rs, rs),
Chris@42 337 X(mktensor_0d)(),
Chris@42 338 TAINT(cr, vs), TAINT(ci, vs),
Chris@42 339 TAINT(cr, vs), TAINT(ci, vs),
Chris@42 340 kind));
Chris@42 341 if (!cld0) goto nada;
Chris@42 342
Chris@42 343 cldm = X(mkplan_d)(
Chris@42 344 plnr,
Chris@42 345 X(mkproblem_rdft2_d)(((m % 2) ?
Chris@42 346 X(mktensor_0d)() : X(mktensor_1d)(r, rs, rs) ),
Chris@42 347 X(mktensor_0d)(),
Chris@42 348 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
Chris@42 349 TAINT(cr + imid, vs), TAINT(ci + imid, vs),
Chris@42 350 kind == R2HC ? R2HCII : HC2RIII));
Chris@42 351 if (!cldm) goto nada;
Chris@42 352
Chris@42 353 if (ego->bufferedp)
Chris@42 354 pln = MKPLAN_HC2C(P, &padt, apply_buf);
Chris@42 355 else
Chris@42 356 pln = MKPLAN_HC2C(P, &padt, extra_iter ? apply_extra_iter : apply);
Chris@42 357
Chris@42 358 pln->k = ego->k;
Chris@42 359 pln->td = 0;
Chris@42 360 pln->r = r; pln->rs = X(mkstride)(r, rs);
Chris@42 361 pln->m = m; pln->ms = ms;
Chris@42 362 pln->v = v; pln->vs = vs;
Chris@42 363 pln->slv = ego;
Chris@42 364 pln->brs = X(mkstride)(r, 4 * compute_batchsize(r));
Chris@42 365 pln->cld0 = cld0;
Chris@42 366 pln->cldm = cldm;
Chris@42 367 pln->extra_iter = extra_iter;
Chris@42 368
Chris@42 369 X(ops_zero)(&pln->super.super.ops);
Chris@42 370 X(ops_madd2)(v * (((m - 1) / 2) / e->genus->vl),
Chris@42 371 &e->ops, &pln->super.super.ops);
Chris@42 372 X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
Chris@42 373 X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
Chris@42 374
Chris@42 375 if (ego->bufferedp)
Chris@42 376 pln->super.super.ops.other += 4 * r * m * v;
Chris@42 377
Chris@42 378 return &(pln->super.super);
Chris@42 379
Chris@42 380 nada:
Chris@42 381 X(plan_destroy_internal)(cld0);
Chris@42 382 X(plan_destroy_internal)(cldm);
Chris@42 383 return 0;
Chris@42 384 }
Chris@42 385
Chris@42 386 static void regone(planner *plnr, khc2c codelet,
Chris@42 387 const hc2c_desc *desc,
Chris@42 388 hc2c_kind hc2ckind,
Chris@42 389 int bufferedp)
Chris@42 390 {
Chris@42 391 S *slv = (S *)X(mksolver_hc2c)(sizeof(S), desc->radix, hc2ckind, mkcldw);
Chris@42 392 slv->k = codelet;
Chris@42 393 slv->desc = desc;
Chris@42 394 slv->bufferedp = bufferedp;
Chris@42 395 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@42 396 }
Chris@42 397
Chris@42 398 void X(regsolver_hc2c_direct)(planner *plnr, khc2c codelet,
Chris@42 399 const hc2c_desc *desc,
Chris@42 400 hc2c_kind hc2ckind)
Chris@42 401 {
Chris@42 402 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */0);
Chris@42 403 regone(plnr, codelet, desc, hc2ckind, /* bufferedp */1);
Chris@42 404 }