annotate src/fftw-3.3.3/rdft/hc2hc-direct.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21
Chris@10 22 #include "hc2hc.h"
Chris@10 23
Chris@10 24 typedef struct {
Chris@10 25 hc2hc_solver super;
Chris@10 26 const hc2hc_desc *desc;
Chris@10 27 khc2hc k;
Chris@10 28 int bufferedp;
Chris@10 29 } S;
Chris@10 30
Chris@10 31 typedef struct {
Chris@10 32 plan_hc2hc super;
Chris@10 33 khc2hc k;
Chris@10 34 plan *cld0, *cldm; /* children for 0th and middle butterflies */
Chris@10 35 INT r, m, v;
Chris@10 36 INT ms, vs, mb, me;
Chris@10 37 stride rs, brs;
Chris@10 38 twid *td;
Chris@10 39 const S *slv;
Chris@10 40 } P;
Chris@10 41
Chris@10 42 /*************************************************************
Chris@10 43 Nonbuffered code
Chris@10 44 *************************************************************/
Chris@10 45 static void apply(const plan *ego_, R *IO)
Chris@10 46 {
Chris@10 47 const P *ego = (const P *) ego_;
Chris@10 48 plan_rdft *cld0 = (plan_rdft *) ego->cld0;
Chris@10 49 plan_rdft *cldm = (plan_rdft *) ego->cldm;
Chris@10 50 INT i, m = ego->m, v = ego->v;
Chris@10 51 INT mb = ego->mb, me = ego->me;
Chris@10 52 INT ms = ego->ms, vs = ego->vs;
Chris@10 53
Chris@10 54 for (i = 0; i < v; ++i, IO += vs) {
Chris@10 55 cld0->apply((plan *) cld0, IO, IO);
Chris@10 56 ego->k(IO + ms * mb, IO + (m - mb) * ms,
Chris@10 57 ego->td->W, ego->rs, mb, me, ms);
Chris@10 58 cldm->apply((plan *) cldm, IO + (m/2) * ms, IO + (m/2) * ms);
Chris@10 59 }
Chris@10 60 }
Chris@10 61
Chris@10 62 /*************************************************************
Chris@10 63 Buffered code
Chris@10 64 *************************************************************/
Chris@10 65
Chris@10 66 /* should not be 2^k to avoid associativity conflicts */
Chris@10 67 static INT compute_batchsize(INT radix)
Chris@10 68 {
Chris@10 69 /* round up to multiple of 4 */
Chris@10 70 radix += 3;
Chris@10 71 radix &= -4;
Chris@10 72
Chris@10 73 return (radix + 2);
Chris@10 74 }
Chris@10 75
Chris@10 76 static void dobatch(const P *ego, R *IOp, R *IOm,
Chris@10 77 INT mb, INT me, R *bufp)
Chris@10 78 {
Chris@10 79 INT b = WS(ego->brs, 1);
Chris@10 80 INT rs = WS(ego->rs, 1);
Chris@10 81 INT r = ego->r;
Chris@10 82 INT ms = ego->ms;
Chris@10 83 R *bufm = bufp + b - 1;
Chris@10 84
Chris@10 85 X(cpy2d_ci)(IOp + mb * ms, bufp, r, rs, b, me - mb, ms, 1, 1);
Chris@10 86 X(cpy2d_ci)(IOm - mb * ms, bufm, r, rs, b, me - mb, -ms, -1, 1);
Chris@10 87
Chris@10 88 ego->k(bufp, bufm, ego->td->W, ego->brs, mb, me, 1);
Chris@10 89
Chris@10 90 X(cpy2d_co)(bufp, IOp + mb * ms, r, b, rs, me - mb, 1, ms, 1);
Chris@10 91 X(cpy2d_co)(bufm, IOm - mb * ms, r, b, rs, me - mb, -1, -ms, 1);
Chris@10 92 }
Chris@10 93
Chris@10 94 static void apply_buf(const plan *ego_, R *IO)
Chris@10 95 {
Chris@10 96 const P *ego = (const P *) ego_;
Chris@10 97 plan_rdft *cld0 = (plan_rdft *) ego->cld0;
Chris@10 98 plan_rdft *cldm = (plan_rdft *) ego->cldm;
Chris@10 99 INT i, j, m = ego->m, v = ego->v, r = ego->r;
Chris@10 100 INT mb = ego->mb, me = ego->me, ms = ego->ms;
Chris@10 101 INT batchsz = compute_batchsize(r);
Chris@10 102 R *buf;
Chris@10 103 size_t bufsz = r * batchsz * 2 * sizeof(R);
Chris@10 104
Chris@10 105 BUF_ALLOC(R *, buf, bufsz);
Chris@10 106
Chris@10 107 for (i = 0; i < v; ++i, IO += ego->vs) {
Chris@10 108 R *IOp = IO;
Chris@10 109 R *IOm = IO + m * ms;
Chris@10 110
Chris@10 111 cld0->apply((plan *) cld0, IO, IO);
Chris@10 112
Chris@10 113 for (j = mb; j + batchsz < me; j += batchsz)
Chris@10 114 dobatch(ego, IOp, IOm, j, j + batchsz, buf);
Chris@10 115
Chris@10 116 dobatch(ego, IOp, IOm, j, me, buf);
Chris@10 117
Chris@10 118 cldm->apply((plan *) cldm, IO + ms * (m/2), IO + ms * (m/2));
Chris@10 119 }
Chris@10 120
Chris@10 121 BUF_FREE(buf, bufsz);
Chris@10 122 }
Chris@10 123
Chris@10 124 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@10 125 {
Chris@10 126 P *ego = (P *) ego_;
Chris@10 127
Chris@10 128 X(plan_awake)(ego->cld0, wakefulness);
Chris@10 129 X(plan_awake)(ego->cldm, wakefulness);
Chris@10 130 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
Chris@10 131 ego->r * ego->m, ego->r, (ego->m - 1) / 2);
Chris@10 132 }
Chris@10 133
Chris@10 134 static void destroy(plan *ego_)
Chris@10 135 {
Chris@10 136 P *ego = (P *) ego_;
Chris@10 137 X(plan_destroy_internal)(ego->cld0);
Chris@10 138 X(plan_destroy_internal)(ego->cldm);
Chris@10 139 X(stride_destroy)(ego->rs);
Chris@10 140 X(stride_destroy)(ego->brs);
Chris@10 141 }
Chris@10 142
Chris@10 143 static void print(const plan *ego_, printer *p)
Chris@10 144 {
Chris@10 145 const P *ego = (const P *) ego_;
Chris@10 146 const S *slv = ego->slv;
Chris@10 147 const hc2hc_desc *e = slv->desc;
Chris@10 148 INT batchsz = compute_batchsize(ego->r);
Chris@10 149
Chris@10 150 if (slv->bufferedp)
Chris@10 151 p->print(p, "(hc2hc-directbuf/%D-%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@10 152 batchsz, ego->r, X(twiddle_length)(ego->r, e->tw),
Chris@10 153 ego->v, e->nam, ego->cld0, ego->cldm);
Chris@10 154 else
Chris@10 155 p->print(p, "(hc2hc-direct-%D/%D%v \"%s\"%(%p%)%(%p%))",
Chris@10 156 ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam,
Chris@10 157 ego->cld0, ego->cldm);
Chris@10 158 }
Chris@10 159
Chris@10 160 static int applicable0(const S *ego, rdft_kind kind, INT r)
Chris@10 161 {
Chris@10 162 const hc2hc_desc *e = ego->desc;
Chris@10 163
Chris@10 164 return (1
Chris@10 165 && r == e->radix
Chris@10 166 && kind == e->genus->kind
Chris@10 167 );
Chris@10 168 }
Chris@10 169
Chris@10 170 static int applicable(const S *ego, rdft_kind kind, INT r, INT m, INT v,
Chris@10 171 const planner *plnr)
Chris@10 172 {
Chris@10 173 if (!applicable0(ego, kind, r))
Chris@10 174 return 0;
Chris@10 175
Chris@10 176 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
Chris@10 177 v, m * r, r))
Chris@10 178 return 0;
Chris@10 179
Chris@10 180 return 1;
Chris@10 181 }
Chris@10 182
Chris@10 183 #define CLDMP(m, mstart, mcount) (2 * ((mstart) + (mcount)) == (m) + 2)
Chris@10 184 #define CLD0P(mstart) ((mstart) == 0)
Chris@10 185
Chris@10 186 static plan *mkcldw(const hc2hc_solver *ego_,
Chris@10 187 rdft_kind kind, INT r, INT m, INT ms, INT v, INT vs,
Chris@10 188 INT mstart, INT mcount,
Chris@10 189 R *IO, planner *plnr)
Chris@10 190 {
Chris@10 191 const S *ego = (const S *) ego_;
Chris@10 192 P *pln;
Chris@10 193 const hc2hc_desc *e = ego->desc;
Chris@10 194 plan *cld0 = 0, *cldm = 0;
Chris@10 195 INT imid = (m / 2) * ms;
Chris@10 196 INT rs = m * ms;
Chris@10 197
Chris@10 198 static const plan_adt padt = {
Chris@10 199 0, awake, print, destroy
Chris@10 200 };
Chris@10 201
Chris@10 202 if (!applicable(ego, kind, r, m, v, plnr))
Chris@10 203 return (plan *)0;
Chris@10 204
Chris@10 205 cld0 = X(mkplan_d)(
Chris@10 206 plnr,
Chris@10 207 X(mkproblem_rdft_1_d)((CLD0P(mstart) ?
Chris@10 208 X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
Chris@10 209 X(mktensor_0d)(),
Chris@10 210 TAINT(IO, vs), TAINT(IO, vs),
Chris@10 211 kind));
Chris@10 212 if (!cld0) goto nada;
Chris@10 213
Chris@10 214 cldm = X(mkplan_d)(
Chris@10 215 plnr,
Chris@10 216 X(mkproblem_rdft_1_d)((CLDMP(m, mstart, mcount) ?
Chris@10 217 X(mktensor_1d)(r, rs, rs) : X(mktensor_0d)()),
Chris@10 218 X(mktensor_0d)(),
Chris@10 219 TAINT(IO + imid, vs), TAINT(IO + imid, vs),
Chris@10 220 kind == R2HC ? R2HCII : HC2RIII));
Chris@10 221 if (!cldm) goto nada;
Chris@10 222
Chris@10 223 pln = MKPLAN_HC2HC(P, &padt, ego->bufferedp ? apply_buf : apply);
Chris@10 224
Chris@10 225 pln->k = ego->k;
Chris@10 226 pln->td = 0;
Chris@10 227 pln->r = r; pln->rs = X(mkstride)(r, rs);
Chris@10 228 pln->m = m; pln->ms = ms;
Chris@10 229 pln->v = v; pln->vs = vs;
Chris@10 230 pln->slv = ego;
Chris@10 231 pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
Chris@10 232 pln->cld0 = cld0;
Chris@10 233 pln->cldm = cldm;
Chris@10 234 pln->mb = mstart + CLD0P(mstart);
Chris@10 235 pln->me = mstart + mcount - CLDMP(m, mstart, mcount);
Chris@10 236
Chris@10 237 X(ops_zero)(&pln->super.super.ops);
Chris@10 238 X(ops_madd2)(v * ((pln->me - pln->mb) / e->genus->vl),
Chris@10 239 &e->ops, &pln->super.super.ops);
Chris@10 240 X(ops_madd2)(v, &cld0->ops, &pln->super.super.ops);
Chris@10 241 X(ops_madd2)(v, &cldm->ops, &pln->super.super.ops);
Chris@10 242
Chris@10 243 if (ego->bufferedp)
Chris@10 244 pln->super.super.ops.other += 4 * r * (pln->me - pln->mb) * v;
Chris@10 245
Chris@10 246 pln->super.super.could_prune_now_p =
Chris@10 247 (!ego->bufferedp && r >= 5 && r < 64 && m >= r);
Chris@10 248
Chris@10 249 return &(pln->super.super);
Chris@10 250
Chris@10 251 nada:
Chris@10 252 X(plan_destroy_internal)(cld0);
Chris@10 253 X(plan_destroy_internal)(cldm);
Chris@10 254 return 0;
Chris@10 255 }
Chris@10 256
Chris@10 257 static void regone(planner *plnr, khc2hc codelet, const hc2hc_desc *desc,
Chris@10 258 int bufferedp)
Chris@10 259 {
Chris@10 260 S *slv = (S *)X(mksolver_hc2hc)(sizeof(S), desc->radix, mkcldw);
Chris@10 261 slv->k = codelet;
Chris@10 262 slv->desc = desc;
Chris@10 263 slv->bufferedp = bufferedp;
Chris@10 264 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@10 265 if (X(mksolver_hc2hc_hook)) {
Chris@10 266 slv = (S *)X(mksolver_hc2hc_hook)(sizeof(S), desc->radix, mkcldw);
Chris@10 267 slv->k = codelet;
Chris@10 268 slv->desc = desc;
Chris@10 269 slv->bufferedp = bufferedp;
Chris@10 270 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@10 271 }
Chris@10 272 }
Chris@10 273
Chris@10 274 void X(regsolver_hc2hc_direct)(planner *plnr, khc2hc codelet,
Chris@10 275 const hc2hc_desc *desc)
Chris@10 276 {
Chris@10 277 regone(plnr, codelet, desc, /* bufferedp */0);
Chris@10 278 regone(plnr, codelet, desc, /* bufferedp */1);
Chris@10 279 }