annotate src/fftw-3.3.5/dft/dftw-direct.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 #include "ct.h"
Chris@42 23
Chris@42 24 typedef struct {
Chris@42 25 ct_solver super;
Chris@42 26 const ct_desc *desc;
Chris@42 27 int bufferedp;
Chris@42 28 kdftw k;
Chris@42 29 } S;
Chris@42 30
Chris@42 31 typedef struct {
Chris@42 32 plan_dftw super;
Chris@42 33 kdftw k;
Chris@42 34 INT r;
Chris@42 35 stride rs;
Chris@42 36 INT m, ms, v, vs, mb, me, extra_iter;
Chris@42 37 stride brs;
Chris@42 38 twid *td;
Chris@42 39 const S *slv;
Chris@42 40 } P;
Chris@42 41
Chris@42 42
Chris@42 43 /*************************************************************
Chris@42 44 Nonbuffered code
Chris@42 45 *************************************************************/
Chris@42 46 static void apply(const plan *ego_, R *rio, R *iio)
Chris@42 47 {
Chris@42 48 const P *ego = (const P *) ego_;
Chris@42 49 INT i;
Chris@42 50 ASSERT_ALIGNED_DOUBLE;
Chris@42 51 for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) {
Chris@42 52 INT mb = ego->mb, ms = ego->ms;
Chris@42 53 ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
Chris@42 54 ego->rs, mb, ego->me, ms);
Chris@42 55 }
Chris@42 56 }
Chris@42 57
Chris@42 58 static void apply_extra_iter(const plan *ego_, R *rio, R *iio)
Chris@42 59 {
Chris@42 60 const P *ego = (const P *) ego_;
Chris@42 61 INT i, v = ego->v, vs = ego->vs;
Chris@42 62 INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms;
Chris@42 63 ASSERT_ALIGNED_DOUBLE;
Chris@42 64 for (i = 0; i < v; ++i, rio += vs, iio += vs) {
Chris@42 65 ego->k(rio + mb*ms, iio + mb*ms, ego->td->W,
Chris@42 66 ego->rs, mb, mm, ms);
Chris@42 67 ego->k(rio + mm*ms, iio + mm*ms, ego->td->W,
Chris@42 68 ego->rs, mm, mm+2, 0);
Chris@42 69 }
Chris@42 70 }
Chris@42 71
Chris@42 72 /*************************************************************
Chris@42 73 Buffered code
Chris@42 74 *************************************************************/
Chris@42 75 static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf)
Chris@42 76 {
Chris@42 77 INT brs = WS(ego->brs, 1);
Chris@42 78 INT rs = WS(ego->rs, 1);
Chris@42 79 INT ms = ego->ms;
Chris@42 80
Chris@42 81 X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1,
Chris@42 82 ego->r, rs, brs,
Chris@42 83 me - mb, ms, 2);
Chris@42 84 ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2);
Chris@42 85 X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms,
Chris@42 86 ego->r, brs, rs,
Chris@42 87 me - mb, 2, ms);
Chris@42 88 }
Chris@42 89
Chris@42 90 /* must be even for SIMD alignment; should not be 2^k to avoid
Chris@42 91 associativity conflicts */
Chris@42 92 static INT compute_batchsize(INT radix)
Chris@42 93 {
Chris@42 94 /* round up to multiple of 4 */
Chris@42 95 radix += 3;
Chris@42 96 radix &= -4;
Chris@42 97
Chris@42 98 return (radix + 2);
Chris@42 99 }
Chris@42 100
Chris@42 101 static void apply_buf(const plan *ego_, R *rio, R *iio)
Chris@42 102 {
Chris@42 103 const P *ego = (const P *) ego_;
Chris@42 104 INT i, j, v = ego->v, r = ego->r;
Chris@42 105 INT batchsz = compute_batchsize(r);
Chris@42 106 R *buf;
Chris@42 107 INT mb = ego->mb, me = ego->me;
Chris@42 108 size_t bufsz = r * batchsz * 2 * sizeof(R);
Chris@42 109
Chris@42 110 BUF_ALLOC(R *, buf, bufsz);
Chris@42 111
Chris@42 112 for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) {
Chris@42 113 for (j = mb; j + batchsz < me; j += batchsz)
Chris@42 114 dobatch(ego, rio, iio, j, j + batchsz, buf);
Chris@42 115
Chris@42 116 dobatch(ego, rio, iio, j, me, buf);
Chris@42 117 }
Chris@42 118
Chris@42 119 BUF_FREE(buf, bufsz);
Chris@42 120 }
Chris@42 121
Chris@42 122 /*************************************************************
Chris@42 123 common code
Chris@42 124 *************************************************************/
Chris@42 125 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@42 126 {
Chris@42 127 P *ego = (P *) ego_;
Chris@42 128
Chris@42 129 X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
Chris@42 130 ego->r * ego->m, ego->r, ego->m + ego->extra_iter);
Chris@42 131 }
Chris@42 132
Chris@42 133 static void destroy(plan *ego_)
Chris@42 134 {
Chris@42 135 P *ego = (P *) ego_;
Chris@42 136 X(stride_destroy)(ego->brs);
Chris@42 137 X(stride_destroy)(ego->rs);
Chris@42 138 }
Chris@42 139
Chris@42 140 static void print(const plan *ego_, printer *p)
Chris@42 141 {
Chris@42 142 const P *ego = (const P *) ego_;
Chris@42 143 const S *slv = ego->slv;
Chris@42 144 const ct_desc *e = slv->desc;
Chris@42 145
Chris@42 146 if (slv->bufferedp)
Chris@42 147 p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")",
Chris@42 148 compute_batchsize(ego->r), ego->r,
Chris@42 149 X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
Chris@42 150 else
Chris@42 151 p->print(p, "(dftw-direct-%D/%D%v \"%s\")",
Chris@42 152 ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
Chris@42 153 }
Chris@42 154
Chris@42 155 static int applicable0(const S *ego,
Chris@42 156 INT r, INT irs, INT ors,
Chris@42 157 INT m, INT ms,
Chris@42 158 INT v, INT ivs, INT ovs,
Chris@42 159 INT mb, INT me,
Chris@42 160 R *rio, R *iio,
Chris@42 161 const planner *plnr, INT *extra_iter)
Chris@42 162 {
Chris@42 163 const ct_desc *e = ego->desc;
Chris@42 164 UNUSED(v);
Chris@42 165
Chris@42 166 return (
Chris@42 167 1
Chris@42 168 && r == e->radix
Chris@42 169 && irs == ors /* in-place along R */
Chris@42 170 && ivs == ovs /* in-place along V */
Chris@42 171
Chris@42 172 /* check for alignment/vector length restrictions */
Chris@42 173 && ((*extra_iter = 0,
Chris@42 174 e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr))
Chris@42 175 ||
Chris@42 176 (*extra_iter = 1,
Chris@42 177 (1
Chris@42 178 /* FIXME: require full array, otherwise some threads
Chris@42 179 may be extra_iter and other threads won't be.
Chris@42 180 Generating the proper twiddle factors is a pain in
Chris@42 181 this case */
Chris@42 182 && mb == 0 && me == m
Chris@42 183 && e->genus->okp(e, rio, iio, irs, ivs,
Chris@42 184 m, mb, me - 1, ms, plnr)
Chris@42 185 && e->genus->okp(e, rio, iio, irs, ivs,
Chris@42 186 m, me - 1, me + 1, ms, plnr))))
Chris@42 187
Chris@42 188 && (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs,
Chris@42 189 m, mb, me - *extra_iter, ms, plnr))
Chris@42 190
Chris@42 191 );
Chris@42 192 }
Chris@42 193
Chris@42 194 static int applicable0_buf(const S *ego,
Chris@42 195 INT r, INT irs, INT ors,
Chris@42 196 INT m, INT ms,
Chris@42 197 INT v, INT ivs, INT ovs,
Chris@42 198 INT mb, INT me,
Chris@42 199 R *rio, R *iio,
Chris@42 200 const planner *plnr)
Chris@42 201 {
Chris@42 202 const ct_desc *e = ego->desc;
Chris@42 203 INT batchsz;
Chris@42 204 UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio);
Chris@42 205
Chris@42 206 return (
Chris@42 207 1
Chris@42 208 && r == e->radix
Chris@42 209 && irs == ors /* in-place along R */
Chris@42 210 && ivs == ovs /* in-place along V */
Chris@42 211
Chris@42 212 /* check for alignment/vector length restrictions, both for
Chris@42 213 batchsize and for the remainder */
Chris@42 214 && (batchsz = compute_batchsize(r), 1)
Chris@42 215 && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
Chris@42 216 m, mb, mb + batchsz, 2, plnr))
Chris@42 217 && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
Chris@42 218 m, mb, me, 2, plnr))
Chris@42 219 );
Chris@42 220 }
Chris@42 221
Chris@42 222 static int applicable(const S *ego,
Chris@42 223 INT r, INT irs, INT ors,
Chris@42 224 INT m, INT ms,
Chris@42 225 INT v, INT ivs, INT ovs,
Chris@42 226 INT mb, INT me,
Chris@42 227 R *rio, R *iio,
Chris@42 228 const planner *plnr, INT *extra_iter)
Chris@42 229 {
Chris@42 230 if (ego->bufferedp) {
Chris@42 231 *extra_iter = 0;
Chris@42 232 if (!applicable0_buf(ego,
Chris@42 233 r, irs, ors, m, ms, v, ivs, ovs, mb, me,
Chris@42 234 rio, iio, plnr))
Chris@42 235 return 0;
Chris@42 236 } else {
Chris@42 237 if (!applicable0(ego,
Chris@42 238 r, irs, ors, m, ms, v, ivs, ovs, mb, me,
Chris@42 239 rio, iio, plnr, extra_iter))
Chris@42 240 return 0;
Chris@42 241 }
Chris@42 242
Chris@42 243 if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
Chris@42 244 v, m * r, r))
Chris@42 245 return 0;
Chris@42 246
Chris@42 247 if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr))
Chris@42 248 return 0;
Chris@42 249
Chris@42 250 return 1;
Chris@42 251 }
Chris@42 252
Chris@42 253 static plan *mkcldw(const ct_solver *ego_,
Chris@42 254 INT r, INT irs, INT ors,
Chris@42 255 INT m, INT ms,
Chris@42 256 INT v, INT ivs, INT ovs,
Chris@42 257 INT mstart, INT mcount,
Chris@42 258 R *rio, R *iio,
Chris@42 259 planner *plnr)
Chris@42 260 {
Chris@42 261 const S *ego = (const S *) ego_;
Chris@42 262 P *pln;
Chris@42 263 const ct_desc *e = ego->desc;
Chris@42 264 INT extra_iter;
Chris@42 265
Chris@42 266 static const plan_adt padt = {
Chris@42 267 0, awake, print, destroy
Chris@42 268 };
Chris@42 269
Chris@42 270 A(mstart >= 0 && mstart + mcount <= m);
Chris@42 271 if (!applicable(ego,
Chris@42 272 r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
Chris@42 273 rio, iio, plnr, &extra_iter))
Chris@42 274 return (plan *)0;
Chris@42 275
Chris@42 276 if (ego->bufferedp) {
Chris@42 277 pln = MKPLAN_DFTW(P, &padt, apply_buf);
Chris@42 278 } else {
Chris@42 279 pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply);
Chris@42 280 }
Chris@42 281
Chris@42 282 pln->k = ego->k;
Chris@42 283 pln->rs = X(mkstride)(r, irs);
Chris@42 284 pln->td = 0;
Chris@42 285 pln->r = r;
Chris@42 286 pln->m = m;
Chris@42 287 pln->ms = ms;
Chris@42 288 pln->v = v;
Chris@42 289 pln->vs = ivs;
Chris@42 290 pln->mb = mstart;
Chris@42 291 pln->me = mstart + mcount;
Chris@42 292 pln->slv = ego;
Chris@42 293 pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
Chris@42 294 pln->extra_iter = extra_iter;
Chris@42 295
Chris@42 296 X(ops_zero)(&pln->super.super.ops);
Chris@42 297 X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops);
Chris@42 298
Chris@42 299 if (ego->bufferedp) {
Chris@42 300 /* 8 load/stores * N * V */
Chris@42 301 pln->super.super.ops.other += 8 * r * mcount * v;
Chris@42 302 }
Chris@42 303
Chris@42 304 pln->super.super.could_prune_now_p =
Chris@42 305 (!ego->bufferedp && r >= 5 && r < 64 && m >= r);
Chris@42 306 return &(pln->super.super);
Chris@42 307 }
Chris@42 308
Chris@42 309 static void regone(planner *plnr, kdftw codelet,
Chris@42 310 const ct_desc *desc, int dec, int bufferedp)
Chris@42 311 {
Chris@42 312 S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
Chris@42 313 slv->k = codelet;
Chris@42 314 slv->desc = desc;
Chris@42 315 slv->bufferedp = bufferedp;
Chris@42 316 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@42 317 if (X(mksolver_ct_hook)) {
Chris@42 318 slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix,
Chris@42 319 dec, mkcldw, 0);
Chris@42 320 slv->k = codelet;
Chris@42 321 slv->desc = desc;
Chris@42 322 slv->bufferedp = bufferedp;
Chris@42 323 REGISTER_SOLVER(plnr, &(slv->super.super));
Chris@42 324 }
Chris@42 325 }
Chris@42 326
Chris@42 327 void X(regsolver_ct_directw)(planner *plnr, kdftw codelet,
Chris@42 328 const ct_desc *desc, int dec)
Chris@42 329 {
Chris@42 330 regone(plnr, codelet, desc, dec, /* bufferedp */ 0);
Chris@42 331 regone(plnr, codelet, desc, dec, /* bufferedp */ 1);
Chris@42 332 }