annotate src/fftw-3.3.8/mpi/dft-rank1.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* Complex DFTs of rank == 1 via six-step algorithm. */
Chris@82 22
Chris@82 23 #include "mpi-dft.h"
Chris@82 24 #include "mpi-transpose.h"
Chris@82 25 #include "dft/dft.h"
Chris@82 26
Chris@82 27 typedef struct {
Chris@82 28 solver super;
Chris@82 29 rdftapply apply; /* apply_ddft_first or apply_ddft_last */
Chris@82 30 int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
Chris@82 31 } S;
Chris@82 32
Chris@82 33 typedef struct {
Chris@82 34 plan_mpi_dft super;
Chris@82 35
Chris@82 36 triggen *t;
Chris@82 37 plan *cldt, *cld_ddft, *cld_dft;
Chris@82 38 INT roff, ioff;
Chris@82 39 int preserve_input;
Chris@82 40 INT vn, xmin, xmax, xs, m, r;
Chris@82 41 } P;
Chris@82 42
Chris@82 43 static void do_twiddle(triggen *t, INT ir, INT m, INT vn, R *xr, R *xi)
Chris@82 44 {
Chris@82 45 void (*rotate)(triggen *, INT, R, R, R *) = t->rotate;
Chris@82 46 INT im, iv;
Chris@82 47 for (im = 0; im < m; ++im)
Chris@82 48 for (iv = 0; iv < vn; ++iv) {
Chris@82 49 /* TODO: modify/inline rotate function
Chris@82 50 so that it can do whole vn vector at once? */
Chris@82 51 R c[2];
Chris@82 52 rotate(t, ir * im, *xr, *xi, c);
Chris@82 53 *xr = c[0]; *xi = c[1];
Chris@82 54 xr += 2; xi += 2;
Chris@82 55 }
Chris@82 56 }
Chris@82 57
Chris@82 58 /* radix-r DFT of size r*m. This is equivalent to an m x r 2d DFT,
Chris@82 59 plus twiddle factors between the size-m and size-r 1d DFTs, where
Chris@82 60 the m dimension is initially distributed. The output is transposed
Chris@82 61 to r x m where the r dimension is distributed.
Chris@82 62
Chris@82 63 This algorithm follows the general sequence:
Chris@82 64 global transpose (m x r -> r x m)
Chris@82 65 DFTs of size m
Chris@82 66 multiply by twiddles + global transpose (r x m -> m x r)
Chris@82 67 DFTs of size r
Chris@82 68 global transpose (m x r -> r x m)
Chris@82 69 where the multiplication by twiddles can come before or after
Chris@82 70 the middle transpose. The first/last transposes are omitted
Chris@82 71 for SCRAMBLED_IN/OUT formats, respectively.
Chris@82 72
Chris@82 73 However, we wish to exploit our dft-rank1-bigvec solver, which
Chris@82 74 solves a vector of distributed DFTs via transpose+dft+transpose.
Chris@82 75 Therefore, we can group *either* the DFTs of size m *or* the
Chris@82 76 DFTs of size r with their surrounding transposes as a single
Chris@82 77 distributed-DFT (ddft) plan. These two variations correspond to
Chris@82 78 apply_ddft_first or apply_ddft_last, respectively.
Chris@82 79 */
Chris@82 80
Chris@82 81 static void apply_ddft_first(const plan *ego_, R *I, R *O)
Chris@82 82 {
Chris@82 83 const P *ego = (const P *) ego_;
Chris@82 84 plan_dft *cld_dft;
Chris@82 85 plan_rdft *cldt, *cld_ddft;
Chris@82 86 INT roff, ioff, im, mmax, ms, r, vn;
Chris@82 87 triggen *t;
Chris@82 88 R *dI, *dO;
Chris@82 89
Chris@82 90 /* distributed size-m DFTs, with output in m x r format */
Chris@82 91 cld_ddft = (plan_rdft *) ego->cld_ddft;
Chris@82 92 cld_ddft->apply(ego->cld_ddft, I, O);
Chris@82 93
Chris@82 94 cldt = (plan_rdft *) ego->cldt;
Chris@82 95 if (ego->preserve_input || !cldt) I = O;
Chris@82 96
Chris@82 97 /* twiddle multiplications, followed by 1d DFTs of size-r */
Chris@82 98 cld_dft = (plan_dft *) ego->cld_dft;
Chris@82 99 roff = ego->roff; ioff = ego->ioff;
Chris@82 100 mmax = ego->xmax; ms = ego->xs;
Chris@82 101 t = ego->t; r = ego->r; vn = ego->vn;
Chris@82 102 dI = O; dO = I;
Chris@82 103 for (im = ego->xmin; im <= mmax; ++im) {
Chris@82 104 do_twiddle(t, im, r, vn, dI+roff, dI+ioff);
Chris@82 105 cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
Chris@82 106 dI += ms; dO += ms;
Chris@82 107 }
Chris@82 108
Chris@82 109 /* final global transpose (m x r -> r x m), if not SCRAMBLED_OUT */
Chris@82 110 if (cldt)
Chris@82 111 cldt->apply((plan *) cldt, I, O);
Chris@82 112 }
Chris@82 113
Chris@82 114 static void apply_ddft_last(const plan *ego_, R *I, R *O)
Chris@82 115 {
Chris@82 116 const P *ego = (const P *) ego_;
Chris@82 117 plan_dft *cld_dft;
Chris@82 118 plan_rdft *cldt, *cld_ddft;
Chris@82 119 INT roff, ioff, ir, rmax, rs, m, vn;
Chris@82 120 triggen *t;
Chris@82 121 R *dI, *dO0, *dO;
Chris@82 122
Chris@82 123 /* initial global transpose (m x r -> r x m), if not SCRAMBLED_IN */
Chris@82 124 cldt = (plan_rdft *) ego->cldt;
Chris@82 125 if (cldt) {
Chris@82 126 cldt->apply((plan *) cldt, I, O);
Chris@82 127 dI = O;
Chris@82 128 }
Chris@82 129 else
Chris@82 130 dI = I;
Chris@82 131 if (ego->preserve_input) dO = O; else dO = I;
Chris@82 132 dO0 = dO;
Chris@82 133
Chris@82 134 /* 1d DFTs of size m, followed by twiddle multiplications */
Chris@82 135 cld_dft = (plan_dft *) ego->cld_dft;
Chris@82 136 roff = ego->roff; ioff = ego->ioff;
Chris@82 137 rmax = ego->xmax; rs = ego->xs;
Chris@82 138 t = ego->t; m = ego->m; vn = ego->vn;
Chris@82 139 for (ir = ego->xmin; ir <= rmax; ++ir) {
Chris@82 140 cld_dft->apply((plan *) cld_dft, dI+roff, dI+ioff, dO+roff, dO+ioff);
Chris@82 141 do_twiddle(t, ir, m, vn, dO+roff, dO+ioff);
Chris@82 142 dI += rs; dO += rs;
Chris@82 143 }
Chris@82 144
Chris@82 145 /* distributed size-r DFTs, with output in r x m format */
Chris@82 146 cld_ddft = (plan_rdft *) ego->cld_ddft;
Chris@82 147 cld_ddft->apply(ego->cld_ddft, dO0, O);
Chris@82 148 }
Chris@82 149
Chris@82 150 static int applicable(const S *ego, const problem *p_,
Chris@82 151 const planner *plnr,
Chris@82 152 INT *r, INT rblock[2], INT mblock[2])
Chris@82 153 {
Chris@82 154 const problem_mpi_dft *p = (const problem_mpi_dft *) p_;
Chris@82 155 int n_pes;
Chris@82 156 MPI_Comm_size(p->comm, &n_pes);
Chris@82 157 return (1
Chris@82 158 && p->sz->rnk == 1
Chris@82 159
Chris@82 160 && ONLY_SCRAMBLEDP(p->flags)
Chris@82 161
Chris@82 162 && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
Chris@82 163 && p->I != p->O))
Chris@82 164
Chris@82 165 && (!(p->flags & SCRAMBLED_IN) || ego->apply == apply_ddft_last)
Chris@82 166 && (!(p->flags & SCRAMBLED_OUT) || ego->apply == apply_ddft_first)
Chris@82 167
Chris@82 168 && (!NO_SLOWP(plnr) /* slow if dft-serial is applicable */
Chris@82 169 || !XM(dft_serial_applicable)(p))
Chris@82 170
Chris@82 171 /* disallow if dft-rank1-bigvec is applicable since the
Chris@82 172 data distribution may be slightly different (ugh!) */
Chris@82 173 && (p->vn < n_pes || p->flags)
Chris@82 174
Chris@82 175 && (*r = XM(choose_radix)(p->sz->dims[0], n_pes,
Chris@82 176 p->flags, p->sign,
Chris@82 177 rblock, mblock))
Chris@82 178
Chris@82 179 /* ddft_first or last has substantial advantages in the
Chris@82 180 bigvec transpositions for the common case where
Chris@82 181 n_pes == n/r or r, respectively */
Chris@82 182 && (!NO_UGLYP(plnr)
Chris@82 183 || !(*r == n_pes && ego->apply == apply_ddft_first)
Chris@82 184 || !(p->sz->dims[0].n / *r == n_pes
Chris@82 185 && ego->apply == apply_ddft_last))
Chris@82 186 );
Chris@82 187 }
Chris@82 188
Chris@82 189 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@82 190 {
Chris@82 191 P *ego = (P *) ego_;
Chris@82 192 X(plan_awake)(ego->cldt, wakefulness);
Chris@82 193 X(plan_awake)(ego->cld_dft, wakefulness);
Chris@82 194 X(plan_awake)(ego->cld_ddft, wakefulness);
Chris@82 195
Chris@82 196 switch (wakefulness) {
Chris@82 197 case SLEEPY:
Chris@82 198 X(triggen_destroy)(ego->t); ego->t = 0;
Chris@82 199 break;
Chris@82 200 default:
Chris@82 201 ego->t = X(mktriggen)(AWAKE_SQRTN_TABLE, ego->r * ego->m);
Chris@82 202 break;
Chris@82 203 }
Chris@82 204 }
Chris@82 205
Chris@82 206 static void destroy(plan *ego_)
Chris@82 207 {
Chris@82 208 P *ego = (P *) ego_;
Chris@82 209 X(plan_destroy_internal)(ego->cldt);
Chris@82 210 X(plan_destroy_internal)(ego->cld_dft);
Chris@82 211 X(plan_destroy_internal)(ego->cld_ddft);
Chris@82 212 }
Chris@82 213
Chris@82 214 static void print(const plan *ego_, printer *p)
Chris@82 215 {
Chris@82 216 const P *ego = (const P *) ego_;
Chris@82 217 p->print(p, "(mpi-dft-rank1/%D%s%s%(%p%)%(%p%)%(%p%))",
Chris@82 218 ego->r,
Chris@82 219 ego->super.apply == apply_ddft_first ? "/first" : "/last",
Chris@82 220 ego->preserve_input==2 ?"/p":"",
Chris@82 221 ego->cld_ddft, ego->cld_dft, ego->cldt);
Chris@82 222 }
Chris@82 223
Chris@82 224 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
Chris@82 225 {
Chris@82 226 const S *ego = (const S *) ego_;
Chris@82 227 const problem_mpi_dft *p;
Chris@82 228 P *pln;
Chris@82 229 plan *cld_dft = 0, *cld_ddft = 0, *cldt = 0;
Chris@82 230 R *ri, *ii, *ro, *io, *I, *O;
Chris@82 231 INT r, rblock[2], m, mblock[2], rp, mp, mpblock[2], mpb;
Chris@82 232 int my_pe, n_pes, preserve_input, ddft_first;
Chris@82 233 dtensor *sz;
Chris@82 234 static const plan_adt padt = {
Chris@82 235 XM(dft_solve), awake, print, destroy
Chris@82 236 };
Chris@82 237
Chris@82 238 UNUSED(ego);
Chris@82 239
Chris@82 240 if (!applicable(ego, p_, plnr, &r, rblock, mblock))
Chris@82 241 return (plan *) 0;
Chris@82 242
Chris@82 243 p = (const problem_mpi_dft *) p_;
Chris@82 244
Chris@82 245 MPI_Comm_rank(p->comm, &my_pe);
Chris@82 246 MPI_Comm_size(p->comm, &n_pes);
Chris@82 247
Chris@82 248 m = p->sz->dims[0].n / r;
Chris@82 249
Chris@82 250 /* some hackery so that we can plan both ddft_first and ddft_last
Chris@82 251 as if they were ddft_first */
Chris@82 252 if ((ddft_first = (ego->apply == apply_ddft_first))) {
Chris@82 253 rp = r; mp = m;
Chris@82 254 mpblock[IB] = mblock[IB]; mpblock[OB] = mblock[OB];
Chris@82 255 mpb = XM(block)(mp, mpblock[OB], my_pe);
Chris@82 256 }
Chris@82 257 else {
Chris@82 258 rp = m; mp = r;
Chris@82 259 mpblock[IB] = rblock[IB]; mpblock[OB] = rblock[OB];
Chris@82 260 mpb = XM(block)(mp, mpblock[IB], my_pe);
Chris@82 261 }
Chris@82 262
Chris@82 263 preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
Chris@82 264
Chris@82 265 sz = XM(mkdtensor)(1);
Chris@82 266 sz->dims[0].n = mp;
Chris@82 267 sz->dims[0].b[IB] = mpblock[IB];
Chris@82 268 sz->dims[0].b[OB] = mpblock[OB];
Chris@82 269 I = (ddft_first || !preserve_input) ? p->I : p->O;
Chris@82 270 O = p->O;
Chris@82 271 cld_ddft = X(mkplan_d)(plnr, XM(mkproblem_dft_d)(sz, rp * p->vn,
Chris@82 272 I, O, p->comm, p->sign,
Chris@82 273 RANK1_BIGVEC_ONLY));
Chris@82 274 if (XM(any_true)(!cld_ddft, p->comm)) goto nada;
Chris@82 275
Chris@82 276 I = TAINT((ddft_first || !p->flags) ? p->O : p->I, rp * p->vn * 2);
Chris@82 277 O = TAINT((preserve_input || (ddft_first && p->flags)) ? p->O : p->I,
Chris@82 278 rp * p->vn * 2);
Chris@82 279 X(extract_reim)(p->sign, I, &ri, &ii);
Chris@82 280 X(extract_reim)(p->sign, O, &ro, &io);
Chris@82 281 cld_dft = X(mkplan_d)(plnr,
Chris@82 282 X(mkproblem_dft_d)(X(mktensor_1d)(rp, p->vn*2,p->vn*2),
Chris@82 283 X(mktensor_1d)(p->vn, 2, 2),
Chris@82 284 ri, ii, ro, io));
Chris@82 285 if (XM(any_true)(!cld_dft, p->comm)) goto nada;
Chris@82 286
Chris@82 287 if (!p->flags) { /* !(SCRAMBLED_IN or SCRAMBLED_OUT) */
Chris@82 288 I = (ddft_first && preserve_input) ? p->O : p->I;
Chris@82 289 O = p->O;
Chris@82 290 cldt = X(mkplan_d)(plnr,
Chris@82 291 XM(mkproblem_transpose)(
Chris@82 292 m, r, p->vn * 2,
Chris@82 293 I, O,
Chris@82 294 ddft_first ? mblock[OB] : mblock[IB],
Chris@82 295 ddft_first ? rblock[OB] : rblock[IB],
Chris@82 296 p->comm, 0));
Chris@82 297 if (XM(any_true)(!cldt, p->comm)) goto nada;
Chris@82 298 }
Chris@82 299
Chris@82 300 pln = MKPLAN_MPI_DFT(P, &padt, ego->apply);
Chris@82 301
Chris@82 302 pln->cld_ddft = cld_ddft;
Chris@82 303 pln->cld_dft = cld_dft;
Chris@82 304 pln->cldt = cldt;
Chris@82 305 pln->preserve_input = preserve_input;
Chris@82 306 X(extract_reim)(p->sign, p->O, &ro, &io);
Chris@82 307 pln->roff = ro - p->O;
Chris@82 308 pln->ioff = io - p->O;
Chris@82 309 pln->vn = p->vn;
Chris@82 310 pln->m = m;
Chris@82 311 pln->r = r;
Chris@82 312 pln->xmin = (ddft_first ? mblock[OB] : rblock[IB]) * my_pe;
Chris@82 313 pln->xmax = pln->xmin + mpb - 1;
Chris@82 314 pln->xs = rp * p->vn * 2;
Chris@82 315 pln->t = 0;
Chris@82 316
Chris@82 317 X(ops_add)(&cld_ddft->ops, &cld_dft->ops, &pln->super.super.ops);
Chris@82 318 if (cldt) X(ops_add2)(&cldt->ops, &pln->super.super.ops);
Chris@82 319 {
Chris@82 320 double n0 = (1 + pln->xmax - pln->xmin) * (mp - 1) * pln->vn;
Chris@82 321 pln->super.super.ops.mul += 8 * n0;
Chris@82 322 pln->super.super.ops.add += 4 * n0;
Chris@82 323 pln->super.super.ops.other += 8 * n0;
Chris@82 324 }
Chris@82 325
Chris@82 326 return &(pln->super.super);
Chris@82 327
Chris@82 328 nada:
Chris@82 329 X(plan_destroy_internal)(cldt);
Chris@82 330 X(plan_destroy_internal)(cld_dft);
Chris@82 331 X(plan_destroy_internal)(cld_ddft);
Chris@82 332 return (plan *) 0;
Chris@82 333 }
Chris@82 334
Chris@82 335 static solver *mksolver(rdftapply apply, int preserve_input)
Chris@82 336 {
Chris@82 337 static const solver_adt sadt = { PROBLEM_MPI_DFT, mkplan, 0 };
Chris@82 338 S *slv = MKSOLVER(S, &sadt);
Chris@82 339 slv->apply = apply;
Chris@82 340 slv->preserve_input = preserve_input;
Chris@82 341 return &(slv->super);
Chris@82 342 }
Chris@82 343
Chris@82 344 void XM(dft_rank1_register)(planner *p)
Chris@82 345 {
Chris@82 346 rdftapply apply[] = { apply_ddft_first, apply_ddft_last };
Chris@82 347 unsigned int iapply;
Chris@82 348 int preserve_input;
Chris@82 349 for (iapply = 0; iapply < sizeof(apply) / sizeof(apply[0]); ++iapply)
Chris@82 350 for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
Chris@82 351 REGISTER_SOLVER(p, mksolver(apply[iapply], preserve_input));
Chris@82 352 }