cannam@95: /* cannam@95: * Copyright (c) 2003, 2007-11 Matteo Frigo cannam@95: * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology cannam@95: * cannam@95: * This program is free software; you can redistribute it and/or modify cannam@95: * it under the terms of the GNU General Public License as published by cannam@95: * the Free Software Foundation; either version 2 of the License, or cannam@95: * (at your option) any later version. cannam@95: * cannam@95: * This program is distributed in the hope that it will be useful, cannam@95: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@95: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@95: * GNU General Public License for more details. cannam@95: * cannam@95: * You should have received a copy of the GNU General Public License cannam@95: * along with this program; if not, write to the Free Software cannam@95: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@95: * cannam@95: */ cannam@95: cannam@95: #include "api.h" cannam@95: #include "fftw3-mpi.h" cannam@95: #include "ifftw-mpi.h" cannam@95: #include "mpi-transpose.h" cannam@95: #include "mpi-dft.h" cannam@95: #include "mpi-rdft.h" cannam@95: #include "mpi-rdft2.h" cannam@95: cannam@95: /* Convert API flags to internal MPI flags. */ cannam@95: #define MPI_FLAGS(f) ((f) >> 27) cannam@95: cannam@95: /*************************************************************************/ cannam@95: cannam@95: static int mpi_inited = 0; cannam@95: cannam@95: static MPI_Comm problem_comm(const problem *p) { cannam@95: switch (p->adt->problem_kind) { cannam@95: case PROBLEM_MPI_DFT: cannam@95: return ((const problem_mpi_dft *) p)->comm; cannam@95: case PROBLEM_MPI_RDFT: cannam@95: return ((const problem_mpi_rdft *) p)->comm; cannam@95: case PROBLEM_MPI_RDFT2: cannam@95: return ((const problem_mpi_rdft2 *) p)->comm; cannam@95: case PROBLEM_MPI_TRANSPOSE: cannam@95: return ((const problem_mpi_transpose *) p)->comm; cannam@95: default: cannam@95: return MPI_COMM_NULL; cannam@95: } cannam@95: } cannam@95: cannam@95: /* used to synchronize cost measurements (timing or estimation) cannam@95: across all processes for an MPI problem, which is critical to cannam@95: ensure that all processes decide to use the same MPI plans cannam@95: (whereas serial plans need not be syncronized). */ cannam@95: static double cost_hook(const problem *p, double t, cost_kind k) cannam@95: { cannam@95: MPI_Comm comm = problem_comm(p); cannam@95: double tsum; cannam@95: if (comm == MPI_COMM_NULL) return t; cannam@95: MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE, cannam@95: k == COST_SUM ? MPI_SUM : MPI_MAX, comm); cannam@95: return tsum; cannam@95: } cannam@95: cannam@95: /* Used to reject wisdom that is not in sync across all processes cannam@95: for an MPI problem, which is critical to ensure that all processes cannam@95: decide to use the same MPI plans. (Even though costs are synchronized, cannam@95: above, out-of-sync wisdom may result from plans being produced cannam@95: by communicators that do not span all processes, either from a cannam@95: user-specified communicator or e.g. from transpose-recurse. */ cannam@95: static int wisdom_ok_hook(const problem *p, flags_t flags) cannam@95: { cannam@95: MPI_Comm comm = problem_comm(p); cannam@95: int eq_me, eq_all; cannam@95: /* unpack flags bitfield, since MPI communications may involve cannam@95: byte-order changes and MPI cannot do this for bit fields */ cannam@95: #if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */ cannam@95: unsigned int f[5]; cannam@95: #else cannam@95: unsigned long f[5]; /* at least 32 bits as per C standard */ cannam@95: #endif cannam@95: cannam@95: if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */ cannam@95: cannam@95: if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */ cannam@95: cannam@95: /* otherwise, check that the flags and solver index are identical cannam@95: on all processes in this problem's communicator. cannam@95: cannam@95: TO DO: possibly we can relax strict equality, but it is cannam@95: critical to ensure that any flags which affect what plan is cannam@95: created (and whether the solver is applicable) are the same, cannam@95: e.g. DESTROY_INPUT, NO_UGLY, etcetera. (If the MPI algorithm cannam@95: differs between processes, deadlocks/crashes generally result.) */ cannam@95: f[0] = flags.l; cannam@95: f[1] = flags.hash_info; cannam@95: f[2] = flags.timelimit_impatience; cannam@95: f[3] = flags.u; cannam@95: f[4] = flags.slvndx; cannam@95: MPI_Bcast(f, 5, cannam@95: SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG, cannam@95: 0, comm); cannam@95: eq_me = f[0] == flags.l && f[1] == flags.hash_info cannam@95: && f[2] == flags.timelimit_impatience cannam@95: && f[3] == flags.u && f[4] == flags.slvndx; cannam@95: MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm); cannam@95: return eq_all; cannam@95: } cannam@95: cannam@95: /* This hook is called when wisdom is not found. The any_true here cannam@95: matches up with the any_true in wisdom_ok_hook, in order to handle cannam@95: the case where some processes had wisdom (and called wisdom_ok_hook) cannam@95: and some processes didn't have wisdom (and called nowisdom_hook). */ cannam@95: static void nowisdom_hook(const problem *p) cannam@95: { cannam@95: MPI_Comm comm = problem_comm(p); cannam@95: if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */ cannam@95: XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */ cannam@95: } cannam@95: cannam@95: /* needed to synchronize planner bogosity flag, in case non-MPI problems cannam@95: on a subset of processes encountered bogus wisdom */ cannam@95: static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p) cannam@95: { cannam@95: MPI_Comm comm = problem_comm(p); cannam@95: if (comm != MPI_COMM_NULL /* an MPI problem */ cannam@95: && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */ cannam@95: return WISDOM_IS_BOGUS; cannam@95: return state; cannam@95: } cannam@95: cannam@95: void XM(init)(void) cannam@95: { cannam@95: if (!mpi_inited) { cannam@95: planner *plnr = X(the_planner)(); cannam@95: plnr->cost_hook = cost_hook; cannam@95: plnr->wisdom_ok_hook = wisdom_ok_hook; cannam@95: plnr->nowisdom_hook = nowisdom_hook; cannam@95: plnr->bogosity_hook = bogosity_hook; cannam@95: XM(conf_standard)(plnr); cannam@95: mpi_inited = 1; cannam@95: } cannam@95: } cannam@95: cannam@95: void XM(cleanup)(void) cannam@95: { cannam@95: X(cleanup)(); cannam@95: mpi_inited = 0; cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: cannam@95: static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0) cannam@95: { cannam@95: dtensor *x = XM(mkdtensor)(rnk); cannam@95: int i; cannam@95: for (i = 0; i < rnk; ++i) { cannam@95: x->dims[i].n = dims0[i].n; cannam@95: x->dims[i].b[IB] = dims0[i].ib; cannam@95: x->dims[i].b[OB] = dims0[i].ob; cannam@95: } cannam@95: return x; cannam@95: } cannam@95: cannam@95: static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes, cannam@95: int rdft2) cannam@95: { cannam@95: dtensor *sz = XM(mkdtensor)(rnk); cannam@95: dtensor *sz0 = mkdtensor_api(rnk, dims0); cannam@95: block_kind k; cannam@95: int i; cannam@95: cannam@95: for (i = 0; i < rnk; ++i) cannam@95: sz->dims[i].n = dims0[i].n; cannam@95: cannam@95: if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; cannam@95: cannam@95: for (i = 0; i < rnk; ++i) { cannam@95: sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n; cannam@95: sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n; cannam@95: } cannam@95: cannam@95: /* If we haven't used all of the processes yet, and some of the cannam@95: block sizes weren't specified (i.e. 0), then set the cannam@95: unspecified blocks so as to use as many processes as cannam@95: possible with as few distributed dimensions as possible. */ cannam@95: FORALL_BLOCK_KIND(k) { cannam@95: INT nb = XM(num_blocks_total)(sz, k); cannam@95: INT np = n_pes / nb; cannam@95: for (i = 0; i < rnk && np > 1; ++i) cannam@95: if (!sz0->dims[i].b[k]) { cannam@95: sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np); cannam@95: nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]); cannam@95: np = n_pes / nb; cannam@95: } cannam@95: } cannam@95: cannam@95: if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n; cannam@95: cannam@95: /* punt for 1d prime */ cannam@95: if (rnk == 1 && X(is_prime)(sz->dims[0].n)) cannam@95: sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n; cannam@95: cannam@95: XM(dtensor_destroy)(sz0); cannam@95: sz0 = XM(dtensor_canonical)(sz, 0); cannam@95: XM(dtensor_destroy)(sz); cannam@95: return sz0; cannam@95: } cannam@95: cannam@95: /* allocate simple local (serial) dims array corresponding to n[rnk] */ cannam@95: static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n) cannam@95: { cannam@95: XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk, cannam@95: TENSORS); cannam@95: int i; cannam@95: for (i = 0; i < rnk; ++i) cannam@95: dims[i].n = dims[i].ib = dims[i].ob = n[i]; cannam@95: return dims; cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: cannam@95: static void local_size(int my_pe, const dtensor *sz, block_kind k, cannam@95: ptrdiff_t *local_n, ptrdiff_t *local_start) cannam@95: { cannam@95: int i; cannam@95: if (my_pe >= XM(num_blocks_total)(sz, k)) cannam@95: for (i = 0; i < sz->rnk; ++i) cannam@95: local_n[i] = local_start[i] = 0; cannam@95: else { cannam@95: XM(block_coords)(sz, k, my_pe, local_start); cannam@95: for (i = 0; i < sz->rnk; ++i) { cannam@95: local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k], cannam@95: local_start[i]); cannam@95: local_start[i] *= sz->dims[i].b[k]; cannam@95: } cannam@95: } cannam@95: } cannam@95: cannam@95: static INT prod(int rnk, const ptrdiff_t *local_n) cannam@95: { cannam@95: int i; cannam@95: INT N = 1; cannam@95: for (i = 0; i < rnk; ++i) N *= local_n[i]; cannam@95: return N; cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0, cannam@95: ptrdiff_t howmany, MPI_Comm comm, cannam@95: ptrdiff_t *local_n_in, cannam@95: ptrdiff_t *local_start_in, cannam@95: ptrdiff_t *local_n_out, cannam@95: ptrdiff_t *local_start_out, cannam@95: int sign, unsigned flags) cannam@95: { cannam@95: INT N; cannam@95: int my_pe, n_pes, i; cannam@95: dtensor *sz; cannam@95: cannam@95: if (rnk == 0) cannam@95: return howmany; cannam@95: cannam@95: MPI_Comm_rank(comm, &my_pe); cannam@95: MPI_Comm_size(comm, &n_pes); cannam@95: sz = default_sz(rnk, dims0, n_pes, 0); cannam@95: cannam@95: /* Now, we must figure out how much local space the user should cannam@95: allocate (or at least an upper bound). This depends strongly cannam@95: on the exact algorithms we employ...ugh! FIXME: get this info cannam@95: from the solvers somehow? */ cannam@95: N = 1; /* never return zero allocation size */ cannam@95: if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) { cannam@95: INT Nafter; cannam@95: ddim odims[2]; cannam@95: cannam@95: /* dft-rank-geq2-transposed */ cannam@95: odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */ cannam@95: /* we may need extra space for transposed intermediate data */ cannam@95: for (i = 0; i < 2; ++i) cannam@95: if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 && cannam@95: XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) { cannam@95: sz->dims[i].b[IB] cannam@95: = XM(default_block)(sz->dims[i].n, n_pes); cannam@95: sz->dims[1-i].b[IB] = sz->dims[1-i].n; cannam@95: local_size(my_pe, sz, IB, local_n_in, local_start_in); cannam@95: N = X(imax)(N, prod(rnk, local_n_in)); cannam@95: sz->dims[i] = odims[i]; cannam@95: sz->dims[1-i] = odims[1-i]; cannam@95: break; cannam@95: } cannam@95: cannam@95: /* dft-rank-geq2 */ cannam@95: Nafter = howmany; cannam@95: for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; cannam@95: N = X(imax)(N, (sz->dims[0].n cannam@95: * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), cannam@95: my_pe) + howmany - 1) / howmany); cannam@95: cannam@95: /* dft-rank-geq2 with dimensions swapped */ cannam@95: Nafter = howmany * sz->dims[0].n; cannam@95: for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; cannam@95: N = X(imax)(N, (sz->dims[1].n cannam@95: * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), cannam@95: my_pe) + howmany - 1) / howmany); cannam@95: } cannam@95: else if (rnk == 1) { cannam@95: if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */ cannam@95: ptrdiff_t n[2], start[2]; cannam@95: dtensor *sz2 = XM(mkdtensor)(2); cannam@95: sz2->dims[0] = sz->dims[0]; cannam@95: sz2->dims[0].b[IB] = sz->dims[0].n; cannam@95: sz2->dims[1].n = sz2->dims[1].b[OB] = howmany; cannam@95: sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes); cannam@95: local_size(my_pe, sz2, IB, n, start); cannam@95: XM(dtensor_destroy)(sz2); cannam@95: N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany); cannam@95: } cannam@95: else { /* dft-rank1 */ cannam@95: INT r, m, rblock[2], mblock[2]; cannam@95: cannam@95: /* Since the 1d transforms are so different, we require cannam@95: the user to call local_size_1d for this case. Ugh. */ cannam@95: CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD); cannam@95: cannam@95: if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign, cannam@95: rblock, mblock))) { cannam@95: m = sz->dims[0].n / r; cannam@95: if (flags & FFTW_MPI_SCRAMBLED_IN) cannam@95: sz->dims[0].b[IB] = rblock[IB] * m; cannam@95: else { /* !SCRAMBLED_IN */ cannam@95: sz->dims[0].b[IB] = r * mblock[IB]; cannam@95: N = X(imax)(N, rblock[IB] * m); cannam@95: } cannam@95: if (flags & FFTW_MPI_SCRAMBLED_OUT) cannam@95: sz->dims[0].b[OB] = r * mblock[OB]; cannam@95: else { /* !SCRAMBLED_OUT */ cannam@95: N = X(imax)(N, r * mblock[OB]); cannam@95: sz->dims[0].b[OB] = rblock[OB] * m; cannam@95: } cannam@95: } cannam@95: } cannam@95: } cannam@95: cannam@95: local_size(my_pe, sz, IB, local_n_in, local_start_in); cannam@95: local_size(my_pe, sz, OB, local_n_out, local_start_out); cannam@95: cannam@95: /* at least, make sure we have enough space to store input & output */ cannam@95: N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out))); cannam@95: cannam@95: XM(dtensor_destroy)(sz); cannam@95: return N * howmany; cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t xblock, ptrdiff_t yblock, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, cannam@95: ptrdiff_t *local_y_start) cannam@95: { cannam@95: ptrdiff_t N; cannam@95: XM(ddim) *dims; cannam@95: ptrdiff_t *local; cannam@95: cannam@95: if (rnk == 0) { cannam@95: *local_nx = *local_ny = 1; cannam@95: *local_x_start = *local_y_start = 0; cannam@95: return howmany; cannam@95: } cannam@95: cannam@95: dims = simple_dims(rnk, n); cannam@95: local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS); cannam@95: cannam@95: /* default 1d block distribution, with transposed output cannam@95: if yblock < n[1] */ cannam@95: dims[0].ib = xblock; cannam@95: if (rnk > 1) { cannam@95: if (yblock < n[1]) cannam@95: dims[1].ob = yblock; cannam@95: else cannam@95: dims[0].ob = xblock; cannam@95: } cannam@95: else cannam@95: dims[0].ob = xblock; /* FIXME: 1d not really supported here cannam@95: since we don't have flags/sign */ cannam@95: cannam@95: N = XM(local_size_guru)(rnk, dims, howmany, comm, cannam@95: local, local + rnk, cannam@95: local + 2*rnk, local + 3*rnk, cannam@95: 0, 0); cannam@95: *local_nx = local[0]; cannam@95: *local_x_start = local[rnk]; cannam@95: if (rnk > 1) { cannam@95: *local_ny = local[2*rnk + 1]; cannam@95: *local_y_start = local[3*rnk + 1]; cannam@95: } cannam@95: else { cannam@95: *local_ny = *local_nx; cannam@95: *local_y_start = *local_x_start; cannam@95: } cannam@95: X(ifree)(local); cannam@95: X(ifree)(dims); cannam@95: return N; cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t xblock, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start) cannam@95: { cannam@95: ptrdiff_t local_ny, local_y_start; cannam@95: return XM(local_size_many_transposed)(rnk, n, howmany, cannam@95: xblock, rnk > 1 cannam@95: ? n[1] : FFTW_MPI_DEFAULT_BLOCK, cannam@95: comm, cannam@95: local_nx, local_x_start, cannam@95: &local_ny, &local_y_start); cannam@95: } cannam@95: cannam@95: cannam@95: ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, cannam@95: ptrdiff_t *local_y_start) cannam@95: { cannam@95: return XM(local_size_many_transposed)(rnk, n, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: comm, cannam@95: local_nx, local_x_start, cannam@95: local_ny, local_y_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start) cannam@95: { cannam@95: return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm, cannam@95: local_nx, local_x_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany, cannam@95: MPI_Comm comm, int sign, unsigned flags, cannam@95: ptrdiff_t *local_nx, ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, ptrdiff_t *local_y_start) cannam@95: { cannam@95: XM(ddim) d; cannam@95: d.n = nx; cannam@95: d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK; cannam@95: return XM(local_size_guru)(1, &d, howmany, comm, cannam@95: local_nx, local_x_start, cannam@95: local_ny, local_y_start, sign, flags); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_1d)(ptrdiff_t nx, cannam@95: MPI_Comm comm, int sign, unsigned flags, cannam@95: ptrdiff_t *local_nx, ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, ptrdiff_t *local_y_start) cannam@95: { cannam@95: return XM(local_size_many_1d)(nx, 1, comm, sign, flags, cannam@95: local_nx, local_x_start, cannam@95: local_ny, local_y_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, cannam@95: ptrdiff_t *local_y_start) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: return XM(local_size_transposed)(2, n, comm, cannam@95: local_nx, local_x_start, cannam@95: local_ny, local_y_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, ptrdiff_t *local_x_start) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: return XM(local_size)(2, n, comm, local_nx, local_x_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny, cannam@95: ptrdiff_t nz, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, cannam@95: ptrdiff_t *local_x_start, cannam@95: ptrdiff_t *local_ny, cannam@95: ptrdiff_t *local_y_start) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: return XM(local_size_transposed)(3, n, comm, cannam@95: local_nx, local_x_start, cannam@95: local_ny, local_y_start); cannam@95: } cannam@95: cannam@95: ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, cannam@95: MPI_Comm comm, cannam@95: ptrdiff_t *local_nx, ptrdiff_t *local_x_start) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: return XM(local_size)(3, n, comm, local_nx, local_x_start); cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: /* Transpose API */ cannam@95: cannam@95: X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t xblock, ptrdiff_t yblock, cannam@95: R *in, R *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: int n_pes; cannam@95: XM(init)(); cannam@95: cannam@95: if (howmany < 0 || xblock < 0 || yblock < 0 || cannam@95: nx <= 0 || ny <= 0) return 0; cannam@95: cannam@95: MPI_Comm_size(comm, &n_pes); cannam@95: if (!xblock) xblock = XM(default_block)(nx, n_pes); cannam@95: if (!yblock) yblock = XM(default_block)(ny, n_pes); cannam@95: if (n_pes < XM(num_blocks)(nx, xblock) cannam@95: || n_pes < XM(num_blocks)(ny, yblock)) cannam@95: return 0; cannam@95: cannam@95: return cannam@95: X(mkapiplan)(FFTW_FORWARD, flags, cannam@95: XM(mkproblem_transpose)(nx, ny, howmany, cannam@95: in, out, xblock, yblock, cannam@95: comm, MPI_FLAGS(flags))); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: cannam@95: { cannam@95: return XM(plan_many_transpose)(nx, ny, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: in, out, comm, flags); cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: /* Complex DFT API */ cannam@95: cannam@95: X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0, cannam@95: ptrdiff_t howmany, cannam@95: C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: int n_pes, i; cannam@95: dtensor *sz; cannam@95: cannam@95: XM(init)(); cannam@95: cannam@95: if (howmany < 0 || rnk < 1) return 0; cannam@95: for (i = 0; i < rnk; ++i) cannam@95: if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) cannam@95: return 0; cannam@95: cannam@95: MPI_Comm_size(comm, &n_pes); cannam@95: sz = default_sz(rnk, dims0, n_pes, 0); cannam@95: cannam@95: if (XM(num_blocks_total)(sz, IB) > n_pes cannam@95: || XM(num_blocks_total)(sz, OB) > n_pes) { cannam@95: XM(dtensor_destroy)(sz); cannam@95: return 0; cannam@95: } cannam@95: cannam@95: return cannam@95: X(mkapiplan)(sign, flags, cannam@95: XM(mkproblem_dft_d)(sz, howmany, cannam@95: (R *) in, (R *) out, cannam@95: comm, sign, cannam@95: MPI_FLAGS(flags))); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t iblock, ptrdiff_t oblock, cannam@95: C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: XM(ddim) *dims = simple_dims(rnk, n); cannam@95: X(plan) pln; cannam@95: cannam@95: if (rnk == 1) { cannam@95: dims[0].ib = iblock; cannam@95: dims[0].ob = oblock; cannam@95: } cannam@95: else if (rnk > 1) { cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; cannam@95: } cannam@95: cannam@95: pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags); cannam@95: X(ifree)(dims); cannam@95: return pln; cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: return XM(plan_many_dft)(rnk, n, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: in, out, comm, sign, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: return XM(plan_dft)(1, &nx, in, out, comm, sign, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: return XM(plan_dft)(2, n, in, out, comm, sign, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, cannam@95: C *in, C *out, cannam@95: MPI_Comm comm, int sign, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: return XM(plan_dft)(3, n, in, out, comm, sign, flags); cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: /* R2R API */ cannam@95: cannam@95: X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0, cannam@95: ptrdiff_t howmany, cannam@95: R *in, R *out, cannam@95: MPI_Comm comm, const X(r2r_kind) *kind, cannam@95: unsigned flags) cannam@95: { cannam@95: int n_pes, i; cannam@95: dtensor *sz; cannam@95: rdft_kind *k; cannam@95: X(plan) pln; cannam@95: cannam@95: XM(init)(); cannam@95: cannam@95: if (howmany < 0 || rnk < 1) return 0; cannam@95: for (i = 0; i < rnk; ++i) cannam@95: if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) cannam@95: return 0; cannam@95: cannam@95: k = X(map_r2r_kind)(rnk, kind); cannam@95: cannam@95: MPI_Comm_size(comm, &n_pes); cannam@95: sz = default_sz(rnk, dims0, n_pes, 0); cannam@95: cannam@95: if (XM(num_blocks_total)(sz, IB) > n_pes cannam@95: || XM(num_blocks_total)(sz, OB) > n_pes) { cannam@95: XM(dtensor_destroy)(sz); cannam@95: return 0; cannam@95: } cannam@95: cannam@95: pln = X(mkapiplan)(0, flags, cannam@95: XM(mkproblem_rdft_d)(sz, howmany, cannam@95: in, out, cannam@95: comm, k, MPI_FLAGS(flags))); cannam@95: X(ifree0)(k); cannam@95: return pln; cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t iblock, ptrdiff_t oblock, cannam@95: R *in, R *out, cannam@95: MPI_Comm comm, const X(r2r_kind) *kind, cannam@95: unsigned flags) cannam@95: { cannam@95: XM(ddim) *dims = simple_dims(rnk, n); cannam@95: X(plan) pln; cannam@95: cannam@95: if (rnk == 1) { cannam@95: dims[0].ib = iblock; cannam@95: dims[0].ob = oblock; cannam@95: } cannam@95: else if (rnk > 1) { cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; cannam@95: } cannam@95: cannam@95: pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags); cannam@95: X(ifree)(dims); cannam@95: return pln; cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out, cannam@95: MPI_Comm comm, cannam@95: const X(r2r_kind) *kind, cannam@95: unsigned flags) cannam@95: { cannam@95: return XM(plan_many_r2r)(rnk, n, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: in, out, comm, kind, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, cannam@95: MPI_Comm comm, cannam@95: X(r2r_kind) kindx, X(r2r_kind) kindy, cannam@95: unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: X(r2r_kind) kind[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: kind[0] = kindx; kind[1] = kindy; cannam@95: return XM(plan_r2r)(2, n, in, out, comm, kind, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, cannam@95: R *in, R *out, cannam@95: MPI_Comm comm, cannam@95: X(r2r_kind) kindx, X(r2r_kind) kindy, cannam@95: X(r2r_kind) kindz, cannam@95: unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: X(r2r_kind) kind[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: kind[0] = kindx; kind[1] = kindy; kind[2] = kindz; cannam@95: return XM(plan_r2r)(3, n, in, out, comm, kind, flags); cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: /* R2C/C2R API */ cannam@95: cannam@95: static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0, cannam@95: ptrdiff_t howmany, cannam@95: R *r, C *c, cannam@95: MPI_Comm comm, rdft_kind kind, unsigned flags) cannam@95: { cannam@95: int n_pes, i; cannam@95: dtensor *sz; cannam@95: R *cr = (R *) c; cannam@95: cannam@95: XM(init)(); cannam@95: cannam@95: if (howmany < 0 || rnk < 2) return 0; cannam@95: for (i = 0; i < rnk; ++i) cannam@95: if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) cannam@95: return 0; cannam@95: cannam@95: MPI_Comm_size(comm, &n_pes); cannam@95: sz = default_sz(rnk, dims0, n_pes, 1); cannam@95: cannam@95: sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; cannam@95: if (XM(num_blocks_total)(sz, IB) > n_pes cannam@95: || XM(num_blocks_total)(sz, OB) > n_pes) { cannam@95: XM(dtensor_destroy)(sz); cannam@95: return 0; cannam@95: } cannam@95: sz->dims[rnk-1].n = dims0[rnk-1].n; cannam@95: cannam@95: if (kind == R2HC) cannam@95: return X(mkapiplan)(0, flags, cannam@95: XM(mkproblem_rdft2_d)(sz, howmany, cannam@95: r, cr, comm, R2HC, cannam@95: MPI_FLAGS(flags))); cannam@95: else cannam@95: return X(mkapiplan)(0, flags, cannam@95: XM(mkproblem_rdft2_d)(sz, howmany, cannam@95: cr, r, comm, HC2R, cannam@95: MPI_FLAGS(flags))); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t iblock, ptrdiff_t oblock, cannam@95: R *in, C *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: XM(ddim) *dims = simple_dims(rnk, n); cannam@95: X(plan) pln; cannam@95: cannam@95: if (rnk == 1) { cannam@95: dims[0].ib = iblock; cannam@95: dims[0].ob = oblock; cannam@95: } cannam@95: else if (rnk > 1) { cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; cannam@95: } cannam@95: cannam@95: pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags); cannam@95: X(ifree)(dims); cannam@95: return pln; cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n, cannam@95: ptrdiff_t howmany, cannam@95: ptrdiff_t iblock, ptrdiff_t oblock, cannam@95: C *in, R *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: XM(ddim) *dims = simple_dims(rnk, n); cannam@95: X(plan) pln; cannam@95: cannam@95: if (rnk == 1) { cannam@95: dims[0].ib = iblock; cannam@95: dims[0].ob = oblock; cannam@95: } cannam@95: else if (rnk > 1) { cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; cannam@95: dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; cannam@95: } cannam@95: cannam@95: pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags); cannam@95: X(ifree)(dims); cannam@95: return pln; cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: return XM(plan_many_dft_r2c)(rnk, n, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: in, out, comm, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: return XM(plan_dft_r2c)(2, n, in, out, comm, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, cannam@95: R *in, C *out, MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: return XM(plan_dft_r2c)(3, n, in, out, comm, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: return XM(plan_many_dft_c2r)(rnk, n, 1, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: FFTW_MPI_DEFAULT_BLOCK, cannam@95: in, out, comm, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out, cannam@95: MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[2]; cannam@95: n[0] = nx; n[1] = ny; cannam@95: return XM(plan_dft_c2r)(2, n, in, out, comm, flags); cannam@95: } cannam@95: cannam@95: X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, cannam@95: C *in, R *out, MPI_Comm comm, unsigned flags) cannam@95: { cannam@95: ptrdiff_t n[3]; cannam@95: n[0] = nx; n[1] = ny; n[2] = nz; cannam@95: return XM(plan_dft_c2r)(3, n, in, out, comm, flags); cannam@95: } cannam@95: cannam@95: /*************************************************************************/ cannam@95: /* New-array execute functions */ cannam@95: cannam@95: void XM(execute_dft)(const X(plan) p, C *in, C *out) { cannam@95: /* internally, MPI plans are just rdft plans */ cannam@95: X(execute_r2r)(p, (R*) in, (R*) out); cannam@95: } cannam@95: cannam@95: void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) { cannam@95: /* internally, MPI plans are just rdft plans */ cannam@95: X(execute_r2r)(p, in, (R*) out); cannam@95: } cannam@95: cannam@95: void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) { cannam@95: /* internally, MPI plans are just rdft plans */ cannam@95: X(execute_r2r)(p, (R*) in, out); cannam@95: } cannam@95: cannam@95: void XM(execute_r2r)(const X(plan) p, R *in, R *out) { cannam@95: /* internally, MPI plans are just rdft plans */ cannam@95: X(execute_r2r)(p, in, out); cannam@95: }