cannam@127: /**************************************************************************/ cannam@127: /* NOTE to users: this is the FFTW-MPI self-test and benchmark program. cannam@127: It is probably NOT a good place to learn FFTW usage, since it has a cannam@127: lot of added complexity in order to exercise and test the full API, cannam@127: etcetera. We suggest reading the manual. */ cannam@127: /**************************************************************************/ cannam@127: cannam@127: #include cannam@127: #include cannam@127: #include cannam@127: #include "fftw3-mpi.h" cannam@127: #include "fftw-bench.h" cannam@127: cannam@127: #if defined(BENCHFFT_SINGLE) cannam@127: # define BENCH_MPI_TYPE MPI_FLOAT cannam@127: #elif defined(BENCHFFT_LDOUBLE) cannam@127: # define BENCH_MPI_TYPE MPI_LONG_DOUBLE cannam@127: #elif defined(BENCHFFT_QUAD) cannam@127: # error MPI quad-precision type is unknown cannam@127: #else cannam@127: # define BENCH_MPI_TYPE MPI_DOUBLE cannam@127: #endif cannam@127: cannam@127: #if SIZEOF_PTRDIFF_T == SIZEOF_INT cannam@127: # define FFTW_MPI_PTRDIFF_T MPI_INT cannam@127: #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG cannam@127: # define FFTW_MPI_PTRDIFF_T MPI_LONG cannam@127: #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG cannam@127: # define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG cannam@127: #else cannam@127: # error MPI type for ptrdiff_t is unknown cannam@127: # define FFTW_MPI_PTRDIFF_T MPI_LONG cannam@127: #endif cannam@127: cannam@127: static const char *mkversion(void) { return FFTW(version); } cannam@127: static const char *mkcc(void) { return FFTW(cc); } cannam@127: static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } cannam@127: static const char *mknproc(void) { cannam@127: static char buf[32]; cannam@127: int ncpus; cannam@127: MPI_Comm_size(MPI_COMM_WORLD, &ncpus); cannam@127: #ifdef HAVE_SNPRINTF cannam@127: snprintf(buf, 32, "%d", ncpus); cannam@127: #else cannam@127: sprintf(buf, "%d", ncpus); cannam@127: #endif cannam@127: return buf; cannam@127: } cannam@127: cannam@127: BEGIN_BENCH_DOC cannam@127: BENCH_DOC("name", "fftw3_mpi") cannam@127: BENCH_DOCF("version", mkversion) cannam@127: BENCH_DOCF("cc", mkcc) cannam@127: BENCH_DOCF("codelet-optim", mkcodelet_optim) cannam@127: BENCH_DOCF("nproc", mknproc) cannam@127: END_BENCH_DOC cannam@127: cannam@127: static int n_pes = 1, my_pe = 0; cannam@127: cannam@127: /* global variables describing the shape of the data and its distribution */ cannam@127: static int rnk; cannam@127: static ptrdiff_t vn, iNtot, oNtot; cannam@127: static ptrdiff_t *local_ni=0, *local_starti=0; cannam@127: static ptrdiff_t *local_no=0, *local_starto=0; cannam@127: static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */ cannam@127: static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */ cannam@127: static ptrdiff_t *istrides = 0, *ostrides = 0; cannam@127: static ptrdiff_t *total_ni=0, *total_no=0; cannam@127: static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */ cannam@127: static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */ cannam@127: cannam@127: static bench_real *local_in = 0, *local_out = 0; cannam@127: static bench_real *all_local_in = 0, *all_local_out = 0; cannam@127: static int all_local_in_alloc = 0, all_local_out_alloc = 0; cannam@127: static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0; cannam@127: cannam@127: static void alloc_rnk(int rnk_) { cannam@127: rnk = rnk_; cannam@127: bench_free(local_ni); cannam@127: if (rnk == 0) cannam@127: local_ni = 0; cannam@127: else cannam@127: local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk cannam@127: * (8 + n_pes * 4)); cannam@127: cannam@127: local_starti = local_ni + rnk; cannam@127: local_no = local_ni + 2 * rnk; cannam@127: local_starto = local_ni + 3 * rnk; cannam@127: istrides = local_ni + 4 * rnk; cannam@127: ostrides = local_ni + 5 * rnk; cannam@127: total_ni = local_ni + 6 * rnk; cannam@127: total_no = local_ni + 7 * rnk; cannam@127: all_local_ni = local_ni + 8 * rnk; cannam@127: all_local_starti = local_ni + (8 + n_pes) * rnk; cannam@127: all_local_no = local_ni + (8 + 2 * n_pes) * rnk; cannam@127: all_local_starto = local_ni + (8 + 3 * n_pes) * rnk; cannam@127: } cannam@127: cannam@127: static void setup_gather_scatter(void) cannam@127: { cannam@127: int i, j; cannam@127: ptrdiff_t off; cannam@127: cannam@127: MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: all_local_ni, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: 0, MPI_COMM_WORLD); cannam@127: MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); cannam@127: MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: all_local_starti, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: 0, MPI_COMM_WORLD); cannam@127: MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); cannam@127: cannam@127: MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: all_local_no, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: 0, MPI_COMM_WORLD); cannam@127: MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); cannam@127: MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: all_local_starto, rnk, FFTW_MPI_PTRDIFF_T, cannam@127: 0, MPI_COMM_WORLD); cannam@127: MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); cannam@127: cannam@127: off = 0; cannam@127: for (i = 0; i < n_pes; ++i) { cannam@127: ptrdiff_t N = vn; cannam@127: for (j = 0; j < rnk; ++j) cannam@127: N *= all_local_ni[i * rnk + j]; cannam@127: isend_cnt[i] = N; cannam@127: isend_off[i] = off; cannam@127: off += N; cannam@127: } cannam@127: iNtot = off; cannam@127: all_local_in_alloc = 1; cannam@127: cannam@127: istrides[rnk - 1] = vn; cannam@127: for (j = rnk - 2; j >= 0; --j) cannam@127: istrides[j] = total_ni[j + 1] * istrides[j + 1]; cannam@127: cannam@127: off = 0; cannam@127: for (i = 0; i < n_pes; ++i) { cannam@127: ptrdiff_t N = vn; cannam@127: for (j = 0; j < rnk; ++j) cannam@127: N *= all_local_no[i * rnk + j]; cannam@127: orecv_cnt[i] = N; cannam@127: orecv_off[i] = off; cannam@127: off += N; cannam@127: } cannam@127: oNtot = off; cannam@127: all_local_out_alloc = 1; cannam@127: cannam@127: ostrides[rnk - 1] = vn; cannam@127: for (j = rnk - 2; j >= 0; --j) cannam@127: ostrides[j] = total_no[j + 1] * ostrides[j + 1]; cannam@127: } cannam@127: cannam@127: static void copy_block_out(const bench_real *in, cannam@127: int rnk, ptrdiff_t *n, ptrdiff_t *start, cannam@127: ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn, cannam@127: bench_real *out) cannam@127: { cannam@127: ptrdiff_t i; cannam@127: if (rnk == 0) { cannam@127: for (i = 0; i < vn; ++i) cannam@127: out[i] = in[i]; cannam@127: } cannam@127: else if (rnk == 1) { /* this case is just an optimization */ cannam@127: ptrdiff_t j; cannam@127: out += start[0] * os[0]; cannam@127: for (j = 0; j < n[0]; ++j) { cannam@127: for (i = 0; i < vn; ++i) cannam@127: out[i] = in[i]; cannam@127: in += is; cannam@127: out += os[0]; cannam@127: } cannam@127: } cannam@127: else { cannam@127: /* we should do n[0] for locality, but this way is simpler to code */ cannam@127: for (i = 0; i < n[rnk - 1]; ++i) cannam@127: copy_block_out(in + i * is, cannam@127: rnk - 1, n, start, is * n[rnk - 1], os, vn, cannam@127: out + (start[rnk - 1] + i) * os[rnk - 1]); cannam@127: } cannam@127: } cannam@127: cannam@127: static void copy_block_in(bench_real *in, cannam@127: int rnk, ptrdiff_t *n, ptrdiff_t *start, cannam@127: ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn, cannam@127: const bench_real *out) cannam@127: { cannam@127: ptrdiff_t i; cannam@127: if (rnk == 0) { cannam@127: for (i = 0; i < vn; ++i) cannam@127: in[i] = out[i]; cannam@127: } cannam@127: else if (rnk == 1) { /* this case is just an optimization */ cannam@127: ptrdiff_t j; cannam@127: out += start[0] * os[0]; cannam@127: for (j = 0; j < n[0]; ++j) { cannam@127: for (i = 0; i < vn; ++i) cannam@127: in[i] = out[i]; cannam@127: in += is; cannam@127: out += os[0]; cannam@127: } cannam@127: } cannam@127: else { cannam@127: /* we should do n[0] for locality, but this way is simpler to code */ cannam@127: for (i = 0; i < n[rnk - 1]; ++i) cannam@127: copy_block_in(in + i * is, cannam@127: rnk - 1, n, start, is * n[rnk - 1], os, vn, cannam@127: out + (start[rnk - 1] + i) * os[rnk - 1]); cannam@127: } cannam@127: } cannam@127: cannam@127: static void do_scatter_in(bench_real *in) cannam@127: { cannam@127: bench_real *ali; cannam@127: int i; cannam@127: if (all_local_in_alloc) { cannam@127: bench_free(all_local_in); cannam@127: all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real)); cannam@127: all_local_in_alloc = 0; cannam@127: } cannam@127: ali = all_local_in; cannam@127: for (i = 0; i < n_pes; ++i) { cannam@127: copy_block_in(ali, cannam@127: rnk, all_local_ni + i * rnk, cannam@127: all_local_starti + i * rnk, cannam@127: vn, istrides, vn, cannam@127: in); cannam@127: ali += isend_cnt[i]; cannam@127: } cannam@127: MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE, cannam@127: local_in, isend_cnt[my_pe], BENCH_MPI_TYPE, cannam@127: 0, MPI_COMM_WORLD); cannam@127: } cannam@127: cannam@127: static void do_gather_out(bench_real *out) cannam@127: { cannam@127: bench_real *alo; cannam@127: int i; cannam@127: cannam@127: if (all_local_out_alloc) { cannam@127: bench_free(all_local_out); cannam@127: all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real)); cannam@127: all_local_out_alloc = 0; cannam@127: } cannam@127: MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE, cannam@127: all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE, cannam@127: 0, MPI_COMM_WORLD); cannam@127: MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD); cannam@127: alo = all_local_out; cannam@127: for (i = 0; i < n_pes; ++i) { cannam@127: copy_block_out(alo, cannam@127: rnk, all_local_no + i * rnk, cannam@127: all_local_starto + i * rnk, cannam@127: vn, ostrides, vn, cannam@127: out); cannam@127: alo += orecv_cnt[i]; cannam@127: } cannam@127: } cannam@127: cannam@127: static void alloc_local(ptrdiff_t nreal, int inplace) cannam@127: { cannam@127: bench_free(local_in); cannam@127: if (local_out != local_in) bench_free(local_out); cannam@127: local_in = local_out = 0; cannam@127: if (nreal > 0) { cannam@127: ptrdiff_t i; cannam@127: local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); cannam@127: if (inplace) cannam@127: local_out = local_in; cannam@127: else cannam@127: local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); cannam@127: for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0; cannam@127: } cannam@127: } cannam@127: cannam@127: void after_problem_rcopy_from(bench_problem *p, bench_real *ri) cannam@127: { cannam@127: UNUSED(p); cannam@127: do_scatter_in(ri); cannam@127: if (plan_scramble_in) FFTW(execute)(plan_scramble_in); cannam@127: } cannam@127: cannam@127: void after_problem_rcopy_to(bench_problem *p, bench_real *ro) cannam@127: { cannam@127: UNUSED(p); cannam@127: if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out); cannam@127: do_gather_out(ro); cannam@127: } cannam@127: cannam@127: void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii) cannam@127: { cannam@127: UNUSED(ii); cannam@127: after_problem_rcopy_from(p, ri); cannam@127: } cannam@127: cannam@127: void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io) cannam@127: { cannam@127: UNUSED(io); cannam@127: after_problem_rcopy_to(p, ro); cannam@127: } cannam@127: cannam@127: void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii) cannam@127: { cannam@127: UNUSED(ii); cannam@127: after_problem_rcopy_from(p, ri); cannam@127: } cannam@127: cannam@127: void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io) cannam@127: { cannam@127: UNUSED(io); cannam@127: after_problem_rcopy_to(p, ro); cannam@127: } cannam@127: cannam@127: static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, cannam@127: ptrdiff_t vn, cannam@127: bench_real *in, bench_real *out) cannam@127: { cannam@127: FFTW(iodim64) hdims[3]; cannam@127: FFTW(r2r_kind) k[3]; cannam@127: FFTW(plan) pln; cannam@127: cannam@127: hdims[0].n = nx; cannam@127: hdims[0].is = ny * vn; cannam@127: hdims[0].os = vn; cannam@127: hdims[1].n = ny; cannam@127: hdims[1].is = vn; cannam@127: hdims[1].os = nx * vn; cannam@127: hdims[2].n = vn; cannam@127: hdims[2].is = 1; cannam@127: hdims[2].os = 1; cannam@127: k[0] = k[1] = k[2] = FFTW_R2HC; cannam@127: pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE); cannam@127: BENCH_ASSERT(pln != 0); cannam@127: return pln; cannam@127: } cannam@127: cannam@127: static int tensor_rowmajor_transposedp(bench_tensor *t) cannam@127: { cannam@127: bench_iodim *d; cannam@127: int i; cannam@127: cannam@127: BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk)); cannam@127: if (t->rnk < 2) cannam@127: return 0; cannam@127: cannam@127: d = t->dims; cannam@127: if (d[0].is != d[1].is * d[1].n cannam@127: || d[0].os != d[1].is cannam@127: || d[1].os != d[0].os * d[0].n) cannam@127: return 0; cannam@127: if (t->rnk > 2 && d[1].is != d[2].is * d[2].n) cannam@127: return 0; cannam@127: for (i = 2; i + 1 < t->rnk; ++i) { cannam@127: d = t->dims + i; cannam@127: if (d[0].is != d[1].is * d[1].n cannam@127: || d[0].os != d[1].os * d[1].n) cannam@127: return 0; cannam@127: } cannam@127: cannam@127: if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os) cannam@127: return 0; cannam@127: return 1; cannam@127: } cannam@127: cannam@127: static int tensor_contiguousp(bench_tensor *t, int s) cannam@127: { cannam@127: return (t->dims[t->rnk-1].is == s cannam@127: && ((tensor_rowmajorp(t) && cannam@127: t->dims[t->rnk-1].is == t->dims[t->rnk-1].os) cannam@127: || tensor_rowmajor_transposedp(t))); cannam@127: } cannam@127: cannam@127: static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags) cannam@127: { cannam@127: FFTW(plan) pln = 0; cannam@127: int i; cannam@127: ptrdiff_t ntot; cannam@127: cannam@127: vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; cannam@127: cannam@127: if (p->sz->rnk < 1 cannam@127: || p->split cannam@127: || !tensor_contiguousp(p->sz, vn) cannam@127: || tensor_rowmajor_transposedp(p->sz) cannam@127: || p->vecsz->rnk > 1 cannam@127: || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 cannam@127: || p->vecsz->dims[0].os != 1))) cannam@127: return 0; cannam@127: cannam@127: alloc_rnk(p->sz->rnk); cannam@127: for (i = 0; i < rnk; ++i) { cannam@127: total_ni[i] = total_no[i] = p->sz->dims[i].n; cannam@127: local_ni[i] = local_no[i] = total_ni[i]; cannam@127: local_starti[i] = local_starto[i] = 0; cannam@127: } cannam@127: if (rnk > 1) { cannam@127: ptrdiff_t n, start, nT, startT; cannam@127: ntot = FFTW(mpi_local_size_many_transposed) cannam@127: (p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, cannam@127: MPI_COMM_WORLD, cannam@127: &n, &start, &nT, &startT); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) { cannam@127: local_ni[1] = nT; cannam@127: local_starti[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_ni[0] = n; cannam@127: local_starti[0] = start; cannam@127: } cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) { cannam@127: local_no[1] = nT; cannam@127: local_starto[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_no[0] = n; cannam@127: local_starto[0] = start; cannam@127: } cannam@127: } cannam@127: else if (rnk == 1) { cannam@127: ntot = FFTW(mpi_local_size_many_1d) cannam@127: (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags, cannam@127: local_ni, local_starti, local_no, local_starto); cannam@127: } cannam@127: alloc_local(ntot * 2, p->in == p->out); cannam@127: cannam@127: pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: (FFTW(complex) *) local_in, cannam@127: (FFTW(complex) *) local_out, cannam@127: MPI_COMM_WORLD, p->sign, flags); cannam@127: cannam@127: vn *= 2; cannam@127: cannam@127: if (rnk > 1) { cannam@127: ptrdiff_t nrest = 1; cannam@127: for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n; cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) cannam@127: plan_scramble_in = mkplan_transpose_local( cannam@127: p->sz->dims[0].n, local_ni[1], vn * nrest, cannam@127: local_in, local_in); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) cannam@127: plan_unscramble_out = mkplan_transpose_local( cannam@127: local_no[1], p->sz->dims[0].n, vn * nrest, cannam@127: local_out, local_out); cannam@127: } cannam@127: cannam@127: return pln; cannam@127: } cannam@127: cannam@127: static int tensor_real_contiguousp(bench_tensor *t, int sign, int s) cannam@127: { cannam@127: return (t->dims[t->rnk-1].is == s cannam@127: && ((tensor_real_rowmajorp(t, sign, 1) && cannam@127: t->dims[t->rnk-1].is == t->dims[t->rnk-1].os))); cannam@127: } cannam@127: cannam@127: static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags) cannam@127: { cannam@127: FFTW(plan) pln = 0; cannam@127: int i; cannam@127: ptrdiff_t ntot; cannam@127: cannam@127: vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; cannam@127: cannam@127: if (p->sz->rnk < 2 cannam@127: || p->split cannam@127: || !tensor_real_contiguousp(p->sz, p->sign, vn) cannam@127: || tensor_rowmajor_transposedp(p->sz) cannam@127: || p->vecsz->rnk > 1 cannam@127: || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 cannam@127: || p->vecsz->dims[0].os != 1))) cannam@127: return 0; cannam@127: cannam@127: alloc_rnk(p->sz->rnk); cannam@127: for (i = 0; i < rnk; ++i) { cannam@127: total_ni[i] = total_no[i] = p->sz->dims[i].n; cannam@127: local_ni[i] = local_no[i] = total_ni[i]; cannam@127: local_starti[i] = local_starto[i] = 0; cannam@127: } cannam@127: local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] cannam@127: = p->sz->dims[rnk-1].n / 2 + 1; cannam@127: { cannam@127: ptrdiff_t n, start, nT, startT; cannam@127: ntot = FFTW(mpi_local_size_many_transposed) cannam@127: (p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, cannam@127: MPI_COMM_WORLD, cannam@127: &n, &start, &nT, &startT); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) { cannam@127: local_ni[1] = nT; cannam@127: local_starti[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_ni[0] = n; cannam@127: local_starti[0] = start; cannam@127: } cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) { cannam@127: local_no[1] = nT; cannam@127: local_starto[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_no[0] = n; cannam@127: local_starto[0] = start; cannam@127: } cannam@127: } cannam@127: alloc_local(ntot * 2, p->in == p->out); cannam@127: cannam@127: total_ni[rnk - 1] = p->sz->dims[rnk - 1].n; cannam@127: if (p->sign < 0) cannam@127: pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: local_in, cannam@127: (FFTW(complex) *) local_out, cannam@127: MPI_COMM_WORLD, flags); cannam@127: else cannam@127: pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: (FFTW(complex) *) local_in, cannam@127: local_out, cannam@127: MPI_COMM_WORLD, flags); cannam@127: cannam@127: total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1; cannam@127: vn *= 2; cannam@127: cannam@127: { cannam@127: ptrdiff_t nrest = 1; cannam@127: for (i = 2; i < rnk; ++i) nrest *= total_ni[i]; cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) cannam@127: plan_scramble_in = mkplan_transpose_local( cannam@127: total_ni[0], local_ni[1], vn * nrest, cannam@127: local_in, local_in); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) cannam@127: plan_unscramble_out = mkplan_transpose_local( cannam@127: local_no[1], total_ni[0], vn * nrest, cannam@127: local_out, local_out); cannam@127: } cannam@127: cannam@127: return pln; cannam@127: } cannam@127: cannam@127: static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags) cannam@127: { cannam@127: ptrdiff_t ntot, nx, ny; cannam@127: int ix=0, iy=1, i; cannam@127: const bench_iodim *d = p->vecsz->dims; cannam@127: FFTW(plan) pln; cannam@127: cannam@127: if (p->vecsz->rnk == 3) { cannam@127: for (i = 0; i < 3; ++i) cannam@127: if (d[i].is == 1 && d[i].os == 1) { cannam@127: vn = d[i].n; cannam@127: ix = (i + 1) % 3; cannam@127: iy = (i + 2) % 3; cannam@127: break; cannam@127: } cannam@127: if (i == 3) return 0; cannam@127: } cannam@127: else { cannam@127: vn = 1; cannam@127: ix = 0; cannam@127: iy = 1; cannam@127: } cannam@127: cannam@127: if (d[ix].is == d[iy].n * vn && d[ix].os == vn cannam@127: && d[iy].os == d[ix].n * vn && d[iy].is == vn) { cannam@127: nx = d[ix].n; cannam@127: ny = d[iy].n; cannam@127: } cannam@127: else if (d[iy].is == d[ix].n * vn && d[iy].os == vn cannam@127: && d[ix].os == d[iy].n * vn && d[ix].is == vn) { cannam@127: nx = d[iy].n; cannam@127: ny = d[ix].n; cannam@127: } cannam@127: else cannam@127: return 0; cannam@127: cannam@127: alloc_rnk(2); cannam@127: ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD, cannam@127: &local_ni[0], cannam@127: &local_starti[0], cannam@127: &local_no[0], cannam@127: &local_starto[0]); cannam@127: local_ni[1] = ny; cannam@127: local_starti[1] = 0; cannam@127: local_no[1] = nx; cannam@127: local_starto[1] = 0; cannam@127: total_ni[0] = nx; total_ni[1] = ny; cannam@127: total_no[1] = nx; total_no[0] = ny; cannam@127: alloc_local(ntot, p->in == p->out); cannam@127: cannam@127: pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: local_in, local_out, cannam@127: MPI_COMM_WORLD, flags); cannam@127: cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) cannam@127: plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn, cannam@127: local_in, local_in); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) cannam@127: plan_unscramble_out = mkplan_transpose_local cannam@127: (nx, local_no[0], vn, local_out, local_out); cannam@127: cannam@127: #if 0 cannam@127: if (pln && vn == 1) { cannam@127: int i, j; cannam@127: bench_real *ri = (bench_real *) p->in; cannam@127: bench_real *ro = (bench_real *) p->out; cannam@127: if (!ri || !ro) return pln; cannam@127: setup_gather_scatter(); cannam@127: for (i = 0; i < nx * ny; ++i) cannam@127: ri[i] = i; cannam@127: after_problem_rcopy_from(p, ri); cannam@127: FFTW(execute)(pln); cannam@127: after_problem_rcopy_to(p, ro); cannam@127: if (my_pe == 0) { cannam@127: for (i = 0; i < nx; ++i) { cannam@127: for (j = 0; j < ny; ++j) cannam@127: printf(" %3g", ro[j * nx + i]); cannam@127: printf("\n"); cannam@127: } cannam@127: } cannam@127: } cannam@127: #endif cannam@127: cannam@127: return pln; cannam@127: } cannam@127: cannam@127: static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags) cannam@127: { cannam@127: FFTW(plan) pln = 0; cannam@127: int i; cannam@127: ptrdiff_t ntot; cannam@127: FFTW(r2r_kind) *k; cannam@127: cannam@127: if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1)) cannam@127: && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3) cannam@127: return mkplan_transpose(p, flags); cannam@127: cannam@127: vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; cannam@127: cannam@127: if (p->sz->rnk < 1 cannam@127: || p->split cannam@127: || !tensor_contiguousp(p->sz, vn) cannam@127: || tensor_rowmajor_transposedp(p->sz) cannam@127: || p->vecsz->rnk > 1 cannam@127: || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 cannam@127: || p->vecsz->dims[0].os != 1))) cannam@127: return 0; cannam@127: cannam@127: alloc_rnk(p->sz->rnk); cannam@127: for (i = 0; i < rnk; ++i) { cannam@127: total_ni[i] = total_no[i] = p->sz->dims[i].n; cannam@127: local_ni[i] = local_no[i] = total_ni[i]; cannam@127: local_starti[i] = local_starto[i] = 0; cannam@127: } cannam@127: if (rnk > 1) { cannam@127: ptrdiff_t n, start, nT, startT; cannam@127: ntot = FFTW(mpi_local_size_many_transposed) cannam@127: (p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, cannam@127: MPI_COMM_WORLD, cannam@127: &n, &start, &nT, &startT); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) { cannam@127: local_ni[1] = nT; cannam@127: local_starti[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_ni[0] = n; cannam@127: local_starti[0] = start; cannam@127: } cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) { cannam@127: local_no[1] = nT; cannam@127: local_starto[1] = startT; cannam@127: } cannam@127: else { cannam@127: local_no[0] = n; cannam@127: local_starto[0] = start; cannam@127: } cannam@127: } cannam@127: else if (rnk == 1) { cannam@127: ntot = FFTW(mpi_local_size_many_1d) cannam@127: (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags, cannam@127: local_ni, local_starti, local_no, local_starto); cannam@127: } cannam@127: alloc_local(ntot, p->in == p->out); cannam@127: cannam@127: k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk); cannam@127: for (i = 0; i < p->sz->rnk; ++i) cannam@127: switch (p->k[i]) { cannam@127: case R2R_R2HC: k[i] = FFTW_R2HC; break; cannam@127: case R2R_HC2R: k[i] = FFTW_HC2R; break; cannam@127: case R2R_DHT: k[i] = FFTW_DHT; break; cannam@127: case R2R_REDFT00: k[i] = FFTW_REDFT00; break; cannam@127: case R2R_REDFT01: k[i] = FFTW_REDFT01; break; cannam@127: case R2R_REDFT10: k[i] = FFTW_REDFT10; break; cannam@127: case R2R_REDFT11: k[i] = FFTW_REDFT11; break; cannam@127: case R2R_RODFT00: k[i] = FFTW_RODFT00; break; cannam@127: case R2R_RODFT01: k[i] = FFTW_RODFT01; break; cannam@127: case R2R_RODFT10: k[i] = FFTW_RODFT10; break; cannam@127: case R2R_RODFT11: k[i] = FFTW_RODFT11; break; cannam@127: default: BENCH_ASSERT(0); cannam@127: } cannam@127: cannam@127: pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: FFTW_MPI_DEFAULT_BLOCK, cannam@127: local_in, local_out, cannam@127: MPI_COMM_WORLD, k, flags); cannam@127: bench_free(k); cannam@127: cannam@127: if (rnk > 1) { cannam@127: ptrdiff_t nrest = 1; cannam@127: for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n; cannam@127: if (flags & FFTW_MPI_TRANSPOSED_IN) cannam@127: plan_scramble_in = mkplan_transpose_local( cannam@127: p->sz->dims[0].n, local_ni[1], vn * nrest, cannam@127: local_in, local_in); cannam@127: if (flags & FFTW_MPI_TRANSPOSED_OUT) cannam@127: plan_unscramble_out = mkplan_transpose_local( cannam@127: local_no[1], p->sz->dims[0].n, vn * nrest, cannam@127: local_out, local_out); cannam@127: } cannam@127: cannam@127: return pln; cannam@127: } cannam@127: cannam@127: FFTW(plan) mkplan(bench_problem *p, unsigned flags) cannam@127: { cannam@127: FFTW(plan) pln = 0; cannam@127: FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0; cannam@127: FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0; cannam@127: if (p->scrambled_in) { cannam@127: if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) cannam@127: flags |= FFTW_MPI_SCRAMBLED_IN; cannam@127: else cannam@127: flags |= FFTW_MPI_TRANSPOSED_IN; cannam@127: } cannam@127: if (p->scrambled_out) { cannam@127: if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) cannam@127: flags |= FFTW_MPI_SCRAMBLED_OUT; cannam@127: else cannam@127: flags |= FFTW_MPI_TRANSPOSED_OUT; cannam@127: } cannam@127: switch (p->kind) { cannam@127: case PROBLEM_COMPLEX: cannam@127: pln =mkplan_complex(p, flags); cannam@127: break; cannam@127: case PROBLEM_REAL: cannam@127: pln = mkplan_real(p, flags); cannam@127: break; cannam@127: case PROBLEM_R2R: cannam@127: pln = mkplan_r2r(p, flags); cannam@127: break; cannam@127: default: BENCH_ASSERT(0); cannam@127: } cannam@127: if (pln) setup_gather_scatter(); cannam@127: return pln; cannam@127: } cannam@127: cannam@127: void main_init(int *argc, char ***argv) cannam@127: { cannam@127: #ifdef HAVE_SMP cannam@127: # if MPI_VERSION >= 2 /* for MPI_Init_thread */ cannam@127: int provided; cannam@127: MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided); cannam@127: threads_ok = provided >= MPI_THREAD_FUNNELED; cannam@127: # else cannam@127: MPI_Init(argc, argv); cannam@127: threads_ok = 0; cannam@127: # endif cannam@127: #else cannam@127: MPI_Init(argc, argv); cannam@127: #endif cannam@127: MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); cannam@127: MPI_Comm_size(MPI_COMM_WORLD, &n_pes); cannam@127: if (my_pe != 0) verbose = -999; cannam@127: no_speed_allocation = 1; /* so we can benchmark transforms > memory */ cannam@127: always_pad_real = 1; /* out-of-place real transforms are padded */ cannam@127: isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes); cannam@127: isend_off = (int *) bench_malloc(sizeof(int) * n_pes); cannam@127: orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes); cannam@127: orecv_off = (int *) bench_malloc(sizeof(int) * n_pes); cannam@127: cannam@127: /* init_threads must be called before any other FFTW function, cannam@127: including mpi_init, because it has to register the threads hooks cannam@127: before the planner is initalized */ cannam@127: #ifdef HAVE_SMP cannam@127: if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); } cannam@127: #endif cannam@127: FFTW(mpi_init)(); cannam@127: } cannam@127: cannam@127: void initial_cleanup(void) cannam@127: { cannam@127: alloc_rnk(0); cannam@127: alloc_local(0, 0); cannam@127: bench_free(all_local_in); all_local_in = 0; cannam@127: bench_free(all_local_out); all_local_out = 0; cannam@127: bench_free(isend_off); isend_off = 0; cannam@127: bench_free(isend_cnt); isend_cnt = 0; cannam@127: bench_free(orecv_off); orecv_off = 0; cannam@127: bench_free(orecv_cnt); orecv_cnt = 0; cannam@127: FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0; cannam@127: FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0; cannam@127: } cannam@127: cannam@127: void final_cleanup(void) cannam@127: { cannam@127: MPI_Finalize(); cannam@127: } cannam@127: cannam@127: void bench_exit(int status) cannam@127: { cannam@127: MPI_Abort(MPI_COMM_WORLD, status); cannam@127: } cannam@127: cannam@127: double bench_cost_postprocess(double cost) cannam@127: { cannam@127: double cost_max; cannam@127: MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); cannam@127: return cost_max; cannam@127: } cannam@127: cannam@127: cannam@127: int import_wisdom(FILE *f) cannam@127: { cannam@127: int success = 1, sall; cannam@127: if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f); cannam@127: FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD); cannam@127: MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); cannam@127: return sall; cannam@127: } cannam@127: cannam@127: void export_wisdom(FILE *f) cannam@127: { cannam@127: FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD); cannam@127: if (my_pe == 0) FFTW(export_wisdom_to_file)(f); cannam@127: }