annotate src/fftw-3.3.5/mpi/mpi-bench.c @ 169:223a55898ab9 tip default

Add null config files
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 02 Mar 2020 14:03:47 +0000
parents 7867fa7e1b6b
children
rev   line source
cannam@127 1 /**************************************************************************/
cannam@127 2 /* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
cannam@127 3 It is probably NOT a good place to learn FFTW usage, since it has a
cannam@127 4 lot of added complexity in order to exercise and test the full API,
cannam@127 5 etcetera. We suggest reading the manual. */
cannam@127 6 /**************************************************************************/
cannam@127 7
cannam@127 8 #include <math.h>
cannam@127 9 #include <stdio.h>
cannam@127 10 #include <string.h>
cannam@127 11 #include "fftw3-mpi.h"
cannam@127 12 #include "fftw-bench.h"
cannam@127 13
cannam@127 14 #if defined(BENCHFFT_SINGLE)
cannam@127 15 # define BENCH_MPI_TYPE MPI_FLOAT
cannam@127 16 #elif defined(BENCHFFT_LDOUBLE)
cannam@127 17 # define BENCH_MPI_TYPE MPI_LONG_DOUBLE
cannam@127 18 #elif defined(BENCHFFT_QUAD)
cannam@127 19 # error MPI quad-precision type is unknown
cannam@127 20 #else
cannam@127 21 # define BENCH_MPI_TYPE MPI_DOUBLE
cannam@127 22 #endif
cannam@127 23
cannam@127 24 #if SIZEOF_PTRDIFF_T == SIZEOF_INT
cannam@127 25 # define FFTW_MPI_PTRDIFF_T MPI_INT
cannam@127 26 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
cannam@127 27 # define FFTW_MPI_PTRDIFF_T MPI_LONG
cannam@127 28 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
cannam@127 29 # define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
cannam@127 30 #else
cannam@127 31 # error MPI type for ptrdiff_t is unknown
cannam@127 32 # define FFTW_MPI_PTRDIFF_T MPI_LONG
cannam@127 33 #endif
cannam@127 34
cannam@127 35 static const char *mkversion(void) { return FFTW(version); }
cannam@127 36 static const char *mkcc(void) { return FFTW(cc); }
cannam@127 37 static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
cannam@127 38 static const char *mknproc(void) {
cannam@127 39 static char buf[32];
cannam@127 40 int ncpus;
cannam@127 41 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
cannam@127 42 #ifdef HAVE_SNPRINTF
cannam@127 43 snprintf(buf, 32, "%d", ncpus);
cannam@127 44 #else
cannam@127 45 sprintf(buf, "%d", ncpus);
cannam@127 46 #endif
cannam@127 47 return buf;
cannam@127 48 }
cannam@127 49
cannam@127 50 BEGIN_BENCH_DOC
cannam@127 51 BENCH_DOC("name", "fftw3_mpi")
cannam@127 52 BENCH_DOCF("version", mkversion)
cannam@127 53 BENCH_DOCF("cc", mkcc)
cannam@127 54 BENCH_DOCF("codelet-optim", mkcodelet_optim)
cannam@127 55 BENCH_DOCF("nproc", mknproc)
cannam@127 56 END_BENCH_DOC
cannam@127 57
cannam@127 58 static int n_pes = 1, my_pe = 0;
cannam@127 59
cannam@127 60 /* global variables describing the shape of the data and its distribution */
cannam@127 61 static int rnk;
cannam@127 62 static ptrdiff_t vn, iNtot, oNtot;
cannam@127 63 static ptrdiff_t *local_ni=0, *local_starti=0;
cannam@127 64 static ptrdiff_t *local_no=0, *local_starto=0;
cannam@127 65 static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
cannam@127 66 static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
cannam@127 67 static ptrdiff_t *istrides = 0, *ostrides = 0;
cannam@127 68 static ptrdiff_t *total_ni=0, *total_no=0;
cannam@127 69 static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
cannam@127 70 static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
cannam@127 71
cannam@127 72 static bench_real *local_in = 0, *local_out = 0;
cannam@127 73 static bench_real *all_local_in = 0, *all_local_out = 0;
cannam@127 74 static int all_local_in_alloc = 0, all_local_out_alloc = 0;
cannam@127 75 static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
cannam@127 76
cannam@127 77 static void alloc_rnk(int rnk_) {
cannam@127 78 rnk = rnk_;
cannam@127 79 bench_free(local_ni);
cannam@127 80 if (rnk == 0)
cannam@127 81 local_ni = 0;
cannam@127 82 else
cannam@127 83 local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
cannam@127 84 * (8 + n_pes * 4));
cannam@127 85
cannam@127 86 local_starti = local_ni + rnk;
cannam@127 87 local_no = local_ni + 2 * rnk;
cannam@127 88 local_starto = local_ni + 3 * rnk;
cannam@127 89 istrides = local_ni + 4 * rnk;
cannam@127 90 ostrides = local_ni + 5 * rnk;
cannam@127 91 total_ni = local_ni + 6 * rnk;
cannam@127 92 total_no = local_ni + 7 * rnk;
cannam@127 93 all_local_ni = local_ni + 8 * rnk;
cannam@127 94 all_local_starti = local_ni + (8 + n_pes) * rnk;
cannam@127 95 all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
cannam@127 96 all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
cannam@127 97 }
cannam@127 98
cannam@127 99 static void setup_gather_scatter(void)
cannam@127 100 {
cannam@127 101 int i, j;
cannam@127 102 ptrdiff_t off;
cannam@127 103
cannam@127 104 MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 105 all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 106 0, MPI_COMM_WORLD);
cannam@127 107 MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
cannam@127 108 MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 109 all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 110 0, MPI_COMM_WORLD);
cannam@127 111 MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
cannam@127 112
cannam@127 113 MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 114 all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 115 0, MPI_COMM_WORLD);
cannam@127 116 MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
cannam@127 117 MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 118 all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
cannam@127 119 0, MPI_COMM_WORLD);
cannam@127 120 MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
cannam@127 121
cannam@127 122 off = 0;
cannam@127 123 for (i = 0; i < n_pes; ++i) {
cannam@127 124 ptrdiff_t N = vn;
cannam@127 125 for (j = 0; j < rnk; ++j)
cannam@127 126 N *= all_local_ni[i * rnk + j];
cannam@127 127 isend_cnt[i] = N;
cannam@127 128 isend_off[i] = off;
cannam@127 129 off += N;
cannam@127 130 }
cannam@127 131 iNtot = off;
cannam@127 132 all_local_in_alloc = 1;
cannam@127 133
cannam@127 134 istrides[rnk - 1] = vn;
cannam@127 135 for (j = rnk - 2; j >= 0; --j)
cannam@127 136 istrides[j] = total_ni[j + 1] * istrides[j + 1];
cannam@127 137
cannam@127 138 off = 0;
cannam@127 139 for (i = 0; i < n_pes; ++i) {
cannam@127 140 ptrdiff_t N = vn;
cannam@127 141 for (j = 0; j < rnk; ++j)
cannam@127 142 N *= all_local_no[i * rnk + j];
cannam@127 143 orecv_cnt[i] = N;
cannam@127 144 orecv_off[i] = off;
cannam@127 145 off += N;
cannam@127 146 }
cannam@127 147 oNtot = off;
cannam@127 148 all_local_out_alloc = 1;
cannam@127 149
cannam@127 150 ostrides[rnk - 1] = vn;
cannam@127 151 for (j = rnk - 2; j >= 0; --j)
cannam@127 152 ostrides[j] = total_no[j + 1] * ostrides[j + 1];
cannam@127 153 }
cannam@127 154
cannam@127 155 static void copy_block_out(const bench_real *in,
cannam@127 156 int rnk, ptrdiff_t *n, ptrdiff_t *start,
cannam@127 157 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
cannam@127 158 bench_real *out)
cannam@127 159 {
cannam@127 160 ptrdiff_t i;
cannam@127 161 if (rnk == 0) {
cannam@127 162 for (i = 0; i < vn; ++i)
cannam@127 163 out[i] = in[i];
cannam@127 164 }
cannam@127 165 else if (rnk == 1) { /* this case is just an optimization */
cannam@127 166 ptrdiff_t j;
cannam@127 167 out += start[0] * os[0];
cannam@127 168 for (j = 0; j < n[0]; ++j) {
cannam@127 169 for (i = 0; i < vn; ++i)
cannam@127 170 out[i] = in[i];
cannam@127 171 in += is;
cannam@127 172 out += os[0];
cannam@127 173 }
cannam@127 174 }
cannam@127 175 else {
cannam@127 176 /* we should do n[0] for locality, but this way is simpler to code */
cannam@127 177 for (i = 0; i < n[rnk - 1]; ++i)
cannam@127 178 copy_block_out(in + i * is,
cannam@127 179 rnk - 1, n, start, is * n[rnk - 1], os, vn,
cannam@127 180 out + (start[rnk - 1] + i) * os[rnk - 1]);
cannam@127 181 }
cannam@127 182 }
cannam@127 183
cannam@127 184 static void copy_block_in(bench_real *in,
cannam@127 185 int rnk, ptrdiff_t *n, ptrdiff_t *start,
cannam@127 186 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
cannam@127 187 const bench_real *out)
cannam@127 188 {
cannam@127 189 ptrdiff_t i;
cannam@127 190 if (rnk == 0) {
cannam@127 191 for (i = 0; i < vn; ++i)
cannam@127 192 in[i] = out[i];
cannam@127 193 }
cannam@127 194 else if (rnk == 1) { /* this case is just an optimization */
cannam@127 195 ptrdiff_t j;
cannam@127 196 out += start[0] * os[0];
cannam@127 197 for (j = 0; j < n[0]; ++j) {
cannam@127 198 for (i = 0; i < vn; ++i)
cannam@127 199 in[i] = out[i];
cannam@127 200 in += is;
cannam@127 201 out += os[0];
cannam@127 202 }
cannam@127 203 }
cannam@127 204 else {
cannam@127 205 /* we should do n[0] for locality, but this way is simpler to code */
cannam@127 206 for (i = 0; i < n[rnk - 1]; ++i)
cannam@127 207 copy_block_in(in + i * is,
cannam@127 208 rnk - 1, n, start, is * n[rnk - 1], os, vn,
cannam@127 209 out + (start[rnk - 1] + i) * os[rnk - 1]);
cannam@127 210 }
cannam@127 211 }
cannam@127 212
cannam@127 213 static void do_scatter_in(bench_real *in)
cannam@127 214 {
cannam@127 215 bench_real *ali;
cannam@127 216 int i;
cannam@127 217 if (all_local_in_alloc) {
cannam@127 218 bench_free(all_local_in);
cannam@127 219 all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
cannam@127 220 all_local_in_alloc = 0;
cannam@127 221 }
cannam@127 222 ali = all_local_in;
cannam@127 223 for (i = 0; i < n_pes; ++i) {
cannam@127 224 copy_block_in(ali,
cannam@127 225 rnk, all_local_ni + i * rnk,
cannam@127 226 all_local_starti + i * rnk,
cannam@127 227 vn, istrides, vn,
cannam@127 228 in);
cannam@127 229 ali += isend_cnt[i];
cannam@127 230 }
cannam@127 231 MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
cannam@127 232 local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
cannam@127 233 0, MPI_COMM_WORLD);
cannam@127 234 }
cannam@127 235
cannam@127 236 static void do_gather_out(bench_real *out)
cannam@127 237 {
cannam@127 238 bench_real *alo;
cannam@127 239 int i;
cannam@127 240
cannam@127 241 if (all_local_out_alloc) {
cannam@127 242 bench_free(all_local_out);
cannam@127 243 all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
cannam@127 244 all_local_out_alloc = 0;
cannam@127 245 }
cannam@127 246 MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
cannam@127 247 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
cannam@127 248 0, MPI_COMM_WORLD);
cannam@127 249 MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
cannam@127 250 alo = all_local_out;
cannam@127 251 for (i = 0; i < n_pes; ++i) {
cannam@127 252 copy_block_out(alo,
cannam@127 253 rnk, all_local_no + i * rnk,
cannam@127 254 all_local_starto + i * rnk,
cannam@127 255 vn, ostrides, vn,
cannam@127 256 out);
cannam@127 257 alo += orecv_cnt[i];
cannam@127 258 }
cannam@127 259 }
cannam@127 260
cannam@127 261 static void alloc_local(ptrdiff_t nreal, int inplace)
cannam@127 262 {
cannam@127 263 bench_free(local_in);
cannam@127 264 if (local_out != local_in) bench_free(local_out);
cannam@127 265 local_in = local_out = 0;
cannam@127 266 if (nreal > 0) {
cannam@127 267 ptrdiff_t i;
cannam@127 268 local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
cannam@127 269 if (inplace)
cannam@127 270 local_out = local_in;
cannam@127 271 else
cannam@127 272 local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
cannam@127 273 for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
cannam@127 274 }
cannam@127 275 }
cannam@127 276
cannam@127 277 void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
cannam@127 278 {
cannam@127 279 UNUSED(p);
cannam@127 280 do_scatter_in(ri);
cannam@127 281 if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
cannam@127 282 }
cannam@127 283
cannam@127 284 void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
cannam@127 285 {
cannam@127 286 UNUSED(p);
cannam@127 287 if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
cannam@127 288 do_gather_out(ro);
cannam@127 289 }
cannam@127 290
cannam@127 291 void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
cannam@127 292 {
cannam@127 293 UNUSED(ii);
cannam@127 294 after_problem_rcopy_from(p, ri);
cannam@127 295 }
cannam@127 296
cannam@127 297 void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
cannam@127 298 {
cannam@127 299 UNUSED(io);
cannam@127 300 after_problem_rcopy_to(p, ro);
cannam@127 301 }
cannam@127 302
cannam@127 303 void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
cannam@127 304 {
cannam@127 305 UNUSED(ii);
cannam@127 306 after_problem_rcopy_from(p, ri);
cannam@127 307 }
cannam@127 308
cannam@127 309 void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
cannam@127 310 {
cannam@127 311 UNUSED(io);
cannam@127 312 after_problem_rcopy_to(p, ro);
cannam@127 313 }
cannam@127 314
cannam@127 315 static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny,
cannam@127 316 ptrdiff_t vn,
cannam@127 317 bench_real *in, bench_real *out)
cannam@127 318 {
cannam@127 319 FFTW(iodim64) hdims[3];
cannam@127 320 FFTW(r2r_kind) k[3];
cannam@127 321 FFTW(plan) pln;
cannam@127 322
cannam@127 323 hdims[0].n = nx;
cannam@127 324 hdims[0].is = ny * vn;
cannam@127 325 hdims[0].os = vn;
cannam@127 326 hdims[1].n = ny;
cannam@127 327 hdims[1].is = vn;
cannam@127 328 hdims[1].os = nx * vn;
cannam@127 329 hdims[2].n = vn;
cannam@127 330 hdims[2].is = 1;
cannam@127 331 hdims[2].os = 1;
cannam@127 332 k[0] = k[1] = k[2] = FFTW_R2HC;
cannam@127 333 pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
cannam@127 334 BENCH_ASSERT(pln != 0);
cannam@127 335 return pln;
cannam@127 336 }
cannam@127 337
cannam@127 338 static int tensor_rowmajor_transposedp(bench_tensor *t)
cannam@127 339 {
cannam@127 340 bench_iodim *d;
cannam@127 341 int i;
cannam@127 342
cannam@127 343 BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
cannam@127 344 if (t->rnk < 2)
cannam@127 345 return 0;
cannam@127 346
cannam@127 347 d = t->dims;
cannam@127 348 if (d[0].is != d[1].is * d[1].n
cannam@127 349 || d[0].os != d[1].is
cannam@127 350 || d[1].os != d[0].os * d[0].n)
cannam@127 351 return 0;
cannam@127 352 if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
cannam@127 353 return 0;
cannam@127 354 for (i = 2; i + 1 < t->rnk; ++i) {
cannam@127 355 d = t->dims + i;
cannam@127 356 if (d[0].is != d[1].is * d[1].n
cannam@127 357 || d[0].os != d[1].os * d[1].n)
cannam@127 358 return 0;
cannam@127 359 }
cannam@127 360
cannam@127 361 if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
cannam@127 362 return 0;
cannam@127 363 return 1;
cannam@127 364 }
cannam@127 365
cannam@127 366 static int tensor_contiguousp(bench_tensor *t, int s)
cannam@127 367 {
cannam@127 368 return (t->dims[t->rnk-1].is == s
cannam@127 369 && ((tensor_rowmajorp(t) &&
cannam@127 370 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
cannam@127 371 || tensor_rowmajor_transposedp(t)));
cannam@127 372 }
cannam@127 373
cannam@127 374 static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
cannam@127 375 {
cannam@127 376 FFTW(plan) pln = 0;
cannam@127 377 int i;
cannam@127 378 ptrdiff_t ntot;
cannam@127 379
cannam@127 380 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
cannam@127 381
cannam@127 382 if (p->sz->rnk < 1
cannam@127 383 || p->split
cannam@127 384 || !tensor_contiguousp(p->sz, vn)
cannam@127 385 || tensor_rowmajor_transposedp(p->sz)
cannam@127 386 || p->vecsz->rnk > 1
cannam@127 387 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
cannam@127 388 || p->vecsz->dims[0].os != 1)))
cannam@127 389 return 0;
cannam@127 390
cannam@127 391 alloc_rnk(p->sz->rnk);
cannam@127 392 for (i = 0; i < rnk; ++i) {
cannam@127 393 total_ni[i] = total_no[i] = p->sz->dims[i].n;
cannam@127 394 local_ni[i] = local_no[i] = total_ni[i];
cannam@127 395 local_starti[i] = local_starto[i] = 0;
cannam@127 396 }
cannam@127 397 if (rnk > 1) {
cannam@127 398 ptrdiff_t n, start, nT, startT;
cannam@127 399 ntot = FFTW(mpi_local_size_many_transposed)
cannam@127 400 (p->sz->rnk, total_ni, vn,
cannam@127 401 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
cannam@127 402 MPI_COMM_WORLD,
cannam@127 403 &n, &start, &nT, &startT);
cannam@127 404 if (flags & FFTW_MPI_TRANSPOSED_IN) {
cannam@127 405 local_ni[1] = nT;
cannam@127 406 local_starti[1] = startT;
cannam@127 407 }
cannam@127 408 else {
cannam@127 409 local_ni[0] = n;
cannam@127 410 local_starti[0] = start;
cannam@127 411 }
cannam@127 412 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
cannam@127 413 local_no[1] = nT;
cannam@127 414 local_starto[1] = startT;
cannam@127 415 }
cannam@127 416 else {
cannam@127 417 local_no[0] = n;
cannam@127 418 local_starto[0] = start;
cannam@127 419 }
cannam@127 420 }
cannam@127 421 else if (rnk == 1) {
cannam@127 422 ntot = FFTW(mpi_local_size_many_1d)
cannam@127 423 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
cannam@127 424 local_ni, local_starti, local_no, local_starto);
cannam@127 425 }
cannam@127 426 alloc_local(ntot * 2, p->in == p->out);
cannam@127 427
cannam@127 428 pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn,
cannam@127 429 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 430 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 431 (FFTW(complex) *) local_in,
cannam@127 432 (FFTW(complex) *) local_out,
cannam@127 433 MPI_COMM_WORLD, p->sign, flags);
cannam@127 434
cannam@127 435 vn *= 2;
cannam@127 436
cannam@127 437 if (rnk > 1) {
cannam@127 438 ptrdiff_t nrest = 1;
cannam@127 439 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
cannam@127 440 if (flags & FFTW_MPI_TRANSPOSED_IN)
cannam@127 441 plan_scramble_in = mkplan_transpose_local(
cannam@127 442 p->sz->dims[0].n, local_ni[1], vn * nrest,
cannam@127 443 local_in, local_in);
cannam@127 444 if (flags & FFTW_MPI_TRANSPOSED_OUT)
cannam@127 445 plan_unscramble_out = mkplan_transpose_local(
cannam@127 446 local_no[1], p->sz->dims[0].n, vn * nrest,
cannam@127 447 local_out, local_out);
cannam@127 448 }
cannam@127 449
cannam@127 450 return pln;
cannam@127 451 }
cannam@127 452
cannam@127 453 static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
cannam@127 454 {
cannam@127 455 return (t->dims[t->rnk-1].is == s
cannam@127 456 && ((tensor_real_rowmajorp(t, sign, 1) &&
cannam@127 457 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
cannam@127 458 }
cannam@127 459
cannam@127 460 static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
cannam@127 461 {
cannam@127 462 FFTW(plan) pln = 0;
cannam@127 463 int i;
cannam@127 464 ptrdiff_t ntot;
cannam@127 465
cannam@127 466 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
cannam@127 467
cannam@127 468 if (p->sz->rnk < 2
cannam@127 469 || p->split
cannam@127 470 || !tensor_real_contiguousp(p->sz, p->sign, vn)
cannam@127 471 || tensor_rowmajor_transposedp(p->sz)
cannam@127 472 || p->vecsz->rnk > 1
cannam@127 473 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
cannam@127 474 || p->vecsz->dims[0].os != 1)))
cannam@127 475 return 0;
cannam@127 476
cannam@127 477 alloc_rnk(p->sz->rnk);
cannam@127 478 for (i = 0; i < rnk; ++i) {
cannam@127 479 total_ni[i] = total_no[i] = p->sz->dims[i].n;
cannam@127 480 local_ni[i] = local_no[i] = total_ni[i];
cannam@127 481 local_starti[i] = local_starto[i] = 0;
cannam@127 482 }
cannam@127 483 local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1]
cannam@127 484 = p->sz->dims[rnk-1].n / 2 + 1;
cannam@127 485 {
cannam@127 486 ptrdiff_t n, start, nT, startT;
cannam@127 487 ntot = FFTW(mpi_local_size_many_transposed)
cannam@127 488 (p->sz->rnk, total_ni, vn,
cannam@127 489 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
cannam@127 490 MPI_COMM_WORLD,
cannam@127 491 &n, &start, &nT, &startT);
cannam@127 492 if (flags & FFTW_MPI_TRANSPOSED_IN) {
cannam@127 493 local_ni[1] = nT;
cannam@127 494 local_starti[1] = startT;
cannam@127 495 }
cannam@127 496 else {
cannam@127 497 local_ni[0] = n;
cannam@127 498 local_starti[0] = start;
cannam@127 499 }
cannam@127 500 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
cannam@127 501 local_no[1] = nT;
cannam@127 502 local_starto[1] = startT;
cannam@127 503 }
cannam@127 504 else {
cannam@127 505 local_no[0] = n;
cannam@127 506 local_starto[0] = start;
cannam@127 507 }
cannam@127 508 }
cannam@127 509 alloc_local(ntot * 2, p->in == p->out);
cannam@127 510
cannam@127 511 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
cannam@127 512 if (p->sign < 0)
cannam@127 513 pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn,
cannam@127 514 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 515 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 516 local_in,
cannam@127 517 (FFTW(complex) *) local_out,
cannam@127 518 MPI_COMM_WORLD, flags);
cannam@127 519 else
cannam@127 520 pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn,
cannam@127 521 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 522 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 523 (FFTW(complex) *) local_in,
cannam@127 524 local_out,
cannam@127 525 MPI_COMM_WORLD, flags);
cannam@127 526
cannam@127 527 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
cannam@127 528 vn *= 2;
cannam@127 529
cannam@127 530 {
cannam@127 531 ptrdiff_t nrest = 1;
cannam@127 532 for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
cannam@127 533 if (flags & FFTW_MPI_TRANSPOSED_IN)
cannam@127 534 plan_scramble_in = mkplan_transpose_local(
cannam@127 535 total_ni[0], local_ni[1], vn * nrest,
cannam@127 536 local_in, local_in);
cannam@127 537 if (flags & FFTW_MPI_TRANSPOSED_OUT)
cannam@127 538 plan_unscramble_out = mkplan_transpose_local(
cannam@127 539 local_no[1], total_ni[0], vn * nrest,
cannam@127 540 local_out, local_out);
cannam@127 541 }
cannam@127 542
cannam@127 543 return pln;
cannam@127 544 }
cannam@127 545
cannam@127 546 static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
cannam@127 547 {
cannam@127 548 ptrdiff_t ntot, nx, ny;
cannam@127 549 int ix=0, iy=1, i;
cannam@127 550 const bench_iodim *d = p->vecsz->dims;
cannam@127 551 FFTW(plan) pln;
cannam@127 552
cannam@127 553 if (p->vecsz->rnk == 3) {
cannam@127 554 for (i = 0; i < 3; ++i)
cannam@127 555 if (d[i].is == 1 && d[i].os == 1) {
cannam@127 556 vn = d[i].n;
cannam@127 557 ix = (i + 1) % 3;
cannam@127 558 iy = (i + 2) % 3;
cannam@127 559 break;
cannam@127 560 }
cannam@127 561 if (i == 3) return 0;
cannam@127 562 }
cannam@127 563 else {
cannam@127 564 vn = 1;
cannam@127 565 ix = 0;
cannam@127 566 iy = 1;
cannam@127 567 }
cannam@127 568
cannam@127 569 if (d[ix].is == d[iy].n * vn && d[ix].os == vn
cannam@127 570 && d[iy].os == d[ix].n * vn && d[iy].is == vn) {
cannam@127 571 nx = d[ix].n;
cannam@127 572 ny = d[iy].n;
cannam@127 573 }
cannam@127 574 else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
cannam@127 575 && d[ix].os == d[iy].n * vn && d[ix].is == vn) {
cannam@127 576 nx = d[iy].n;
cannam@127 577 ny = d[ix].n;
cannam@127 578 }
cannam@127 579 else
cannam@127 580 return 0;
cannam@127 581
cannam@127 582 alloc_rnk(2);
cannam@127 583 ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
cannam@127 584 &local_ni[0],
cannam@127 585 &local_starti[0],
cannam@127 586 &local_no[0],
cannam@127 587 &local_starto[0]);
cannam@127 588 local_ni[1] = ny;
cannam@127 589 local_starti[1] = 0;
cannam@127 590 local_no[1] = nx;
cannam@127 591 local_starto[1] = 0;
cannam@127 592 total_ni[0] = nx; total_ni[1] = ny;
cannam@127 593 total_no[1] = nx; total_no[0] = ny;
cannam@127 594 alloc_local(ntot, p->in == p->out);
cannam@127 595
cannam@127 596 pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
cannam@127 597 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 598 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 599 local_in, local_out,
cannam@127 600 MPI_COMM_WORLD, flags);
cannam@127 601
cannam@127 602 if (flags & FFTW_MPI_TRANSPOSED_IN)
cannam@127 603 plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
cannam@127 604 local_in, local_in);
cannam@127 605 if (flags & FFTW_MPI_TRANSPOSED_OUT)
cannam@127 606 plan_unscramble_out = mkplan_transpose_local
cannam@127 607 (nx, local_no[0], vn, local_out, local_out);
cannam@127 608
cannam@127 609 #if 0
cannam@127 610 if (pln && vn == 1) {
cannam@127 611 int i, j;
cannam@127 612 bench_real *ri = (bench_real *) p->in;
cannam@127 613 bench_real *ro = (bench_real *) p->out;
cannam@127 614 if (!ri || !ro) return pln;
cannam@127 615 setup_gather_scatter();
cannam@127 616 for (i = 0; i < nx * ny; ++i)
cannam@127 617 ri[i] = i;
cannam@127 618 after_problem_rcopy_from(p, ri);
cannam@127 619 FFTW(execute)(pln);
cannam@127 620 after_problem_rcopy_to(p, ro);
cannam@127 621 if (my_pe == 0) {
cannam@127 622 for (i = 0; i < nx; ++i) {
cannam@127 623 for (j = 0; j < ny; ++j)
cannam@127 624 printf(" %3g", ro[j * nx + i]);
cannam@127 625 printf("\n");
cannam@127 626 }
cannam@127 627 }
cannam@127 628 }
cannam@127 629 #endif
cannam@127 630
cannam@127 631 return pln;
cannam@127 632 }
cannam@127 633
cannam@127 634 static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
cannam@127 635 {
cannam@127 636 FFTW(plan) pln = 0;
cannam@127 637 int i;
cannam@127 638 ptrdiff_t ntot;
cannam@127 639 FFTW(r2r_kind) *k;
cannam@127 640
cannam@127 641 if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
cannam@127 642 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
cannam@127 643 return mkplan_transpose(p, flags);
cannam@127 644
cannam@127 645 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
cannam@127 646
cannam@127 647 if (p->sz->rnk < 1
cannam@127 648 || p->split
cannam@127 649 || !tensor_contiguousp(p->sz, vn)
cannam@127 650 || tensor_rowmajor_transposedp(p->sz)
cannam@127 651 || p->vecsz->rnk > 1
cannam@127 652 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
cannam@127 653 || p->vecsz->dims[0].os != 1)))
cannam@127 654 return 0;
cannam@127 655
cannam@127 656 alloc_rnk(p->sz->rnk);
cannam@127 657 for (i = 0; i < rnk; ++i) {
cannam@127 658 total_ni[i] = total_no[i] = p->sz->dims[i].n;
cannam@127 659 local_ni[i] = local_no[i] = total_ni[i];
cannam@127 660 local_starti[i] = local_starto[i] = 0;
cannam@127 661 }
cannam@127 662 if (rnk > 1) {
cannam@127 663 ptrdiff_t n, start, nT, startT;
cannam@127 664 ntot = FFTW(mpi_local_size_many_transposed)
cannam@127 665 (p->sz->rnk, total_ni, vn,
cannam@127 666 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
cannam@127 667 MPI_COMM_WORLD,
cannam@127 668 &n, &start, &nT, &startT);
cannam@127 669 if (flags & FFTW_MPI_TRANSPOSED_IN) {
cannam@127 670 local_ni[1] = nT;
cannam@127 671 local_starti[1] = startT;
cannam@127 672 }
cannam@127 673 else {
cannam@127 674 local_ni[0] = n;
cannam@127 675 local_starti[0] = start;
cannam@127 676 }
cannam@127 677 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
cannam@127 678 local_no[1] = nT;
cannam@127 679 local_starto[1] = startT;
cannam@127 680 }
cannam@127 681 else {
cannam@127 682 local_no[0] = n;
cannam@127 683 local_starto[0] = start;
cannam@127 684 }
cannam@127 685 }
cannam@127 686 else if (rnk == 1) {
cannam@127 687 ntot = FFTW(mpi_local_size_many_1d)
cannam@127 688 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
cannam@127 689 local_ni, local_starti, local_no, local_starto);
cannam@127 690 }
cannam@127 691 alloc_local(ntot, p->in == p->out);
cannam@127 692
cannam@127 693 k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
cannam@127 694 for (i = 0; i < p->sz->rnk; ++i)
cannam@127 695 switch (p->k[i]) {
cannam@127 696 case R2R_R2HC: k[i] = FFTW_R2HC; break;
cannam@127 697 case R2R_HC2R: k[i] = FFTW_HC2R; break;
cannam@127 698 case R2R_DHT: k[i] = FFTW_DHT; break;
cannam@127 699 case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
cannam@127 700 case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
cannam@127 701 case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
cannam@127 702 case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
cannam@127 703 case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
cannam@127 704 case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
cannam@127 705 case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
cannam@127 706 case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
cannam@127 707 default: BENCH_ASSERT(0);
cannam@127 708 }
cannam@127 709
cannam@127 710 pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn,
cannam@127 711 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 712 FFTW_MPI_DEFAULT_BLOCK,
cannam@127 713 local_in, local_out,
cannam@127 714 MPI_COMM_WORLD, k, flags);
cannam@127 715 bench_free(k);
cannam@127 716
cannam@127 717 if (rnk > 1) {
cannam@127 718 ptrdiff_t nrest = 1;
cannam@127 719 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
cannam@127 720 if (flags & FFTW_MPI_TRANSPOSED_IN)
cannam@127 721 plan_scramble_in = mkplan_transpose_local(
cannam@127 722 p->sz->dims[0].n, local_ni[1], vn * nrest,
cannam@127 723 local_in, local_in);
cannam@127 724 if (flags & FFTW_MPI_TRANSPOSED_OUT)
cannam@127 725 plan_unscramble_out = mkplan_transpose_local(
cannam@127 726 local_no[1], p->sz->dims[0].n, vn * nrest,
cannam@127 727 local_out, local_out);
cannam@127 728 }
cannam@127 729
cannam@127 730 return pln;
cannam@127 731 }
cannam@127 732
cannam@127 733 FFTW(plan) mkplan(bench_problem *p, unsigned flags)
cannam@127 734 {
cannam@127 735 FFTW(plan) pln = 0;
cannam@127 736 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
cannam@127 737 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
cannam@127 738 if (p->scrambled_in) {
cannam@127 739 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
cannam@127 740 flags |= FFTW_MPI_SCRAMBLED_IN;
cannam@127 741 else
cannam@127 742 flags |= FFTW_MPI_TRANSPOSED_IN;
cannam@127 743 }
cannam@127 744 if (p->scrambled_out) {
cannam@127 745 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
cannam@127 746 flags |= FFTW_MPI_SCRAMBLED_OUT;
cannam@127 747 else
cannam@127 748 flags |= FFTW_MPI_TRANSPOSED_OUT;
cannam@127 749 }
cannam@127 750 switch (p->kind) {
cannam@127 751 case PROBLEM_COMPLEX:
cannam@127 752 pln =mkplan_complex(p, flags);
cannam@127 753 break;
cannam@127 754 case PROBLEM_REAL:
cannam@127 755 pln = mkplan_real(p, flags);
cannam@127 756 break;
cannam@127 757 case PROBLEM_R2R:
cannam@127 758 pln = mkplan_r2r(p, flags);
cannam@127 759 break;
cannam@127 760 default: BENCH_ASSERT(0);
cannam@127 761 }
cannam@127 762 if (pln) setup_gather_scatter();
cannam@127 763 return pln;
cannam@127 764 }
cannam@127 765
cannam@127 766 void main_init(int *argc, char ***argv)
cannam@127 767 {
cannam@127 768 #ifdef HAVE_SMP
cannam@127 769 # if MPI_VERSION >= 2 /* for MPI_Init_thread */
cannam@127 770 int provided;
cannam@127 771 MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
cannam@127 772 threads_ok = provided >= MPI_THREAD_FUNNELED;
cannam@127 773 # else
cannam@127 774 MPI_Init(argc, argv);
cannam@127 775 threads_ok = 0;
cannam@127 776 # endif
cannam@127 777 #else
cannam@127 778 MPI_Init(argc, argv);
cannam@127 779 #endif
cannam@127 780 MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
cannam@127 781 MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
cannam@127 782 if (my_pe != 0) verbose = -999;
cannam@127 783 no_speed_allocation = 1; /* so we can benchmark transforms > memory */
cannam@127 784 always_pad_real = 1; /* out-of-place real transforms are padded */
cannam@127 785 isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
cannam@127 786 isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
cannam@127 787 orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
cannam@127 788 orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
cannam@127 789
cannam@127 790 /* init_threads must be called before any other FFTW function,
cannam@127 791 including mpi_init, because it has to register the threads hooks
cannam@127 792 before the planner is initalized */
cannam@127 793 #ifdef HAVE_SMP
cannam@127 794 if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
cannam@127 795 #endif
cannam@127 796 FFTW(mpi_init)();
cannam@127 797 }
cannam@127 798
cannam@127 799 void initial_cleanup(void)
cannam@127 800 {
cannam@127 801 alloc_rnk(0);
cannam@127 802 alloc_local(0, 0);
cannam@127 803 bench_free(all_local_in); all_local_in = 0;
cannam@127 804 bench_free(all_local_out); all_local_out = 0;
cannam@127 805 bench_free(isend_off); isend_off = 0;
cannam@127 806 bench_free(isend_cnt); isend_cnt = 0;
cannam@127 807 bench_free(orecv_off); orecv_off = 0;
cannam@127 808 bench_free(orecv_cnt); orecv_cnt = 0;
cannam@127 809 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
cannam@127 810 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
cannam@127 811 }
cannam@127 812
cannam@127 813 void final_cleanup(void)
cannam@127 814 {
cannam@127 815 MPI_Finalize();
cannam@127 816 }
cannam@127 817
cannam@127 818 void bench_exit(int status)
cannam@127 819 {
cannam@127 820 MPI_Abort(MPI_COMM_WORLD, status);
cannam@127 821 }
cannam@127 822
cannam@127 823 double bench_cost_postprocess(double cost)
cannam@127 824 {
cannam@127 825 double cost_max;
cannam@127 826 MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
cannam@127 827 return cost_max;
cannam@127 828 }
cannam@127 829
cannam@127 830
cannam@127 831 int import_wisdom(FILE *f)
cannam@127 832 {
cannam@127 833 int success = 1, sall;
cannam@127 834 if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
cannam@127 835 FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
cannam@127 836 MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
cannam@127 837 return sall;
cannam@127 838 }
cannam@127 839
cannam@127 840 void export_wisdom(FILE *f)
cannam@127 841 {
cannam@127 842 FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
cannam@127 843 if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
cannam@127 844 }