annotate src/fftw-3.3.8/mpi/mpi-bench.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /**************************************************************************/
Chris@82 2 /* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
Chris@82 3 It is probably NOT a good place to learn FFTW usage, since it has a
Chris@82 4 lot of added complexity in order to exercise and test the full API,
Chris@82 5 etcetera. We suggest reading the manual. */
Chris@82 6 /**************************************************************************/
Chris@82 7
Chris@82 8 #include <math.h>
Chris@82 9 #include <stdio.h>
Chris@82 10 #include <string.h>
Chris@82 11 #include "fftw3-mpi.h"
Chris@82 12 #include "tests/fftw-bench.h"
Chris@82 13
Chris@82 14 #if defined(BENCHFFT_SINGLE)
Chris@82 15 # define BENCH_MPI_TYPE MPI_FLOAT
Chris@82 16 #elif defined(BENCHFFT_LDOUBLE)
Chris@82 17 # define BENCH_MPI_TYPE MPI_LONG_DOUBLE
Chris@82 18 #elif defined(BENCHFFT_QUAD)
Chris@82 19 # error MPI quad-precision type is unknown
Chris@82 20 #else
Chris@82 21 # define BENCH_MPI_TYPE MPI_DOUBLE
Chris@82 22 #endif
Chris@82 23
Chris@82 24 #if SIZEOF_PTRDIFF_T == SIZEOF_INT
Chris@82 25 # define FFTW_MPI_PTRDIFF_T MPI_INT
Chris@82 26 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
Chris@82 27 # define FFTW_MPI_PTRDIFF_T MPI_LONG
Chris@82 28 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
Chris@82 29 # define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
Chris@82 30 #else
Chris@82 31 # error MPI type for ptrdiff_t is unknown
Chris@82 32 # define FFTW_MPI_PTRDIFF_T MPI_LONG
Chris@82 33 #endif
Chris@82 34
Chris@82 35 static const char *mkversion(void) { return FFTW(version); }
Chris@82 36 static const char *mkcc(void) { return FFTW(cc); }
Chris@82 37 static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
Chris@82 38 static const char *mknproc(void) {
Chris@82 39 static char buf[32];
Chris@82 40 int ncpus;
Chris@82 41 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
Chris@82 42 #ifdef HAVE_SNPRINTF
Chris@82 43 snprintf(buf, 32, "%d", ncpus);
Chris@82 44 #else
Chris@82 45 sprintf(buf, "%d", ncpus);
Chris@82 46 #endif
Chris@82 47 return buf;
Chris@82 48 }
Chris@82 49
Chris@82 50 BEGIN_BENCH_DOC
Chris@82 51 BENCH_DOC("name", "fftw3_mpi")
Chris@82 52 BENCH_DOCF("version", mkversion)
Chris@82 53 BENCH_DOCF("cc", mkcc)
Chris@82 54 BENCH_DOCF("codelet-optim", mkcodelet_optim)
Chris@82 55 BENCH_DOCF("nproc", mknproc)
Chris@82 56 END_BENCH_DOC
Chris@82 57
Chris@82 58 static int n_pes = 1, my_pe = 0;
Chris@82 59
Chris@82 60 /* global variables describing the shape of the data and its distribution */
Chris@82 61 static int rnk;
Chris@82 62 static ptrdiff_t vn, iNtot, oNtot;
Chris@82 63 static ptrdiff_t *local_ni=0, *local_starti=0;
Chris@82 64 static ptrdiff_t *local_no=0, *local_starto=0;
Chris@82 65 static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
Chris@82 66 static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
Chris@82 67 static ptrdiff_t *istrides = 0, *ostrides = 0;
Chris@82 68 static ptrdiff_t *total_ni=0, *total_no=0;
Chris@82 69 static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
Chris@82 70 static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
Chris@82 71
Chris@82 72 static bench_real *local_in = 0, *local_out = 0;
Chris@82 73 static bench_real *all_local_in = 0, *all_local_out = 0;
Chris@82 74 static int all_local_in_alloc = 0, all_local_out_alloc = 0;
Chris@82 75 static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
Chris@82 76
Chris@82 77 static void alloc_rnk(int rnk_) {
Chris@82 78 rnk = rnk_;
Chris@82 79 bench_free(local_ni);
Chris@82 80 if (rnk == 0)
Chris@82 81 local_ni = 0;
Chris@82 82 else
Chris@82 83 local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
Chris@82 84 * (8 + n_pes * 4));
Chris@82 85
Chris@82 86 local_starti = local_ni + rnk;
Chris@82 87 local_no = local_ni + 2 * rnk;
Chris@82 88 local_starto = local_ni + 3 * rnk;
Chris@82 89 istrides = local_ni + 4 * rnk;
Chris@82 90 ostrides = local_ni + 5 * rnk;
Chris@82 91 total_ni = local_ni + 6 * rnk;
Chris@82 92 total_no = local_ni + 7 * rnk;
Chris@82 93 all_local_ni = local_ni + 8 * rnk;
Chris@82 94 all_local_starti = local_ni + (8 + n_pes) * rnk;
Chris@82 95 all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
Chris@82 96 all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
Chris@82 97 }
Chris@82 98
Chris@82 99 static void setup_gather_scatter(void)
Chris@82 100 {
Chris@82 101 int i, j;
Chris@82 102 ptrdiff_t off;
Chris@82 103
Chris@82 104 MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 105 all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 106 0, MPI_COMM_WORLD);
Chris@82 107 MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
Chris@82 108 MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 109 all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 110 0, MPI_COMM_WORLD);
Chris@82 111 MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
Chris@82 112
Chris@82 113 MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 114 all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 115 0, MPI_COMM_WORLD);
Chris@82 116 MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
Chris@82 117 MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 118 all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
Chris@82 119 0, MPI_COMM_WORLD);
Chris@82 120 MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
Chris@82 121
Chris@82 122 off = 0;
Chris@82 123 for (i = 0; i < n_pes; ++i) {
Chris@82 124 ptrdiff_t N = vn;
Chris@82 125 for (j = 0; j < rnk; ++j)
Chris@82 126 N *= all_local_ni[i * rnk + j];
Chris@82 127 isend_cnt[i] = N;
Chris@82 128 isend_off[i] = off;
Chris@82 129 off += N;
Chris@82 130 }
Chris@82 131 iNtot = off;
Chris@82 132 all_local_in_alloc = 1;
Chris@82 133
Chris@82 134 istrides[rnk - 1] = vn;
Chris@82 135 for (j = rnk - 2; j >= 0; --j)
Chris@82 136 istrides[j] = total_ni[j + 1] * istrides[j + 1];
Chris@82 137
Chris@82 138 off = 0;
Chris@82 139 for (i = 0; i < n_pes; ++i) {
Chris@82 140 ptrdiff_t N = vn;
Chris@82 141 for (j = 0; j < rnk; ++j)
Chris@82 142 N *= all_local_no[i * rnk + j];
Chris@82 143 orecv_cnt[i] = N;
Chris@82 144 orecv_off[i] = off;
Chris@82 145 off += N;
Chris@82 146 }
Chris@82 147 oNtot = off;
Chris@82 148 all_local_out_alloc = 1;
Chris@82 149
Chris@82 150 ostrides[rnk - 1] = vn;
Chris@82 151 for (j = rnk - 2; j >= 0; --j)
Chris@82 152 ostrides[j] = total_no[j + 1] * ostrides[j + 1];
Chris@82 153 }
Chris@82 154
Chris@82 155 static void copy_block_out(const bench_real *in,
Chris@82 156 int rnk, ptrdiff_t *n, ptrdiff_t *start,
Chris@82 157 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
Chris@82 158 bench_real *out)
Chris@82 159 {
Chris@82 160 ptrdiff_t i;
Chris@82 161 if (rnk == 0) {
Chris@82 162 for (i = 0; i < vn; ++i)
Chris@82 163 out[i] = in[i];
Chris@82 164 }
Chris@82 165 else if (rnk == 1) { /* this case is just an optimization */
Chris@82 166 ptrdiff_t j;
Chris@82 167 out += start[0] * os[0];
Chris@82 168 for (j = 0; j < n[0]; ++j) {
Chris@82 169 for (i = 0; i < vn; ++i)
Chris@82 170 out[i] = in[i];
Chris@82 171 in += is;
Chris@82 172 out += os[0];
Chris@82 173 }
Chris@82 174 }
Chris@82 175 else {
Chris@82 176 /* we should do n[0] for locality, but this way is simpler to code */
Chris@82 177 for (i = 0; i < n[rnk - 1]; ++i)
Chris@82 178 copy_block_out(in + i * is,
Chris@82 179 rnk - 1, n, start, is * n[rnk - 1], os, vn,
Chris@82 180 out + (start[rnk - 1] + i) * os[rnk - 1]);
Chris@82 181 }
Chris@82 182 }
Chris@82 183
Chris@82 184 static void copy_block_in(bench_real *in,
Chris@82 185 int rnk, ptrdiff_t *n, ptrdiff_t *start,
Chris@82 186 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
Chris@82 187 const bench_real *out)
Chris@82 188 {
Chris@82 189 ptrdiff_t i;
Chris@82 190 if (rnk == 0) {
Chris@82 191 for (i = 0; i < vn; ++i)
Chris@82 192 in[i] = out[i];
Chris@82 193 }
Chris@82 194 else if (rnk == 1) { /* this case is just an optimization */
Chris@82 195 ptrdiff_t j;
Chris@82 196 out += start[0] * os[0];
Chris@82 197 for (j = 0; j < n[0]; ++j) {
Chris@82 198 for (i = 0; i < vn; ++i)
Chris@82 199 in[i] = out[i];
Chris@82 200 in += is;
Chris@82 201 out += os[0];
Chris@82 202 }
Chris@82 203 }
Chris@82 204 else {
Chris@82 205 /* we should do n[0] for locality, but this way is simpler to code */
Chris@82 206 for (i = 0; i < n[rnk - 1]; ++i)
Chris@82 207 copy_block_in(in + i * is,
Chris@82 208 rnk - 1, n, start, is * n[rnk - 1], os, vn,
Chris@82 209 out + (start[rnk - 1] + i) * os[rnk - 1]);
Chris@82 210 }
Chris@82 211 }
Chris@82 212
Chris@82 213 static void do_scatter_in(bench_real *in)
Chris@82 214 {
Chris@82 215 bench_real *ali;
Chris@82 216 int i;
Chris@82 217 if (all_local_in_alloc) {
Chris@82 218 bench_free(all_local_in);
Chris@82 219 all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
Chris@82 220 all_local_in_alloc = 0;
Chris@82 221 }
Chris@82 222 ali = all_local_in;
Chris@82 223 for (i = 0; i < n_pes; ++i) {
Chris@82 224 copy_block_in(ali,
Chris@82 225 rnk, all_local_ni + i * rnk,
Chris@82 226 all_local_starti + i * rnk,
Chris@82 227 vn, istrides, vn,
Chris@82 228 in);
Chris@82 229 ali += isend_cnt[i];
Chris@82 230 }
Chris@82 231 MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
Chris@82 232 local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
Chris@82 233 0, MPI_COMM_WORLD);
Chris@82 234 }
Chris@82 235
Chris@82 236 static void do_gather_out(bench_real *out)
Chris@82 237 {
Chris@82 238 bench_real *alo;
Chris@82 239 int i;
Chris@82 240
Chris@82 241 if (all_local_out_alloc) {
Chris@82 242 bench_free(all_local_out);
Chris@82 243 all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
Chris@82 244 all_local_out_alloc = 0;
Chris@82 245 }
Chris@82 246 MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
Chris@82 247 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
Chris@82 248 0, MPI_COMM_WORLD);
Chris@82 249 MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
Chris@82 250 alo = all_local_out;
Chris@82 251 for (i = 0; i < n_pes; ++i) {
Chris@82 252 copy_block_out(alo,
Chris@82 253 rnk, all_local_no + i * rnk,
Chris@82 254 all_local_starto + i * rnk,
Chris@82 255 vn, ostrides, vn,
Chris@82 256 out);
Chris@82 257 alo += orecv_cnt[i];
Chris@82 258 }
Chris@82 259 }
Chris@82 260
Chris@82 261 static void alloc_local(ptrdiff_t nreal, int inplace)
Chris@82 262 {
Chris@82 263 bench_free(local_in);
Chris@82 264 if (local_out != local_in) bench_free(local_out);
Chris@82 265 local_in = local_out = 0;
Chris@82 266 if (nreal > 0) {
Chris@82 267 ptrdiff_t i;
Chris@82 268 local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
Chris@82 269 if (inplace)
Chris@82 270 local_out = local_in;
Chris@82 271 else
Chris@82 272 local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
Chris@82 273 for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
Chris@82 274 }
Chris@82 275 }
Chris@82 276
Chris@82 277 void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
Chris@82 278 {
Chris@82 279 UNUSED(p);
Chris@82 280 do_scatter_in(ri);
Chris@82 281 if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
Chris@82 282 }
Chris@82 283
Chris@82 284 void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
Chris@82 285 {
Chris@82 286 UNUSED(p);
Chris@82 287 if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
Chris@82 288 do_gather_out(ro);
Chris@82 289 }
Chris@82 290
Chris@82 291 void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
Chris@82 292 {
Chris@82 293 UNUSED(ii);
Chris@82 294 after_problem_rcopy_from(p, ri);
Chris@82 295 }
Chris@82 296
Chris@82 297 void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
Chris@82 298 {
Chris@82 299 UNUSED(io);
Chris@82 300 after_problem_rcopy_to(p, ro);
Chris@82 301 }
Chris@82 302
Chris@82 303 void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
Chris@82 304 {
Chris@82 305 UNUSED(ii);
Chris@82 306 after_problem_rcopy_from(p, ri);
Chris@82 307 }
Chris@82 308
Chris@82 309 void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
Chris@82 310 {
Chris@82 311 UNUSED(io);
Chris@82 312 after_problem_rcopy_to(p, ro);
Chris@82 313 }
Chris@82 314
Chris@82 315 static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny,
Chris@82 316 ptrdiff_t vn,
Chris@82 317 bench_real *in, bench_real *out)
Chris@82 318 {
Chris@82 319 FFTW(iodim64) hdims[3];
Chris@82 320 FFTW(r2r_kind) k[3];
Chris@82 321 FFTW(plan) pln;
Chris@82 322
Chris@82 323 hdims[0].n = nx;
Chris@82 324 hdims[0].is = ny * vn;
Chris@82 325 hdims[0].os = vn;
Chris@82 326 hdims[1].n = ny;
Chris@82 327 hdims[1].is = vn;
Chris@82 328 hdims[1].os = nx * vn;
Chris@82 329 hdims[2].n = vn;
Chris@82 330 hdims[2].is = 1;
Chris@82 331 hdims[2].os = 1;
Chris@82 332 k[0] = k[1] = k[2] = FFTW_R2HC;
Chris@82 333 pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
Chris@82 334 BENCH_ASSERT(pln != 0);
Chris@82 335 return pln;
Chris@82 336 }
Chris@82 337
Chris@82 338 static int tensor_rowmajor_transposedp(bench_tensor *t)
Chris@82 339 {
Chris@82 340 bench_iodim *d;
Chris@82 341 int i;
Chris@82 342
Chris@82 343 BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
Chris@82 344 if (t->rnk < 2)
Chris@82 345 return 0;
Chris@82 346
Chris@82 347 d = t->dims;
Chris@82 348 if (d[0].is != d[1].is * d[1].n
Chris@82 349 || d[0].os != d[1].is
Chris@82 350 || d[1].os != d[0].os * d[0].n)
Chris@82 351 return 0;
Chris@82 352 if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
Chris@82 353 return 0;
Chris@82 354 for (i = 2; i + 1 < t->rnk; ++i) {
Chris@82 355 d = t->dims + i;
Chris@82 356 if (d[0].is != d[1].is * d[1].n
Chris@82 357 || d[0].os != d[1].os * d[1].n)
Chris@82 358 return 0;
Chris@82 359 }
Chris@82 360
Chris@82 361 if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
Chris@82 362 return 0;
Chris@82 363 return 1;
Chris@82 364 }
Chris@82 365
Chris@82 366 static int tensor_contiguousp(bench_tensor *t, int s)
Chris@82 367 {
Chris@82 368 return (t->dims[t->rnk-1].is == s
Chris@82 369 && ((tensor_rowmajorp(t) &&
Chris@82 370 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
Chris@82 371 || tensor_rowmajor_transposedp(t)));
Chris@82 372 }
Chris@82 373
Chris@82 374 static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
Chris@82 375 {
Chris@82 376 FFTW(plan) pln = 0;
Chris@82 377 int i;
Chris@82 378 ptrdiff_t ntot;
Chris@82 379
Chris@82 380 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
Chris@82 381
Chris@82 382 if (p->sz->rnk < 1
Chris@82 383 || p->split
Chris@82 384 || !tensor_contiguousp(p->sz, vn)
Chris@82 385 || tensor_rowmajor_transposedp(p->sz)
Chris@82 386 || p->vecsz->rnk > 1
Chris@82 387 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
Chris@82 388 || p->vecsz->dims[0].os != 1)))
Chris@82 389 return 0;
Chris@82 390
Chris@82 391 alloc_rnk(p->sz->rnk);
Chris@82 392 for (i = 0; i < rnk; ++i) {
Chris@82 393 total_ni[i] = total_no[i] = p->sz->dims[i].n;
Chris@82 394 local_ni[i] = local_no[i] = total_ni[i];
Chris@82 395 local_starti[i] = local_starto[i] = 0;
Chris@82 396 }
Chris@82 397 if (rnk > 1) {
Chris@82 398 ptrdiff_t n, start, nT, startT;
Chris@82 399 ntot = FFTW(mpi_local_size_many_transposed)
Chris@82 400 (p->sz->rnk, total_ni, vn,
Chris@82 401 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
Chris@82 402 MPI_COMM_WORLD,
Chris@82 403 &n, &start, &nT, &startT);
Chris@82 404 if (flags & FFTW_MPI_TRANSPOSED_IN) {
Chris@82 405 local_ni[1] = nT;
Chris@82 406 local_starti[1] = startT;
Chris@82 407 }
Chris@82 408 else {
Chris@82 409 local_ni[0] = n;
Chris@82 410 local_starti[0] = start;
Chris@82 411 }
Chris@82 412 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
Chris@82 413 local_no[1] = nT;
Chris@82 414 local_starto[1] = startT;
Chris@82 415 }
Chris@82 416 else {
Chris@82 417 local_no[0] = n;
Chris@82 418 local_starto[0] = start;
Chris@82 419 }
Chris@82 420 }
Chris@82 421 else if (rnk == 1) {
Chris@82 422 ntot = FFTW(mpi_local_size_many_1d)
Chris@82 423 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
Chris@82 424 local_ni, local_starti, local_no, local_starto);
Chris@82 425 }
Chris@82 426 alloc_local(ntot * 2, p->in == p->out);
Chris@82 427
Chris@82 428 pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn,
Chris@82 429 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 430 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 431 (FFTW(complex) *) local_in,
Chris@82 432 (FFTW(complex) *) local_out,
Chris@82 433 MPI_COMM_WORLD, p->sign, flags);
Chris@82 434
Chris@82 435 vn *= 2;
Chris@82 436
Chris@82 437 if (rnk > 1) {
Chris@82 438 ptrdiff_t nrest = 1;
Chris@82 439 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
Chris@82 440 if (flags & FFTW_MPI_TRANSPOSED_IN)
Chris@82 441 plan_scramble_in = mkplan_transpose_local(
Chris@82 442 p->sz->dims[0].n, local_ni[1], vn * nrest,
Chris@82 443 local_in, local_in);
Chris@82 444 if (flags & FFTW_MPI_TRANSPOSED_OUT)
Chris@82 445 plan_unscramble_out = mkplan_transpose_local(
Chris@82 446 local_no[1], p->sz->dims[0].n, vn * nrest,
Chris@82 447 local_out, local_out);
Chris@82 448 }
Chris@82 449
Chris@82 450 return pln;
Chris@82 451 }
Chris@82 452
Chris@82 453 static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
Chris@82 454 {
Chris@82 455 return (t->dims[t->rnk-1].is == s
Chris@82 456 && ((tensor_real_rowmajorp(t, sign, 1) &&
Chris@82 457 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
Chris@82 458 }
Chris@82 459
Chris@82 460 static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
Chris@82 461 {
Chris@82 462 FFTW(plan) pln = 0;
Chris@82 463 int i;
Chris@82 464 ptrdiff_t ntot;
Chris@82 465
Chris@82 466 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
Chris@82 467
Chris@82 468 if (p->sz->rnk < 2
Chris@82 469 || p->split
Chris@82 470 || !tensor_real_contiguousp(p->sz, p->sign, vn)
Chris@82 471 || tensor_rowmajor_transposedp(p->sz)
Chris@82 472 || p->vecsz->rnk > 1
Chris@82 473 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
Chris@82 474 || p->vecsz->dims[0].os != 1)))
Chris@82 475 return 0;
Chris@82 476
Chris@82 477 alloc_rnk(p->sz->rnk);
Chris@82 478 for (i = 0; i < rnk; ++i) {
Chris@82 479 total_ni[i] = total_no[i] = p->sz->dims[i].n;
Chris@82 480 local_ni[i] = local_no[i] = total_ni[i];
Chris@82 481 local_starti[i] = local_starto[i] = 0;
Chris@82 482 }
Chris@82 483 local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1]
Chris@82 484 = p->sz->dims[rnk-1].n / 2 + 1;
Chris@82 485 {
Chris@82 486 ptrdiff_t n, start, nT, startT;
Chris@82 487 ntot = FFTW(mpi_local_size_many_transposed)
Chris@82 488 (p->sz->rnk, total_ni, vn,
Chris@82 489 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
Chris@82 490 MPI_COMM_WORLD,
Chris@82 491 &n, &start, &nT, &startT);
Chris@82 492 if (flags & FFTW_MPI_TRANSPOSED_IN) {
Chris@82 493 local_ni[1] = nT;
Chris@82 494 local_starti[1] = startT;
Chris@82 495 }
Chris@82 496 else {
Chris@82 497 local_ni[0] = n;
Chris@82 498 local_starti[0] = start;
Chris@82 499 }
Chris@82 500 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
Chris@82 501 local_no[1] = nT;
Chris@82 502 local_starto[1] = startT;
Chris@82 503 }
Chris@82 504 else {
Chris@82 505 local_no[0] = n;
Chris@82 506 local_starto[0] = start;
Chris@82 507 }
Chris@82 508 }
Chris@82 509 alloc_local(ntot * 2, p->in == p->out);
Chris@82 510
Chris@82 511 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
Chris@82 512 if (p->sign < 0)
Chris@82 513 pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn,
Chris@82 514 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 515 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 516 local_in,
Chris@82 517 (FFTW(complex) *) local_out,
Chris@82 518 MPI_COMM_WORLD, flags);
Chris@82 519 else
Chris@82 520 pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn,
Chris@82 521 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 522 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 523 (FFTW(complex) *) local_in,
Chris@82 524 local_out,
Chris@82 525 MPI_COMM_WORLD, flags);
Chris@82 526
Chris@82 527 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
Chris@82 528 vn *= 2;
Chris@82 529
Chris@82 530 {
Chris@82 531 ptrdiff_t nrest = 1;
Chris@82 532 for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
Chris@82 533 if (flags & FFTW_MPI_TRANSPOSED_IN)
Chris@82 534 plan_scramble_in = mkplan_transpose_local(
Chris@82 535 total_ni[0], local_ni[1], vn * nrest,
Chris@82 536 local_in, local_in);
Chris@82 537 if (flags & FFTW_MPI_TRANSPOSED_OUT)
Chris@82 538 plan_unscramble_out = mkplan_transpose_local(
Chris@82 539 local_no[1], total_ni[0], vn * nrest,
Chris@82 540 local_out, local_out);
Chris@82 541 }
Chris@82 542
Chris@82 543 return pln;
Chris@82 544 }
Chris@82 545
Chris@82 546 static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
Chris@82 547 {
Chris@82 548 ptrdiff_t ntot, nx, ny;
Chris@82 549 int ix=0, iy=1, i;
Chris@82 550 const bench_iodim *d = p->vecsz->dims;
Chris@82 551 FFTW(plan) pln;
Chris@82 552
Chris@82 553 if (p->vecsz->rnk == 3) {
Chris@82 554 for (i = 0; i < 3; ++i)
Chris@82 555 if (d[i].is == 1 && d[i].os == 1) {
Chris@82 556 vn = d[i].n;
Chris@82 557 ix = (i + 1) % 3;
Chris@82 558 iy = (i + 2) % 3;
Chris@82 559 break;
Chris@82 560 }
Chris@82 561 if (i == 3) return 0;
Chris@82 562 }
Chris@82 563 else {
Chris@82 564 vn = 1;
Chris@82 565 ix = 0;
Chris@82 566 iy = 1;
Chris@82 567 }
Chris@82 568
Chris@82 569 if (d[ix].is == d[iy].n * vn && d[ix].os == vn
Chris@82 570 && d[iy].os == d[ix].n * vn && d[iy].is == vn) {
Chris@82 571 nx = d[ix].n;
Chris@82 572 ny = d[iy].n;
Chris@82 573 }
Chris@82 574 else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
Chris@82 575 && d[ix].os == d[iy].n * vn && d[ix].is == vn) {
Chris@82 576 nx = d[iy].n;
Chris@82 577 ny = d[ix].n;
Chris@82 578 }
Chris@82 579 else
Chris@82 580 return 0;
Chris@82 581
Chris@82 582 alloc_rnk(2);
Chris@82 583 ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
Chris@82 584 &local_ni[0],
Chris@82 585 &local_starti[0],
Chris@82 586 &local_no[0],
Chris@82 587 &local_starto[0]);
Chris@82 588 local_ni[1] = ny;
Chris@82 589 local_starti[1] = 0;
Chris@82 590 local_no[1] = nx;
Chris@82 591 local_starto[1] = 0;
Chris@82 592 total_ni[0] = nx; total_ni[1] = ny;
Chris@82 593 total_no[1] = nx; total_no[0] = ny;
Chris@82 594 alloc_local(ntot, p->in == p->out);
Chris@82 595
Chris@82 596 pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
Chris@82 597 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 598 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 599 local_in, local_out,
Chris@82 600 MPI_COMM_WORLD, flags);
Chris@82 601
Chris@82 602 if (flags & FFTW_MPI_TRANSPOSED_IN)
Chris@82 603 plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
Chris@82 604 local_in, local_in);
Chris@82 605 if (flags & FFTW_MPI_TRANSPOSED_OUT)
Chris@82 606 plan_unscramble_out = mkplan_transpose_local
Chris@82 607 (nx, local_no[0], vn, local_out, local_out);
Chris@82 608
Chris@82 609 #if 0
Chris@82 610 if (pln && vn == 1) {
Chris@82 611 int i, j;
Chris@82 612 bench_real *ri = (bench_real *) p->in;
Chris@82 613 bench_real *ro = (bench_real *) p->out;
Chris@82 614 if (!ri || !ro) return pln;
Chris@82 615 setup_gather_scatter();
Chris@82 616 for (i = 0; i < nx * ny; ++i)
Chris@82 617 ri[i] = i;
Chris@82 618 after_problem_rcopy_from(p, ri);
Chris@82 619 FFTW(execute)(pln);
Chris@82 620 after_problem_rcopy_to(p, ro);
Chris@82 621 if (my_pe == 0) {
Chris@82 622 for (i = 0; i < nx; ++i) {
Chris@82 623 for (j = 0; j < ny; ++j)
Chris@82 624 printf(" %3g", ro[j * nx + i]);
Chris@82 625 printf("\n");
Chris@82 626 }
Chris@82 627 }
Chris@82 628 }
Chris@82 629 #endif
Chris@82 630
Chris@82 631 return pln;
Chris@82 632 }
Chris@82 633
Chris@82 634 static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
Chris@82 635 {
Chris@82 636 FFTW(plan) pln = 0;
Chris@82 637 int i;
Chris@82 638 ptrdiff_t ntot;
Chris@82 639 FFTW(r2r_kind) *k;
Chris@82 640
Chris@82 641 if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
Chris@82 642 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
Chris@82 643 return mkplan_transpose(p, flags);
Chris@82 644
Chris@82 645 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
Chris@82 646
Chris@82 647 if (p->sz->rnk < 1
Chris@82 648 || p->split
Chris@82 649 || !tensor_contiguousp(p->sz, vn)
Chris@82 650 || tensor_rowmajor_transposedp(p->sz)
Chris@82 651 || p->vecsz->rnk > 1
Chris@82 652 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
Chris@82 653 || p->vecsz->dims[0].os != 1)))
Chris@82 654 return 0;
Chris@82 655
Chris@82 656 alloc_rnk(p->sz->rnk);
Chris@82 657 for (i = 0; i < rnk; ++i) {
Chris@82 658 total_ni[i] = total_no[i] = p->sz->dims[i].n;
Chris@82 659 local_ni[i] = local_no[i] = total_ni[i];
Chris@82 660 local_starti[i] = local_starto[i] = 0;
Chris@82 661 }
Chris@82 662 if (rnk > 1) {
Chris@82 663 ptrdiff_t n, start, nT, startT;
Chris@82 664 ntot = FFTW(mpi_local_size_many_transposed)
Chris@82 665 (p->sz->rnk, total_ni, vn,
Chris@82 666 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
Chris@82 667 MPI_COMM_WORLD,
Chris@82 668 &n, &start, &nT, &startT);
Chris@82 669 if (flags & FFTW_MPI_TRANSPOSED_IN) {
Chris@82 670 local_ni[1] = nT;
Chris@82 671 local_starti[1] = startT;
Chris@82 672 }
Chris@82 673 else {
Chris@82 674 local_ni[0] = n;
Chris@82 675 local_starti[0] = start;
Chris@82 676 }
Chris@82 677 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
Chris@82 678 local_no[1] = nT;
Chris@82 679 local_starto[1] = startT;
Chris@82 680 }
Chris@82 681 else {
Chris@82 682 local_no[0] = n;
Chris@82 683 local_starto[0] = start;
Chris@82 684 }
Chris@82 685 }
Chris@82 686 else if (rnk == 1) {
Chris@82 687 ntot = FFTW(mpi_local_size_many_1d)
Chris@82 688 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
Chris@82 689 local_ni, local_starti, local_no, local_starto);
Chris@82 690 }
Chris@82 691 alloc_local(ntot, p->in == p->out);
Chris@82 692
Chris@82 693 k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
Chris@82 694 for (i = 0; i < p->sz->rnk; ++i)
Chris@82 695 switch (p->k[i]) {
Chris@82 696 case R2R_R2HC: k[i] = FFTW_R2HC; break;
Chris@82 697 case R2R_HC2R: k[i] = FFTW_HC2R; break;
Chris@82 698 case R2R_DHT: k[i] = FFTW_DHT; break;
Chris@82 699 case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
Chris@82 700 case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
Chris@82 701 case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
Chris@82 702 case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
Chris@82 703 case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
Chris@82 704 case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
Chris@82 705 case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
Chris@82 706 case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
Chris@82 707 default: BENCH_ASSERT(0);
Chris@82 708 }
Chris@82 709
Chris@82 710 pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn,
Chris@82 711 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 712 FFTW_MPI_DEFAULT_BLOCK,
Chris@82 713 local_in, local_out,
Chris@82 714 MPI_COMM_WORLD, k, flags);
Chris@82 715 bench_free(k);
Chris@82 716
Chris@82 717 if (rnk > 1) {
Chris@82 718 ptrdiff_t nrest = 1;
Chris@82 719 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
Chris@82 720 if (flags & FFTW_MPI_TRANSPOSED_IN)
Chris@82 721 plan_scramble_in = mkplan_transpose_local(
Chris@82 722 p->sz->dims[0].n, local_ni[1], vn * nrest,
Chris@82 723 local_in, local_in);
Chris@82 724 if (flags & FFTW_MPI_TRANSPOSED_OUT)
Chris@82 725 plan_unscramble_out = mkplan_transpose_local(
Chris@82 726 local_no[1], p->sz->dims[0].n, vn * nrest,
Chris@82 727 local_out, local_out);
Chris@82 728 }
Chris@82 729
Chris@82 730 return pln;
Chris@82 731 }
Chris@82 732
Chris@82 733 FFTW(plan) mkplan(bench_problem *p, unsigned flags)
Chris@82 734 {
Chris@82 735 FFTW(plan) pln = 0;
Chris@82 736 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
Chris@82 737 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
Chris@82 738 if (p->scrambled_in) {
Chris@82 739 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
Chris@82 740 flags |= FFTW_MPI_SCRAMBLED_IN;
Chris@82 741 else
Chris@82 742 flags |= FFTW_MPI_TRANSPOSED_IN;
Chris@82 743 }
Chris@82 744 if (p->scrambled_out) {
Chris@82 745 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
Chris@82 746 flags |= FFTW_MPI_SCRAMBLED_OUT;
Chris@82 747 else
Chris@82 748 flags |= FFTW_MPI_TRANSPOSED_OUT;
Chris@82 749 }
Chris@82 750 switch (p->kind) {
Chris@82 751 case PROBLEM_COMPLEX:
Chris@82 752 pln =mkplan_complex(p, flags);
Chris@82 753 break;
Chris@82 754 case PROBLEM_REAL:
Chris@82 755 pln = mkplan_real(p, flags);
Chris@82 756 break;
Chris@82 757 case PROBLEM_R2R:
Chris@82 758 pln = mkplan_r2r(p, flags);
Chris@82 759 break;
Chris@82 760 default: BENCH_ASSERT(0);
Chris@82 761 }
Chris@82 762 if (pln) setup_gather_scatter();
Chris@82 763 return pln;
Chris@82 764 }
Chris@82 765
Chris@82 766 void main_init(int *argc, char ***argv)
Chris@82 767 {
Chris@82 768 #ifdef HAVE_SMP
Chris@82 769 # if MPI_VERSION >= 2 /* for MPI_Init_thread */
Chris@82 770 int provided;
Chris@82 771 MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
Chris@82 772 threads_ok = provided >= MPI_THREAD_FUNNELED;
Chris@82 773 # else
Chris@82 774 MPI_Init(argc, argv);
Chris@82 775 threads_ok = 0;
Chris@82 776 # endif
Chris@82 777 #else
Chris@82 778 MPI_Init(argc, argv);
Chris@82 779 #endif
Chris@82 780 MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
Chris@82 781 MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
Chris@82 782 if (my_pe != 0) verbose = -999;
Chris@82 783 no_speed_allocation = 1; /* so we can benchmark transforms > memory */
Chris@82 784 always_pad_real = 1; /* out-of-place real transforms are padded */
Chris@82 785 isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
Chris@82 786 isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
Chris@82 787 orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
Chris@82 788 orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
Chris@82 789
Chris@82 790 /* init_threads must be called before any other FFTW function,
Chris@82 791 including mpi_init, because it has to register the threads hooks
Chris@82 792 before the planner is initalized */
Chris@82 793 #ifdef HAVE_SMP
Chris@82 794 if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
Chris@82 795 #endif
Chris@82 796 FFTW(mpi_init)();
Chris@82 797 }
Chris@82 798
Chris@82 799 void initial_cleanup(void)
Chris@82 800 {
Chris@82 801 alloc_rnk(0);
Chris@82 802 alloc_local(0, 0);
Chris@82 803 bench_free(all_local_in); all_local_in = 0;
Chris@82 804 bench_free(all_local_out); all_local_out = 0;
Chris@82 805 bench_free(isend_off); isend_off = 0;
Chris@82 806 bench_free(isend_cnt); isend_cnt = 0;
Chris@82 807 bench_free(orecv_off); orecv_off = 0;
Chris@82 808 bench_free(orecv_cnt); orecv_cnt = 0;
Chris@82 809 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
Chris@82 810 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
Chris@82 811 }
Chris@82 812
Chris@82 813 void final_cleanup(void)
Chris@82 814 {
Chris@82 815 MPI_Finalize();
Chris@82 816 }
Chris@82 817
Chris@82 818 void bench_exit(int status)
Chris@82 819 {
Chris@82 820 MPI_Abort(MPI_COMM_WORLD, status);
Chris@82 821 }
Chris@82 822
Chris@82 823 double bench_cost_postprocess(double cost)
Chris@82 824 {
Chris@82 825 double cost_max;
Chris@82 826 MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
Chris@82 827 return cost_max;
Chris@82 828 }
Chris@82 829
Chris@82 830
Chris@82 831 int import_wisdom(FILE *f)
Chris@82 832 {
Chris@82 833 int success = 1, sall;
Chris@82 834 if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
Chris@82 835 FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
Chris@82 836 MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
Chris@82 837 return sall;
Chris@82 838 }
Chris@82 839
Chris@82 840 void export_wisdom(FILE *f)
Chris@82 841 {
Chris@82 842 FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
Chris@82 843 if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
Chris@82 844 }