Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/mpi/mpi-bench.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /**************************************************************************/ | |
2 /* NOTE to users: this is the FFTW-MPI self-test and benchmark program. | |
3 It is probably NOT a good place to learn FFTW usage, since it has a | |
4 lot of added complexity in order to exercise and test the full API, | |
5 etcetera. We suggest reading the manual. */ | |
6 /**************************************************************************/ | |
7 | |
8 #include <math.h> | |
9 #include <stdio.h> | |
10 #include <string.h> | |
11 #include "fftw3-mpi.h" | |
12 #include "fftw-bench.h" | |
13 | |
14 #if defined(BENCHFFT_SINGLE) | |
15 # define BENCH_MPI_TYPE MPI_FLOAT | |
16 #elif defined(BENCHFFT_LDOUBLE) | |
17 # define BENCH_MPI_TYPE MPI_LONG_DOUBLE | |
18 #elif defined(BENCHFFT_QUAD) | |
19 # error MPI quad-precision type is unknown | |
20 #else | |
21 # define BENCH_MPI_TYPE MPI_DOUBLE | |
22 #endif | |
23 | |
24 #if SIZEOF_PTRDIFF_T == SIZEOF_INT | |
25 # define FFTW_MPI_PTRDIFF_T MPI_INT | |
26 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG | |
27 # define FFTW_MPI_PTRDIFF_T MPI_LONG | |
28 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG | |
29 # define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG | |
30 #else | |
31 # error MPI type for ptrdiff_t is unknown | |
32 # define FFTW_MPI_PTRDIFF_T MPI_LONG | |
33 #endif | |
34 | |
35 static const char *mkversion(void) { return FFTW(version); } | |
36 static const char *mkcc(void) { return FFTW(cc); } | |
37 static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); } | |
38 static const char *mknproc(void) { | |
39 static char buf[32]; | |
40 int ncpus; | |
41 MPI_Comm_size(MPI_COMM_WORLD, &ncpus); | |
42 #ifdef HAVE_SNPRINTF | |
43 snprintf(buf, 32, "%d", ncpus); | |
44 #else | |
45 sprintf(buf, "%d", ncpus); | |
46 #endif | |
47 return buf; | |
48 } | |
49 | |
50 BEGIN_BENCH_DOC | |
51 BENCH_DOC("name", "fftw3_mpi") | |
52 BENCH_DOCF("version", mkversion) | |
53 BENCH_DOCF("cc", mkcc) | |
54 BENCH_DOCF("codelet-optim", mkcodelet_optim) | |
55 BENCH_DOCF("nproc", mknproc) | |
56 END_BENCH_DOC | |
57 | |
58 static int n_pes = 1, my_pe = 0; | |
59 | |
60 /* global variables describing the shape of the data and its distribution */ | |
61 static int rnk; | |
62 static ptrdiff_t vn, iNtot, oNtot; | |
63 static ptrdiff_t *local_ni=0, *local_starti=0; | |
64 static ptrdiff_t *local_no=0, *local_starto=0; | |
65 static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */ | |
66 static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */ | |
67 static ptrdiff_t *istrides = 0, *ostrides = 0; | |
68 static ptrdiff_t *total_ni=0, *total_no=0; | |
69 static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */ | |
70 static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */ | |
71 | |
72 static bench_real *local_in = 0, *local_out = 0; | |
73 static bench_real *all_local_in = 0, *all_local_out = 0; | |
74 static int all_local_in_alloc = 0, all_local_out_alloc = 0; | |
75 static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0; | |
76 | |
77 static void alloc_rnk(int rnk_) { | |
78 rnk = rnk_; | |
79 bench_free(local_ni); | |
80 if (rnk == 0) | |
81 local_ni = 0; | |
82 else | |
83 local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk | |
84 * (8 + n_pes * 4)); | |
85 | |
86 local_starti = local_ni + rnk; | |
87 local_no = local_ni + 2 * rnk; | |
88 local_starto = local_ni + 3 * rnk; | |
89 istrides = local_ni + 4 * rnk; | |
90 ostrides = local_ni + 5 * rnk; | |
91 total_ni = local_ni + 6 * rnk; | |
92 total_no = local_ni + 7 * rnk; | |
93 all_local_ni = local_ni + 8 * rnk; | |
94 all_local_starti = local_ni + (8 + n_pes) * rnk; | |
95 all_local_no = local_ni + (8 + 2 * n_pes) * rnk; | |
96 all_local_starto = local_ni + (8 + 3 * n_pes) * rnk; | |
97 } | |
98 | |
99 static void setup_gather_scatter(void) | |
100 { | |
101 int i, j; | |
102 ptrdiff_t off; | |
103 | |
104 MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T, | |
105 all_local_ni, rnk, FFTW_MPI_PTRDIFF_T, | |
106 0, MPI_COMM_WORLD); | |
107 MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); | |
108 MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T, | |
109 all_local_starti, rnk, FFTW_MPI_PTRDIFF_T, | |
110 0, MPI_COMM_WORLD); | |
111 MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); | |
112 | |
113 MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T, | |
114 all_local_no, rnk, FFTW_MPI_PTRDIFF_T, | |
115 0, MPI_COMM_WORLD); | |
116 MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); | |
117 MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T, | |
118 all_local_starto, rnk, FFTW_MPI_PTRDIFF_T, | |
119 0, MPI_COMM_WORLD); | |
120 MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD); | |
121 | |
122 off = 0; | |
123 for (i = 0; i < n_pes; ++i) { | |
124 ptrdiff_t N = vn; | |
125 for (j = 0; j < rnk; ++j) | |
126 N *= all_local_ni[i * rnk + j]; | |
127 isend_cnt[i] = N; | |
128 isend_off[i] = off; | |
129 off += N; | |
130 } | |
131 iNtot = off; | |
132 all_local_in_alloc = 1; | |
133 | |
134 istrides[rnk - 1] = vn; | |
135 for (j = rnk - 2; j >= 0; --j) | |
136 istrides[j] = total_ni[j + 1] * istrides[j + 1]; | |
137 | |
138 off = 0; | |
139 for (i = 0; i < n_pes; ++i) { | |
140 ptrdiff_t N = vn; | |
141 for (j = 0; j < rnk; ++j) | |
142 N *= all_local_no[i * rnk + j]; | |
143 orecv_cnt[i] = N; | |
144 orecv_off[i] = off; | |
145 off += N; | |
146 } | |
147 oNtot = off; | |
148 all_local_out_alloc = 1; | |
149 | |
150 ostrides[rnk - 1] = vn; | |
151 for (j = rnk - 2; j >= 0; --j) | |
152 ostrides[j] = total_no[j + 1] * ostrides[j + 1]; | |
153 } | |
154 | |
155 static void copy_block_out(const bench_real *in, | |
156 int rnk, ptrdiff_t *n, ptrdiff_t *start, | |
157 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn, | |
158 bench_real *out) | |
159 { | |
160 ptrdiff_t i; | |
161 if (rnk == 0) { | |
162 for (i = 0; i < vn; ++i) | |
163 out[i] = in[i]; | |
164 } | |
165 else if (rnk == 1) { /* this case is just an optimization */ | |
166 ptrdiff_t j; | |
167 out += start[0] * os[0]; | |
168 for (j = 0; j < n[0]; ++j) { | |
169 for (i = 0; i < vn; ++i) | |
170 out[i] = in[i]; | |
171 in += is; | |
172 out += os[0]; | |
173 } | |
174 } | |
175 else { | |
176 /* we should do n[0] for locality, but this way is simpler to code */ | |
177 for (i = 0; i < n[rnk - 1]; ++i) | |
178 copy_block_out(in + i * is, | |
179 rnk - 1, n, start, is * n[rnk - 1], os, vn, | |
180 out + (start[rnk - 1] + i) * os[rnk - 1]); | |
181 } | |
182 } | |
183 | |
184 static void copy_block_in(bench_real *in, | |
185 int rnk, ptrdiff_t *n, ptrdiff_t *start, | |
186 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn, | |
187 const bench_real *out) | |
188 { | |
189 ptrdiff_t i; | |
190 if (rnk == 0) { | |
191 for (i = 0; i < vn; ++i) | |
192 in[i] = out[i]; | |
193 } | |
194 else if (rnk == 1) { /* this case is just an optimization */ | |
195 ptrdiff_t j; | |
196 out += start[0] * os[0]; | |
197 for (j = 0; j < n[0]; ++j) { | |
198 for (i = 0; i < vn; ++i) | |
199 in[i] = out[i]; | |
200 in += is; | |
201 out += os[0]; | |
202 } | |
203 } | |
204 else { | |
205 /* we should do n[0] for locality, but this way is simpler to code */ | |
206 for (i = 0; i < n[rnk - 1]; ++i) | |
207 copy_block_in(in + i * is, | |
208 rnk - 1, n, start, is * n[rnk - 1], os, vn, | |
209 out + (start[rnk - 1] + i) * os[rnk - 1]); | |
210 } | |
211 } | |
212 | |
213 static void do_scatter_in(bench_real *in) | |
214 { | |
215 bench_real *ali; | |
216 int i; | |
217 if (all_local_in_alloc) { | |
218 bench_free(all_local_in); | |
219 all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real)); | |
220 all_local_in_alloc = 0; | |
221 } | |
222 ali = all_local_in; | |
223 for (i = 0; i < n_pes; ++i) { | |
224 copy_block_in(ali, | |
225 rnk, all_local_ni + i * rnk, | |
226 all_local_starti + i * rnk, | |
227 vn, istrides, vn, | |
228 in); | |
229 ali += isend_cnt[i]; | |
230 } | |
231 MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE, | |
232 local_in, isend_cnt[my_pe], BENCH_MPI_TYPE, | |
233 0, MPI_COMM_WORLD); | |
234 } | |
235 | |
236 static void do_gather_out(bench_real *out) | |
237 { | |
238 bench_real *alo; | |
239 int i; | |
240 | |
241 if (all_local_out_alloc) { | |
242 bench_free(all_local_out); | |
243 all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real)); | |
244 all_local_out_alloc = 0; | |
245 } | |
246 MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE, | |
247 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE, | |
248 0, MPI_COMM_WORLD); | |
249 MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD); | |
250 alo = all_local_out; | |
251 for (i = 0; i < n_pes; ++i) { | |
252 copy_block_out(alo, | |
253 rnk, all_local_no + i * rnk, | |
254 all_local_starto + i * rnk, | |
255 vn, ostrides, vn, | |
256 out); | |
257 alo += orecv_cnt[i]; | |
258 } | |
259 } | |
260 | |
261 static void alloc_local(ptrdiff_t nreal, int inplace) | |
262 { | |
263 bench_free(local_in); | |
264 if (local_out != local_in) bench_free(local_out); | |
265 local_in = local_out = 0; | |
266 if (nreal > 0) { | |
267 ptrdiff_t i; | |
268 local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); | |
269 if (inplace) | |
270 local_out = local_in; | |
271 else | |
272 local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real)); | |
273 for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0; | |
274 } | |
275 } | |
276 | |
277 void after_problem_rcopy_from(bench_problem *p, bench_real *ri) | |
278 { | |
279 UNUSED(p); | |
280 do_scatter_in(ri); | |
281 if (plan_scramble_in) FFTW(execute)(plan_scramble_in); | |
282 } | |
283 | |
284 void after_problem_rcopy_to(bench_problem *p, bench_real *ro) | |
285 { | |
286 UNUSED(p); | |
287 if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out); | |
288 do_gather_out(ro); | |
289 } | |
290 | |
291 void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii) | |
292 { | |
293 UNUSED(ii); | |
294 after_problem_rcopy_from(p, ri); | |
295 } | |
296 | |
297 void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io) | |
298 { | |
299 UNUSED(io); | |
300 after_problem_rcopy_to(p, ro); | |
301 } | |
302 | |
303 void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii) | |
304 { | |
305 UNUSED(ii); | |
306 after_problem_rcopy_from(p, ri); | |
307 } | |
308 | |
309 void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io) | |
310 { | |
311 UNUSED(io); | |
312 after_problem_rcopy_to(p, ro); | |
313 } | |
314 | |
315 static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny, | |
316 ptrdiff_t vn, | |
317 bench_real *in, bench_real *out) | |
318 { | |
319 FFTW(iodim64) hdims[3]; | |
320 FFTW(r2r_kind) k[3]; | |
321 FFTW(plan) pln; | |
322 | |
323 hdims[0].n = nx; | |
324 hdims[0].is = ny * vn; | |
325 hdims[0].os = vn; | |
326 hdims[1].n = ny; | |
327 hdims[1].is = vn; | |
328 hdims[1].os = nx * vn; | |
329 hdims[2].n = vn; | |
330 hdims[2].is = 1; | |
331 hdims[2].os = 1; | |
332 k[0] = k[1] = k[2] = FFTW_R2HC; | |
333 pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE); | |
334 BENCH_ASSERT(pln != 0); | |
335 return pln; | |
336 } | |
337 | |
338 static int tensor_rowmajor_transposedp(bench_tensor *t) | |
339 { | |
340 bench_iodim *d; | |
341 int i; | |
342 | |
343 BENCH_ASSERT(FINITE_RNK(t->rnk)); | |
344 if (t->rnk < 2) | |
345 return 0; | |
346 | |
347 d = t->dims; | |
348 if (d[0].is != d[1].is * d[1].n | |
349 || d[0].os != d[1].is | |
350 || d[1].os != d[0].os * d[0].n) | |
351 return 0; | |
352 if (t->rnk > 2 && d[1].is != d[2].is * d[2].n) | |
353 return 0; | |
354 for (i = 2; i + 1 < t->rnk; ++i) { | |
355 d = t->dims + i; | |
356 if (d[0].is != d[1].is * d[1].n | |
357 || d[0].os != d[1].os * d[1].n) | |
358 return 0; | |
359 } | |
360 | |
361 if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os) | |
362 return 0; | |
363 return 1; | |
364 } | |
365 | |
366 static int tensor_contiguousp(bench_tensor *t, int s) | |
367 { | |
368 return (t->dims[t->rnk-1].is == s | |
369 && ((tensor_rowmajorp(t) && | |
370 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os) | |
371 || tensor_rowmajor_transposedp(t))); | |
372 } | |
373 | |
374 static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags) | |
375 { | |
376 FFTW(plan) pln = 0; | |
377 int i; | |
378 ptrdiff_t ntot; | |
379 | |
380 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; | |
381 | |
382 if (p->sz->rnk < 1 | |
383 || p->split | |
384 || !tensor_contiguousp(p->sz, vn) | |
385 || tensor_rowmajor_transposedp(p->sz) | |
386 || p->vecsz->rnk > 1 | |
387 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 | |
388 || p->vecsz->dims[0].os != 1))) | |
389 return 0; | |
390 | |
391 alloc_rnk(p->sz->rnk); | |
392 for (i = 0; i < rnk; ++i) { | |
393 total_ni[i] = total_no[i] = p->sz->dims[i].n; | |
394 local_ni[i] = local_no[i] = total_ni[i]; | |
395 local_starti[i] = local_starto[i] = 0; | |
396 } | |
397 if (rnk > 1) { | |
398 ptrdiff_t n, start, nT, startT; | |
399 ntot = FFTW(mpi_local_size_many_transposed) | |
400 (p->sz->rnk, total_ni, vn, | |
401 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, | |
402 MPI_COMM_WORLD, | |
403 &n, &start, &nT, &startT); | |
404 if (flags & FFTW_MPI_TRANSPOSED_IN) { | |
405 local_ni[1] = nT; | |
406 local_starti[1] = startT; | |
407 } | |
408 else { | |
409 local_ni[0] = n; | |
410 local_starti[0] = start; | |
411 } | |
412 if (flags & FFTW_MPI_TRANSPOSED_OUT) { | |
413 local_no[1] = nT; | |
414 local_starto[1] = startT; | |
415 } | |
416 else { | |
417 local_no[0] = n; | |
418 local_starto[0] = start; | |
419 } | |
420 } | |
421 else if (rnk == 1) { | |
422 ntot = FFTW(mpi_local_size_many_1d) | |
423 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags, | |
424 local_ni, local_starti, local_no, local_starto); | |
425 } | |
426 alloc_local(ntot * 2, p->in == p->out); | |
427 | |
428 pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn, | |
429 FFTW_MPI_DEFAULT_BLOCK, | |
430 FFTW_MPI_DEFAULT_BLOCK, | |
431 (FFTW(complex) *) local_in, | |
432 (FFTW(complex) *) local_out, | |
433 MPI_COMM_WORLD, p->sign, flags); | |
434 | |
435 vn *= 2; | |
436 | |
437 if (rnk > 1) { | |
438 ptrdiff_t nrest = 1; | |
439 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n; | |
440 if (flags & FFTW_MPI_TRANSPOSED_IN) | |
441 plan_scramble_in = mkplan_transpose_local( | |
442 p->sz->dims[0].n, local_ni[1], vn * nrest, | |
443 local_in, local_in); | |
444 if (flags & FFTW_MPI_TRANSPOSED_OUT) | |
445 plan_unscramble_out = mkplan_transpose_local( | |
446 local_no[1], p->sz->dims[0].n, vn * nrest, | |
447 local_out, local_out); | |
448 } | |
449 | |
450 return pln; | |
451 } | |
452 | |
453 static int tensor_real_contiguousp(bench_tensor *t, int sign, int s) | |
454 { | |
455 return (t->dims[t->rnk-1].is == s | |
456 && ((tensor_real_rowmajorp(t, sign, 1) && | |
457 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os))); | |
458 } | |
459 | |
460 static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags) | |
461 { | |
462 FFTW(plan) pln = 0; | |
463 int i; | |
464 ptrdiff_t ntot; | |
465 | |
466 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; | |
467 | |
468 if (p->sz->rnk < 2 | |
469 || p->split | |
470 || !tensor_real_contiguousp(p->sz, p->sign, vn) | |
471 || tensor_rowmajor_transposedp(p->sz) | |
472 || p->vecsz->rnk > 1 | |
473 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 | |
474 || p->vecsz->dims[0].os != 1))) | |
475 return 0; | |
476 | |
477 alloc_rnk(p->sz->rnk); | |
478 for (i = 0; i < rnk; ++i) { | |
479 total_ni[i] = total_no[i] = p->sz->dims[i].n; | |
480 local_ni[i] = local_no[i] = total_ni[i]; | |
481 local_starti[i] = local_starto[i] = 0; | |
482 } | |
483 local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1] | |
484 = p->sz->dims[rnk-1].n / 2 + 1; | |
485 { | |
486 ptrdiff_t n, start, nT, startT; | |
487 ntot = FFTW(mpi_local_size_many_transposed) | |
488 (p->sz->rnk, total_ni, vn, | |
489 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, | |
490 MPI_COMM_WORLD, | |
491 &n, &start, &nT, &startT); | |
492 if (flags & FFTW_MPI_TRANSPOSED_IN) { | |
493 local_ni[1] = nT; | |
494 local_starti[1] = startT; | |
495 } | |
496 else { | |
497 local_ni[0] = n; | |
498 local_starti[0] = start; | |
499 } | |
500 if (flags & FFTW_MPI_TRANSPOSED_OUT) { | |
501 local_no[1] = nT; | |
502 local_starto[1] = startT; | |
503 } | |
504 else { | |
505 local_no[0] = n; | |
506 local_starto[0] = start; | |
507 } | |
508 } | |
509 alloc_local(ntot * 2, p->in == p->out); | |
510 | |
511 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n; | |
512 if (p->sign < 0) | |
513 pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn, | |
514 FFTW_MPI_DEFAULT_BLOCK, | |
515 FFTW_MPI_DEFAULT_BLOCK, | |
516 local_in, | |
517 (FFTW(complex) *) local_out, | |
518 MPI_COMM_WORLD, flags); | |
519 else | |
520 pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn, | |
521 FFTW_MPI_DEFAULT_BLOCK, | |
522 FFTW_MPI_DEFAULT_BLOCK, | |
523 (FFTW(complex) *) local_in, | |
524 local_out, | |
525 MPI_COMM_WORLD, flags); | |
526 | |
527 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1; | |
528 vn *= 2; | |
529 | |
530 { | |
531 ptrdiff_t nrest = 1; | |
532 for (i = 2; i < rnk; ++i) nrest *= total_ni[i]; | |
533 if (flags & FFTW_MPI_TRANSPOSED_IN) | |
534 plan_scramble_in = mkplan_transpose_local( | |
535 total_ni[0], local_ni[1], vn * nrest, | |
536 local_in, local_in); | |
537 if (flags & FFTW_MPI_TRANSPOSED_OUT) | |
538 plan_unscramble_out = mkplan_transpose_local( | |
539 local_no[1], total_ni[0], vn * nrest, | |
540 local_out, local_out); | |
541 } | |
542 | |
543 return pln; | |
544 } | |
545 | |
546 static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags) | |
547 { | |
548 ptrdiff_t ntot, nx, ny; | |
549 int ix=0, iy=1, i; | |
550 const bench_iodim *d = p->vecsz->dims; | |
551 FFTW(plan) pln; | |
552 | |
553 if (p->vecsz->rnk == 3) { | |
554 for (i = 0; i < 3; ++i) | |
555 if (d[i].is == 1 && d[i].os == 1) { | |
556 vn = d[i].n; | |
557 ix = (i + 1) % 3; | |
558 iy = (i + 2) % 3; | |
559 break; | |
560 } | |
561 if (i == 3) return 0; | |
562 } | |
563 else { | |
564 vn = 1; | |
565 ix = 0; | |
566 iy = 1; | |
567 } | |
568 | |
569 if (d[ix].is == d[iy].n * vn && d[ix].os == vn | |
570 && d[iy].os == d[ix].n * vn && d[iy].is == vn) { | |
571 nx = d[ix].n; | |
572 ny = d[iy].n; | |
573 } | |
574 else if (d[iy].is == d[ix].n * vn && d[iy].os == vn | |
575 && d[ix].os == d[iy].n * vn && d[ix].is == vn) { | |
576 nx = d[iy].n; | |
577 ny = d[ix].n; | |
578 } | |
579 else | |
580 return 0; | |
581 | |
582 alloc_rnk(2); | |
583 ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD, | |
584 &local_ni[0], | |
585 &local_starti[0], | |
586 &local_no[0], | |
587 &local_starto[0]); | |
588 local_ni[1] = ny; | |
589 local_starti[1] = 0; | |
590 local_no[1] = nx; | |
591 local_starto[1] = 0; | |
592 total_ni[0] = nx; total_ni[1] = ny; | |
593 total_no[1] = nx; total_no[0] = ny; | |
594 alloc_local(ntot, p->in == p->out); | |
595 | |
596 pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn, | |
597 FFTW_MPI_DEFAULT_BLOCK, | |
598 FFTW_MPI_DEFAULT_BLOCK, | |
599 local_in, local_out, | |
600 MPI_COMM_WORLD, flags); | |
601 | |
602 if (flags & FFTW_MPI_TRANSPOSED_IN) | |
603 plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn, | |
604 local_in, local_in); | |
605 if (flags & FFTW_MPI_TRANSPOSED_OUT) | |
606 plan_unscramble_out = mkplan_transpose_local | |
607 (nx, local_no[0], vn, local_out, local_out); | |
608 | |
609 #if 0 | |
610 if (pln && vn == 1) { | |
611 int i, j; | |
612 bench_real *ri = (bench_real *) p->in; | |
613 bench_real *ro = (bench_real *) p->out; | |
614 if (!ri || !ro) return pln; | |
615 setup_gather_scatter(); | |
616 for (i = 0; i < nx * ny; ++i) | |
617 ri[i] = i; | |
618 after_problem_rcopy_from(p, ri); | |
619 FFTW(execute)(pln); | |
620 after_problem_rcopy_to(p, ro); | |
621 if (my_pe == 0) { | |
622 for (i = 0; i < nx; ++i) { | |
623 for (j = 0; j < ny; ++j) | |
624 printf(" %3g", ro[j * nx + i]); | |
625 printf("\n"); | |
626 } | |
627 } | |
628 } | |
629 #endif | |
630 | |
631 return pln; | |
632 } | |
633 | |
634 static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags) | |
635 { | |
636 FFTW(plan) pln = 0; | |
637 int i; | |
638 ptrdiff_t ntot; | |
639 FFTW(r2r_kind) *k; | |
640 | |
641 if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1)) | |
642 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3) | |
643 return mkplan_transpose(p, flags); | |
644 | |
645 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1; | |
646 | |
647 if (p->sz->rnk < 1 | |
648 || p->split | |
649 || !tensor_contiguousp(p->sz, vn) | |
650 || tensor_rowmajor_transposedp(p->sz) | |
651 || p->vecsz->rnk > 1 | |
652 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1 | |
653 || p->vecsz->dims[0].os != 1))) | |
654 return 0; | |
655 | |
656 alloc_rnk(p->sz->rnk); | |
657 for (i = 0; i < rnk; ++i) { | |
658 total_ni[i] = total_no[i] = p->sz->dims[i].n; | |
659 local_ni[i] = local_no[i] = total_ni[i]; | |
660 local_starti[i] = local_starto[i] = 0; | |
661 } | |
662 if (rnk > 1) { | |
663 ptrdiff_t n, start, nT, startT; | |
664 ntot = FFTW(mpi_local_size_many_transposed) | |
665 (p->sz->rnk, total_ni, vn, | |
666 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, | |
667 MPI_COMM_WORLD, | |
668 &n, &start, &nT, &startT); | |
669 if (flags & FFTW_MPI_TRANSPOSED_IN) { | |
670 local_ni[1] = nT; | |
671 local_starti[1] = startT; | |
672 } | |
673 else { | |
674 local_ni[0] = n; | |
675 local_starti[0] = start; | |
676 } | |
677 if (flags & FFTW_MPI_TRANSPOSED_OUT) { | |
678 local_no[1] = nT; | |
679 local_starto[1] = startT; | |
680 } | |
681 else { | |
682 local_no[0] = n; | |
683 local_starto[0] = start; | |
684 } | |
685 } | |
686 else if (rnk == 1) { | |
687 ntot = FFTW(mpi_local_size_many_1d) | |
688 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags, | |
689 local_ni, local_starti, local_no, local_starto); | |
690 } | |
691 alloc_local(ntot, p->in == p->out); | |
692 | |
693 k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk); | |
694 for (i = 0; i < p->sz->rnk; ++i) | |
695 switch (p->k[i]) { | |
696 case R2R_R2HC: k[i] = FFTW_R2HC; break; | |
697 case R2R_HC2R: k[i] = FFTW_HC2R; break; | |
698 case R2R_DHT: k[i] = FFTW_DHT; break; | |
699 case R2R_REDFT00: k[i] = FFTW_REDFT00; break; | |
700 case R2R_REDFT01: k[i] = FFTW_REDFT01; break; | |
701 case R2R_REDFT10: k[i] = FFTW_REDFT10; break; | |
702 case R2R_REDFT11: k[i] = FFTW_REDFT11; break; | |
703 case R2R_RODFT00: k[i] = FFTW_RODFT00; break; | |
704 case R2R_RODFT01: k[i] = FFTW_RODFT01; break; | |
705 case R2R_RODFT10: k[i] = FFTW_RODFT10; break; | |
706 case R2R_RODFT11: k[i] = FFTW_RODFT11; break; | |
707 default: BENCH_ASSERT(0); | |
708 } | |
709 | |
710 pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn, | |
711 FFTW_MPI_DEFAULT_BLOCK, | |
712 FFTW_MPI_DEFAULT_BLOCK, | |
713 local_in, local_out, | |
714 MPI_COMM_WORLD, k, flags); | |
715 bench_free(k); | |
716 | |
717 if (rnk > 1) { | |
718 ptrdiff_t nrest = 1; | |
719 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n; | |
720 if (flags & FFTW_MPI_TRANSPOSED_IN) | |
721 plan_scramble_in = mkplan_transpose_local( | |
722 p->sz->dims[0].n, local_ni[1], vn * nrest, | |
723 local_in, local_in); | |
724 if (flags & FFTW_MPI_TRANSPOSED_OUT) | |
725 plan_unscramble_out = mkplan_transpose_local( | |
726 local_no[1], p->sz->dims[0].n, vn * nrest, | |
727 local_out, local_out); | |
728 } | |
729 | |
730 return pln; | |
731 } | |
732 | |
733 FFTW(plan) mkplan(bench_problem *p, unsigned flags) | |
734 { | |
735 FFTW(plan) pln = 0; | |
736 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0; | |
737 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0; | |
738 if (p->scrambled_in) { | |
739 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) | |
740 flags |= FFTW_MPI_SCRAMBLED_IN; | |
741 else | |
742 flags |= FFTW_MPI_TRANSPOSED_IN; | |
743 } | |
744 if (p->scrambled_out) { | |
745 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1) | |
746 flags |= FFTW_MPI_SCRAMBLED_OUT; | |
747 else | |
748 flags |= FFTW_MPI_TRANSPOSED_OUT; | |
749 } | |
750 switch (p->kind) { | |
751 case PROBLEM_COMPLEX: | |
752 pln =mkplan_complex(p, flags); | |
753 break; | |
754 case PROBLEM_REAL: | |
755 pln = mkplan_real(p, flags); | |
756 break; | |
757 case PROBLEM_R2R: | |
758 pln = mkplan_r2r(p, flags); | |
759 break; | |
760 default: BENCH_ASSERT(0); | |
761 } | |
762 if (pln) setup_gather_scatter(); | |
763 return pln; | |
764 } | |
765 | |
766 void main_init(int *argc, char ***argv) | |
767 { | |
768 #ifdef HAVE_SMP | |
769 # if MPI_VERSION >= 2 /* for MPI_Init_thread */ | |
770 int provided; | |
771 MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided); | |
772 threads_ok = provided >= MPI_THREAD_FUNNELED; | |
773 # else | |
774 MPI_Init(argc, argv); | |
775 threads_ok = 0; | |
776 # endif | |
777 #else | |
778 MPI_Init(argc, argv); | |
779 #endif | |
780 MPI_Comm_rank(MPI_COMM_WORLD, &my_pe); | |
781 MPI_Comm_size(MPI_COMM_WORLD, &n_pes); | |
782 if (my_pe != 0) verbose = -999; | |
783 no_speed_allocation = 1; /* so we can benchmark transforms > memory */ | |
784 always_pad_real = 1; /* out-of-place real transforms are padded */ | |
785 isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes); | |
786 isend_off = (int *) bench_malloc(sizeof(int) * n_pes); | |
787 orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes); | |
788 orecv_off = (int *) bench_malloc(sizeof(int) * n_pes); | |
789 | |
790 /* init_threads must be called before any other FFTW function, | |
791 including mpi_init, because it has to register the threads hooks | |
792 before the planner is initalized */ | |
793 #ifdef HAVE_SMP | |
794 if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); } | |
795 #endif | |
796 FFTW(mpi_init)(); | |
797 } | |
798 | |
799 void initial_cleanup(void) | |
800 { | |
801 alloc_rnk(0); | |
802 alloc_local(0, 0); | |
803 bench_free(all_local_in); all_local_in = 0; | |
804 bench_free(all_local_out); all_local_out = 0; | |
805 bench_free(isend_off); isend_off = 0; | |
806 bench_free(isend_cnt); isend_cnt = 0; | |
807 bench_free(orecv_off); orecv_off = 0; | |
808 bench_free(orecv_cnt); orecv_cnt = 0; | |
809 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0; | |
810 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0; | |
811 } | |
812 | |
813 void final_cleanup(void) | |
814 { | |
815 MPI_Finalize(); | |
816 } | |
817 | |
818 void bench_exit(int status) | |
819 { | |
820 MPI_Abort(MPI_COMM_WORLD, status); | |
821 } | |
822 | |
823 double bench_cost_postprocess(double cost) | |
824 { | |
825 double cost_max; | |
826 MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); | |
827 return cost_max; | |
828 } | |
829 | |
830 | |
831 int import_wisdom(FILE *f) | |
832 { | |
833 int success = 1, sall; | |
834 if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f); | |
835 FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD); | |
836 MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); | |
837 return sall; | |
838 } | |
839 | |
840 void export_wisdom(FILE *f) | |
841 { | |
842 FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD); | |
843 if (my_pe == 0) FFTW(export_wisdom_to_file)(f); | |
844 } |