cannam@167
|
1 /**************************************************************************/
|
cannam@167
|
2 /* NOTE to users: this is the FFTW-MPI self-test and benchmark program.
|
cannam@167
|
3 It is probably NOT a good place to learn FFTW usage, since it has a
|
cannam@167
|
4 lot of added complexity in order to exercise and test the full API,
|
cannam@167
|
5 etcetera. We suggest reading the manual. */
|
cannam@167
|
6 /**************************************************************************/
|
cannam@167
|
7
|
cannam@167
|
8 #include <math.h>
|
cannam@167
|
9 #include <stdio.h>
|
cannam@167
|
10 #include <string.h>
|
cannam@167
|
11 #include "fftw3-mpi.h"
|
cannam@167
|
12 #include "tests/fftw-bench.h"
|
cannam@167
|
13
|
cannam@167
|
14 #if defined(BENCHFFT_SINGLE)
|
cannam@167
|
15 # define BENCH_MPI_TYPE MPI_FLOAT
|
cannam@167
|
16 #elif defined(BENCHFFT_LDOUBLE)
|
cannam@167
|
17 # define BENCH_MPI_TYPE MPI_LONG_DOUBLE
|
cannam@167
|
18 #elif defined(BENCHFFT_QUAD)
|
cannam@167
|
19 # error MPI quad-precision type is unknown
|
cannam@167
|
20 #else
|
cannam@167
|
21 # define BENCH_MPI_TYPE MPI_DOUBLE
|
cannam@167
|
22 #endif
|
cannam@167
|
23
|
cannam@167
|
24 #if SIZEOF_PTRDIFF_T == SIZEOF_INT
|
cannam@167
|
25 # define FFTW_MPI_PTRDIFF_T MPI_INT
|
cannam@167
|
26 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG
|
cannam@167
|
27 # define FFTW_MPI_PTRDIFF_T MPI_LONG
|
cannam@167
|
28 #elif SIZEOF_PTRDIFF_T == SIZEOF_LONG_LONG
|
cannam@167
|
29 # define FFTW_MPI_PTRDIFF_T MPI_LONG_LONG
|
cannam@167
|
30 #else
|
cannam@167
|
31 # error MPI type for ptrdiff_t is unknown
|
cannam@167
|
32 # define FFTW_MPI_PTRDIFF_T MPI_LONG
|
cannam@167
|
33 #endif
|
cannam@167
|
34
|
cannam@167
|
35 static const char *mkversion(void) { return FFTW(version); }
|
cannam@167
|
36 static const char *mkcc(void) { return FFTW(cc); }
|
cannam@167
|
37 static const char *mkcodelet_optim(void) { return FFTW(codelet_optim); }
|
cannam@167
|
38 static const char *mknproc(void) {
|
cannam@167
|
39 static char buf[32];
|
cannam@167
|
40 int ncpus;
|
cannam@167
|
41 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
|
cannam@167
|
42 #ifdef HAVE_SNPRINTF
|
cannam@167
|
43 snprintf(buf, 32, "%d", ncpus);
|
cannam@167
|
44 #else
|
cannam@167
|
45 sprintf(buf, "%d", ncpus);
|
cannam@167
|
46 #endif
|
cannam@167
|
47 return buf;
|
cannam@167
|
48 }
|
cannam@167
|
49
|
cannam@167
|
50 BEGIN_BENCH_DOC
|
cannam@167
|
51 BENCH_DOC("name", "fftw3_mpi")
|
cannam@167
|
52 BENCH_DOCF("version", mkversion)
|
cannam@167
|
53 BENCH_DOCF("cc", mkcc)
|
cannam@167
|
54 BENCH_DOCF("codelet-optim", mkcodelet_optim)
|
cannam@167
|
55 BENCH_DOCF("nproc", mknproc)
|
cannam@167
|
56 END_BENCH_DOC
|
cannam@167
|
57
|
cannam@167
|
58 static int n_pes = 1, my_pe = 0;
|
cannam@167
|
59
|
cannam@167
|
60 /* global variables describing the shape of the data and its distribution */
|
cannam@167
|
61 static int rnk;
|
cannam@167
|
62 static ptrdiff_t vn, iNtot, oNtot;
|
cannam@167
|
63 static ptrdiff_t *local_ni=0, *local_starti=0;
|
cannam@167
|
64 static ptrdiff_t *local_no=0, *local_starto=0;
|
cannam@167
|
65 static ptrdiff_t *all_local_ni=0, *all_local_starti=0; /* n_pes x rnk arrays */
|
cannam@167
|
66 static ptrdiff_t *all_local_no=0, *all_local_starto=0; /* n_pes x rnk arrays */
|
cannam@167
|
67 static ptrdiff_t *istrides = 0, *ostrides = 0;
|
cannam@167
|
68 static ptrdiff_t *total_ni=0, *total_no=0;
|
cannam@167
|
69 static int *isend_cnt = 0, *isend_off = 0; /* for MPI_Scatterv */
|
cannam@167
|
70 static int *orecv_cnt = 0, *orecv_off = 0; /* for MPI_Gatherv */
|
cannam@167
|
71
|
cannam@167
|
72 static bench_real *local_in = 0, *local_out = 0;
|
cannam@167
|
73 static bench_real *all_local_in = 0, *all_local_out = 0;
|
cannam@167
|
74 static int all_local_in_alloc = 0, all_local_out_alloc = 0;
|
cannam@167
|
75 static FFTW(plan) plan_scramble_in = 0, plan_unscramble_out = 0;
|
cannam@167
|
76
|
cannam@167
|
77 static void alloc_rnk(int rnk_) {
|
cannam@167
|
78 rnk = rnk_;
|
cannam@167
|
79 bench_free(local_ni);
|
cannam@167
|
80 if (rnk == 0)
|
cannam@167
|
81 local_ni = 0;
|
cannam@167
|
82 else
|
cannam@167
|
83 local_ni = (ptrdiff_t *) bench_malloc(sizeof(ptrdiff_t) * rnk
|
cannam@167
|
84 * (8 + n_pes * 4));
|
cannam@167
|
85
|
cannam@167
|
86 local_starti = local_ni + rnk;
|
cannam@167
|
87 local_no = local_ni + 2 * rnk;
|
cannam@167
|
88 local_starto = local_ni + 3 * rnk;
|
cannam@167
|
89 istrides = local_ni + 4 * rnk;
|
cannam@167
|
90 ostrides = local_ni + 5 * rnk;
|
cannam@167
|
91 total_ni = local_ni + 6 * rnk;
|
cannam@167
|
92 total_no = local_ni + 7 * rnk;
|
cannam@167
|
93 all_local_ni = local_ni + 8 * rnk;
|
cannam@167
|
94 all_local_starti = local_ni + (8 + n_pes) * rnk;
|
cannam@167
|
95 all_local_no = local_ni + (8 + 2 * n_pes) * rnk;
|
cannam@167
|
96 all_local_starto = local_ni + (8 + 3 * n_pes) * rnk;
|
cannam@167
|
97 }
|
cannam@167
|
98
|
cannam@167
|
99 static void setup_gather_scatter(void)
|
cannam@167
|
100 {
|
cannam@167
|
101 int i, j;
|
cannam@167
|
102 ptrdiff_t off;
|
cannam@167
|
103
|
cannam@167
|
104 MPI_Gather(local_ni, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
105 all_local_ni, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
106 0, MPI_COMM_WORLD);
|
cannam@167
|
107 MPI_Bcast(all_local_ni, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
cannam@167
|
108 MPI_Gather(local_starti, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
109 all_local_starti, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
110 0, MPI_COMM_WORLD);
|
cannam@167
|
111 MPI_Bcast(all_local_starti, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
cannam@167
|
112
|
cannam@167
|
113 MPI_Gather(local_no, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
114 all_local_no, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
115 0, MPI_COMM_WORLD);
|
cannam@167
|
116 MPI_Bcast(all_local_no, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
cannam@167
|
117 MPI_Gather(local_starto, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
118 all_local_starto, rnk, FFTW_MPI_PTRDIFF_T,
|
cannam@167
|
119 0, MPI_COMM_WORLD);
|
cannam@167
|
120 MPI_Bcast(all_local_starto, rnk*n_pes, FFTW_MPI_PTRDIFF_T, 0, MPI_COMM_WORLD);
|
cannam@167
|
121
|
cannam@167
|
122 off = 0;
|
cannam@167
|
123 for (i = 0; i < n_pes; ++i) {
|
cannam@167
|
124 ptrdiff_t N = vn;
|
cannam@167
|
125 for (j = 0; j < rnk; ++j)
|
cannam@167
|
126 N *= all_local_ni[i * rnk + j];
|
cannam@167
|
127 isend_cnt[i] = N;
|
cannam@167
|
128 isend_off[i] = off;
|
cannam@167
|
129 off += N;
|
cannam@167
|
130 }
|
cannam@167
|
131 iNtot = off;
|
cannam@167
|
132 all_local_in_alloc = 1;
|
cannam@167
|
133
|
cannam@167
|
134 istrides[rnk - 1] = vn;
|
cannam@167
|
135 for (j = rnk - 2; j >= 0; --j)
|
cannam@167
|
136 istrides[j] = total_ni[j + 1] * istrides[j + 1];
|
cannam@167
|
137
|
cannam@167
|
138 off = 0;
|
cannam@167
|
139 for (i = 0; i < n_pes; ++i) {
|
cannam@167
|
140 ptrdiff_t N = vn;
|
cannam@167
|
141 for (j = 0; j < rnk; ++j)
|
cannam@167
|
142 N *= all_local_no[i * rnk + j];
|
cannam@167
|
143 orecv_cnt[i] = N;
|
cannam@167
|
144 orecv_off[i] = off;
|
cannam@167
|
145 off += N;
|
cannam@167
|
146 }
|
cannam@167
|
147 oNtot = off;
|
cannam@167
|
148 all_local_out_alloc = 1;
|
cannam@167
|
149
|
cannam@167
|
150 ostrides[rnk - 1] = vn;
|
cannam@167
|
151 for (j = rnk - 2; j >= 0; --j)
|
cannam@167
|
152 ostrides[j] = total_no[j + 1] * ostrides[j + 1];
|
cannam@167
|
153 }
|
cannam@167
|
154
|
cannam@167
|
155 static void copy_block_out(const bench_real *in,
|
cannam@167
|
156 int rnk, ptrdiff_t *n, ptrdiff_t *start,
|
cannam@167
|
157 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
|
cannam@167
|
158 bench_real *out)
|
cannam@167
|
159 {
|
cannam@167
|
160 ptrdiff_t i;
|
cannam@167
|
161 if (rnk == 0) {
|
cannam@167
|
162 for (i = 0; i < vn; ++i)
|
cannam@167
|
163 out[i] = in[i];
|
cannam@167
|
164 }
|
cannam@167
|
165 else if (rnk == 1) { /* this case is just an optimization */
|
cannam@167
|
166 ptrdiff_t j;
|
cannam@167
|
167 out += start[0] * os[0];
|
cannam@167
|
168 for (j = 0; j < n[0]; ++j) {
|
cannam@167
|
169 for (i = 0; i < vn; ++i)
|
cannam@167
|
170 out[i] = in[i];
|
cannam@167
|
171 in += is;
|
cannam@167
|
172 out += os[0];
|
cannam@167
|
173 }
|
cannam@167
|
174 }
|
cannam@167
|
175 else {
|
cannam@167
|
176 /* we should do n[0] for locality, but this way is simpler to code */
|
cannam@167
|
177 for (i = 0; i < n[rnk - 1]; ++i)
|
cannam@167
|
178 copy_block_out(in + i * is,
|
cannam@167
|
179 rnk - 1, n, start, is * n[rnk - 1], os, vn,
|
cannam@167
|
180 out + (start[rnk - 1] + i) * os[rnk - 1]);
|
cannam@167
|
181 }
|
cannam@167
|
182 }
|
cannam@167
|
183
|
cannam@167
|
184 static void copy_block_in(bench_real *in,
|
cannam@167
|
185 int rnk, ptrdiff_t *n, ptrdiff_t *start,
|
cannam@167
|
186 ptrdiff_t is, ptrdiff_t *os, ptrdiff_t vn,
|
cannam@167
|
187 const bench_real *out)
|
cannam@167
|
188 {
|
cannam@167
|
189 ptrdiff_t i;
|
cannam@167
|
190 if (rnk == 0) {
|
cannam@167
|
191 for (i = 0; i < vn; ++i)
|
cannam@167
|
192 in[i] = out[i];
|
cannam@167
|
193 }
|
cannam@167
|
194 else if (rnk == 1) { /* this case is just an optimization */
|
cannam@167
|
195 ptrdiff_t j;
|
cannam@167
|
196 out += start[0] * os[0];
|
cannam@167
|
197 for (j = 0; j < n[0]; ++j) {
|
cannam@167
|
198 for (i = 0; i < vn; ++i)
|
cannam@167
|
199 in[i] = out[i];
|
cannam@167
|
200 in += is;
|
cannam@167
|
201 out += os[0];
|
cannam@167
|
202 }
|
cannam@167
|
203 }
|
cannam@167
|
204 else {
|
cannam@167
|
205 /* we should do n[0] for locality, but this way is simpler to code */
|
cannam@167
|
206 for (i = 0; i < n[rnk - 1]; ++i)
|
cannam@167
|
207 copy_block_in(in + i * is,
|
cannam@167
|
208 rnk - 1, n, start, is * n[rnk - 1], os, vn,
|
cannam@167
|
209 out + (start[rnk - 1] + i) * os[rnk - 1]);
|
cannam@167
|
210 }
|
cannam@167
|
211 }
|
cannam@167
|
212
|
cannam@167
|
213 static void do_scatter_in(bench_real *in)
|
cannam@167
|
214 {
|
cannam@167
|
215 bench_real *ali;
|
cannam@167
|
216 int i;
|
cannam@167
|
217 if (all_local_in_alloc) {
|
cannam@167
|
218 bench_free(all_local_in);
|
cannam@167
|
219 all_local_in = (bench_real*) bench_malloc(iNtot*sizeof(bench_real));
|
cannam@167
|
220 all_local_in_alloc = 0;
|
cannam@167
|
221 }
|
cannam@167
|
222 ali = all_local_in;
|
cannam@167
|
223 for (i = 0; i < n_pes; ++i) {
|
cannam@167
|
224 copy_block_in(ali,
|
cannam@167
|
225 rnk, all_local_ni + i * rnk,
|
cannam@167
|
226 all_local_starti + i * rnk,
|
cannam@167
|
227 vn, istrides, vn,
|
cannam@167
|
228 in);
|
cannam@167
|
229 ali += isend_cnt[i];
|
cannam@167
|
230 }
|
cannam@167
|
231 MPI_Scatterv(all_local_in, isend_cnt, isend_off, BENCH_MPI_TYPE,
|
cannam@167
|
232 local_in, isend_cnt[my_pe], BENCH_MPI_TYPE,
|
cannam@167
|
233 0, MPI_COMM_WORLD);
|
cannam@167
|
234 }
|
cannam@167
|
235
|
cannam@167
|
236 static void do_gather_out(bench_real *out)
|
cannam@167
|
237 {
|
cannam@167
|
238 bench_real *alo;
|
cannam@167
|
239 int i;
|
cannam@167
|
240
|
cannam@167
|
241 if (all_local_out_alloc) {
|
cannam@167
|
242 bench_free(all_local_out);
|
cannam@167
|
243 all_local_out = (bench_real*) bench_malloc(oNtot*sizeof(bench_real));
|
cannam@167
|
244 all_local_out_alloc = 0;
|
cannam@167
|
245 }
|
cannam@167
|
246 MPI_Gatherv(local_out, orecv_cnt[my_pe], BENCH_MPI_TYPE,
|
cannam@167
|
247 all_local_out, orecv_cnt, orecv_off, BENCH_MPI_TYPE,
|
cannam@167
|
248 0, MPI_COMM_WORLD);
|
cannam@167
|
249 MPI_Bcast(all_local_out, oNtot, BENCH_MPI_TYPE, 0, MPI_COMM_WORLD);
|
cannam@167
|
250 alo = all_local_out;
|
cannam@167
|
251 for (i = 0; i < n_pes; ++i) {
|
cannam@167
|
252 copy_block_out(alo,
|
cannam@167
|
253 rnk, all_local_no + i * rnk,
|
cannam@167
|
254 all_local_starto + i * rnk,
|
cannam@167
|
255 vn, ostrides, vn,
|
cannam@167
|
256 out);
|
cannam@167
|
257 alo += orecv_cnt[i];
|
cannam@167
|
258 }
|
cannam@167
|
259 }
|
cannam@167
|
260
|
cannam@167
|
261 static void alloc_local(ptrdiff_t nreal, int inplace)
|
cannam@167
|
262 {
|
cannam@167
|
263 bench_free(local_in);
|
cannam@167
|
264 if (local_out != local_in) bench_free(local_out);
|
cannam@167
|
265 local_in = local_out = 0;
|
cannam@167
|
266 if (nreal > 0) {
|
cannam@167
|
267 ptrdiff_t i;
|
cannam@167
|
268 local_in = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
|
cannam@167
|
269 if (inplace)
|
cannam@167
|
270 local_out = local_in;
|
cannam@167
|
271 else
|
cannam@167
|
272 local_out = (bench_real*) bench_malloc(nreal * sizeof(bench_real));
|
cannam@167
|
273 for (i = 0; i < nreal; ++i) local_in[i] = local_out[i] = 0.0;
|
cannam@167
|
274 }
|
cannam@167
|
275 }
|
cannam@167
|
276
|
cannam@167
|
277 void after_problem_rcopy_from(bench_problem *p, bench_real *ri)
|
cannam@167
|
278 {
|
cannam@167
|
279 UNUSED(p);
|
cannam@167
|
280 do_scatter_in(ri);
|
cannam@167
|
281 if (plan_scramble_in) FFTW(execute)(plan_scramble_in);
|
cannam@167
|
282 }
|
cannam@167
|
283
|
cannam@167
|
284 void after_problem_rcopy_to(bench_problem *p, bench_real *ro)
|
cannam@167
|
285 {
|
cannam@167
|
286 UNUSED(p);
|
cannam@167
|
287 if (plan_unscramble_out) FFTW(execute)(plan_unscramble_out);
|
cannam@167
|
288 do_gather_out(ro);
|
cannam@167
|
289 }
|
cannam@167
|
290
|
cannam@167
|
291 void after_problem_ccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
|
cannam@167
|
292 {
|
cannam@167
|
293 UNUSED(ii);
|
cannam@167
|
294 after_problem_rcopy_from(p, ri);
|
cannam@167
|
295 }
|
cannam@167
|
296
|
cannam@167
|
297 void after_problem_ccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
|
cannam@167
|
298 {
|
cannam@167
|
299 UNUSED(io);
|
cannam@167
|
300 after_problem_rcopy_to(p, ro);
|
cannam@167
|
301 }
|
cannam@167
|
302
|
cannam@167
|
303 void after_problem_hccopy_from(bench_problem *p, bench_real *ri, bench_real *ii)
|
cannam@167
|
304 {
|
cannam@167
|
305 UNUSED(ii);
|
cannam@167
|
306 after_problem_rcopy_from(p, ri);
|
cannam@167
|
307 }
|
cannam@167
|
308
|
cannam@167
|
309 void after_problem_hccopy_to(bench_problem *p, bench_real *ro, bench_real *io)
|
cannam@167
|
310 {
|
cannam@167
|
311 UNUSED(io);
|
cannam@167
|
312 after_problem_rcopy_to(p, ro);
|
cannam@167
|
313 }
|
cannam@167
|
314
|
cannam@167
|
315 static FFTW(plan) mkplan_transpose_local(ptrdiff_t nx, ptrdiff_t ny,
|
cannam@167
|
316 ptrdiff_t vn,
|
cannam@167
|
317 bench_real *in, bench_real *out)
|
cannam@167
|
318 {
|
cannam@167
|
319 FFTW(iodim64) hdims[3];
|
cannam@167
|
320 FFTW(r2r_kind) k[3];
|
cannam@167
|
321 FFTW(plan) pln;
|
cannam@167
|
322
|
cannam@167
|
323 hdims[0].n = nx;
|
cannam@167
|
324 hdims[0].is = ny * vn;
|
cannam@167
|
325 hdims[0].os = vn;
|
cannam@167
|
326 hdims[1].n = ny;
|
cannam@167
|
327 hdims[1].is = vn;
|
cannam@167
|
328 hdims[1].os = nx * vn;
|
cannam@167
|
329 hdims[2].n = vn;
|
cannam@167
|
330 hdims[2].is = 1;
|
cannam@167
|
331 hdims[2].os = 1;
|
cannam@167
|
332 k[0] = k[1] = k[2] = FFTW_R2HC;
|
cannam@167
|
333 pln = FFTW(plan_guru64_r2r)(0, 0, 3, hdims, in, out, k, FFTW_ESTIMATE);
|
cannam@167
|
334 BENCH_ASSERT(pln != 0);
|
cannam@167
|
335 return pln;
|
cannam@167
|
336 }
|
cannam@167
|
337
|
cannam@167
|
338 static int tensor_rowmajor_transposedp(bench_tensor *t)
|
cannam@167
|
339 {
|
cannam@167
|
340 bench_iodim *d;
|
cannam@167
|
341 int i;
|
cannam@167
|
342
|
cannam@167
|
343 BENCH_ASSERT(BENCH_FINITE_RNK(t->rnk));
|
cannam@167
|
344 if (t->rnk < 2)
|
cannam@167
|
345 return 0;
|
cannam@167
|
346
|
cannam@167
|
347 d = t->dims;
|
cannam@167
|
348 if (d[0].is != d[1].is * d[1].n
|
cannam@167
|
349 || d[0].os != d[1].is
|
cannam@167
|
350 || d[1].os != d[0].os * d[0].n)
|
cannam@167
|
351 return 0;
|
cannam@167
|
352 if (t->rnk > 2 && d[1].is != d[2].is * d[2].n)
|
cannam@167
|
353 return 0;
|
cannam@167
|
354 for (i = 2; i + 1 < t->rnk; ++i) {
|
cannam@167
|
355 d = t->dims + i;
|
cannam@167
|
356 if (d[0].is != d[1].is * d[1].n
|
cannam@167
|
357 || d[0].os != d[1].os * d[1].n)
|
cannam@167
|
358 return 0;
|
cannam@167
|
359 }
|
cannam@167
|
360
|
cannam@167
|
361 if (t->rnk > 2 && t->dims[t->rnk-1].is != t->dims[t->rnk-1].os)
|
cannam@167
|
362 return 0;
|
cannam@167
|
363 return 1;
|
cannam@167
|
364 }
|
cannam@167
|
365
|
cannam@167
|
366 static int tensor_contiguousp(bench_tensor *t, int s)
|
cannam@167
|
367 {
|
cannam@167
|
368 return (t->dims[t->rnk-1].is == s
|
cannam@167
|
369 && ((tensor_rowmajorp(t) &&
|
cannam@167
|
370 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)
|
cannam@167
|
371 || tensor_rowmajor_transposedp(t)));
|
cannam@167
|
372 }
|
cannam@167
|
373
|
cannam@167
|
374 static FFTW(plan) mkplan_complex(bench_problem *p, unsigned flags)
|
cannam@167
|
375 {
|
cannam@167
|
376 FFTW(plan) pln = 0;
|
cannam@167
|
377 int i;
|
cannam@167
|
378 ptrdiff_t ntot;
|
cannam@167
|
379
|
cannam@167
|
380 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
cannam@167
|
381
|
cannam@167
|
382 if (p->sz->rnk < 1
|
cannam@167
|
383 || p->split
|
cannam@167
|
384 || !tensor_contiguousp(p->sz, vn)
|
cannam@167
|
385 || tensor_rowmajor_transposedp(p->sz)
|
cannam@167
|
386 || p->vecsz->rnk > 1
|
cannam@167
|
387 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
cannam@167
|
388 || p->vecsz->dims[0].os != 1)))
|
cannam@167
|
389 return 0;
|
cannam@167
|
390
|
cannam@167
|
391 alloc_rnk(p->sz->rnk);
|
cannam@167
|
392 for (i = 0; i < rnk; ++i) {
|
cannam@167
|
393 total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
cannam@167
|
394 local_ni[i] = local_no[i] = total_ni[i];
|
cannam@167
|
395 local_starti[i] = local_starto[i] = 0;
|
cannam@167
|
396 }
|
cannam@167
|
397 if (rnk > 1) {
|
cannam@167
|
398 ptrdiff_t n, start, nT, startT;
|
cannam@167
|
399 ntot = FFTW(mpi_local_size_many_transposed)
|
cannam@167
|
400 (p->sz->rnk, total_ni, vn,
|
cannam@167
|
401 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
402 MPI_COMM_WORLD,
|
cannam@167
|
403 &n, &start, &nT, &startT);
|
cannam@167
|
404 if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
cannam@167
|
405 local_ni[1] = nT;
|
cannam@167
|
406 local_starti[1] = startT;
|
cannam@167
|
407 }
|
cannam@167
|
408 else {
|
cannam@167
|
409 local_ni[0] = n;
|
cannam@167
|
410 local_starti[0] = start;
|
cannam@167
|
411 }
|
cannam@167
|
412 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
cannam@167
|
413 local_no[1] = nT;
|
cannam@167
|
414 local_starto[1] = startT;
|
cannam@167
|
415 }
|
cannam@167
|
416 else {
|
cannam@167
|
417 local_no[0] = n;
|
cannam@167
|
418 local_starto[0] = start;
|
cannam@167
|
419 }
|
cannam@167
|
420 }
|
cannam@167
|
421 else if (rnk == 1) {
|
cannam@167
|
422 ntot = FFTW(mpi_local_size_many_1d)
|
cannam@167
|
423 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
|
cannam@167
|
424 local_ni, local_starti, local_no, local_starto);
|
cannam@167
|
425 }
|
cannam@167
|
426 alloc_local(ntot * 2, p->in == p->out);
|
cannam@167
|
427
|
cannam@167
|
428 pln = FFTW(mpi_plan_many_dft)(p->sz->rnk, total_ni, vn,
|
cannam@167
|
429 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
430 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
431 (FFTW(complex) *) local_in,
|
cannam@167
|
432 (FFTW(complex) *) local_out,
|
cannam@167
|
433 MPI_COMM_WORLD, p->sign, flags);
|
cannam@167
|
434
|
cannam@167
|
435 vn *= 2;
|
cannam@167
|
436
|
cannam@167
|
437 if (rnk > 1) {
|
cannam@167
|
438 ptrdiff_t nrest = 1;
|
cannam@167
|
439 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
|
cannam@167
|
440 if (flags & FFTW_MPI_TRANSPOSED_IN)
|
cannam@167
|
441 plan_scramble_in = mkplan_transpose_local(
|
cannam@167
|
442 p->sz->dims[0].n, local_ni[1], vn * nrest,
|
cannam@167
|
443 local_in, local_in);
|
cannam@167
|
444 if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
cannam@167
|
445 plan_unscramble_out = mkplan_transpose_local(
|
cannam@167
|
446 local_no[1], p->sz->dims[0].n, vn * nrest,
|
cannam@167
|
447 local_out, local_out);
|
cannam@167
|
448 }
|
cannam@167
|
449
|
cannam@167
|
450 return pln;
|
cannam@167
|
451 }
|
cannam@167
|
452
|
cannam@167
|
453 static int tensor_real_contiguousp(bench_tensor *t, int sign, int s)
|
cannam@167
|
454 {
|
cannam@167
|
455 return (t->dims[t->rnk-1].is == s
|
cannam@167
|
456 && ((tensor_real_rowmajorp(t, sign, 1) &&
|
cannam@167
|
457 t->dims[t->rnk-1].is == t->dims[t->rnk-1].os)));
|
cannam@167
|
458 }
|
cannam@167
|
459
|
cannam@167
|
460 static FFTW(plan) mkplan_real(bench_problem *p, unsigned flags)
|
cannam@167
|
461 {
|
cannam@167
|
462 FFTW(plan) pln = 0;
|
cannam@167
|
463 int i;
|
cannam@167
|
464 ptrdiff_t ntot;
|
cannam@167
|
465
|
cannam@167
|
466 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
cannam@167
|
467
|
cannam@167
|
468 if (p->sz->rnk < 2
|
cannam@167
|
469 || p->split
|
cannam@167
|
470 || !tensor_real_contiguousp(p->sz, p->sign, vn)
|
cannam@167
|
471 || tensor_rowmajor_transposedp(p->sz)
|
cannam@167
|
472 || p->vecsz->rnk > 1
|
cannam@167
|
473 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
cannam@167
|
474 || p->vecsz->dims[0].os != 1)))
|
cannam@167
|
475 return 0;
|
cannam@167
|
476
|
cannam@167
|
477 alloc_rnk(p->sz->rnk);
|
cannam@167
|
478 for (i = 0; i < rnk; ++i) {
|
cannam@167
|
479 total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
cannam@167
|
480 local_ni[i] = local_no[i] = total_ni[i];
|
cannam@167
|
481 local_starti[i] = local_starto[i] = 0;
|
cannam@167
|
482 }
|
cannam@167
|
483 local_ni[rnk-1] = local_no[rnk-1] = total_ni[rnk-1] = total_no[rnk-1]
|
cannam@167
|
484 = p->sz->dims[rnk-1].n / 2 + 1;
|
cannam@167
|
485 {
|
cannam@167
|
486 ptrdiff_t n, start, nT, startT;
|
cannam@167
|
487 ntot = FFTW(mpi_local_size_many_transposed)
|
cannam@167
|
488 (p->sz->rnk, total_ni, vn,
|
cannam@167
|
489 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
490 MPI_COMM_WORLD,
|
cannam@167
|
491 &n, &start, &nT, &startT);
|
cannam@167
|
492 if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
cannam@167
|
493 local_ni[1] = nT;
|
cannam@167
|
494 local_starti[1] = startT;
|
cannam@167
|
495 }
|
cannam@167
|
496 else {
|
cannam@167
|
497 local_ni[0] = n;
|
cannam@167
|
498 local_starti[0] = start;
|
cannam@167
|
499 }
|
cannam@167
|
500 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
cannam@167
|
501 local_no[1] = nT;
|
cannam@167
|
502 local_starto[1] = startT;
|
cannam@167
|
503 }
|
cannam@167
|
504 else {
|
cannam@167
|
505 local_no[0] = n;
|
cannam@167
|
506 local_starto[0] = start;
|
cannam@167
|
507 }
|
cannam@167
|
508 }
|
cannam@167
|
509 alloc_local(ntot * 2, p->in == p->out);
|
cannam@167
|
510
|
cannam@167
|
511 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n;
|
cannam@167
|
512 if (p->sign < 0)
|
cannam@167
|
513 pln = FFTW(mpi_plan_many_dft_r2c)(p->sz->rnk, total_ni, vn,
|
cannam@167
|
514 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
515 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
516 local_in,
|
cannam@167
|
517 (FFTW(complex) *) local_out,
|
cannam@167
|
518 MPI_COMM_WORLD, flags);
|
cannam@167
|
519 else
|
cannam@167
|
520 pln = FFTW(mpi_plan_many_dft_c2r)(p->sz->rnk, total_ni, vn,
|
cannam@167
|
521 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
522 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
523 (FFTW(complex) *) local_in,
|
cannam@167
|
524 local_out,
|
cannam@167
|
525 MPI_COMM_WORLD, flags);
|
cannam@167
|
526
|
cannam@167
|
527 total_ni[rnk - 1] = p->sz->dims[rnk - 1].n / 2 + 1;
|
cannam@167
|
528 vn *= 2;
|
cannam@167
|
529
|
cannam@167
|
530 {
|
cannam@167
|
531 ptrdiff_t nrest = 1;
|
cannam@167
|
532 for (i = 2; i < rnk; ++i) nrest *= total_ni[i];
|
cannam@167
|
533 if (flags & FFTW_MPI_TRANSPOSED_IN)
|
cannam@167
|
534 plan_scramble_in = mkplan_transpose_local(
|
cannam@167
|
535 total_ni[0], local_ni[1], vn * nrest,
|
cannam@167
|
536 local_in, local_in);
|
cannam@167
|
537 if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
cannam@167
|
538 plan_unscramble_out = mkplan_transpose_local(
|
cannam@167
|
539 local_no[1], total_ni[0], vn * nrest,
|
cannam@167
|
540 local_out, local_out);
|
cannam@167
|
541 }
|
cannam@167
|
542
|
cannam@167
|
543 return pln;
|
cannam@167
|
544 }
|
cannam@167
|
545
|
cannam@167
|
546 static FFTW(plan) mkplan_transpose(bench_problem *p, unsigned flags)
|
cannam@167
|
547 {
|
cannam@167
|
548 ptrdiff_t ntot, nx, ny;
|
cannam@167
|
549 int ix=0, iy=1, i;
|
cannam@167
|
550 const bench_iodim *d = p->vecsz->dims;
|
cannam@167
|
551 FFTW(plan) pln;
|
cannam@167
|
552
|
cannam@167
|
553 if (p->vecsz->rnk == 3) {
|
cannam@167
|
554 for (i = 0; i < 3; ++i)
|
cannam@167
|
555 if (d[i].is == 1 && d[i].os == 1) {
|
cannam@167
|
556 vn = d[i].n;
|
cannam@167
|
557 ix = (i + 1) % 3;
|
cannam@167
|
558 iy = (i + 2) % 3;
|
cannam@167
|
559 break;
|
cannam@167
|
560 }
|
cannam@167
|
561 if (i == 3) return 0;
|
cannam@167
|
562 }
|
cannam@167
|
563 else {
|
cannam@167
|
564 vn = 1;
|
cannam@167
|
565 ix = 0;
|
cannam@167
|
566 iy = 1;
|
cannam@167
|
567 }
|
cannam@167
|
568
|
cannam@167
|
569 if (d[ix].is == d[iy].n * vn && d[ix].os == vn
|
cannam@167
|
570 && d[iy].os == d[ix].n * vn && d[iy].is == vn) {
|
cannam@167
|
571 nx = d[ix].n;
|
cannam@167
|
572 ny = d[iy].n;
|
cannam@167
|
573 }
|
cannam@167
|
574 else if (d[iy].is == d[ix].n * vn && d[iy].os == vn
|
cannam@167
|
575 && d[ix].os == d[iy].n * vn && d[ix].is == vn) {
|
cannam@167
|
576 nx = d[iy].n;
|
cannam@167
|
577 ny = d[ix].n;
|
cannam@167
|
578 }
|
cannam@167
|
579 else
|
cannam@167
|
580 return 0;
|
cannam@167
|
581
|
cannam@167
|
582 alloc_rnk(2);
|
cannam@167
|
583 ntot = vn * FFTW(mpi_local_size_2d_transposed)(nx, ny, MPI_COMM_WORLD,
|
cannam@167
|
584 &local_ni[0],
|
cannam@167
|
585 &local_starti[0],
|
cannam@167
|
586 &local_no[0],
|
cannam@167
|
587 &local_starto[0]);
|
cannam@167
|
588 local_ni[1] = ny;
|
cannam@167
|
589 local_starti[1] = 0;
|
cannam@167
|
590 local_no[1] = nx;
|
cannam@167
|
591 local_starto[1] = 0;
|
cannam@167
|
592 total_ni[0] = nx; total_ni[1] = ny;
|
cannam@167
|
593 total_no[1] = nx; total_no[0] = ny;
|
cannam@167
|
594 alloc_local(ntot, p->in == p->out);
|
cannam@167
|
595
|
cannam@167
|
596 pln = FFTW(mpi_plan_many_transpose)(nx, ny, vn,
|
cannam@167
|
597 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
598 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
599 local_in, local_out,
|
cannam@167
|
600 MPI_COMM_WORLD, flags);
|
cannam@167
|
601
|
cannam@167
|
602 if (flags & FFTW_MPI_TRANSPOSED_IN)
|
cannam@167
|
603 plan_scramble_in = mkplan_transpose_local(local_ni[0], ny, vn,
|
cannam@167
|
604 local_in, local_in);
|
cannam@167
|
605 if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
cannam@167
|
606 plan_unscramble_out = mkplan_transpose_local
|
cannam@167
|
607 (nx, local_no[0], vn, local_out, local_out);
|
cannam@167
|
608
|
cannam@167
|
609 #if 0
|
cannam@167
|
610 if (pln && vn == 1) {
|
cannam@167
|
611 int i, j;
|
cannam@167
|
612 bench_real *ri = (bench_real *) p->in;
|
cannam@167
|
613 bench_real *ro = (bench_real *) p->out;
|
cannam@167
|
614 if (!ri || !ro) return pln;
|
cannam@167
|
615 setup_gather_scatter();
|
cannam@167
|
616 for (i = 0; i < nx * ny; ++i)
|
cannam@167
|
617 ri[i] = i;
|
cannam@167
|
618 after_problem_rcopy_from(p, ri);
|
cannam@167
|
619 FFTW(execute)(pln);
|
cannam@167
|
620 after_problem_rcopy_to(p, ro);
|
cannam@167
|
621 if (my_pe == 0) {
|
cannam@167
|
622 for (i = 0; i < nx; ++i) {
|
cannam@167
|
623 for (j = 0; j < ny; ++j)
|
cannam@167
|
624 printf(" %3g", ro[j * nx + i]);
|
cannam@167
|
625 printf("\n");
|
cannam@167
|
626 }
|
cannam@167
|
627 }
|
cannam@167
|
628 }
|
cannam@167
|
629 #endif
|
cannam@167
|
630
|
cannam@167
|
631 return pln;
|
cannam@167
|
632 }
|
cannam@167
|
633
|
cannam@167
|
634 static FFTW(plan) mkplan_r2r(bench_problem *p, unsigned flags)
|
cannam@167
|
635 {
|
cannam@167
|
636 FFTW(plan) pln = 0;
|
cannam@167
|
637 int i;
|
cannam@167
|
638 ptrdiff_t ntot;
|
cannam@167
|
639 FFTW(r2r_kind) *k;
|
cannam@167
|
640
|
cannam@167
|
641 if ((p->sz->rnk == 0 || (p->sz->rnk == 1 && p->sz->dims[0].n == 1))
|
cannam@167
|
642 && p->vecsz->rnk >= 2 && p->vecsz->rnk <= 3)
|
cannam@167
|
643 return mkplan_transpose(p, flags);
|
cannam@167
|
644
|
cannam@167
|
645 vn = p->vecsz->rnk == 1 ? p->vecsz->dims[0].n : 1;
|
cannam@167
|
646
|
cannam@167
|
647 if (p->sz->rnk < 1
|
cannam@167
|
648 || p->split
|
cannam@167
|
649 || !tensor_contiguousp(p->sz, vn)
|
cannam@167
|
650 || tensor_rowmajor_transposedp(p->sz)
|
cannam@167
|
651 || p->vecsz->rnk > 1
|
cannam@167
|
652 || (p->vecsz->rnk == 1 && (p->vecsz->dims[0].is != 1
|
cannam@167
|
653 || p->vecsz->dims[0].os != 1)))
|
cannam@167
|
654 return 0;
|
cannam@167
|
655
|
cannam@167
|
656 alloc_rnk(p->sz->rnk);
|
cannam@167
|
657 for (i = 0; i < rnk; ++i) {
|
cannam@167
|
658 total_ni[i] = total_no[i] = p->sz->dims[i].n;
|
cannam@167
|
659 local_ni[i] = local_no[i] = total_ni[i];
|
cannam@167
|
660 local_starti[i] = local_starto[i] = 0;
|
cannam@167
|
661 }
|
cannam@167
|
662 if (rnk > 1) {
|
cannam@167
|
663 ptrdiff_t n, start, nT, startT;
|
cannam@167
|
664 ntot = FFTW(mpi_local_size_many_transposed)
|
cannam@167
|
665 (p->sz->rnk, total_ni, vn,
|
cannam@167
|
666 FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
667 MPI_COMM_WORLD,
|
cannam@167
|
668 &n, &start, &nT, &startT);
|
cannam@167
|
669 if (flags & FFTW_MPI_TRANSPOSED_IN) {
|
cannam@167
|
670 local_ni[1] = nT;
|
cannam@167
|
671 local_starti[1] = startT;
|
cannam@167
|
672 }
|
cannam@167
|
673 else {
|
cannam@167
|
674 local_ni[0] = n;
|
cannam@167
|
675 local_starti[0] = start;
|
cannam@167
|
676 }
|
cannam@167
|
677 if (flags & FFTW_MPI_TRANSPOSED_OUT) {
|
cannam@167
|
678 local_no[1] = nT;
|
cannam@167
|
679 local_starto[1] = startT;
|
cannam@167
|
680 }
|
cannam@167
|
681 else {
|
cannam@167
|
682 local_no[0] = n;
|
cannam@167
|
683 local_starto[0] = start;
|
cannam@167
|
684 }
|
cannam@167
|
685 }
|
cannam@167
|
686 else if (rnk == 1) {
|
cannam@167
|
687 ntot = FFTW(mpi_local_size_many_1d)
|
cannam@167
|
688 (total_ni[0], vn, MPI_COMM_WORLD, p->sign, flags,
|
cannam@167
|
689 local_ni, local_starti, local_no, local_starto);
|
cannam@167
|
690 }
|
cannam@167
|
691 alloc_local(ntot, p->in == p->out);
|
cannam@167
|
692
|
cannam@167
|
693 k = (FFTW(r2r_kind) *) bench_malloc(sizeof(FFTW(r2r_kind)) * p->sz->rnk);
|
cannam@167
|
694 for (i = 0; i < p->sz->rnk; ++i)
|
cannam@167
|
695 switch (p->k[i]) {
|
cannam@167
|
696 case R2R_R2HC: k[i] = FFTW_R2HC; break;
|
cannam@167
|
697 case R2R_HC2R: k[i] = FFTW_HC2R; break;
|
cannam@167
|
698 case R2R_DHT: k[i] = FFTW_DHT; break;
|
cannam@167
|
699 case R2R_REDFT00: k[i] = FFTW_REDFT00; break;
|
cannam@167
|
700 case R2R_REDFT01: k[i] = FFTW_REDFT01; break;
|
cannam@167
|
701 case R2R_REDFT10: k[i] = FFTW_REDFT10; break;
|
cannam@167
|
702 case R2R_REDFT11: k[i] = FFTW_REDFT11; break;
|
cannam@167
|
703 case R2R_RODFT00: k[i] = FFTW_RODFT00; break;
|
cannam@167
|
704 case R2R_RODFT01: k[i] = FFTW_RODFT01; break;
|
cannam@167
|
705 case R2R_RODFT10: k[i] = FFTW_RODFT10; break;
|
cannam@167
|
706 case R2R_RODFT11: k[i] = FFTW_RODFT11; break;
|
cannam@167
|
707 default: BENCH_ASSERT(0);
|
cannam@167
|
708 }
|
cannam@167
|
709
|
cannam@167
|
710 pln = FFTW(mpi_plan_many_r2r)(p->sz->rnk, total_ni, vn,
|
cannam@167
|
711 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
712 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@167
|
713 local_in, local_out,
|
cannam@167
|
714 MPI_COMM_WORLD, k, flags);
|
cannam@167
|
715 bench_free(k);
|
cannam@167
|
716
|
cannam@167
|
717 if (rnk > 1) {
|
cannam@167
|
718 ptrdiff_t nrest = 1;
|
cannam@167
|
719 for (i = 2; i < rnk; ++i) nrest *= p->sz->dims[i].n;
|
cannam@167
|
720 if (flags & FFTW_MPI_TRANSPOSED_IN)
|
cannam@167
|
721 plan_scramble_in = mkplan_transpose_local(
|
cannam@167
|
722 p->sz->dims[0].n, local_ni[1], vn * nrest,
|
cannam@167
|
723 local_in, local_in);
|
cannam@167
|
724 if (flags & FFTW_MPI_TRANSPOSED_OUT)
|
cannam@167
|
725 plan_unscramble_out = mkplan_transpose_local(
|
cannam@167
|
726 local_no[1], p->sz->dims[0].n, vn * nrest,
|
cannam@167
|
727 local_out, local_out);
|
cannam@167
|
728 }
|
cannam@167
|
729
|
cannam@167
|
730 return pln;
|
cannam@167
|
731 }
|
cannam@167
|
732
|
cannam@167
|
733 FFTW(plan) mkplan(bench_problem *p, unsigned flags)
|
cannam@167
|
734 {
|
cannam@167
|
735 FFTW(plan) pln = 0;
|
cannam@167
|
736 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
|
cannam@167
|
737 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
|
cannam@167
|
738 if (p->scrambled_in) {
|
cannam@167
|
739 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
|
cannam@167
|
740 flags |= FFTW_MPI_SCRAMBLED_IN;
|
cannam@167
|
741 else
|
cannam@167
|
742 flags |= FFTW_MPI_TRANSPOSED_IN;
|
cannam@167
|
743 }
|
cannam@167
|
744 if (p->scrambled_out) {
|
cannam@167
|
745 if (p->sz->rnk == 1 && p->sz->dims[0].n != 1)
|
cannam@167
|
746 flags |= FFTW_MPI_SCRAMBLED_OUT;
|
cannam@167
|
747 else
|
cannam@167
|
748 flags |= FFTW_MPI_TRANSPOSED_OUT;
|
cannam@167
|
749 }
|
cannam@167
|
750 switch (p->kind) {
|
cannam@167
|
751 case PROBLEM_COMPLEX:
|
cannam@167
|
752 pln =mkplan_complex(p, flags);
|
cannam@167
|
753 break;
|
cannam@167
|
754 case PROBLEM_REAL:
|
cannam@167
|
755 pln = mkplan_real(p, flags);
|
cannam@167
|
756 break;
|
cannam@167
|
757 case PROBLEM_R2R:
|
cannam@167
|
758 pln = mkplan_r2r(p, flags);
|
cannam@167
|
759 break;
|
cannam@167
|
760 default: BENCH_ASSERT(0);
|
cannam@167
|
761 }
|
cannam@167
|
762 if (pln) setup_gather_scatter();
|
cannam@167
|
763 return pln;
|
cannam@167
|
764 }
|
cannam@167
|
765
|
cannam@167
|
766 void main_init(int *argc, char ***argv)
|
cannam@167
|
767 {
|
cannam@167
|
768 #ifdef HAVE_SMP
|
cannam@167
|
769 # if MPI_VERSION >= 2 /* for MPI_Init_thread */
|
cannam@167
|
770 int provided;
|
cannam@167
|
771 MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, &provided);
|
cannam@167
|
772 threads_ok = provided >= MPI_THREAD_FUNNELED;
|
cannam@167
|
773 # else
|
cannam@167
|
774 MPI_Init(argc, argv);
|
cannam@167
|
775 threads_ok = 0;
|
cannam@167
|
776 # endif
|
cannam@167
|
777 #else
|
cannam@167
|
778 MPI_Init(argc, argv);
|
cannam@167
|
779 #endif
|
cannam@167
|
780 MPI_Comm_rank(MPI_COMM_WORLD, &my_pe);
|
cannam@167
|
781 MPI_Comm_size(MPI_COMM_WORLD, &n_pes);
|
cannam@167
|
782 if (my_pe != 0) verbose = -999;
|
cannam@167
|
783 no_speed_allocation = 1; /* so we can benchmark transforms > memory */
|
cannam@167
|
784 always_pad_real = 1; /* out-of-place real transforms are padded */
|
cannam@167
|
785 isend_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
|
cannam@167
|
786 isend_off = (int *) bench_malloc(sizeof(int) * n_pes);
|
cannam@167
|
787 orecv_cnt = (int *) bench_malloc(sizeof(int) * n_pes);
|
cannam@167
|
788 orecv_off = (int *) bench_malloc(sizeof(int) * n_pes);
|
cannam@167
|
789
|
cannam@167
|
790 /* init_threads must be called before any other FFTW function,
|
cannam@167
|
791 including mpi_init, because it has to register the threads hooks
|
cannam@167
|
792 before the planner is initalized */
|
cannam@167
|
793 #ifdef HAVE_SMP
|
cannam@167
|
794 if (threads_ok) { BENCH_ASSERT(FFTW(init_threads)()); }
|
cannam@167
|
795 #endif
|
cannam@167
|
796 FFTW(mpi_init)();
|
cannam@167
|
797 }
|
cannam@167
|
798
|
cannam@167
|
799 void initial_cleanup(void)
|
cannam@167
|
800 {
|
cannam@167
|
801 alloc_rnk(0);
|
cannam@167
|
802 alloc_local(0, 0);
|
cannam@167
|
803 bench_free(all_local_in); all_local_in = 0;
|
cannam@167
|
804 bench_free(all_local_out); all_local_out = 0;
|
cannam@167
|
805 bench_free(isend_off); isend_off = 0;
|
cannam@167
|
806 bench_free(isend_cnt); isend_cnt = 0;
|
cannam@167
|
807 bench_free(orecv_off); orecv_off = 0;
|
cannam@167
|
808 bench_free(orecv_cnt); orecv_cnt = 0;
|
cannam@167
|
809 FFTW(destroy_plan)(plan_scramble_in); plan_scramble_in = 0;
|
cannam@167
|
810 FFTW(destroy_plan)(plan_unscramble_out); plan_unscramble_out = 0;
|
cannam@167
|
811 }
|
cannam@167
|
812
|
cannam@167
|
813 void final_cleanup(void)
|
cannam@167
|
814 {
|
cannam@167
|
815 MPI_Finalize();
|
cannam@167
|
816 }
|
cannam@167
|
817
|
cannam@167
|
818 void bench_exit(int status)
|
cannam@167
|
819 {
|
cannam@167
|
820 MPI_Abort(MPI_COMM_WORLD, status);
|
cannam@167
|
821 }
|
cannam@167
|
822
|
cannam@167
|
823 double bench_cost_postprocess(double cost)
|
cannam@167
|
824 {
|
cannam@167
|
825 double cost_max;
|
cannam@167
|
826 MPI_Allreduce(&cost, &cost_max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
cannam@167
|
827 return cost_max;
|
cannam@167
|
828 }
|
cannam@167
|
829
|
cannam@167
|
830
|
cannam@167
|
831 int import_wisdom(FILE *f)
|
cannam@167
|
832 {
|
cannam@167
|
833 int success = 1, sall;
|
cannam@167
|
834 if (my_pe == 0) success = FFTW(import_wisdom_from_file)(f);
|
cannam@167
|
835 FFTW(mpi_broadcast_wisdom)(MPI_COMM_WORLD);
|
cannam@167
|
836 MPI_Allreduce(&success, &sall, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
|
cannam@167
|
837 return sall;
|
cannam@167
|
838 }
|
cannam@167
|
839
|
cannam@167
|
840 void export_wisdom(FILE *f)
|
cannam@167
|
841 {
|
cannam@167
|
842 FFTW(mpi_gather_wisdom)(MPI_COMM_WORLD);
|
cannam@167
|
843 if (my_pe == 0) FFTW(export_wisdom_to_file)(f);
|
cannam@167
|
844 }
|