Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.8/mpi/api.c @ 82:d0c2a83c1364
Add FFTW 3.3.8 source, and a Linux build
author | Chris Cannam |
---|---|
date | Tue, 19 Nov 2019 14:52:55 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
81:7029a4916348 | 82:d0c2a83c1364 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 #include "api/api.h" | |
22 #include "fftw3-mpi.h" | |
23 #include "ifftw-mpi.h" | |
24 #include "mpi-transpose.h" | |
25 #include "mpi-dft.h" | |
26 #include "mpi-rdft.h" | |
27 #include "mpi-rdft2.h" | |
28 | |
29 /* Convert API flags to internal MPI flags. */ | |
30 #define MPI_FLAGS(f) ((f) >> 27) | |
31 | |
32 /*************************************************************************/ | |
33 | |
34 static int mpi_inited = 0; | |
35 | |
36 static MPI_Comm problem_comm(const problem *p) { | |
37 switch (p->adt->problem_kind) { | |
38 case PROBLEM_MPI_DFT: | |
39 return ((const problem_mpi_dft *) p)->comm; | |
40 case PROBLEM_MPI_RDFT: | |
41 return ((const problem_mpi_rdft *) p)->comm; | |
42 case PROBLEM_MPI_RDFT2: | |
43 return ((const problem_mpi_rdft2 *) p)->comm; | |
44 case PROBLEM_MPI_TRANSPOSE: | |
45 return ((const problem_mpi_transpose *) p)->comm; | |
46 default: | |
47 return MPI_COMM_NULL; | |
48 } | |
49 } | |
50 | |
51 /* used to synchronize cost measurements (timing or estimation) | |
52 across all processes for an MPI problem, which is critical to | |
53 ensure that all processes decide to use the same MPI plans | |
54 (whereas serial plans need not be syncronized). */ | |
55 static double cost_hook(const problem *p, double t, cost_kind k) | |
56 { | |
57 MPI_Comm comm = problem_comm(p); | |
58 double tsum; | |
59 if (comm == MPI_COMM_NULL) return t; | |
60 MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE, | |
61 k == COST_SUM ? MPI_SUM : MPI_MAX, comm); | |
62 return tsum; | |
63 } | |
64 | |
65 /* Used to reject wisdom that is not in sync across all processes | |
66 for an MPI problem, which is critical to ensure that all processes | |
67 decide to use the same MPI plans. (Even though costs are synchronized, | |
68 above, out-of-sync wisdom may result from plans being produced | |
69 by communicators that do not span all processes, either from a | |
70 user-specified communicator or e.g. from transpose-recurse. */ | |
71 static int wisdom_ok_hook(const problem *p, flags_t flags) | |
72 { | |
73 MPI_Comm comm = problem_comm(p); | |
74 int eq_me, eq_all; | |
75 /* unpack flags bitfield, since MPI communications may involve | |
76 byte-order changes and MPI cannot do this for bit fields */ | |
77 #if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */ | |
78 unsigned int f[5]; | |
79 #else | |
80 unsigned long f[5]; /* at least 32 bits as per C standard */ | |
81 #endif | |
82 | |
83 if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */ | |
84 | |
85 if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */ | |
86 | |
87 /* otherwise, check that the flags and solver index are identical | |
88 on all processes in this problem's communicator. | |
89 | |
90 TO DO: possibly we can relax strict equality, but it is | |
91 critical to ensure that any flags which affect what plan is | |
92 created (and whether the solver is applicable) are the same, | |
93 e.g. DESTROY_INPUT, NO_UGLY, etcetera. (If the MPI algorithm | |
94 differs between processes, deadlocks/crashes generally result.) */ | |
95 f[0] = flags.l; | |
96 f[1] = flags.hash_info; | |
97 f[2] = flags.timelimit_impatience; | |
98 f[3] = flags.u; | |
99 f[4] = flags.slvndx; | |
100 MPI_Bcast(f, 5, | |
101 SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG, | |
102 0, comm); | |
103 eq_me = f[0] == flags.l && f[1] == flags.hash_info | |
104 && f[2] == flags.timelimit_impatience | |
105 && f[3] == flags.u && f[4] == flags.slvndx; | |
106 MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm); | |
107 return eq_all; | |
108 } | |
109 | |
110 /* This hook is called when wisdom is not found. The any_true here | |
111 matches up with the any_true in wisdom_ok_hook, in order to handle | |
112 the case where some processes had wisdom (and called wisdom_ok_hook) | |
113 and some processes didn't have wisdom (and called nowisdom_hook). */ | |
114 static void nowisdom_hook(const problem *p) | |
115 { | |
116 MPI_Comm comm = problem_comm(p); | |
117 if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */ | |
118 XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */ | |
119 } | |
120 | |
121 /* needed to synchronize planner bogosity flag, in case non-MPI problems | |
122 on a subset of processes encountered bogus wisdom */ | |
123 static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p) | |
124 { | |
125 MPI_Comm comm = problem_comm(p); | |
126 if (comm != MPI_COMM_NULL /* an MPI problem */ | |
127 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */ | |
128 return WISDOM_IS_BOGUS; | |
129 return state; | |
130 } | |
131 | |
132 void XM(init)(void) | |
133 { | |
134 if (!mpi_inited) { | |
135 planner *plnr = X(the_planner)(); | |
136 plnr->cost_hook = cost_hook; | |
137 plnr->wisdom_ok_hook = wisdom_ok_hook; | |
138 plnr->nowisdom_hook = nowisdom_hook; | |
139 plnr->bogosity_hook = bogosity_hook; | |
140 XM(conf_standard)(plnr); | |
141 mpi_inited = 1; | |
142 } | |
143 } | |
144 | |
145 void XM(cleanup)(void) | |
146 { | |
147 X(cleanup)(); | |
148 mpi_inited = 0; | |
149 } | |
150 | |
151 /*************************************************************************/ | |
152 | |
153 static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0) | |
154 { | |
155 dtensor *x = XM(mkdtensor)(rnk); | |
156 int i; | |
157 for (i = 0; i < rnk; ++i) { | |
158 x->dims[i].n = dims0[i].n; | |
159 x->dims[i].b[IB] = dims0[i].ib; | |
160 x->dims[i].b[OB] = dims0[i].ob; | |
161 } | |
162 return x; | |
163 } | |
164 | |
165 static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes, | |
166 int rdft2) | |
167 { | |
168 dtensor *sz = XM(mkdtensor)(rnk); | |
169 dtensor *sz0 = mkdtensor_api(rnk, dims0); | |
170 block_kind k; | |
171 int i; | |
172 | |
173 for (i = 0; i < rnk; ++i) | |
174 sz->dims[i].n = dims0[i].n; | |
175 | |
176 if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; | |
177 | |
178 for (i = 0; i < rnk; ++i) { | |
179 sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n; | |
180 sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n; | |
181 } | |
182 | |
183 /* If we haven't used all of the processes yet, and some of the | |
184 block sizes weren't specified (i.e. 0), then set the | |
185 unspecified blocks so as to use as many processes as | |
186 possible with as few distributed dimensions as possible. */ | |
187 FORALL_BLOCK_KIND(k) { | |
188 INT nb = XM(num_blocks_total)(sz, k); | |
189 INT np = n_pes / nb; | |
190 for (i = 0; i < rnk && np > 1; ++i) | |
191 if (!sz0->dims[i].b[k]) { | |
192 sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np); | |
193 nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]); | |
194 np = n_pes / nb; | |
195 } | |
196 } | |
197 | |
198 if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n; | |
199 | |
200 /* punt for 1d prime */ | |
201 if (rnk == 1 && X(is_prime)(sz->dims[0].n)) | |
202 sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n; | |
203 | |
204 XM(dtensor_destroy)(sz0); | |
205 sz0 = XM(dtensor_canonical)(sz, 0); | |
206 XM(dtensor_destroy)(sz); | |
207 return sz0; | |
208 } | |
209 | |
210 /* allocate simple local (serial) dims array corresponding to n[rnk] */ | |
211 static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n) | |
212 { | |
213 XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk, | |
214 TENSORS); | |
215 int i; | |
216 for (i = 0; i < rnk; ++i) | |
217 dims[i].n = dims[i].ib = dims[i].ob = n[i]; | |
218 return dims; | |
219 } | |
220 | |
221 /*************************************************************************/ | |
222 | |
223 static void local_size(int my_pe, const dtensor *sz, block_kind k, | |
224 ptrdiff_t *local_n, ptrdiff_t *local_start) | |
225 { | |
226 int i; | |
227 if (my_pe >= XM(num_blocks_total)(sz, k)) | |
228 for (i = 0; i < sz->rnk; ++i) | |
229 local_n[i] = local_start[i] = 0; | |
230 else { | |
231 XM(block_coords)(sz, k, my_pe, local_start); | |
232 for (i = 0; i < sz->rnk; ++i) { | |
233 local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k], | |
234 local_start[i]); | |
235 local_start[i] *= sz->dims[i].b[k]; | |
236 } | |
237 } | |
238 } | |
239 | |
240 static INT prod(int rnk, const ptrdiff_t *local_n) | |
241 { | |
242 int i; | |
243 INT N = 1; | |
244 for (i = 0; i < rnk; ++i) N *= local_n[i]; | |
245 return N; | |
246 } | |
247 | |
248 ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0, | |
249 ptrdiff_t howmany, MPI_Comm comm, | |
250 ptrdiff_t *local_n_in, | |
251 ptrdiff_t *local_start_in, | |
252 ptrdiff_t *local_n_out, | |
253 ptrdiff_t *local_start_out, | |
254 int sign, unsigned flags) | |
255 { | |
256 INT N; | |
257 int my_pe, n_pes, i; | |
258 dtensor *sz; | |
259 | |
260 if (rnk == 0) | |
261 return howmany; | |
262 | |
263 MPI_Comm_rank(comm, &my_pe); | |
264 MPI_Comm_size(comm, &n_pes); | |
265 sz = default_sz(rnk, dims0, n_pes, 0); | |
266 | |
267 /* Now, we must figure out how much local space the user should | |
268 allocate (or at least an upper bound). This depends strongly | |
269 on the exact algorithms we employ...ugh! FIXME: get this info | |
270 from the solvers somehow? */ | |
271 N = 1; /* never return zero allocation size */ | |
272 if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) { | |
273 INT Nafter; | |
274 ddim odims[2]; | |
275 | |
276 /* dft-rank-geq2-transposed */ | |
277 odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */ | |
278 /* we may need extra space for transposed intermediate data */ | |
279 for (i = 0; i < 2; ++i) | |
280 if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 && | |
281 XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) { | |
282 sz->dims[i].b[IB] | |
283 = XM(default_block)(sz->dims[i].n, n_pes); | |
284 sz->dims[1-i].b[IB] = sz->dims[1-i].n; | |
285 local_size(my_pe, sz, IB, local_n_in, local_start_in); | |
286 N = X(imax)(N, prod(rnk, local_n_in)); | |
287 sz->dims[i] = odims[i]; | |
288 sz->dims[1-i] = odims[1-i]; | |
289 break; | |
290 } | |
291 | |
292 /* dft-rank-geq2 */ | |
293 Nafter = howmany; | |
294 for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; | |
295 N = X(imax)(N, (sz->dims[0].n | |
296 * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), | |
297 my_pe) + howmany - 1) / howmany); | |
298 | |
299 /* dft-rank-geq2 with dimensions swapped */ | |
300 Nafter = howmany * sz->dims[0].n; | |
301 for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; | |
302 N = X(imax)(N, (sz->dims[1].n | |
303 * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), | |
304 my_pe) + howmany - 1) / howmany); | |
305 } | |
306 else if (rnk == 1) { | |
307 if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */ | |
308 ptrdiff_t n[2], start[2]; | |
309 dtensor *sz2 = XM(mkdtensor)(2); | |
310 sz2->dims[0] = sz->dims[0]; | |
311 sz2->dims[0].b[IB] = sz->dims[0].n; | |
312 sz2->dims[1].n = sz2->dims[1].b[OB] = howmany; | |
313 sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes); | |
314 local_size(my_pe, sz2, IB, n, start); | |
315 XM(dtensor_destroy)(sz2); | |
316 N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany); | |
317 } | |
318 else { /* dft-rank1 */ | |
319 INT r, m, rblock[2], mblock[2]; | |
320 | |
321 /* Since the 1d transforms are so different, we require | |
322 the user to call local_size_1d for this case. Ugh. */ | |
323 CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD); | |
324 | |
325 if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign, | |
326 rblock, mblock))) { | |
327 m = sz->dims[0].n / r; | |
328 if (flags & FFTW_MPI_SCRAMBLED_IN) | |
329 sz->dims[0].b[IB] = rblock[IB] * m; | |
330 else { /* !SCRAMBLED_IN */ | |
331 sz->dims[0].b[IB] = r * mblock[IB]; | |
332 N = X(imax)(N, rblock[IB] * m); | |
333 } | |
334 if (flags & FFTW_MPI_SCRAMBLED_OUT) | |
335 sz->dims[0].b[OB] = r * mblock[OB]; | |
336 else { /* !SCRAMBLED_OUT */ | |
337 N = X(imax)(N, r * mblock[OB]); | |
338 sz->dims[0].b[OB] = rblock[OB] * m; | |
339 } | |
340 } | |
341 } | |
342 } | |
343 | |
344 local_size(my_pe, sz, IB, local_n_in, local_start_in); | |
345 local_size(my_pe, sz, OB, local_n_out, local_start_out); | |
346 | |
347 /* at least, make sure we have enough space to store input & output */ | |
348 N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out))); | |
349 | |
350 XM(dtensor_destroy)(sz); | |
351 return N * howmany; | |
352 } | |
353 | |
354 ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n, | |
355 ptrdiff_t howmany, | |
356 ptrdiff_t xblock, ptrdiff_t yblock, | |
357 MPI_Comm comm, | |
358 ptrdiff_t *local_nx, | |
359 ptrdiff_t *local_x_start, | |
360 ptrdiff_t *local_ny, | |
361 ptrdiff_t *local_y_start) | |
362 { | |
363 ptrdiff_t N; | |
364 XM(ddim) *dims; | |
365 ptrdiff_t *local; | |
366 | |
367 if (rnk == 0) { | |
368 *local_nx = *local_ny = 1; | |
369 *local_x_start = *local_y_start = 0; | |
370 return howmany; | |
371 } | |
372 | |
373 dims = simple_dims(rnk, n); | |
374 local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS); | |
375 | |
376 /* default 1d block distribution, with transposed output | |
377 if yblock < n[1] */ | |
378 dims[0].ib = xblock; | |
379 if (rnk > 1) { | |
380 if (yblock < n[1]) | |
381 dims[1].ob = yblock; | |
382 else | |
383 dims[0].ob = xblock; | |
384 } | |
385 else | |
386 dims[0].ob = xblock; /* FIXME: 1d not really supported here | |
387 since we don't have flags/sign */ | |
388 | |
389 N = XM(local_size_guru)(rnk, dims, howmany, comm, | |
390 local, local + rnk, | |
391 local + 2*rnk, local + 3*rnk, | |
392 0, 0); | |
393 *local_nx = local[0]; | |
394 *local_x_start = local[rnk]; | |
395 if (rnk > 1) { | |
396 *local_ny = local[2*rnk + 1]; | |
397 *local_y_start = local[3*rnk + 1]; | |
398 } | |
399 else { | |
400 *local_ny = *local_nx; | |
401 *local_y_start = *local_x_start; | |
402 } | |
403 X(ifree)(local); | |
404 X(ifree)(dims); | |
405 return N; | |
406 } | |
407 | |
408 ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n, | |
409 ptrdiff_t howmany, | |
410 ptrdiff_t xblock, | |
411 MPI_Comm comm, | |
412 ptrdiff_t *local_nx, | |
413 ptrdiff_t *local_x_start) | |
414 { | |
415 ptrdiff_t local_ny, local_y_start; | |
416 return XM(local_size_many_transposed)(rnk, n, howmany, | |
417 xblock, rnk > 1 | |
418 ? n[1] : FFTW_MPI_DEFAULT_BLOCK, | |
419 comm, | |
420 local_nx, local_x_start, | |
421 &local_ny, &local_y_start); | |
422 } | |
423 | |
424 | |
425 ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n, | |
426 MPI_Comm comm, | |
427 ptrdiff_t *local_nx, | |
428 ptrdiff_t *local_x_start, | |
429 ptrdiff_t *local_ny, | |
430 ptrdiff_t *local_y_start) | |
431 { | |
432 return XM(local_size_many_transposed)(rnk, n, 1, | |
433 FFTW_MPI_DEFAULT_BLOCK, | |
434 FFTW_MPI_DEFAULT_BLOCK, | |
435 comm, | |
436 local_nx, local_x_start, | |
437 local_ny, local_y_start); | |
438 } | |
439 | |
440 ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n, | |
441 MPI_Comm comm, | |
442 ptrdiff_t *local_nx, | |
443 ptrdiff_t *local_x_start) | |
444 { | |
445 return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm, | |
446 local_nx, local_x_start); | |
447 } | |
448 | |
449 ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany, | |
450 MPI_Comm comm, int sign, unsigned flags, | |
451 ptrdiff_t *local_nx, ptrdiff_t *local_x_start, | |
452 ptrdiff_t *local_ny, ptrdiff_t *local_y_start) | |
453 { | |
454 XM(ddim) d; | |
455 d.n = nx; | |
456 d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK; | |
457 return XM(local_size_guru)(1, &d, howmany, comm, | |
458 local_nx, local_x_start, | |
459 local_ny, local_y_start, sign, flags); | |
460 } | |
461 | |
462 ptrdiff_t XM(local_size_1d)(ptrdiff_t nx, | |
463 MPI_Comm comm, int sign, unsigned flags, | |
464 ptrdiff_t *local_nx, ptrdiff_t *local_x_start, | |
465 ptrdiff_t *local_ny, ptrdiff_t *local_y_start) | |
466 { | |
467 return XM(local_size_many_1d)(nx, 1, comm, sign, flags, | |
468 local_nx, local_x_start, | |
469 local_ny, local_y_start); | |
470 } | |
471 | |
472 ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny, | |
473 MPI_Comm comm, | |
474 ptrdiff_t *local_nx, | |
475 ptrdiff_t *local_x_start, | |
476 ptrdiff_t *local_ny, | |
477 ptrdiff_t *local_y_start) | |
478 { | |
479 ptrdiff_t n[2]; | |
480 n[0] = nx; n[1] = ny; | |
481 return XM(local_size_transposed)(2, n, comm, | |
482 local_nx, local_x_start, | |
483 local_ny, local_y_start); | |
484 } | |
485 | |
486 ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm, | |
487 ptrdiff_t *local_nx, ptrdiff_t *local_x_start) | |
488 { | |
489 ptrdiff_t n[2]; | |
490 n[0] = nx; n[1] = ny; | |
491 return XM(local_size)(2, n, comm, local_nx, local_x_start); | |
492 } | |
493 | |
494 ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny, | |
495 ptrdiff_t nz, | |
496 MPI_Comm comm, | |
497 ptrdiff_t *local_nx, | |
498 ptrdiff_t *local_x_start, | |
499 ptrdiff_t *local_ny, | |
500 ptrdiff_t *local_y_start) | |
501 { | |
502 ptrdiff_t n[3]; | |
503 n[0] = nx; n[1] = ny; n[2] = nz; | |
504 return XM(local_size_transposed)(3, n, comm, | |
505 local_nx, local_x_start, | |
506 local_ny, local_y_start); | |
507 } | |
508 | |
509 ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | |
510 MPI_Comm comm, | |
511 ptrdiff_t *local_nx, ptrdiff_t *local_x_start) | |
512 { | |
513 ptrdiff_t n[3]; | |
514 n[0] = nx; n[1] = ny; n[2] = nz; | |
515 return XM(local_size)(3, n, comm, local_nx, local_x_start); | |
516 } | |
517 | |
518 /*************************************************************************/ | |
519 /* Transpose API */ | |
520 | |
521 X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny, | |
522 ptrdiff_t howmany, | |
523 ptrdiff_t xblock, ptrdiff_t yblock, | |
524 R *in, R *out, | |
525 MPI_Comm comm, unsigned flags) | |
526 { | |
527 int n_pes; | |
528 XM(init)(); | |
529 | |
530 if (howmany < 0 || xblock < 0 || yblock < 0 || | |
531 nx <= 0 || ny <= 0) return 0; | |
532 | |
533 MPI_Comm_size(comm, &n_pes); | |
534 if (!xblock) xblock = XM(default_block)(nx, n_pes); | |
535 if (!yblock) yblock = XM(default_block)(ny, n_pes); | |
536 if (n_pes < XM(num_blocks)(nx, xblock) | |
537 || n_pes < XM(num_blocks)(ny, yblock)) | |
538 return 0; | |
539 | |
540 return | |
541 X(mkapiplan)(FFTW_FORWARD, flags, | |
542 XM(mkproblem_transpose)(nx, ny, howmany, | |
543 in, out, xblock, yblock, | |
544 comm, MPI_FLAGS(flags))); | |
545 } | |
546 | |
547 X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, | |
548 MPI_Comm comm, unsigned flags) | |
549 | |
550 { | |
551 return XM(plan_many_transpose)(nx, ny, 1, | |
552 FFTW_MPI_DEFAULT_BLOCK, | |
553 FFTW_MPI_DEFAULT_BLOCK, | |
554 in, out, comm, flags); | |
555 } | |
556 | |
557 /*************************************************************************/ | |
558 /* Complex DFT API */ | |
559 | |
560 X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0, | |
561 ptrdiff_t howmany, | |
562 C *in, C *out, | |
563 MPI_Comm comm, int sign, unsigned flags) | |
564 { | |
565 int n_pes, i; | |
566 dtensor *sz; | |
567 | |
568 XM(init)(); | |
569 | |
570 if (howmany < 0 || rnk < 1) return 0; | |
571 for (i = 0; i < rnk; ++i) | |
572 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | |
573 return 0; | |
574 | |
575 MPI_Comm_size(comm, &n_pes); | |
576 sz = default_sz(rnk, dims0, n_pes, 0); | |
577 | |
578 if (XM(num_blocks_total)(sz, IB) > n_pes | |
579 || XM(num_blocks_total)(sz, OB) > n_pes) { | |
580 XM(dtensor_destroy)(sz); | |
581 return 0; | |
582 } | |
583 | |
584 return | |
585 X(mkapiplan)(sign, flags, | |
586 XM(mkproblem_dft_d)(sz, howmany, | |
587 (R *) in, (R *) out, | |
588 comm, sign, | |
589 MPI_FLAGS(flags))); | |
590 } | |
591 | |
592 X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n, | |
593 ptrdiff_t howmany, | |
594 ptrdiff_t iblock, ptrdiff_t oblock, | |
595 C *in, C *out, | |
596 MPI_Comm comm, int sign, unsigned flags) | |
597 { | |
598 XM(ddim) *dims = simple_dims(rnk, n); | |
599 X(plan) pln; | |
600 | |
601 if (rnk == 1) { | |
602 dims[0].ib = iblock; | |
603 dims[0].ob = oblock; | |
604 } | |
605 else if (rnk > 1) { | |
606 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | |
607 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | |
608 } | |
609 | |
610 pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags); | |
611 X(ifree)(dims); | |
612 return pln; | |
613 } | |
614 | |
615 X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out, | |
616 MPI_Comm comm, int sign, unsigned flags) | |
617 { | |
618 return XM(plan_many_dft)(rnk, n, 1, | |
619 FFTW_MPI_DEFAULT_BLOCK, | |
620 FFTW_MPI_DEFAULT_BLOCK, | |
621 in, out, comm, sign, flags); | |
622 } | |
623 | |
624 X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out, | |
625 MPI_Comm comm, int sign, unsigned flags) | |
626 { | |
627 return XM(plan_dft)(1, &nx, in, out, comm, sign, flags); | |
628 } | |
629 | |
630 X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out, | |
631 MPI_Comm comm, int sign, unsigned flags) | |
632 { | |
633 ptrdiff_t n[2]; | |
634 n[0] = nx; n[1] = ny; | |
635 return XM(plan_dft)(2, n, in, out, comm, sign, flags); | |
636 } | |
637 | |
638 X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | |
639 C *in, C *out, | |
640 MPI_Comm comm, int sign, unsigned flags) | |
641 { | |
642 ptrdiff_t n[3]; | |
643 n[0] = nx; n[1] = ny; n[2] = nz; | |
644 return XM(plan_dft)(3, n, in, out, comm, sign, flags); | |
645 } | |
646 | |
647 /*************************************************************************/ | |
648 /* R2R API */ | |
649 | |
650 X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0, | |
651 ptrdiff_t howmany, | |
652 R *in, R *out, | |
653 MPI_Comm comm, const X(r2r_kind) *kind, | |
654 unsigned flags) | |
655 { | |
656 int n_pes, i; | |
657 dtensor *sz; | |
658 rdft_kind *k; | |
659 X(plan) pln; | |
660 | |
661 XM(init)(); | |
662 | |
663 if (howmany < 0 || rnk < 1) return 0; | |
664 for (i = 0; i < rnk; ++i) | |
665 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | |
666 return 0; | |
667 | |
668 k = X(map_r2r_kind)(rnk, kind); | |
669 | |
670 MPI_Comm_size(comm, &n_pes); | |
671 sz = default_sz(rnk, dims0, n_pes, 0); | |
672 | |
673 if (XM(num_blocks_total)(sz, IB) > n_pes | |
674 || XM(num_blocks_total)(sz, OB) > n_pes) { | |
675 XM(dtensor_destroy)(sz); | |
676 return 0; | |
677 } | |
678 | |
679 pln = X(mkapiplan)(0, flags, | |
680 XM(mkproblem_rdft_d)(sz, howmany, | |
681 in, out, | |
682 comm, k, MPI_FLAGS(flags))); | |
683 X(ifree0)(k); | |
684 return pln; | |
685 } | |
686 | |
687 X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n, | |
688 ptrdiff_t howmany, | |
689 ptrdiff_t iblock, ptrdiff_t oblock, | |
690 R *in, R *out, | |
691 MPI_Comm comm, const X(r2r_kind) *kind, | |
692 unsigned flags) | |
693 { | |
694 XM(ddim) *dims = simple_dims(rnk, n); | |
695 X(plan) pln; | |
696 | |
697 if (rnk == 1) { | |
698 dims[0].ib = iblock; | |
699 dims[0].ob = oblock; | |
700 } | |
701 else if (rnk > 1) { | |
702 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | |
703 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | |
704 } | |
705 | |
706 pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags); | |
707 X(ifree)(dims); | |
708 return pln; | |
709 } | |
710 | |
711 X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out, | |
712 MPI_Comm comm, | |
713 const X(r2r_kind) *kind, | |
714 unsigned flags) | |
715 { | |
716 return XM(plan_many_r2r)(rnk, n, 1, | |
717 FFTW_MPI_DEFAULT_BLOCK, | |
718 FFTW_MPI_DEFAULT_BLOCK, | |
719 in, out, comm, kind, flags); | |
720 } | |
721 | |
722 X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, | |
723 MPI_Comm comm, | |
724 X(r2r_kind) kindx, X(r2r_kind) kindy, | |
725 unsigned flags) | |
726 { | |
727 ptrdiff_t n[2]; | |
728 X(r2r_kind) kind[2]; | |
729 n[0] = nx; n[1] = ny; | |
730 kind[0] = kindx; kind[1] = kindy; | |
731 return XM(plan_r2r)(2, n, in, out, comm, kind, flags); | |
732 } | |
733 | |
734 X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | |
735 R *in, R *out, | |
736 MPI_Comm comm, | |
737 X(r2r_kind) kindx, X(r2r_kind) kindy, | |
738 X(r2r_kind) kindz, | |
739 unsigned flags) | |
740 { | |
741 ptrdiff_t n[3]; | |
742 X(r2r_kind) kind[3]; | |
743 n[0] = nx; n[1] = ny; n[2] = nz; | |
744 kind[0] = kindx; kind[1] = kindy; kind[2] = kindz; | |
745 return XM(plan_r2r)(3, n, in, out, comm, kind, flags); | |
746 } | |
747 | |
748 /*************************************************************************/ | |
749 /* R2C/C2R API */ | |
750 | |
751 static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0, | |
752 ptrdiff_t howmany, | |
753 R *r, C *c, | |
754 MPI_Comm comm, rdft_kind kind, unsigned flags) | |
755 { | |
756 int n_pes, i; | |
757 dtensor *sz; | |
758 R *cr = (R *) c; | |
759 | |
760 XM(init)(); | |
761 | |
762 if (howmany < 0 || rnk < 2) return 0; | |
763 for (i = 0; i < rnk; ++i) | |
764 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | |
765 return 0; | |
766 | |
767 MPI_Comm_size(comm, &n_pes); | |
768 sz = default_sz(rnk, dims0, n_pes, 1); | |
769 | |
770 sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; | |
771 if (XM(num_blocks_total)(sz, IB) > n_pes | |
772 || XM(num_blocks_total)(sz, OB) > n_pes) { | |
773 XM(dtensor_destroy)(sz); | |
774 return 0; | |
775 } | |
776 sz->dims[rnk-1].n = dims0[rnk-1].n; | |
777 | |
778 if (kind == R2HC) | |
779 return X(mkapiplan)(0, flags, | |
780 XM(mkproblem_rdft2_d)(sz, howmany, | |
781 r, cr, comm, R2HC, | |
782 MPI_FLAGS(flags))); | |
783 else | |
784 return X(mkapiplan)(0, flags, | |
785 XM(mkproblem_rdft2_d)(sz, howmany, | |
786 cr, r, comm, HC2R, | |
787 MPI_FLAGS(flags))); | |
788 } | |
789 | |
790 X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n, | |
791 ptrdiff_t howmany, | |
792 ptrdiff_t iblock, ptrdiff_t oblock, | |
793 R *in, C *out, | |
794 MPI_Comm comm, unsigned flags) | |
795 { | |
796 XM(ddim) *dims = simple_dims(rnk, n); | |
797 X(plan) pln; | |
798 | |
799 if (rnk == 1) { | |
800 dims[0].ib = iblock; | |
801 dims[0].ob = oblock; | |
802 } | |
803 else if (rnk > 1) { | |
804 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | |
805 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | |
806 } | |
807 | |
808 pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags); | |
809 X(ifree)(dims); | |
810 return pln; | |
811 } | |
812 | |
813 X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n, | |
814 ptrdiff_t howmany, | |
815 ptrdiff_t iblock, ptrdiff_t oblock, | |
816 C *in, R *out, | |
817 MPI_Comm comm, unsigned flags) | |
818 { | |
819 XM(ddim) *dims = simple_dims(rnk, n); | |
820 X(plan) pln; | |
821 | |
822 if (rnk == 1) { | |
823 dims[0].ib = iblock; | |
824 dims[0].ob = oblock; | |
825 } | |
826 else if (rnk > 1) { | |
827 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | |
828 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | |
829 } | |
830 | |
831 pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags); | |
832 X(ifree)(dims); | |
833 return pln; | |
834 } | |
835 | |
836 X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out, | |
837 MPI_Comm comm, unsigned flags) | |
838 { | |
839 return XM(plan_many_dft_r2c)(rnk, n, 1, | |
840 FFTW_MPI_DEFAULT_BLOCK, | |
841 FFTW_MPI_DEFAULT_BLOCK, | |
842 in, out, comm, flags); | |
843 } | |
844 | |
845 X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out, | |
846 MPI_Comm comm, unsigned flags) | |
847 { | |
848 ptrdiff_t n[2]; | |
849 n[0] = nx; n[1] = ny; | |
850 return XM(plan_dft_r2c)(2, n, in, out, comm, flags); | |
851 } | |
852 | |
853 X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | |
854 R *in, C *out, MPI_Comm comm, unsigned flags) | |
855 { | |
856 ptrdiff_t n[3]; | |
857 n[0] = nx; n[1] = ny; n[2] = nz; | |
858 return XM(plan_dft_r2c)(3, n, in, out, comm, flags); | |
859 } | |
860 | |
861 X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out, | |
862 MPI_Comm comm, unsigned flags) | |
863 { | |
864 return XM(plan_many_dft_c2r)(rnk, n, 1, | |
865 FFTW_MPI_DEFAULT_BLOCK, | |
866 FFTW_MPI_DEFAULT_BLOCK, | |
867 in, out, comm, flags); | |
868 } | |
869 | |
870 X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out, | |
871 MPI_Comm comm, unsigned flags) | |
872 { | |
873 ptrdiff_t n[2]; | |
874 n[0] = nx; n[1] = ny; | |
875 return XM(plan_dft_c2r)(2, n, in, out, comm, flags); | |
876 } | |
877 | |
878 X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | |
879 C *in, R *out, MPI_Comm comm, unsigned flags) | |
880 { | |
881 ptrdiff_t n[3]; | |
882 n[0] = nx; n[1] = ny; n[2] = nz; | |
883 return XM(plan_dft_c2r)(3, n, in, out, comm, flags); | |
884 } | |
885 | |
886 /*************************************************************************/ | |
887 /* New-array execute functions */ | |
888 | |
889 void XM(execute_dft)(const X(plan) p, C *in, C *out) { | |
890 /* internally, MPI plans are just rdft plans */ | |
891 X(execute_r2r)(p, (R*) in, (R*) out); | |
892 } | |
893 | |
894 void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) { | |
895 /* internally, MPI plans are just rdft plans */ | |
896 X(execute_r2r)(p, in, (R*) out); | |
897 } | |
898 | |
899 void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) { | |
900 /* internally, MPI plans are just rdft plans */ | |
901 X(execute_r2r)(p, (R*) in, out); | |
902 } | |
903 | |
904 void XM(execute_r2r)(const X(plan) p, R *in, R *out) { | |
905 /* internally, MPI plans are just rdft plans */ | |
906 X(execute_r2r)(p, in, out); | |
907 } |