cannam@127
|
1 /*
|
cannam@127
|
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
|
cannam@127
|
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
cannam@127
|
4 *
|
cannam@127
|
5 * This program is free software; you can redistribute it and/or modify
|
cannam@127
|
6 * it under the terms of the GNU General Public License as published by
|
cannam@127
|
7 * the Free Software Foundation; either version 2 of the License, or
|
cannam@127
|
8 * (at your option) any later version.
|
cannam@127
|
9 *
|
cannam@127
|
10 * This program is distributed in the hope that it will be useful,
|
cannam@127
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
cannam@127
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
cannam@127
|
13 * GNU General Public License for more details.
|
cannam@127
|
14 *
|
cannam@127
|
15 * You should have received a copy of the GNU General Public License
|
cannam@127
|
16 * along with this program; if not, write to the Free Software
|
cannam@127
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
cannam@127
|
18 *
|
cannam@127
|
19 */
|
cannam@127
|
20
|
cannam@127
|
21 #include "api.h"
|
cannam@127
|
22 #include "fftw3-mpi.h"
|
cannam@127
|
23 #include "ifftw-mpi.h"
|
cannam@127
|
24 #include "mpi-transpose.h"
|
cannam@127
|
25 #include "mpi-dft.h"
|
cannam@127
|
26 #include "mpi-rdft.h"
|
cannam@127
|
27 #include "mpi-rdft2.h"
|
cannam@127
|
28
|
cannam@127
|
29 /* Convert API flags to internal MPI flags. */
|
cannam@127
|
30 #define MPI_FLAGS(f) ((f) >> 27)
|
cannam@127
|
31
|
cannam@127
|
32 /*************************************************************************/
|
cannam@127
|
33
|
cannam@127
|
34 static int mpi_inited = 0;
|
cannam@127
|
35
|
cannam@127
|
36 static MPI_Comm problem_comm(const problem *p) {
|
cannam@127
|
37 switch (p->adt->problem_kind) {
|
cannam@127
|
38 case PROBLEM_MPI_DFT:
|
cannam@127
|
39 return ((const problem_mpi_dft *) p)->comm;
|
cannam@127
|
40 case PROBLEM_MPI_RDFT:
|
cannam@127
|
41 return ((const problem_mpi_rdft *) p)->comm;
|
cannam@127
|
42 case PROBLEM_MPI_RDFT2:
|
cannam@127
|
43 return ((const problem_mpi_rdft2 *) p)->comm;
|
cannam@127
|
44 case PROBLEM_MPI_TRANSPOSE:
|
cannam@127
|
45 return ((const problem_mpi_transpose *) p)->comm;
|
cannam@127
|
46 default:
|
cannam@127
|
47 return MPI_COMM_NULL;
|
cannam@127
|
48 }
|
cannam@127
|
49 }
|
cannam@127
|
50
|
cannam@127
|
51 /* used to synchronize cost measurements (timing or estimation)
|
cannam@127
|
52 across all processes for an MPI problem, which is critical to
|
cannam@127
|
53 ensure that all processes decide to use the same MPI plans
|
cannam@127
|
54 (whereas serial plans need not be syncronized). */
|
cannam@127
|
55 static double cost_hook(const problem *p, double t, cost_kind k)
|
cannam@127
|
56 {
|
cannam@127
|
57 MPI_Comm comm = problem_comm(p);
|
cannam@127
|
58 double tsum;
|
cannam@127
|
59 if (comm == MPI_COMM_NULL) return t;
|
cannam@127
|
60 MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE,
|
cannam@127
|
61 k == COST_SUM ? MPI_SUM : MPI_MAX, comm);
|
cannam@127
|
62 return tsum;
|
cannam@127
|
63 }
|
cannam@127
|
64
|
cannam@127
|
65 /* Used to reject wisdom that is not in sync across all processes
|
cannam@127
|
66 for an MPI problem, which is critical to ensure that all processes
|
cannam@127
|
67 decide to use the same MPI plans. (Even though costs are synchronized,
|
cannam@127
|
68 above, out-of-sync wisdom may result from plans being produced
|
cannam@127
|
69 by communicators that do not span all processes, either from a
|
cannam@127
|
70 user-specified communicator or e.g. from transpose-recurse. */
|
cannam@127
|
71 static int wisdom_ok_hook(const problem *p, flags_t flags)
|
cannam@127
|
72 {
|
cannam@127
|
73 MPI_Comm comm = problem_comm(p);
|
cannam@127
|
74 int eq_me, eq_all;
|
cannam@127
|
75 /* unpack flags bitfield, since MPI communications may involve
|
cannam@127
|
76 byte-order changes and MPI cannot do this for bit fields */
|
cannam@127
|
77 #if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
|
cannam@127
|
78 unsigned int f[5];
|
cannam@127
|
79 #else
|
cannam@127
|
80 unsigned long f[5]; /* at least 32 bits as per C standard */
|
cannam@127
|
81 #endif
|
cannam@127
|
82
|
cannam@127
|
83 if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */
|
cannam@127
|
84
|
cannam@127
|
85 if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */
|
cannam@127
|
86
|
cannam@127
|
87 /* otherwise, check that the flags and solver index are identical
|
cannam@127
|
88 on all processes in this problem's communicator.
|
cannam@127
|
89
|
cannam@127
|
90 TO DO: possibly we can relax strict equality, but it is
|
cannam@127
|
91 critical to ensure that any flags which affect what plan is
|
cannam@127
|
92 created (and whether the solver is applicable) are the same,
|
cannam@127
|
93 e.g. DESTROY_INPUT, NO_UGLY, etcetera. (If the MPI algorithm
|
cannam@127
|
94 differs between processes, deadlocks/crashes generally result.) */
|
cannam@127
|
95 f[0] = flags.l;
|
cannam@127
|
96 f[1] = flags.hash_info;
|
cannam@127
|
97 f[2] = flags.timelimit_impatience;
|
cannam@127
|
98 f[3] = flags.u;
|
cannam@127
|
99 f[4] = flags.slvndx;
|
cannam@127
|
100 MPI_Bcast(f, 5,
|
cannam@127
|
101 SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG,
|
cannam@127
|
102 0, comm);
|
cannam@127
|
103 eq_me = f[0] == flags.l && f[1] == flags.hash_info
|
cannam@127
|
104 && f[2] == flags.timelimit_impatience
|
cannam@127
|
105 && f[3] == flags.u && f[4] == flags.slvndx;
|
cannam@127
|
106 MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm);
|
cannam@127
|
107 return eq_all;
|
cannam@127
|
108 }
|
cannam@127
|
109
|
cannam@127
|
110 /* This hook is called when wisdom is not found. The any_true here
|
cannam@127
|
111 matches up with the any_true in wisdom_ok_hook, in order to handle
|
cannam@127
|
112 the case where some processes had wisdom (and called wisdom_ok_hook)
|
cannam@127
|
113 and some processes didn't have wisdom (and called nowisdom_hook). */
|
cannam@127
|
114 static void nowisdom_hook(const problem *p)
|
cannam@127
|
115 {
|
cannam@127
|
116 MPI_Comm comm = problem_comm(p);
|
cannam@127
|
117 if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */
|
cannam@127
|
118 XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */
|
cannam@127
|
119 }
|
cannam@127
|
120
|
cannam@127
|
121 /* needed to synchronize planner bogosity flag, in case non-MPI problems
|
cannam@127
|
122 on a subset of processes encountered bogus wisdom */
|
cannam@127
|
123 static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p)
|
cannam@127
|
124 {
|
cannam@127
|
125 MPI_Comm comm = problem_comm(p);
|
cannam@127
|
126 if (comm != MPI_COMM_NULL /* an MPI problem */
|
cannam@127
|
127 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */
|
cannam@127
|
128 return WISDOM_IS_BOGUS;
|
cannam@127
|
129 return state;
|
cannam@127
|
130 }
|
cannam@127
|
131
|
cannam@127
|
132 void XM(init)(void)
|
cannam@127
|
133 {
|
cannam@127
|
134 if (!mpi_inited) {
|
cannam@127
|
135 planner *plnr = X(the_planner)();
|
cannam@127
|
136 plnr->cost_hook = cost_hook;
|
cannam@127
|
137 plnr->wisdom_ok_hook = wisdom_ok_hook;
|
cannam@127
|
138 plnr->nowisdom_hook = nowisdom_hook;
|
cannam@127
|
139 plnr->bogosity_hook = bogosity_hook;
|
cannam@127
|
140 XM(conf_standard)(plnr);
|
cannam@127
|
141 mpi_inited = 1;
|
cannam@127
|
142 }
|
cannam@127
|
143 }
|
cannam@127
|
144
|
cannam@127
|
145 void XM(cleanup)(void)
|
cannam@127
|
146 {
|
cannam@127
|
147 X(cleanup)();
|
cannam@127
|
148 mpi_inited = 0;
|
cannam@127
|
149 }
|
cannam@127
|
150
|
cannam@127
|
151 /*************************************************************************/
|
cannam@127
|
152
|
cannam@127
|
153 static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0)
|
cannam@127
|
154 {
|
cannam@127
|
155 dtensor *x = XM(mkdtensor)(rnk);
|
cannam@127
|
156 int i;
|
cannam@127
|
157 for (i = 0; i < rnk; ++i) {
|
cannam@127
|
158 x->dims[i].n = dims0[i].n;
|
cannam@127
|
159 x->dims[i].b[IB] = dims0[i].ib;
|
cannam@127
|
160 x->dims[i].b[OB] = dims0[i].ob;
|
cannam@127
|
161 }
|
cannam@127
|
162 return x;
|
cannam@127
|
163 }
|
cannam@127
|
164
|
cannam@127
|
165 static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes,
|
cannam@127
|
166 int rdft2)
|
cannam@127
|
167 {
|
cannam@127
|
168 dtensor *sz = XM(mkdtensor)(rnk);
|
cannam@127
|
169 dtensor *sz0 = mkdtensor_api(rnk, dims0);
|
cannam@127
|
170 block_kind k;
|
cannam@127
|
171 int i;
|
cannam@127
|
172
|
cannam@127
|
173 for (i = 0; i < rnk; ++i)
|
cannam@127
|
174 sz->dims[i].n = dims0[i].n;
|
cannam@127
|
175
|
cannam@127
|
176 if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
|
cannam@127
|
177
|
cannam@127
|
178 for (i = 0; i < rnk; ++i) {
|
cannam@127
|
179 sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n;
|
cannam@127
|
180 sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n;
|
cannam@127
|
181 }
|
cannam@127
|
182
|
cannam@127
|
183 /* If we haven't used all of the processes yet, and some of the
|
cannam@127
|
184 block sizes weren't specified (i.e. 0), then set the
|
cannam@127
|
185 unspecified blocks so as to use as many processes as
|
cannam@127
|
186 possible with as few distributed dimensions as possible. */
|
cannam@127
|
187 FORALL_BLOCK_KIND(k) {
|
cannam@127
|
188 INT nb = XM(num_blocks_total)(sz, k);
|
cannam@127
|
189 INT np = n_pes / nb;
|
cannam@127
|
190 for (i = 0; i < rnk && np > 1; ++i)
|
cannam@127
|
191 if (!sz0->dims[i].b[k]) {
|
cannam@127
|
192 sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np);
|
cannam@127
|
193 nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]);
|
cannam@127
|
194 np = n_pes / nb;
|
cannam@127
|
195 }
|
cannam@127
|
196 }
|
cannam@127
|
197
|
cannam@127
|
198 if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n;
|
cannam@127
|
199
|
cannam@127
|
200 /* punt for 1d prime */
|
cannam@127
|
201 if (rnk == 1 && X(is_prime)(sz->dims[0].n))
|
cannam@127
|
202 sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n;
|
cannam@127
|
203
|
cannam@127
|
204 XM(dtensor_destroy)(sz0);
|
cannam@127
|
205 sz0 = XM(dtensor_canonical)(sz, 0);
|
cannam@127
|
206 XM(dtensor_destroy)(sz);
|
cannam@127
|
207 return sz0;
|
cannam@127
|
208 }
|
cannam@127
|
209
|
cannam@127
|
210 /* allocate simple local (serial) dims array corresponding to n[rnk] */
|
cannam@127
|
211 static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n)
|
cannam@127
|
212 {
|
cannam@127
|
213 XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk,
|
cannam@127
|
214 TENSORS);
|
cannam@127
|
215 int i;
|
cannam@127
|
216 for (i = 0; i < rnk; ++i)
|
cannam@127
|
217 dims[i].n = dims[i].ib = dims[i].ob = n[i];
|
cannam@127
|
218 return dims;
|
cannam@127
|
219 }
|
cannam@127
|
220
|
cannam@127
|
221 /*************************************************************************/
|
cannam@127
|
222
|
cannam@127
|
223 static void local_size(int my_pe, const dtensor *sz, block_kind k,
|
cannam@127
|
224 ptrdiff_t *local_n, ptrdiff_t *local_start)
|
cannam@127
|
225 {
|
cannam@127
|
226 int i;
|
cannam@127
|
227 if (my_pe >= XM(num_blocks_total)(sz, k))
|
cannam@127
|
228 for (i = 0; i < sz->rnk; ++i)
|
cannam@127
|
229 local_n[i] = local_start[i] = 0;
|
cannam@127
|
230 else {
|
cannam@127
|
231 XM(block_coords)(sz, k, my_pe, local_start);
|
cannam@127
|
232 for (i = 0; i < sz->rnk; ++i) {
|
cannam@127
|
233 local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k],
|
cannam@127
|
234 local_start[i]);
|
cannam@127
|
235 local_start[i] *= sz->dims[i].b[k];
|
cannam@127
|
236 }
|
cannam@127
|
237 }
|
cannam@127
|
238 }
|
cannam@127
|
239
|
cannam@127
|
240 static INT prod(int rnk, const ptrdiff_t *local_n)
|
cannam@127
|
241 {
|
cannam@127
|
242 int i;
|
cannam@127
|
243 INT N = 1;
|
cannam@127
|
244 for (i = 0; i < rnk; ++i) N *= local_n[i];
|
cannam@127
|
245 return N;
|
cannam@127
|
246 }
|
cannam@127
|
247
|
cannam@127
|
248 ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0,
|
cannam@127
|
249 ptrdiff_t howmany, MPI_Comm comm,
|
cannam@127
|
250 ptrdiff_t *local_n_in,
|
cannam@127
|
251 ptrdiff_t *local_start_in,
|
cannam@127
|
252 ptrdiff_t *local_n_out,
|
cannam@127
|
253 ptrdiff_t *local_start_out,
|
cannam@127
|
254 int sign, unsigned flags)
|
cannam@127
|
255 {
|
cannam@127
|
256 INT N;
|
cannam@127
|
257 int my_pe, n_pes, i;
|
cannam@127
|
258 dtensor *sz;
|
cannam@127
|
259
|
cannam@127
|
260 if (rnk == 0)
|
cannam@127
|
261 return howmany;
|
cannam@127
|
262
|
cannam@127
|
263 MPI_Comm_rank(comm, &my_pe);
|
cannam@127
|
264 MPI_Comm_size(comm, &n_pes);
|
cannam@127
|
265 sz = default_sz(rnk, dims0, n_pes, 0);
|
cannam@127
|
266
|
cannam@127
|
267 /* Now, we must figure out how much local space the user should
|
cannam@127
|
268 allocate (or at least an upper bound). This depends strongly
|
cannam@127
|
269 on the exact algorithms we employ...ugh! FIXME: get this info
|
cannam@127
|
270 from the solvers somehow? */
|
cannam@127
|
271 N = 1; /* never return zero allocation size */
|
cannam@127
|
272 if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) {
|
cannam@127
|
273 INT Nafter;
|
cannam@127
|
274 ddim odims[2];
|
cannam@127
|
275
|
cannam@127
|
276 /* dft-rank-geq2-transposed */
|
cannam@127
|
277 odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */
|
cannam@127
|
278 /* we may need extra space for transposed intermediate data */
|
cannam@127
|
279 for (i = 0; i < 2; ++i)
|
cannam@127
|
280 if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 &&
|
cannam@127
|
281 XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) {
|
cannam@127
|
282 sz->dims[i].b[IB]
|
cannam@127
|
283 = XM(default_block)(sz->dims[i].n, n_pes);
|
cannam@127
|
284 sz->dims[1-i].b[IB] = sz->dims[1-i].n;
|
cannam@127
|
285 local_size(my_pe, sz, IB, local_n_in, local_start_in);
|
cannam@127
|
286 N = X(imax)(N, prod(rnk, local_n_in));
|
cannam@127
|
287 sz->dims[i] = odims[i];
|
cannam@127
|
288 sz->dims[1-i] = odims[1-i];
|
cannam@127
|
289 break;
|
cannam@127
|
290 }
|
cannam@127
|
291
|
cannam@127
|
292 /* dft-rank-geq2 */
|
cannam@127
|
293 Nafter = howmany;
|
cannam@127
|
294 for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
|
cannam@127
|
295 N = X(imax)(N, (sz->dims[0].n
|
cannam@127
|
296 * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
|
cannam@127
|
297 my_pe) + howmany - 1) / howmany);
|
cannam@127
|
298
|
cannam@127
|
299 /* dft-rank-geq2 with dimensions swapped */
|
cannam@127
|
300 Nafter = howmany * sz->dims[0].n;
|
cannam@127
|
301 for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n;
|
cannam@127
|
302 N = X(imax)(N, (sz->dims[1].n
|
cannam@127
|
303 * XM(block)(Nafter, XM(default_block)(Nafter, n_pes),
|
cannam@127
|
304 my_pe) + howmany - 1) / howmany);
|
cannam@127
|
305 }
|
cannam@127
|
306 else if (rnk == 1) {
|
cannam@127
|
307 if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */
|
cannam@127
|
308 ptrdiff_t n[2], start[2];
|
cannam@127
|
309 dtensor *sz2 = XM(mkdtensor)(2);
|
cannam@127
|
310 sz2->dims[0] = sz->dims[0];
|
cannam@127
|
311 sz2->dims[0].b[IB] = sz->dims[0].n;
|
cannam@127
|
312 sz2->dims[1].n = sz2->dims[1].b[OB] = howmany;
|
cannam@127
|
313 sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes);
|
cannam@127
|
314 local_size(my_pe, sz2, IB, n, start);
|
cannam@127
|
315 XM(dtensor_destroy)(sz2);
|
cannam@127
|
316 N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany);
|
cannam@127
|
317 }
|
cannam@127
|
318 else { /* dft-rank1 */
|
cannam@127
|
319 INT r, m, rblock[2], mblock[2];
|
cannam@127
|
320
|
cannam@127
|
321 /* Since the 1d transforms are so different, we require
|
cannam@127
|
322 the user to call local_size_1d for this case. Ugh. */
|
cannam@127
|
323 CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD);
|
cannam@127
|
324
|
cannam@127
|
325 if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign,
|
cannam@127
|
326 rblock, mblock))) {
|
cannam@127
|
327 m = sz->dims[0].n / r;
|
cannam@127
|
328 if (flags & FFTW_MPI_SCRAMBLED_IN)
|
cannam@127
|
329 sz->dims[0].b[IB] = rblock[IB] * m;
|
cannam@127
|
330 else { /* !SCRAMBLED_IN */
|
cannam@127
|
331 sz->dims[0].b[IB] = r * mblock[IB];
|
cannam@127
|
332 N = X(imax)(N, rblock[IB] * m);
|
cannam@127
|
333 }
|
cannam@127
|
334 if (flags & FFTW_MPI_SCRAMBLED_OUT)
|
cannam@127
|
335 sz->dims[0].b[OB] = r * mblock[OB];
|
cannam@127
|
336 else { /* !SCRAMBLED_OUT */
|
cannam@127
|
337 N = X(imax)(N, r * mblock[OB]);
|
cannam@127
|
338 sz->dims[0].b[OB] = rblock[OB] * m;
|
cannam@127
|
339 }
|
cannam@127
|
340 }
|
cannam@127
|
341 }
|
cannam@127
|
342 }
|
cannam@127
|
343
|
cannam@127
|
344 local_size(my_pe, sz, IB, local_n_in, local_start_in);
|
cannam@127
|
345 local_size(my_pe, sz, OB, local_n_out, local_start_out);
|
cannam@127
|
346
|
cannam@127
|
347 /* at least, make sure we have enough space to store input & output */
|
cannam@127
|
348 N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out)));
|
cannam@127
|
349
|
cannam@127
|
350 XM(dtensor_destroy)(sz);
|
cannam@127
|
351 return N * howmany;
|
cannam@127
|
352 }
|
cannam@127
|
353
|
cannam@127
|
354 ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
355 ptrdiff_t howmany,
|
cannam@127
|
356 ptrdiff_t xblock, ptrdiff_t yblock,
|
cannam@127
|
357 MPI_Comm comm,
|
cannam@127
|
358 ptrdiff_t *local_nx,
|
cannam@127
|
359 ptrdiff_t *local_x_start,
|
cannam@127
|
360 ptrdiff_t *local_ny,
|
cannam@127
|
361 ptrdiff_t *local_y_start)
|
cannam@127
|
362 {
|
cannam@127
|
363 ptrdiff_t N;
|
cannam@127
|
364 XM(ddim) *dims;
|
cannam@127
|
365 ptrdiff_t *local;
|
cannam@127
|
366
|
cannam@127
|
367 if (rnk == 0) {
|
cannam@127
|
368 *local_nx = *local_ny = 1;
|
cannam@127
|
369 *local_x_start = *local_y_start = 0;
|
cannam@127
|
370 return howmany;
|
cannam@127
|
371 }
|
cannam@127
|
372
|
cannam@127
|
373 dims = simple_dims(rnk, n);
|
cannam@127
|
374 local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS);
|
cannam@127
|
375
|
cannam@127
|
376 /* default 1d block distribution, with transposed output
|
cannam@127
|
377 if yblock < n[1] */
|
cannam@127
|
378 dims[0].ib = xblock;
|
cannam@127
|
379 if (rnk > 1) {
|
cannam@127
|
380 if (yblock < n[1])
|
cannam@127
|
381 dims[1].ob = yblock;
|
cannam@127
|
382 else
|
cannam@127
|
383 dims[0].ob = xblock;
|
cannam@127
|
384 }
|
cannam@127
|
385 else
|
cannam@127
|
386 dims[0].ob = xblock; /* FIXME: 1d not really supported here
|
cannam@127
|
387 since we don't have flags/sign */
|
cannam@127
|
388
|
cannam@127
|
389 N = XM(local_size_guru)(rnk, dims, howmany, comm,
|
cannam@127
|
390 local, local + rnk,
|
cannam@127
|
391 local + 2*rnk, local + 3*rnk,
|
cannam@127
|
392 0, 0);
|
cannam@127
|
393 *local_nx = local[0];
|
cannam@127
|
394 *local_x_start = local[rnk];
|
cannam@127
|
395 if (rnk > 1) {
|
cannam@127
|
396 *local_ny = local[2*rnk + 1];
|
cannam@127
|
397 *local_y_start = local[3*rnk + 1];
|
cannam@127
|
398 }
|
cannam@127
|
399 else {
|
cannam@127
|
400 *local_ny = *local_nx;
|
cannam@127
|
401 *local_y_start = *local_x_start;
|
cannam@127
|
402 }
|
cannam@127
|
403 X(ifree)(local);
|
cannam@127
|
404 X(ifree)(dims);
|
cannam@127
|
405 return N;
|
cannam@127
|
406 }
|
cannam@127
|
407
|
cannam@127
|
408 ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
409 ptrdiff_t howmany,
|
cannam@127
|
410 ptrdiff_t xblock,
|
cannam@127
|
411 MPI_Comm comm,
|
cannam@127
|
412 ptrdiff_t *local_nx,
|
cannam@127
|
413 ptrdiff_t *local_x_start)
|
cannam@127
|
414 {
|
cannam@127
|
415 ptrdiff_t local_ny, local_y_start;
|
cannam@127
|
416 return XM(local_size_many_transposed)(rnk, n, howmany,
|
cannam@127
|
417 xblock, rnk > 1
|
cannam@127
|
418 ? n[1] : FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
419 comm,
|
cannam@127
|
420 local_nx, local_x_start,
|
cannam@127
|
421 &local_ny, &local_y_start);
|
cannam@127
|
422 }
|
cannam@127
|
423
|
cannam@127
|
424
|
cannam@127
|
425 ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
426 MPI_Comm comm,
|
cannam@127
|
427 ptrdiff_t *local_nx,
|
cannam@127
|
428 ptrdiff_t *local_x_start,
|
cannam@127
|
429 ptrdiff_t *local_ny,
|
cannam@127
|
430 ptrdiff_t *local_y_start)
|
cannam@127
|
431 {
|
cannam@127
|
432 return XM(local_size_many_transposed)(rnk, n, 1,
|
cannam@127
|
433 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
434 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
435 comm,
|
cannam@127
|
436 local_nx, local_x_start,
|
cannam@127
|
437 local_ny, local_y_start);
|
cannam@127
|
438 }
|
cannam@127
|
439
|
cannam@127
|
440 ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
441 MPI_Comm comm,
|
cannam@127
|
442 ptrdiff_t *local_nx,
|
cannam@127
|
443 ptrdiff_t *local_x_start)
|
cannam@127
|
444 {
|
cannam@127
|
445 return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm,
|
cannam@127
|
446 local_nx, local_x_start);
|
cannam@127
|
447 }
|
cannam@127
|
448
|
cannam@127
|
449 ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany,
|
cannam@127
|
450 MPI_Comm comm, int sign, unsigned flags,
|
cannam@127
|
451 ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
|
cannam@127
|
452 ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
|
cannam@127
|
453 {
|
cannam@127
|
454 XM(ddim) d;
|
cannam@127
|
455 d.n = nx;
|
cannam@127
|
456 d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK;
|
cannam@127
|
457 return XM(local_size_guru)(1, &d, howmany, comm,
|
cannam@127
|
458 local_nx, local_x_start,
|
cannam@127
|
459 local_ny, local_y_start, sign, flags);
|
cannam@127
|
460 }
|
cannam@127
|
461
|
cannam@127
|
462 ptrdiff_t XM(local_size_1d)(ptrdiff_t nx,
|
cannam@127
|
463 MPI_Comm comm, int sign, unsigned flags,
|
cannam@127
|
464 ptrdiff_t *local_nx, ptrdiff_t *local_x_start,
|
cannam@127
|
465 ptrdiff_t *local_ny, ptrdiff_t *local_y_start)
|
cannam@127
|
466 {
|
cannam@127
|
467 return XM(local_size_many_1d)(nx, 1, comm, sign, flags,
|
cannam@127
|
468 local_nx, local_x_start,
|
cannam@127
|
469 local_ny, local_y_start);
|
cannam@127
|
470 }
|
cannam@127
|
471
|
cannam@127
|
472 ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
|
cannam@127
|
473 MPI_Comm comm,
|
cannam@127
|
474 ptrdiff_t *local_nx,
|
cannam@127
|
475 ptrdiff_t *local_x_start,
|
cannam@127
|
476 ptrdiff_t *local_ny,
|
cannam@127
|
477 ptrdiff_t *local_y_start)
|
cannam@127
|
478 {
|
cannam@127
|
479 ptrdiff_t n[2];
|
cannam@127
|
480 n[0] = nx; n[1] = ny;
|
cannam@127
|
481 return XM(local_size_transposed)(2, n, comm,
|
cannam@127
|
482 local_nx, local_x_start,
|
cannam@127
|
483 local_ny, local_y_start);
|
cannam@127
|
484 }
|
cannam@127
|
485
|
cannam@127
|
486 ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm,
|
cannam@127
|
487 ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
|
cannam@127
|
488 {
|
cannam@127
|
489 ptrdiff_t n[2];
|
cannam@127
|
490 n[0] = nx; n[1] = ny;
|
cannam@127
|
491 return XM(local_size)(2, n, comm, local_nx, local_x_start);
|
cannam@127
|
492 }
|
cannam@127
|
493
|
cannam@127
|
494 ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny,
|
cannam@127
|
495 ptrdiff_t nz,
|
cannam@127
|
496 MPI_Comm comm,
|
cannam@127
|
497 ptrdiff_t *local_nx,
|
cannam@127
|
498 ptrdiff_t *local_x_start,
|
cannam@127
|
499 ptrdiff_t *local_ny,
|
cannam@127
|
500 ptrdiff_t *local_y_start)
|
cannam@127
|
501 {
|
cannam@127
|
502 ptrdiff_t n[3];
|
cannam@127
|
503 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
504 return XM(local_size_transposed)(3, n, comm,
|
cannam@127
|
505 local_nx, local_x_start,
|
cannam@127
|
506 local_ny, local_y_start);
|
cannam@127
|
507 }
|
cannam@127
|
508
|
cannam@127
|
509 ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
cannam@127
|
510 MPI_Comm comm,
|
cannam@127
|
511 ptrdiff_t *local_nx, ptrdiff_t *local_x_start)
|
cannam@127
|
512 {
|
cannam@127
|
513 ptrdiff_t n[3];
|
cannam@127
|
514 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
515 return XM(local_size)(3, n, comm, local_nx, local_x_start);
|
cannam@127
|
516 }
|
cannam@127
|
517
|
cannam@127
|
518 /*************************************************************************/
|
cannam@127
|
519 /* Transpose API */
|
cannam@127
|
520
|
cannam@127
|
521 X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny,
|
cannam@127
|
522 ptrdiff_t howmany,
|
cannam@127
|
523 ptrdiff_t xblock, ptrdiff_t yblock,
|
cannam@127
|
524 R *in, R *out,
|
cannam@127
|
525 MPI_Comm comm, unsigned flags)
|
cannam@127
|
526 {
|
cannam@127
|
527 int n_pes;
|
cannam@127
|
528 XM(init)();
|
cannam@127
|
529
|
cannam@127
|
530 if (howmany < 0 || xblock < 0 || yblock < 0 ||
|
cannam@127
|
531 nx <= 0 || ny <= 0) return 0;
|
cannam@127
|
532
|
cannam@127
|
533 MPI_Comm_size(comm, &n_pes);
|
cannam@127
|
534 if (!xblock) xblock = XM(default_block)(nx, n_pes);
|
cannam@127
|
535 if (!yblock) yblock = XM(default_block)(ny, n_pes);
|
cannam@127
|
536 if (n_pes < XM(num_blocks)(nx, xblock)
|
cannam@127
|
537 || n_pes < XM(num_blocks)(ny, yblock))
|
cannam@127
|
538 return 0;
|
cannam@127
|
539
|
cannam@127
|
540 return
|
cannam@127
|
541 X(mkapiplan)(FFTW_FORWARD, flags,
|
cannam@127
|
542 XM(mkproblem_transpose)(nx, ny, howmany,
|
cannam@127
|
543 in, out, xblock, yblock,
|
cannam@127
|
544 comm, MPI_FLAGS(flags)));
|
cannam@127
|
545 }
|
cannam@127
|
546
|
cannam@127
|
547 X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
|
cannam@127
|
548 MPI_Comm comm, unsigned flags)
|
cannam@127
|
549
|
cannam@127
|
550 {
|
cannam@127
|
551 return XM(plan_many_transpose)(nx, ny, 1,
|
cannam@127
|
552 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
553 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
554 in, out, comm, flags);
|
cannam@127
|
555 }
|
cannam@127
|
556
|
cannam@127
|
557 /*************************************************************************/
|
cannam@127
|
558 /* Complex DFT API */
|
cannam@127
|
559
|
cannam@127
|
560 X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0,
|
cannam@127
|
561 ptrdiff_t howmany,
|
cannam@127
|
562 C *in, C *out,
|
cannam@127
|
563 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
564 {
|
cannam@127
|
565 int n_pes, i;
|
cannam@127
|
566 dtensor *sz;
|
cannam@127
|
567
|
cannam@127
|
568 XM(init)();
|
cannam@127
|
569
|
cannam@127
|
570 if (howmany < 0 || rnk < 1) return 0;
|
cannam@127
|
571 for (i = 0; i < rnk; ++i)
|
cannam@127
|
572 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
cannam@127
|
573 return 0;
|
cannam@127
|
574
|
cannam@127
|
575 MPI_Comm_size(comm, &n_pes);
|
cannam@127
|
576 sz = default_sz(rnk, dims0, n_pes, 0);
|
cannam@127
|
577
|
cannam@127
|
578 if (XM(num_blocks_total)(sz, IB) > n_pes
|
cannam@127
|
579 || XM(num_blocks_total)(sz, OB) > n_pes) {
|
cannam@127
|
580 XM(dtensor_destroy)(sz);
|
cannam@127
|
581 return 0;
|
cannam@127
|
582 }
|
cannam@127
|
583
|
cannam@127
|
584 return
|
cannam@127
|
585 X(mkapiplan)(sign, flags,
|
cannam@127
|
586 XM(mkproblem_dft_d)(sz, howmany,
|
cannam@127
|
587 (R *) in, (R *) out,
|
cannam@127
|
588 comm, sign,
|
cannam@127
|
589 MPI_FLAGS(flags)));
|
cannam@127
|
590 }
|
cannam@127
|
591
|
cannam@127
|
592 X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
593 ptrdiff_t howmany,
|
cannam@127
|
594 ptrdiff_t iblock, ptrdiff_t oblock,
|
cannam@127
|
595 C *in, C *out,
|
cannam@127
|
596 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
597 {
|
cannam@127
|
598 XM(ddim) *dims = simple_dims(rnk, n);
|
cannam@127
|
599 X(plan) pln;
|
cannam@127
|
600
|
cannam@127
|
601 if (rnk == 1) {
|
cannam@127
|
602 dims[0].ib = iblock;
|
cannam@127
|
603 dims[0].ob = oblock;
|
cannam@127
|
604 }
|
cannam@127
|
605 else if (rnk > 1) {
|
cannam@127
|
606 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
cannam@127
|
607 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
cannam@127
|
608 }
|
cannam@127
|
609
|
cannam@127
|
610 pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags);
|
cannam@127
|
611 X(ifree)(dims);
|
cannam@127
|
612 return pln;
|
cannam@127
|
613 }
|
cannam@127
|
614
|
cannam@127
|
615 X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out,
|
cannam@127
|
616 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
617 {
|
cannam@127
|
618 return XM(plan_many_dft)(rnk, n, 1,
|
cannam@127
|
619 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
620 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
621 in, out, comm, sign, flags);
|
cannam@127
|
622 }
|
cannam@127
|
623
|
cannam@127
|
624 X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out,
|
cannam@127
|
625 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
626 {
|
cannam@127
|
627 return XM(plan_dft)(1, &nx, in, out, comm, sign, flags);
|
cannam@127
|
628 }
|
cannam@127
|
629
|
cannam@127
|
630 X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out,
|
cannam@127
|
631 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
632 {
|
cannam@127
|
633 ptrdiff_t n[2];
|
cannam@127
|
634 n[0] = nx; n[1] = ny;
|
cannam@127
|
635 return XM(plan_dft)(2, n, in, out, comm, sign, flags);
|
cannam@127
|
636 }
|
cannam@127
|
637
|
cannam@127
|
638 X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
cannam@127
|
639 C *in, C *out,
|
cannam@127
|
640 MPI_Comm comm, int sign, unsigned flags)
|
cannam@127
|
641 {
|
cannam@127
|
642 ptrdiff_t n[3];
|
cannam@127
|
643 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
644 return XM(plan_dft)(3, n, in, out, comm, sign, flags);
|
cannam@127
|
645 }
|
cannam@127
|
646
|
cannam@127
|
647 /*************************************************************************/
|
cannam@127
|
648 /* R2R API */
|
cannam@127
|
649
|
cannam@127
|
650 X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0,
|
cannam@127
|
651 ptrdiff_t howmany,
|
cannam@127
|
652 R *in, R *out,
|
cannam@127
|
653 MPI_Comm comm, const X(r2r_kind) *kind,
|
cannam@127
|
654 unsigned flags)
|
cannam@127
|
655 {
|
cannam@127
|
656 int n_pes, i;
|
cannam@127
|
657 dtensor *sz;
|
cannam@127
|
658 rdft_kind *k;
|
cannam@127
|
659 X(plan) pln;
|
cannam@127
|
660
|
cannam@127
|
661 XM(init)();
|
cannam@127
|
662
|
cannam@127
|
663 if (howmany < 0 || rnk < 1) return 0;
|
cannam@127
|
664 for (i = 0; i < rnk; ++i)
|
cannam@127
|
665 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
cannam@127
|
666 return 0;
|
cannam@127
|
667
|
cannam@127
|
668 k = X(map_r2r_kind)(rnk, kind);
|
cannam@127
|
669
|
cannam@127
|
670 MPI_Comm_size(comm, &n_pes);
|
cannam@127
|
671 sz = default_sz(rnk, dims0, n_pes, 0);
|
cannam@127
|
672
|
cannam@127
|
673 if (XM(num_blocks_total)(sz, IB) > n_pes
|
cannam@127
|
674 || XM(num_blocks_total)(sz, OB) > n_pes) {
|
cannam@127
|
675 XM(dtensor_destroy)(sz);
|
cannam@127
|
676 return 0;
|
cannam@127
|
677 }
|
cannam@127
|
678
|
cannam@127
|
679 pln = X(mkapiplan)(0, flags,
|
cannam@127
|
680 XM(mkproblem_rdft_d)(sz, howmany,
|
cannam@127
|
681 in, out,
|
cannam@127
|
682 comm, k, MPI_FLAGS(flags)));
|
cannam@127
|
683 X(ifree0)(k);
|
cannam@127
|
684 return pln;
|
cannam@127
|
685 }
|
cannam@127
|
686
|
cannam@127
|
687 X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
688 ptrdiff_t howmany,
|
cannam@127
|
689 ptrdiff_t iblock, ptrdiff_t oblock,
|
cannam@127
|
690 R *in, R *out,
|
cannam@127
|
691 MPI_Comm comm, const X(r2r_kind) *kind,
|
cannam@127
|
692 unsigned flags)
|
cannam@127
|
693 {
|
cannam@127
|
694 XM(ddim) *dims = simple_dims(rnk, n);
|
cannam@127
|
695 X(plan) pln;
|
cannam@127
|
696
|
cannam@127
|
697 if (rnk == 1) {
|
cannam@127
|
698 dims[0].ib = iblock;
|
cannam@127
|
699 dims[0].ob = oblock;
|
cannam@127
|
700 }
|
cannam@127
|
701 else if (rnk > 1) {
|
cannam@127
|
702 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
cannam@127
|
703 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
cannam@127
|
704 }
|
cannam@127
|
705
|
cannam@127
|
706 pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags);
|
cannam@127
|
707 X(ifree)(dims);
|
cannam@127
|
708 return pln;
|
cannam@127
|
709 }
|
cannam@127
|
710
|
cannam@127
|
711 X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out,
|
cannam@127
|
712 MPI_Comm comm,
|
cannam@127
|
713 const X(r2r_kind) *kind,
|
cannam@127
|
714 unsigned flags)
|
cannam@127
|
715 {
|
cannam@127
|
716 return XM(plan_many_r2r)(rnk, n, 1,
|
cannam@127
|
717 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
718 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
719 in, out, comm, kind, flags);
|
cannam@127
|
720 }
|
cannam@127
|
721
|
cannam@127
|
722 X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,
|
cannam@127
|
723 MPI_Comm comm,
|
cannam@127
|
724 X(r2r_kind) kindx, X(r2r_kind) kindy,
|
cannam@127
|
725 unsigned flags)
|
cannam@127
|
726 {
|
cannam@127
|
727 ptrdiff_t n[2];
|
cannam@127
|
728 X(r2r_kind) kind[2];
|
cannam@127
|
729 n[0] = nx; n[1] = ny;
|
cannam@127
|
730 kind[0] = kindx; kind[1] = kindy;
|
cannam@127
|
731 return XM(plan_r2r)(2, n, in, out, comm, kind, flags);
|
cannam@127
|
732 }
|
cannam@127
|
733
|
cannam@127
|
734 X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
cannam@127
|
735 R *in, R *out,
|
cannam@127
|
736 MPI_Comm comm,
|
cannam@127
|
737 X(r2r_kind) kindx, X(r2r_kind) kindy,
|
cannam@127
|
738 X(r2r_kind) kindz,
|
cannam@127
|
739 unsigned flags)
|
cannam@127
|
740 {
|
cannam@127
|
741 ptrdiff_t n[3];
|
cannam@127
|
742 X(r2r_kind) kind[3];
|
cannam@127
|
743 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
744 kind[0] = kindx; kind[1] = kindy; kind[2] = kindz;
|
cannam@127
|
745 return XM(plan_r2r)(3, n, in, out, comm, kind, flags);
|
cannam@127
|
746 }
|
cannam@127
|
747
|
cannam@127
|
748 /*************************************************************************/
|
cannam@127
|
749 /* R2C/C2R API */
|
cannam@127
|
750
|
cannam@127
|
751 static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0,
|
cannam@127
|
752 ptrdiff_t howmany,
|
cannam@127
|
753 R *r, C *c,
|
cannam@127
|
754 MPI_Comm comm, rdft_kind kind, unsigned flags)
|
cannam@127
|
755 {
|
cannam@127
|
756 int n_pes, i;
|
cannam@127
|
757 dtensor *sz;
|
cannam@127
|
758 R *cr = (R *) c;
|
cannam@127
|
759
|
cannam@127
|
760 XM(init)();
|
cannam@127
|
761
|
cannam@127
|
762 if (howmany < 0 || rnk < 2) return 0;
|
cannam@127
|
763 for (i = 0; i < rnk; ++i)
|
cannam@127
|
764 if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0)
|
cannam@127
|
765 return 0;
|
cannam@127
|
766
|
cannam@127
|
767 MPI_Comm_size(comm, &n_pes);
|
cannam@127
|
768 sz = default_sz(rnk, dims0, n_pes, 1);
|
cannam@127
|
769
|
cannam@127
|
770 sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1;
|
cannam@127
|
771 if (XM(num_blocks_total)(sz, IB) > n_pes
|
cannam@127
|
772 || XM(num_blocks_total)(sz, OB) > n_pes) {
|
cannam@127
|
773 XM(dtensor_destroy)(sz);
|
cannam@127
|
774 return 0;
|
cannam@127
|
775 }
|
cannam@127
|
776 sz->dims[rnk-1].n = dims0[rnk-1].n;
|
cannam@127
|
777
|
cannam@127
|
778 if (kind == R2HC)
|
cannam@127
|
779 return X(mkapiplan)(0, flags,
|
cannam@127
|
780 XM(mkproblem_rdft2_d)(sz, howmany,
|
cannam@127
|
781 r, cr, comm, R2HC,
|
cannam@127
|
782 MPI_FLAGS(flags)));
|
cannam@127
|
783 else
|
cannam@127
|
784 return X(mkapiplan)(0, flags,
|
cannam@127
|
785 XM(mkproblem_rdft2_d)(sz, howmany,
|
cannam@127
|
786 cr, r, comm, HC2R,
|
cannam@127
|
787 MPI_FLAGS(flags)));
|
cannam@127
|
788 }
|
cannam@127
|
789
|
cannam@127
|
790 X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
791 ptrdiff_t howmany,
|
cannam@127
|
792 ptrdiff_t iblock, ptrdiff_t oblock,
|
cannam@127
|
793 R *in, C *out,
|
cannam@127
|
794 MPI_Comm comm, unsigned flags)
|
cannam@127
|
795 {
|
cannam@127
|
796 XM(ddim) *dims = simple_dims(rnk, n);
|
cannam@127
|
797 X(plan) pln;
|
cannam@127
|
798
|
cannam@127
|
799 if (rnk == 1) {
|
cannam@127
|
800 dims[0].ib = iblock;
|
cannam@127
|
801 dims[0].ob = oblock;
|
cannam@127
|
802 }
|
cannam@127
|
803 else if (rnk > 1) {
|
cannam@127
|
804 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
cannam@127
|
805 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
cannam@127
|
806 }
|
cannam@127
|
807
|
cannam@127
|
808 pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags);
|
cannam@127
|
809 X(ifree)(dims);
|
cannam@127
|
810 return pln;
|
cannam@127
|
811 }
|
cannam@127
|
812
|
cannam@127
|
813 X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n,
|
cannam@127
|
814 ptrdiff_t howmany,
|
cannam@127
|
815 ptrdiff_t iblock, ptrdiff_t oblock,
|
cannam@127
|
816 C *in, R *out,
|
cannam@127
|
817 MPI_Comm comm, unsigned flags)
|
cannam@127
|
818 {
|
cannam@127
|
819 XM(ddim) *dims = simple_dims(rnk, n);
|
cannam@127
|
820 X(plan) pln;
|
cannam@127
|
821
|
cannam@127
|
822 if (rnk == 1) {
|
cannam@127
|
823 dims[0].ib = iblock;
|
cannam@127
|
824 dims[0].ob = oblock;
|
cannam@127
|
825 }
|
cannam@127
|
826 else if (rnk > 1) {
|
cannam@127
|
827 dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock;
|
cannam@127
|
828 dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock;
|
cannam@127
|
829 }
|
cannam@127
|
830
|
cannam@127
|
831 pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags);
|
cannam@127
|
832 X(ifree)(dims);
|
cannam@127
|
833 return pln;
|
cannam@127
|
834 }
|
cannam@127
|
835
|
cannam@127
|
836 X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out,
|
cannam@127
|
837 MPI_Comm comm, unsigned flags)
|
cannam@127
|
838 {
|
cannam@127
|
839 return XM(plan_many_dft_r2c)(rnk, n, 1,
|
cannam@127
|
840 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
841 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
842 in, out, comm, flags);
|
cannam@127
|
843 }
|
cannam@127
|
844
|
cannam@127
|
845 X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out,
|
cannam@127
|
846 MPI_Comm comm, unsigned flags)
|
cannam@127
|
847 {
|
cannam@127
|
848 ptrdiff_t n[2];
|
cannam@127
|
849 n[0] = nx; n[1] = ny;
|
cannam@127
|
850 return XM(plan_dft_r2c)(2, n, in, out, comm, flags);
|
cannam@127
|
851 }
|
cannam@127
|
852
|
cannam@127
|
853 X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
cannam@127
|
854 R *in, C *out, MPI_Comm comm, unsigned flags)
|
cannam@127
|
855 {
|
cannam@127
|
856 ptrdiff_t n[3];
|
cannam@127
|
857 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
858 return XM(plan_dft_r2c)(3, n, in, out, comm, flags);
|
cannam@127
|
859 }
|
cannam@127
|
860
|
cannam@127
|
861 X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out,
|
cannam@127
|
862 MPI_Comm comm, unsigned flags)
|
cannam@127
|
863 {
|
cannam@127
|
864 return XM(plan_many_dft_c2r)(rnk, n, 1,
|
cannam@127
|
865 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
866 FFTW_MPI_DEFAULT_BLOCK,
|
cannam@127
|
867 in, out, comm, flags);
|
cannam@127
|
868 }
|
cannam@127
|
869
|
cannam@127
|
870 X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out,
|
cannam@127
|
871 MPI_Comm comm, unsigned flags)
|
cannam@127
|
872 {
|
cannam@127
|
873 ptrdiff_t n[2];
|
cannam@127
|
874 n[0] = nx; n[1] = ny;
|
cannam@127
|
875 return XM(plan_dft_c2r)(2, n, in, out, comm, flags);
|
cannam@127
|
876 }
|
cannam@127
|
877
|
cannam@127
|
878 X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz,
|
cannam@127
|
879 C *in, R *out, MPI_Comm comm, unsigned flags)
|
cannam@127
|
880 {
|
cannam@127
|
881 ptrdiff_t n[3];
|
cannam@127
|
882 n[0] = nx; n[1] = ny; n[2] = nz;
|
cannam@127
|
883 return XM(plan_dft_c2r)(3, n, in, out, comm, flags);
|
cannam@127
|
884 }
|
cannam@127
|
885
|
cannam@127
|
886 /*************************************************************************/
|
cannam@127
|
887 /* New-array execute functions */
|
cannam@127
|
888
|
cannam@127
|
889 void XM(execute_dft)(const X(plan) p, C *in, C *out) {
|
cannam@127
|
890 /* internally, MPI plans are just rdft plans */
|
cannam@127
|
891 X(execute_r2r)(p, (R*) in, (R*) out);
|
cannam@127
|
892 }
|
cannam@127
|
893
|
cannam@127
|
894 void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) {
|
cannam@127
|
895 /* internally, MPI plans are just rdft plans */
|
cannam@127
|
896 X(execute_r2r)(p, in, (R*) out);
|
cannam@127
|
897 }
|
cannam@127
|
898
|
cannam@127
|
899 void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) {
|
cannam@127
|
900 /* internally, MPI plans are just rdft plans */
|
cannam@127
|
901 X(execute_r2r)(p, (R*) in, out);
|
cannam@127
|
902 }
|
cannam@127
|
903
|
cannam@127
|
904 void XM(execute_r2r)(const X(plan) p, R *in, R *out) {
|
cannam@127
|
905 /* internally, MPI plans are just rdft plans */
|
cannam@127
|
906 X(execute_r2r)(p, in, out);
|
cannam@127
|
907 }
|