Chris@42: /* Chris@42: * Copyright (c) 2003, 2007-14 Matteo Frigo Chris@42: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology Chris@42: * Chris@42: * This program is free software; you can redistribute it and/or modify Chris@42: * it under the terms of the GNU General Public License as published by Chris@42: * the Free Software Foundation; either version 2 of the License, or Chris@42: * (at your option) any later version. Chris@42: * Chris@42: * This program is distributed in the hope that it will be useful, Chris@42: * but WITHOUT ANY WARRANTY; without even the implied warranty of Chris@42: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Chris@42: * GNU General Public License for more details. Chris@42: * Chris@42: * You should have received a copy of the GNU General Public License Chris@42: * along with this program; if not, write to the Free Software Chris@42: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Chris@42: * Chris@42: */ Chris@42: Chris@42: #include "ifftw-mpi.h" Chris@42: Chris@42: /* Return the radix r for a 1d MPI transform of a distributed dimension d, Chris@42: with the given flags and transform size. That is, decomposes d.n Chris@42: as r * m, Cooley-Tukey style. Also computes the block sizes rblock Chris@42: and mblock. Returns 0 if such a decomposition is not feasible. Chris@42: This is unfortunately somewhat complicated. Chris@42: Chris@42: A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c): Chris@42: Chris@42: d.n is initially distributed as an m x r array with block size mblock[IB]. Chris@42: Then it is internally transposed to an r x m array with block size Chris@42: rblock[IB]. Then it is internally transposed to m x r again with block Chris@42: size mblock[OB]. Finally, it is transposed to r x m with block size Chris@42: rblock[IB]. Chris@42: Chris@42: If flags & SCRAMBLED_IN, then the first transpose is skipped (the array Chris@42: starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose Chris@42: is skipped (the array ends up as m x r). To make sure the forward Chris@42: and backward transforms use the same "scrambling" format, we swap r Chris@42: and m when sign != FFT_SIGN. Chris@42: Chris@42: There are some downsides to this, especially in the case where Chris@42: either m or r is not divisible by n_pes. For one thing, it means Chris@42: that in general we can't use the same block size for the input and Chris@42: output. For another thing, it means that we can't in general honor Chris@42: a user's "requested" block sizes in d.b[]. Therefore, for simplicity, Chris@42: we simply ignore d.b[] for now. Chris@42: */ Chris@42: INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign, Chris@42: INT rblock[2], INT mblock[2]) Chris@42: { Chris@42: INT r, m; Chris@42: Chris@42: UNUSED(flags); /* we would need this if we paid attention to d.b[*] */ Chris@42: Chris@42: /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes. Chris@42: This not only ensures that the input (the m dimension) is Chris@42: equally distributed if possible, and at the r dimension is Chris@42: maximally equally distributed (if d.n/n_pes >= n_pes), it also Chris@42: makes one of the local transpositions in the algorithm Chris@42: trivial. */ Chris@42: if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/ Chris@42: && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */) Chris@42: r = d.n / n_pes; Chris@42: else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */ Chris@42: for (r = X(isqrt)(d.n); d.n % r != 0; ++r) Chris@42: ; Chris@42: } Chris@42: if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */ Chris@42: Chris@42: if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */ Chris@42: m = r; Chris@42: r = d.n / m; Chris@42: } Chris@42: else Chris@42: m = d.n / r; Chris@42: Chris@42: rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes); Chris@42: mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes); Chris@42: Chris@42: return r; Chris@42: }