cannam@127: /* cannam@127: * Copyright (c) 2003, 2007-14 Matteo Frigo cannam@127: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology cannam@127: * cannam@127: * This program is free software; you can redistribute it and/or modify cannam@127: * it under the terms of the GNU General Public License as published by cannam@127: * the Free Software Foundation; either version 2 of the License, or cannam@127: * (at your option) any later version. cannam@127: * cannam@127: * This program is distributed in the hope that it will be useful, cannam@127: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@127: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@127: * GNU General Public License for more details. cannam@127: * cannam@127: * You should have received a copy of the GNU General Public License cannam@127: * along with this program; if not, write to the Free Software cannam@127: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@127: * cannam@127: */ cannam@127: cannam@127: #include "ifftw-mpi.h" cannam@127: cannam@127: /* Return the radix r for a 1d MPI transform of a distributed dimension d, cannam@127: with the given flags and transform size. That is, decomposes d.n cannam@127: as r * m, Cooley-Tukey style. Also computes the block sizes rblock cannam@127: and mblock. Returns 0 if such a decomposition is not feasible. cannam@127: This is unfortunately somewhat complicated. cannam@127: cannam@127: A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c): cannam@127: cannam@127: d.n is initially distributed as an m x r array with block size mblock[IB]. cannam@127: Then it is internally transposed to an r x m array with block size cannam@127: rblock[IB]. Then it is internally transposed to m x r again with block cannam@127: size mblock[OB]. Finally, it is transposed to r x m with block size cannam@127: rblock[IB]. cannam@127: cannam@127: If flags & SCRAMBLED_IN, then the first transpose is skipped (the array cannam@127: starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose cannam@127: is skipped (the array ends up as m x r). To make sure the forward cannam@127: and backward transforms use the same "scrambling" format, we swap r cannam@127: and m when sign != FFT_SIGN. cannam@127: cannam@127: There are some downsides to this, especially in the case where cannam@127: either m or r is not divisible by n_pes. For one thing, it means cannam@127: that in general we can't use the same block size for the input and cannam@127: output. For another thing, it means that we can't in general honor cannam@127: a user's "requested" block sizes in d.b[]. Therefore, for simplicity, cannam@127: we simply ignore d.b[] for now. cannam@127: */ cannam@127: INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign, cannam@127: INT rblock[2], INT mblock[2]) cannam@127: { cannam@127: INT r, m; cannam@127: cannam@127: UNUSED(flags); /* we would need this if we paid attention to d.b[*] */ cannam@127: cannam@127: /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes. cannam@127: This not only ensures that the input (the m dimension) is cannam@127: equally distributed if possible, and at the r dimension is cannam@127: maximally equally distributed (if d.n/n_pes >= n_pes), it also cannam@127: makes one of the local transpositions in the algorithm cannam@127: trivial. */ cannam@127: if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/ cannam@127: && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */) cannam@127: r = d.n / n_pes; cannam@127: else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */ cannam@127: for (r = X(isqrt)(d.n); d.n % r != 0; ++r) cannam@127: ; cannam@127: } cannam@127: if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */ cannam@127: cannam@127: if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */ cannam@127: m = r; cannam@127: r = d.n / m; cannam@127: } cannam@127: else cannam@127: m = d.n / r; cannam@127: cannam@127: rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes); cannam@127: mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes); cannam@127: cannam@127: return r; cannam@127: }