annotate src/fftw-3.3.8/mpi/choose-radix.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 #include "ifftw-mpi.h"
Chris@82 22
Chris@82 23 /* Return the radix r for a 1d MPI transform of a distributed dimension d,
Chris@82 24 with the given flags and transform size. That is, decomposes d.n
Chris@82 25 as r * m, Cooley-Tukey style. Also computes the block sizes rblock
Chris@82 26 and mblock. Returns 0 if such a decomposition is not feasible.
Chris@82 27 This is unfortunately somewhat complicated.
Chris@82 28
Chris@82 29 A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
Chris@82 30
Chris@82 31 d.n is initially distributed as an m x r array with block size mblock[IB].
Chris@82 32 Then it is internally transposed to an r x m array with block size
Chris@82 33 rblock[IB]. Then it is internally transposed to m x r again with block
Chris@82 34 size mblock[OB]. Finally, it is transposed to r x m with block size
Chris@82 35 rblock[IB].
Chris@82 36
Chris@82 37 If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
Chris@82 38 starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose
Chris@82 39 is skipped (the array ends up as m x r). To make sure the forward
Chris@82 40 and backward transforms use the same "scrambling" format, we swap r
Chris@82 41 and m when sign != FFT_SIGN.
Chris@82 42
Chris@82 43 There are some downsides to this, especially in the case where
Chris@82 44 either m or r is not divisible by n_pes. For one thing, it means
Chris@82 45 that in general we can't use the same block size for the input and
Chris@82 46 output. For another thing, it means that we can't in general honor
Chris@82 47 a user's "requested" block sizes in d.b[]. Therefore, for simplicity,
Chris@82 48 we simply ignore d.b[] for now.
Chris@82 49 */
Chris@82 50 INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
Chris@82 51 INT rblock[2], INT mblock[2])
Chris@82 52 {
Chris@82 53 INT r, m;
Chris@82 54
Chris@82 55 UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
Chris@82 56
Chris@82 57 /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
Chris@82 58 This not only ensures that the input (the m dimension) is
Chris@82 59 equally distributed if possible, and at the r dimension is
Chris@82 60 maximally equally distributed (if d.n/n_pes >= n_pes), it also
Chris@82 61 makes one of the local transpositions in the algorithm
Chris@82 62 trivial. */
Chris@82 63 if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
Chris@82 64 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
Chris@82 65 r = d.n / n_pes;
Chris@82 66 else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
Chris@82 67 for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
Chris@82 68 ;
Chris@82 69 }
Chris@82 70 if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
Chris@82 71
Chris@82 72 if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
Chris@82 73 m = r;
Chris@82 74 r = d.n / m;
Chris@82 75 }
Chris@82 76 else
Chris@82 77 m = d.n / r;
Chris@82 78
Chris@82 79 rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
Chris@82 80 mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
Chris@82 81
Chris@82 82 return r;
Chris@82 83 }