Mercurial > hg > sv-dependency-builds
diff src/fftw-3.3.3/mpi/choose-radix.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/fftw-3.3.3/mpi/choose-radix.c Wed Mar 20 15:35:50 2013 +0000 @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "ifftw-mpi.h" + +/* Return the radix r for a 1d MPI transform of a distributed dimension d, + with the given flags and transform size. That is, decomposes d.n + as r * m, Cooley-Tukey style. Also computes the block sizes rblock + and mblock. Returns 0 if such a decomposition is not feasible. + This is unfortunately somewhat complicated. + + A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c): + + d.n is initially distributed as an m x r array with block size mblock[IB]. + Then it is internally transposed to an r x m array with block size + rblock[IB]. Then it is internally transposed to m x r again with block + size mblock[OB]. Finally, it is transposed to r x m with block size + rblock[IB]. + + If flags & SCRAMBLED_IN, then the first transpose is skipped (the array + starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose + is skipped (the array ends up as m x r). To make sure the forward + and backward transforms use the same "scrambling" format, we swap r + and m when sign != FFT_SIGN. + + There are some downsides to this, especially in the case where + either m or r is not divisible by n_pes. For one thing, it means + that in general we can't use the same block size for the input and + output. For another thing, it means that we can't in general honor + a user's "requested" block sizes in d.b[]. Therefore, for simplicity, + we simply ignore d.b[] for now. +*/ +INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign, + INT rblock[2], INT mblock[2]) +{ + INT r, m; + + UNUSED(flags); /* we would need this if we paid attention to d.b[*] */ + + /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes. + This not only ensures that the input (the m dimension) is + equally distributed if possible, and at the r dimension is + maximally equally distributed (if d.n/n_pes >= n_pes), it also + makes one of the local transpositions in the algorithm + trivial. */ + if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/ + && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */) + r = d.n / n_pes; + else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */ + for (r = X(isqrt)(d.n); d.n % r != 0; ++r) + ; + } + if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */ + + if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */ + m = r; + r = d.n / m; + } + else + m = d.n / r; + + rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes); + mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes); + + return r; +}