comparison src/fftw-3.3.3/mpi/choose-radix.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 #include "ifftw-mpi.h"
22
23 /* Return the radix r for a 1d MPI transform of a distributed dimension d,
24 with the given flags and transform size. That is, decomposes d.n
25 as r * m, Cooley-Tukey style. Also computes the block sizes rblock
26 and mblock. Returns 0 if such a decomposition is not feasible.
27 This is unfortunately somewhat complicated.
28
29 A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
30
31 d.n is initially distributed as an m x r array with block size mblock[IB].
32 Then it is internally transposed to an r x m array with block size
33 rblock[IB]. Then it is internally transposed to m x r again with block
34 size mblock[OB]. Finally, it is transposed to r x m with block size
35 rblock[IB].
36
37 If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
38 starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose
39 is skipped (the array ends up as m x r). To make sure the forward
40 and backward transforms use the same "scrambling" format, we swap r
41 and m when sign != FFT_SIGN.
42
43 There are some downsides to this, especially in the case where
44 either m or r is not divisible by n_pes. For one thing, it means
45 that in general we can't use the same block size for the input and
46 output. For another thing, it means that we can't in general honor
47 a user's "requested" block sizes in d.b[]. Therefore, for simplicity,
48 we simply ignore d.b[] for now.
49 */
50 INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
51 INT rblock[2], INT mblock[2])
52 {
53 INT r, m;
54
55 UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
56
57 /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
58 This not only ensures that the input (the m dimension) is
59 equally distributed if possible, and at the r dimension is
60 maximally equally distributed (if d.n/n_pes >= n_pes), it also
61 makes one of the local transpositions in the algorithm
62 trivial. */
63 if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
64 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
65 r = d.n / n_pes;
66 else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
67 for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
68 ;
69 }
70 if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
71
72 if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
73 m = r;
74 r = d.n / m;
75 }
76 else
77 m = d.n / r;
78
79 rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
80 mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
81
82 return r;
83 }