comparison src/fftw-3.3.3/mpi/transpose-recurse.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* Recursive "radix-r" distributed transpose, which breaks a transpose
22 over p processes into p/r transposes over r processes plus r
23 transposes over p/r processes. If performed recursively, this
24 produces a total of O(p log p) messages vs. O(p^2) messages for a
25 direct approach.
26
27 However, this is not necessarily an improvement. The total size of
28 all the messages is actually increased from O(N) to O(N log p)
29 where N is the total data size. Also, the amount of local data
30 rearrangement is increased. So, it's not clear, a priori, what the
31 best algorithm will be, and we'll leave it to the planner. (In
32 theory and practice, it looks like this becomes advantageous for
33 large p, in the limit where the message sizes are small and
34 latency-dominated.)
35 */
36
37 #include "mpi-transpose.h"
38 #include <string.h>
39
40 typedef struct {
41 solver super;
42 int (*radix)(int np);
43 const char *nam;
44 int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
45 } S;
46
47 typedef struct {
48 plan_mpi_transpose super;
49
50 plan *cld1, *cldtr, *cldtm;
51 int preserve_input;
52
53 int r; /* "radix" */
54 const char *nam;
55 } P;
56
57 static void apply(const plan *ego_, R *I, R *O)
58 {
59 const P *ego = (const P *) ego_;
60 plan_rdft *cld1, *cldtr, *cldtm;
61
62 cld1 = (plan_rdft *) ego->cld1;
63 if (cld1) cld1->apply((plan *) cld1, I, O);
64
65 if (ego->preserve_input) I = O;
66
67 cldtr = (plan_rdft *) ego->cldtr;
68 if (cldtr) cldtr->apply((plan *) cldtr, O, I);
69
70 cldtm = (plan_rdft *) ego->cldtm;
71 if (cldtm) cldtm->apply((plan *) cldtm, I, O);
72 }
73
74 static int radix_sqrt(int np)
75 {
76 int r;
77 for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
78 ;
79 return r;
80 }
81
82 static int radix_first(int np)
83 {
84 int r = (int) (X(first_divisor)(np));
85 return (r >= (int) (X(isqrt)(np)) ? 0 : r);
86 }
87
88 /* the local allocated space on process pe required for the given transpose
89 dimensions and block sizes */
90 static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
91 {
92 return X(imax)(XM(block)(nx, block, pe) * ny,
93 nx * XM(block)(ny, tblock, pe));
94 }
95
96 /* check whether the recursive transposes fit within the space
97 that must have been allocated on each process for this transpose;
98 this must be modified if the subdivision in mkplan is changed! */
99 static int enough_space(INT nx, INT ny, INT block, INT tblock,
100 int r, int n_pes)
101 {
102 int pe;
103 int m = n_pes / r;
104 for (pe = 0; pe < n_pes; ++pe) {
105 INT space = transpose_space(nx, ny, block, tblock, pe);
106 INT b1 = XM(block)(nx, r * block, pe / r);
107 INT b2 = XM(block)(ny, m * tblock, pe % r);
108 if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
109 || transpose_space(nx, b2, r*block, tblock, pe / r) > space)
110 return 0;
111 }
112 return 1;
113 }
114
115 /* In theory, transpose-recurse becomes advantageous for message sizes
116 below some minimum, assuming that the time is dominated by
117 communications. In practice, we want to constrain the minimum
118 message size for transpose-recurse to keep the planning time down.
119 I've set this conservatively according to some simple experiments
120 on a Cray XT3 where the crossover message size was 128, although on
121 a larger-latency machine the crossover will be larger. */
122 #define SMALL_MESSAGE 2048
123
124 static int applicable(const S *ego, const problem *p_,
125 const planner *plnr, int *r)
126 {
127 const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
128 int n_pes;
129 MPI_Comm_size(p->comm, &n_pes);
130 return (1
131 && p->tblock * n_pes == p->ny
132 && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
133 && p->I != p->O))
134 && (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
135 && enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
136 && (!CONSERVE_MEMORYP(plnr) || *r > 8
137 || !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
138 && (!NO_SLOWP(plnr) ||
139 (p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
140 && ONLY_TRANSPOSEDP(p->flags)
141 );
142 }
143
144 static void awake(plan *ego_, enum wakefulness wakefulness)
145 {
146 P *ego = (P *) ego_;
147 X(plan_awake)(ego->cld1, wakefulness);
148 X(plan_awake)(ego->cldtr, wakefulness);
149 X(plan_awake)(ego->cldtm, wakefulness);
150 }
151
152 static void destroy(plan *ego_)
153 {
154 P *ego = (P *) ego_;
155 X(plan_destroy_internal)(ego->cldtm);
156 X(plan_destroy_internal)(ego->cldtr);
157 X(plan_destroy_internal)(ego->cld1);
158 }
159
160 static void print(const plan *ego_, printer *p)
161 {
162 const P *ego = (const P *) ego_;
163 p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
164 ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
165 ego->cld1, ego->cldtr, ego->cldtm);
166 }
167
168 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
169 {
170 const S *ego = (const S *) ego_;
171 const problem_mpi_transpose *p;
172 P *pln;
173 plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
174 R *I, *O;
175 int me, np, r, m;
176 INT b;
177 MPI_Comm comm2;
178 static const plan_adt padt = {
179 XM(transpose_solve), awake, print, destroy
180 };
181
182 UNUSED(ego);
183
184 if (!applicable(ego, p_, plnr, &r))
185 return (plan *) 0;
186
187 p = (const problem_mpi_transpose *) p_;
188
189 MPI_Comm_size(p->comm, &np);
190 MPI_Comm_rank(p->comm, &me);
191 m = np / r;
192 A(r * m == np);
193
194 I = p->I; O = p->O;
195
196 b = XM(block)(p->nx, p->block, me);
197 A(p->tblock * np == p->ny); /* this is currently required for cld1 */
198 if (p->flags & TRANSPOSED_IN) {
199 /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
200 INT vn = p->vn * b * p->tblock;
201 cld1 = X(mkplan_f_d)(plnr,
202 X(mkproblem_rdft_0_d)(X(mktensor_3d)
203 (m, r*vn, vn,
204 r, vn, m*vn,
205 vn, 1, 1),
206 I, O),
207 0, 0, NO_SLOW);
208 }
209 else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
210 /* b x m x r x bt x vn -> r x m x bt x b x vn */
211 INT vn = p->vn;
212 INT bt = p->tblock;
213 cld1 = X(mkplan_f_d)(plnr,
214 X(mkproblem_rdft_0_d)(X(mktensor_5d)
215 (b, m*r*bt*vn, vn,
216 m, r*bt*vn, bt*b*vn,
217 r, bt*vn, m*bt*b*vn,
218 bt, vn, b*vn,
219 vn, 1, 1),
220 I, O),
221 0, 0, NO_SLOW);
222 }
223 else { /* TRANSPOSED_IN permutation must be separate for in-place */
224 /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
225 INT vn = p->vn * p->tblock;
226 cld1 = X(mkplan_f_d)(plnr,
227 X(mkproblem_rdft_0_d)(X(mktensor_4d)
228 (m, r*vn, vn,
229 r, vn, m*vn,
230 vn, 1, 1,
231 b, np*vn, np*vn),
232 I, O),
233 0, 0, NO_SLOW);
234 }
235 if (XM(any_true)(!cld1, p->comm)) goto nada;
236
237 if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
238
239 b = XM(block)(p->nx, r * p->block, me / r);
240 MPI_Comm_split(p->comm, me / r, me, &comm2);
241 if (b)
242 cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
243 (b, p->ny, p->vn,
244 O, I, p->block, m * p->tblock, comm2,
245 p->I != p->O
246 ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
247 MPI_Comm_free(&comm2);
248 if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
249
250 b = XM(block)(p->ny, m * p->tblock, me % r);
251 MPI_Comm_split(p->comm, me % r, me, &comm2);
252 if (b)
253 cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
254 (p->nx, b, p->vn,
255 I, O, r * p->block, p->tblock, comm2,
256 TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
257 MPI_Comm_free(&comm2);
258 if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
259
260 pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
261
262 pln->cld1 = cld1;
263 pln->cldtr = cldtr;
264 pln->cldtm = cldtm;
265 pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
266 pln->r = r;
267 pln->nam = ego->nam;
268
269 pln->super.super.ops = cld1->ops;
270 if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
271 if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
272
273 return &(pln->super.super);
274
275 nada:
276 X(plan_destroy_internal)(cldtm);
277 X(plan_destroy_internal)(cldtr);
278 X(plan_destroy_internal)(cld1);
279 return (plan *) 0;
280 }
281
282 static solver *mksolver(int preserve_input,
283 int (*radix)(int np), const char *nam)
284 {
285 static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
286 S *slv = MKSOLVER(S, &sadt);
287 slv->preserve_input = preserve_input;
288 slv->radix = radix;
289 slv->nam = nam;
290 return &(slv->super);
291 }
292
293 void XM(transpose_recurse_register)(planner *p)
294 {
295 int preserve_input;
296 for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
297 REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
298 REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
299 }
300 }