Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/mpi/transpose-recurse.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /* Recursive "radix-r" distributed transpose, which breaks a transpose | |
22 over p processes into p/r transposes over r processes plus r | |
23 transposes over p/r processes. If performed recursively, this | |
24 produces a total of O(p log p) messages vs. O(p^2) messages for a | |
25 direct approach. | |
26 | |
27 However, this is not necessarily an improvement. The total size of | |
28 all the messages is actually increased from O(N) to O(N log p) | |
29 where N is the total data size. Also, the amount of local data | |
30 rearrangement is increased. So, it's not clear, a priori, what the | |
31 best algorithm will be, and we'll leave it to the planner. (In | |
32 theory and practice, it looks like this becomes advantageous for | |
33 large p, in the limit where the message sizes are small and | |
34 latency-dominated.) | |
35 */ | |
36 | |
37 #include "mpi-transpose.h" | |
38 #include <string.h> | |
39 | |
40 typedef struct { | |
41 solver super; | |
42 int (*radix)(int np); | |
43 const char *nam; | |
44 int preserve_input; /* preserve input even if DESTROY_INPUT was passed */ | |
45 } S; | |
46 | |
47 typedef struct { | |
48 plan_mpi_transpose super; | |
49 | |
50 plan *cld1, *cldtr, *cldtm; | |
51 int preserve_input; | |
52 | |
53 int r; /* "radix" */ | |
54 const char *nam; | |
55 } P; | |
56 | |
57 static void apply(const plan *ego_, R *I, R *O) | |
58 { | |
59 const P *ego = (const P *) ego_; | |
60 plan_rdft *cld1, *cldtr, *cldtm; | |
61 | |
62 cld1 = (plan_rdft *) ego->cld1; | |
63 if (cld1) cld1->apply((plan *) cld1, I, O); | |
64 | |
65 if (ego->preserve_input) I = O; | |
66 | |
67 cldtr = (plan_rdft *) ego->cldtr; | |
68 if (cldtr) cldtr->apply((plan *) cldtr, O, I); | |
69 | |
70 cldtm = (plan_rdft *) ego->cldtm; | |
71 if (cldtm) cldtm->apply((plan *) cldtm, I, O); | |
72 } | |
73 | |
74 static int radix_sqrt(int np) | |
75 { | |
76 int r; | |
77 for (r = (int) (X(isqrt)(np)); np % r != 0; ++r) | |
78 ; | |
79 return r; | |
80 } | |
81 | |
82 static int radix_first(int np) | |
83 { | |
84 int r = (int) (X(first_divisor)(np)); | |
85 return (r >= (int) (X(isqrt)(np)) ? 0 : r); | |
86 } | |
87 | |
88 /* the local allocated space on process pe required for the given transpose | |
89 dimensions and block sizes */ | |
90 static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe) | |
91 { | |
92 return X(imax)(XM(block)(nx, block, pe) * ny, | |
93 nx * XM(block)(ny, tblock, pe)); | |
94 } | |
95 | |
96 /* check whether the recursive transposes fit within the space | |
97 that must have been allocated on each process for this transpose; | |
98 this must be modified if the subdivision in mkplan is changed! */ | |
99 static int enough_space(INT nx, INT ny, INT block, INT tblock, | |
100 int r, int n_pes) | |
101 { | |
102 int pe; | |
103 int m = n_pes / r; | |
104 for (pe = 0; pe < n_pes; ++pe) { | |
105 INT space = transpose_space(nx, ny, block, tblock, pe); | |
106 INT b1 = XM(block)(nx, r * block, pe / r); | |
107 INT b2 = XM(block)(ny, m * tblock, pe % r); | |
108 if (transpose_space(b1, ny, block, m*tblock, pe % r) > space | |
109 || transpose_space(nx, b2, r*block, tblock, pe / r) > space) | |
110 return 0; | |
111 } | |
112 return 1; | |
113 } | |
114 | |
115 /* In theory, transpose-recurse becomes advantageous for message sizes | |
116 below some minimum, assuming that the time is dominated by | |
117 communications. In practice, we want to constrain the minimum | |
118 message size for transpose-recurse to keep the planning time down. | |
119 I've set this conservatively according to some simple experiments | |
120 on a Cray XT3 where the crossover message size was 128, although on | |
121 a larger-latency machine the crossover will be larger. */ | |
122 #define SMALL_MESSAGE 2048 | |
123 | |
124 static int applicable(const S *ego, const problem *p_, | |
125 const planner *plnr, int *r) | |
126 { | |
127 const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_; | |
128 int n_pes; | |
129 MPI_Comm_size(p->comm, &n_pes); | |
130 return (1 | |
131 && p->tblock * n_pes == p->ny | |
132 && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr) | |
133 && p->I != p->O)) | |
134 && (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1 | |
135 && enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes) | |
136 && (!CONSERVE_MEMORYP(plnr) || *r > 8 | |
137 || !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r)) | |
138 && (!NO_SLOWP(plnr) || | |
139 (p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE) | |
140 && ONLY_TRANSPOSEDP(p->flags) | |
141 ); | |
142 } | |
143 | |
144 static void awake(plan *ego_, enum wakefulness wakefulness) | |
145 { | |
146 P *ego = (P *) ego_; | |
147 X(plan_awake)(ego->cld1, wakefulness); | |
148 X(plan_awake)(ego->cldtr, wakefulness); | |
149 X(plan_awake)(ego->cldtm, wakefulness); | |
150 } | |
151 | |
152 static void destroy(plan *ego_) | |
153 { | |
154 P *ego = (P *) ego_; | |
155 X(plan_destroy_internal)(ego->cldtm); | |
156 X(plan_destroy_internal)(ego->cldtr); | |
157 X(plan_destroy_internal)(ego->cld1); | |
158 } | |
159 | |
160 static void print(const plan *ego_, printer *p) | |
161 { | |
162 const P *ego = (const P *) ego_; | |
163 p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))", | |
164 ego->nam, ego->r, ego->preserve_input==2 ?"/p":"", | |
165 ego->cld1, ego->cldtr, ego->cldtm); | |
166 } | |
167 | |
168 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) | |
169 { | |
170 const S *ego = (const S *) ego_; | |
171 const problem_mpi_transpose *p; | |
172 P *pln; | |
173 plan *cld1 = 0, *cldtr = 0, *cldtm = 0; | |
174 R *I, *O; | |
175 int me, np, r, m; | |
176 INT b; | |
177 MPI_Comm comm2; | |
178 static const plan_adt padt = { | |
179 XM(transpose_solve), awake, print, destroy | |
180 }; | |
181 | |
182 UNUSED(ego); | |
183 | |
184 if (!applicable(ego, p_, plnr, &r)) | |
185 return (plan *) 0; | |
186 | |
187 p = (const problem_mpi_transpose *) p_; | |
188 | |
189 MPI_Comm_size(p->comm, &np); | |
190 MPI_Comm_rank(p->comm, &me); | |
191 m = np / r; | |
192 A(r * m == np); | |
193 | |
194 I = p->I; O = p->O; | |
195 | |
196 b = XM(block)(p->nx, p->block, me); | |
197 A(p->tblock * np == p->ny); /* this is currently required for cld1 */ | |
198 if (p->flags & TRANSPOSED_IN) { | |
199 /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */ | |
200 INT vn = p->vn * b * p->tblock; | |
201 cld1 = X(mkplan_f_d)(plnr, | |
202 X(mkproblem_rdft_0_d)(X(mktensor_3d) | |
203 (m, r*vn, vn, | |
204 r, vn, m*vn, | |
205 vn, 1, 1), | |
206 I, O), | |
207 0, 0, NO_SLOW); | |
208 } | |
209 else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */ | |
210 /* b x m x r x bt x vn -> r x m x bt x b x vn */ | |
211 INT vn = p->vn; | |
212 INT bt = p->tblock; | |
213 cld1 = X(mkplan_f_d)(plnr, | |
214 X(mkproblem_rdft_0_d)(X(mktensor_5d) | |
215 (b, m*r*bt*vn, vn, | |
216 m, r*bt*vn, bt*b*vn, | |
217 r, bt*vn, m*bt*b*vn, | |
218 bt, vn, b*vn, | |
219 vn, 1, 1), | |
220 I, O), | |
221 0, 0, NO_SLOW); | |
222 } | |
223 else { /* TRANSPOSED_IN permutation must be separate for in-place */ | |
224 /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */ | |
225 INT vn = p->vn * p->tblock; | |
226 cld1 = X(mkplan_f_d)(plnr, | |
227 X(mkproblem_rdft_0_d)(X(mktensor_4d) | |
228 (m, r*vn, vn, | |
229 r, vn, m*vn, | |
230 vn, 1, 1, | |
231 b, np*vn, np*vn), | |
232 I, O), | |
233 0, 0, NO_SLOW); | |
234 } | |
235 if (XM(any_true)(!cld1, p->comm)) goto nada; | |
236 | |
237 if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O; | |
238 | |
239 b = XM(block)(p->nx, r * p->block, me / r); | |
240 MPI_Comm_split(p->comm, me / r, me, &comm2); | |
241 if (b) | |
242 cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose) | |
243 (b, p->ny, p->vn, | |
244 O, I, p->block, m * p->tblock, comm2, | |
245 p->I != p->O | |
246 ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN))); | |
247 MPI_Comm_free(&comm2); | |
248 if (XM(any_true)(b && !cldtr, p->comm)) goto nada; | |
249 | |
250 b = XM(block)(p->ny, m * p->tblock, me % r); | |
251 MPI_Comm_split(p->comm, me % r, me, &comm2); | |
252 if (b) | |
253 cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose) | |
254 (p->nx, b, p->vn, | |
255 I, O, r * p->block, p->tblock, comm2, | |
256 TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT))); | |
257 MPI_Comm_free(&comm2); | |
258 if (XM(any_true)(b && !cldtm, p->comm)) goto nada; | |
259 | |
260 pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply); | |
261 | |
262 pln->cld1 = cld1; | |
263 pln->cldtr = cldtr; | |
264 pln->cldtm = cldtm; | |
265 pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr); | |
266 pln->r = r; | |
267 pln->nam = ego->nam; | |
268 | |
269 pln->super.super.ops = cld1->ops; | |
270 if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops); | |
271 if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops); | |
272 | |
273 return &(pln->super.super); | |
274 | |
275 nada: | |
276 X(plan_destroy_internal)(cldtm); | |
277 X(plan_destroy_internal)(cldtr); | |
278 X(plan_destroy_internal)(cld1); | |
279 return (plan *) 0; | |
280 } | |
281 | |
282 static solver *mksolver(int preserve_input, | |
283 int (*radix)(int np), const char *nam) | |
284 { | |
285 static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 }; | |
286 S *slv = MKSOLVER(S, &sadt); | |
287 slv->preserve_input = preserve_input; | |
288 slv->radix = radix; | |
289 slv->nam = nam; | |
290 return &(slv->super); | |
291 } | |
292 | |
293 void XM(transpose_recurse_register)(planner *p) | |
294 { | |
295 int preserve_input; | |
296 for (preserve_input = 0; preserve_input <= 1; ++preserve_input) { | |
297 REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt")); | |
298 REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first")); | |
299 } | |
300 } |