Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/mpi/testsched.c @ 10:37bf6b4a2645
Add FFTW3
author | Chris Cannam |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:c0fb53affa76 | 10:37bf6b4a2645 |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
3 * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 /**********************************************************************/ | |
22 /* This is a modified and combined version of the sched.c and | |
23 test_sched.c files shipped with FFTW 2, written to implement and | |
24 test various all-to-all communications scheduling patterns. | |
25 | |
26 It is not used in FFTW 3, but I keep it around in case we ever want | |
27 to play with this again or to change algorithms. In particular, I | |
28 used it to implement and test the fill1_comm_sched routine in | |
29 transpose-pairwise.c, which allows us to create a schedule for one | |
30 process at a time and is much more compact than the FFTW 2 code. | |
31 | |
32 Note that the scheduling algorithm is somewhat modified from that | |
33 of FFTW 2. Originally, I thought that one "stall" in the schedule | |
34 was unavoidable for odd numbers of processes, since this is the | |
35 case for the soccer-timetabling problem. However, because of the | |
36 self-communication step, we can use the self-communication to fill | |
37 in the stalls. (Thanks to Ralf Wildenhues for pointing this out.) | |
38 This greatly simplifies the process re-sorting algorithm. */ | |
39 | |
40 /**********************************************************************/ | |
41 | |
42 #include <stdio.h> | |
43 #include <stdlib.h> | |
44 | |
45 /* This file contains routines to compute communications schedules for | |
46 all-to-all communications (complete exchanges) that are performed | |
47 in-place. (That is, the block that processor x sends to processor | |
48 y gets replaced on processor x by a block received from processor y.) | |
49 | |
50 A schedule, int **sched, is a two-dimensional array where | |
51 sched[pe][i] is the processor that pe expects to exchange a message | |
52 with on the i-th step of the exchange. sched[pe][i] == -1 for the | |
53 i after the last exchange scheduled on pe. | |
54 | |
55 Here, processors (pe's, for processing elements), are numbered from | |
56 0 to npes-1. | |
57 | |
58 There are a couple of constraints that a schedule should satisfy | |
59 (besides the obvious one that every processor has to communicate | |
60 with every other processor exactly once). | |
61 | |
62 * First, and most importantly, there must be no deadlocks. | |
63 | |
64 * Second, we would like to overlap communications as much as possible, | |
65 so that all exchanges occur in parallel. It turns out that perfect | |
66 overlap is possible for all number of processes (npes). | |
67 | |
68 It turns out that this scheduling problem is actually well-studied, | |
69 and good solutions are known. The problem is known as a | |
70 "time-tabling" problem, and is specifically the problem of | |
71 scheduling a sports competition (where n teams must compete exactly | |
72 once with every other team). The problem is discussed and | |
73 algorithms are presented in: | |
74 | |
75 [1] J. A. M. Schreuder, "Constructing Timetables for Sport | |
76 Competitions," Mathematical Programming Study 13, pp. 58-67 (1980). | |
77 | |
78 [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint | |
79 Logic Programming," Proc. of 12th Europ. Conf. on | |
80 Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996). | |
81 http://hermes.dis.uniromal.it/~aschaerf/publications.html | |
82 | |
83 (These people actually impose a lot of additional constraints that | |
84 we don't care about, so they are solving harder problems. [1] gives | |
85 a simple enough algorithm for our purposes, though.) | |
86 | |
87 In the timetabling problem, N teams can all play one another in N-1 | |
88 steps if N is even, and N steps if N is odd. Here, however, | |
89 there is a "self-communication" step (a team must also "play itself") | |
90 and so we can always make an optimal N-step schedule regardless of N. | |
91 | |
92 However, we have to do more: for a particular processor, the | |
93 communications schedule must be sorted in ascending or descending | |
94 order of processor index. (This is necessary so that the data | |
95 coming in for the transpose does not overwrite data that will be | |
96 sent later; for that processor the incoming and outgoing blocks are | |
97 of different non-zero sizes.) Fortunately, because the schedule | |
98 is stall free, each parallel step of the schedule is independent | |
99 of every other step, and we can reorder the steps arbitrarily | |
100 to achieve any desired order on a particular process. | |
101 */ | |
102 | |
103 void free_comm_schedule(int **sched, int npes) | |
104 { | |
105 if (sched) { | |
106 int i; | |
107 | |
108 for (i = 0; i < npes; ++i) | |
109 free(sched[i]); | |
110 free(sched); | |
111 } | |
112 } | |
113 | |
114 void empty_comm_schedule(int **sched, int npes) | |
115 { | |
116 int i; | |
117 for (i = 0; i < npes; ++i) | |
118 sched[i][0] = -1; | |
119 } | |
120 | |
121 extern void fill_comm_schedule(int **sched, int npes); | |
122 | |
123 /* Create a new communications schedule for a given number of processors. | |
124 The schedule is initialized to a deadlock-free, maximum overlap | |
125 schedule. Returns NULL on an error (may print a message to | |
126 stderr if there is a program bug detected). */ | |
127 int **make_comm_schedule(int npes) | |
128 { | |
129 int **sched; | |
130 int i; | |
131 | |
132 sched = (int **) malloc(sizeof(int *) * npes); | |
133 if (!sched) | |
134 return NULL; | |
135 | |
136 for (i = 0; i < npes; ++i) | |
137 sched[i] = NULL; | |
138 | |
139 for (i = 0; i < npes; ++i) { | |
140 sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1)); | |
141 if (!sched[i]) { | |
142 free_comm_schedule(sched,npes); | |
143 return NULL; | |
144 } | |
145 } | |
146 | |
147 empty_comm_schedule(sched,npes); | |
148 fill_comm_schedule(sched,npes); | |
149 | |
150 if (!check_comm_schedule(sched,npes)) { | |
151 free_comm_schedule(sched,npes); | |
152 return NULL; | |
153 } | |
154 | |
155 return sched; | |
156 } | |
157 | |
158 static void add_dest_to_comm_schedule(int **sched, int pe, int dest) | |
159 { | |
160 int i; | |
161 | |
162 for (i = 0; sched[pe][i] != -1; ++i) | |
163 ; | |
164 | |
165 sched[pe][i] = dest; | |
166 sched[pe][i+1] = -1; | |
167 } | |
168 | |
169 static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2) | |
170 { | |
171 add_dest_to_comm_schedule(sched, pe1, pe2); | |
172 if (pe1 != pe2) | |
173 add_dest_to_comm_schedule(sched, pe2, pe1); | |
174 } | |
175 | |
176 /* Simplification of algorithm presented in [1] (we have fewer | |
177 constraints). Produces a perfect schedule (npes steps). */ | |
178 | |
179 void fill_comm_schedule(int **sched, int npes) | |
180 { | |
181 int pe, i, n; | |
182 | |
183 if (npes % 2 == 0) { | |
184 n = npes; | |
185 for (pe = 0; pe < npes; ++pe) | |
186 add_pair_to_comm_schedule(sched,pe,pe); | |
187 } | |
188 else | |
189 n = npes + 1; | |
190 | |
191 for (pe = 0; pe < n - 1; ++pe) { | |
192 add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe); | |
193 | |
194 for (i = 1; i < n/2; ++i) { | |
195 int pe_a, pe_b; | |
196 | |
197 pe_a = pe - i; | |
198 if (pe_a < 0) | |
199 pe_a += n - 1; | |
200 | |
201 pe_b = (pe + i) % (n - 1); | |
202 | |
203 add_pair_to_comm_schedule(sched,pe_a,pe_b); | |
204 } | |
205 } | |
206 } | |
207 | |
208 /* given an array sched[npes], fills it with the communications | |
209 schedule for process pe. */ | |
210 void fill1_comm_sched(int *sched, int which_pe, int npes) | |
211 { | |
212 int pe, i, n, s = 0; | |
213 if (npes % 2 == 0) { | |
214 n = npes; | |
215 sched[s++] = which_pe; | |
216 } | |
217 else | |
218 n = npes + 1; | |
219 for (pe = 0; pe < n - 1; ++pe) { | |
220 if (npes % 2 == 0) { | |
221 if (pe == which_pe) sched[s++] = npes - 1; | |
222 else if (npes - 1 == which_pe) sched[s++] = pe; | |
223 } | |
224 else if (pe == which_pe) sched[s++] = pe; | |
225 | |
226 if (pe != which_pe && which_pe < n - 1) { | |
227 i = (pe - which_pe + (n - 1)) % (n - 1); | |
228 if (i < n/2) | |
229 sched[s++] = (pe + i) % (n - 1); | |
230 | |
231 i = (which_pe - pe + (n - 1)) % (n - 1); | |
232 if (i < n/2) | |
233 sched[s++] = (pe - i + (n - 1)) % (n - 1); | |
234 } | |
235 } | |
236 if (s != npes) { | |
237 fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n", | |
238 s, which_pe, npes); | |
239 exit(EXIT_FAILURE); | |
240 } | |
241 } | |
242 | |
243 /* sort the communication schedule sched for npes so that the schedule | |
244 on process sortpe is ascending or descending (!ascending). */ | |
245 static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending) | |
246 { | |
247 int *sortsched, i; | |
248 sortsched = (int *) malloc(npes * sizeof(int) * 2); | |
249 fill1_comm_sched(sortsched, sortpe, npes); | |
250 if (ascending) | |
251 for (i = 0; i < npes; ++i) | |
252 sortsched[npes + sortsched[i]] = sched[i]; | |
253 else | |
254 for (i = 0; i < npes; ++i) | |
255 sortsched[2*npes - 1 - sortsched[i]] = sched[i]; | |
256 for (i = 0; i < npes; ++i) | |
257 sched[i] = sortsched[npes + i]; | |
258 free(sortsched); | |
259 } | |
260 | |
261 /* Below, we have various checks in case of bugs: */ | |
262 | |
263 /* check for deadlocks by simulating the schedule and looking for | |
264 cycles in the dependency list; returns 0 if there are deadlocks | |
265 (or other errors) */ | |
266 static int check_schedule_deadlock(int **sched, int npes) | |
267 { | |
268 int *step, *depend, *visited, pe, pe2, period, done = 0; | |
269 int counter = 0; | |
270 | |
271 /* step[pe] is the step in the schedule that a given pe is on */ | |
272 step = (int *) malloc(sizeof(int) * npes); | |
273 | |
274 /* depend[pe] is the pe' that pe is currently waiting for a message | |
275 from (-1 if none) */ | |
276 depend = (int *) malloc(sizeof(int) * npes); | |
277 | |
278 /* visited[pe] tells whether we have visited the current pe already | |
279 when we are looking for cycles. */ | |
280 visited = (int *) malloc(sizeof(int) * npes); | |
281 | |
282 if (!step || !depend || !visited) { | |
283 free(step); free(depend); free(visited); | |
284 return 0; | |
285 } | |
286 | |
287 for (pe = 0; pe < npes; ++pe) | |
288 step[pe] = 0; | |
289 | |
290 while (!done) { | |
291 ++counter; | |
292 | |
293 for (pe = 0; pe < npes; ++pe) | |
294 depend[pe] = sched[pe][step[pe]]; | |
295 | |
296 /* now look for cycles in the dependencies with period > 2: */ | |
297 for (pe = 0; pe < npes; ++pe) | |
298 if (depend[pe] != -1) { | |
299 for (pe2 = 0; pe2 < npes; ++pe2) | |
300 visited[pe2] = 0; | |
301 | |
302 period = 0; | |
303 pe2 = pe; | |
304 do { | |
305 visited[pe2] = period + 1; | |
306 pe2 = depend[pe2]; | |
307 period++; | |
308 } while (pe2 != -1 && !visited[pe2]); | |
309 | |
310 if (pe2 == -1) { | |
311 fprintf(stderr, | |
312 "BUG: unterminated cycle in schedule!\n"); | |
313 free(step); free(depend); | |
314 free(visited); | |
315 return 0; | |
316 } | |
317 if (period - (visited[pe2] - 1) > 2) { | |
318 fprintf(stderr,"BUG: deadlock in schedule!\n"); | |
319 free(step); free(depend); | |
320 free(visited); | |
321 return 0; | |
322 } | |
323 | |
324 if (pe2 == pe) | |
325 step[pe]++; | |
326 } | |
327 | |
328 done = 1; | |
329 for (pe = 0; pe < npes; ++pe) | |
330 if (sched[pe][step[pe]] != -1) { | |
331 done = 0; | |
332 break; | |
333 } | |
334 } | |
335 | |
336 free(step); free(depend); free(visited); | |
337 return (counter > 0 ? counter : 1); | |
338 } | |
339 | |
340 /* sanity checks; prints message and returns 0 on failure. | |
341 undocumented feature: the return value on success is actually the | |
342 number of steps required for the schedule to complete, counting | |
343 stalls. */ | |
344 int check_comm_schedule(int **sched, int npes) | |
345 { | |
346 int pe, i, comm_pe; | |
347 | |
348 for (pe = 0; pe < npes; ++pe) { | |
349 for (comm_pe = 0; comm_pe < npes; ++comm_pe) { | |
350 for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i) | |
351 ; | |
352 if (sched[pe][i] == -1) { | |
353 fprintf(stderr,"BUG: schedule never sends message from " | |
354 "%d to %d.\n",pe,comm_pe); | |
355 return 0; /* never send message to comm_pe */ | |
356 } | |
357 } | |
358 for (i = 0; sched[pe][i] != -1; ++i) | |
359 ; | |
360 if (i != npes) { | |
361 fprintf(stderr,"BUG: schedule sends too many messages from " | |
362 "%d\n",pe); | |
363 return 0; | |
364 } | |
365 } | |
366 return check_schedule_deadlock(sched,npes); | |
367 } | |
368 | |
369 /* invert the order of all the schedules; this has no effect on | |
370 its required properties. */ | |
371 void invert_comm_schedule(int **sched, int npes) | |
372 { | |
373 int pe, i; | |
374 | |
375 for (pe = 0; pe < npes; ++pe) | |
376 for (i = 0; i < npes/2; ++i) { | |
377 int dummy = sched[pe][i]; | |
378 sched[pe][i] = sched[pe][npes-1-i]; | |
379 sched[pe][npes-1-i] = dummy; | |
380 } | |
381 } | |
382 | |
383 /* Sort the schedule for sort_pe in ascending order of processor | |
384 index. Unfortunately, for odd npes (when schedule has a stall | |
385 to begin with) this will introduce an extra stall due to | |
386 the motion of the self-communication past a stall. We could | |
387 fix this if it were really important. Actually, we don't | |
388 get an extra stall when sort_pe == 0 or npes-1, which is sufficient | |
389 for our purposes. */ | |
390 void sort_comm_schedule(int **sched, int npes, int sort_pe) | |
391 { | |
392 int i,j,pe; | |
393 | |
394 /* Note that we can do this sort in O(npes) swaps because we know | |
395 that the numbers we are sorting are just 0...npes-1. But we'll | |
396 just do a bubble sort for simplicity here. */ | |
397 | |
398 for (i = 0; i < npes - 1; ++i) | |
399 for (j = i + 1; j < npes; ++j) | |
400 if (sched[sort_pe][i] > sched[sort_pe][j]) { | |
401 for (pe = 0; pe < npes; ++pe) { | |
402 int s = sched[pe][i]; | |
403 sched[pe][i] = sched[pe][j]; | |
404 sched[pe][j] = s; | |
405 } | |
406 } | |
407 } | |
408 | |
409 /* print the schedule (for debugging purposes) */ | |
410 void print_comm_schedule(int **sched, int npes) | |
411 { | |
412 int pe, i, width; | |
413 | |
414 if (npes < 10) | |
415 width = 1; | |
416 else if (npes < 100) | |
417 width = 2; | |
418 else | |
419 width = 3; | |
420 | |
421 for (pe = 0; pe < npes; ++pe) { | |
422 printf("pe %*d schedule:", width, pe); | |
423 for (i = 0; sched[pe][i] != -1; ++i) | |
424 printf(" %*d",width,sched[pe][i]); | |
425 printf("\n"); | |
426 } | |
427 } | |
428 | |
429 int main(int argc, char **argv) | |
430 { | |
431 int **sched; | |
432 int npes = -1, sortpe = -1, steps, i; | |
433 | |
434 if (argc >= 2) { | |
435 npes = atoi(argv[1]); | |
436 if (npes <= 0) { | |
437 fprintf(stderr,"npes must be positive!"); | |
438 return 1; | |
439 } | |
440 } | |
441 if (argc >= 3) { | |
442 sortpe = atoi(argv[2]); | |
443 if (sortpe < 0 || sortpe >= npes) { | |
444 fprintf(stderr,"sortpe must be between 0 and npes-1.\n"); | |
445 return 1; | |
446 } | |
447 } | |
448 | |
449 if (npes != -1) { | |
450 printf("Computing schedule for npes = %d:\n",npes); | |
451 sched = make_comm_schedule(npes); | |
452 if (!sched) { | |
453 fprintf(stderr,"Out of memory!"); | |
454 return 6; | |
455 } | |
456 | |
457 if (steps = check_comm_schedule(sched,npes)) | |
458 printf("schedule OK (takes %d steps to complete).\n", steps); | |
459 else | |
460 printf("schedule not OK.\n"); | |
461 | |
462 print_comm_schedule(sched, npes); | |
463 | |
464 if (sortpe != -1) { | |
465 printf("\nRe-creating schedule for pe = %d...\n", sortpe); | |
466 int *sched1 = (int*) malloc(sizeof(int) * npes); | |
467 for (i = 0; i < npes; ++i) sched1[i] = -1; | |
468 fill1_comm_sched(sched1, sortpe, npes); | |
469 printf(" ="); | |
470 for (i = 0; i < npes; ++i) | |
471 printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3), | |
472 sched1[i]); | |
473 printf("\n"); | |
474 | |
475 printf("\nSorting schedule for sortpe = %d...\n", sortpe); | |
476 sort_comm_schedule(sched,npes,sortpe); | |
477 | |
478 if (steps = check_comm_schedule(sched,npes)) | |
479 printf("schedule OK (takes %d steps to complete).\n", | |
480 steps); | |
481 else | |
482 printf("schedule not OK.\n"); | |
483 | |
484 print_comm_schedule(sched, npes); | |
485 | |
486 printf("\nInverting schedule...\n"); | |
487 invert_comm_schedule(sched,npes); | |
488 | |
489 if (steps = check_comm_schedule(sched,npes)) | |
490 printf("schedule OK (takes %d steps to complete).\n", | |
491 steps); | |
492 else | |
493 printf("schedule not OK.\n"); | |
494 | |
495 print_comm_schedule(sched, npes); | |
496 | |
497 free_comm_schedule(sched,npes); | |
498 | |
499 free(sched1); | |
500 } | |
501 } | |
502 else { | |
503 printf("Doing infinite tests...\n"); | |
504 for (npes = 1; ; ++npes) { | |
505 int *sched1 = (int*) malloc(sizeof(int) * npes); | |
506 printf("npes = %d...",npes); | |
507 sched = make_comm_schedule(npes); | |
508 if (!sched) { | |
509 fprintf(stderr,"Out of memory!\n"); | |
510 return 5; | |
511 } | |
512 for (sortpe = 0; sortpe < npes; ++sortpe) { | |
513 empty_comm_schedule(sched,npes); | |
514 fill_comm_schedule(sched,npes); | |
515 if (!check_comm_schedule(sched,npes)) { | |
516 fprintf(stderr, | |
517 "\n -- fill error for sortpe = %d!\n",sortpe); | |
518 return 2; | |
519 } | |
520 | |
521 for (i = 0; i < npes; ++i) sched1[i] = -1; | |
522 fill1_comm_sched(sched1, sortpe, npes); | |
523 for (i = 0; i < npes; ++i) | |
524 if (sched1[i] != sched[sortpe][i]) | |
525 fprintf(stderr, | |
526 "\n -- fill1 error for pe = %d!\n", | |
527 sortpe); | |
528 | |
529 sort_comm_schedule(sched,npes,sortpe); | |
530 if (!check_comm_schedule(sched,npes)) { | |
531 fprintf(stderr, | |
532 "\n -- sort error for sortpe = %d!\n",sortpe); | |
533 return 3; | |
534 } | |
535 invert_comm_schedule(sched,npes); | |
536 if (!check_comm_schedule(sched,npes)) { | |
537 fprintf(stderr, | |
538 "\n -- invert error for sortpe = %d!\n", | |
539 sortpe); | |
540 return 4; | |
541 } | |
542 } | |
543 free_comm_schedule(sched,npes); | |
544 printf("OK\n"); | |
545 if (npes % 50 == 0) | |
546 printf("(...Hit Ctrl-C to stop...)\n"); | |
547 free(sched1); | |
548 } | |
549 } | |
550 | |
551 return 0; | |
552 } |