cannam@127: /* cannam@127: * Copyright (c) 2003, 2007-14 Matteo Frigo cannam@127: * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology cannam@127: * cannam@127: * This program is free software; you can redistribute it and/or modify cannam@127: * it under the terms of the GNU General Public License as published by cannam@127: * the Free Software Foundation; either version 2 of the License, or cannam@127: * (at your option) any later version. cannam@127: * cannam@127: * This program is distributed in the hope that it will be useful, cannam@127: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@127: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@127: * GNU General Public License for more details. cannam@127: * cannam@127: * You should have received a copy of the GNU General Public License cannam@127: * along with this program; if not, write to the Free Software cannam@127: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@127: * cannam@127: */ cannam@127: cannam@127: /**********************************************************************/ cannam@127: /* This is a modified and combined version of the sched.c and cannam@127: test_sched.c files shipped with FFTW 2, written to implement and cannam@127: test various all-to-all communications scheduling patterns. cannam@127: cannam@127: It is not used in FFTW 3, but I keep it around in case we ever want cannam@127: to play with this again or to change algorithms. In particular, I cannam@127: used it to implement and test the fill1_comm_sched routine in cannam@127: transpose-pairwise.c, which allows us to create a schedule for one cannam@127: process at a time and is much more compact than the FFTW 2 code. cannam@127: cannam@127: Note that the scheduling algorithm is somewhat modified from that cannam@127: of FFTW 2. Originally, I thought that one "stall" in the schedule cannam@127: was unavoidable for odd numbers of processes, since this is the cannam@127: case for the soccer-timetabling problem. However, because of the cannam@127: self-communication step, we can use the self-communication to fill cannam@127: in the stalls. (Thanks to Ralf Wildenhues for pointing this out.) cannam@127: This greatly simplifies the process re-sorting algorithm. */ cannam@127: cannam@127: /**********************************************************************/ cannam@127: cannam@127: #include cannam@127: #include cannam@127: cannam@127: /* This file contains routines to compute communications schedules for cannam@127: all-to-all communications (complete exchanges) that are performed cannam@127: in-place. (That is, the block that processor x sends to processor cannam@127: y gets replaced on processor x by a block received from processor y.) cannam@127: cannam@127: A schedule, int **sched, is a two-dimensional array where cannam@127: sched[pe][i] is the processor that pe expects to exchange a message cannam@127: with on the i-th step of the exchange. sched[pe][i] == -1 for the cannam@127: i after the last exchange scheduled on pe. cannam@127: cannam@127: Here, processors (pe's, for processing elements), are numbered from cannam@127: 0 to npes-1. cannam@127: cannam@127: There are a couple of constraints that a schedule should satisfy cannam@127: (besides the obvious one that every processor has to communicate cannam@127: with every other processor exactly once). cannam@127: cannam@127: * First, and most importantly, there must be no deadlocks. cannam@127: cannam@127: * Second, we would like to overlap communications as much as possible, cannam@127: so that all exchanges occur in parallel. It turns out that perfect cannam@127: overlap is possible for all number of processes (npes). cannam@127: cannam@127: It turns out that this scheduling problem is actually well-studied, cannam@127: and good solutions are known. The problem is known as a cannam@127: "time-tabling" problem, and is specifically the problem of cannam@127: scheduling a sports competition (where n teams must compete exactly cannam@127: once with every other team). The problem is discussed and cannam@127: algorithms are presented in: cannam@127: cannam@127: [1] J. A. M. Schreuder, "Constructing Timetables for Sport cannam@127: Competitions," Mathematical Programming Study 13, pp. 58-67 (1980). cannam@127: cannam@127: [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint cannam@127: Logic Programming," Proc. of 12th Europ. Conf. on cannam@127: Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996). cannam@127: http://hermes.dis.uniromal.it/~aschaerf/publications.html cannam@127: cannam@127: (These people actually impose a lot of additional constraints that cannam@127: we don't care about, so they are solving harder problems. [1] gives cannam@127: a simple enough algorithm for our purposes, though.) cannam@127: cannam@127: In the timetabling problem, N teams can all play one another in N-1 cannam@127: steps if N is even, and N steps if N is odd. Here, however, cannam@127: there is a "self-communication" step (a team must also "play itself") cannam@127: and so we can always make an optimal N-step schedule regardless of N. cannam@127: cannam@127: However, we have to do more: for a particular processor, the cannam@127: communications schedule must be sorted in ascending or descending cannam@127: order of processor index. (This is necessary so that the data cannam@127: coming in for the transpose does not overwrite data that will be cannam@127: sent later; for that processor the incoming and outgoing blocks are cannam@127: of different non-zero sizes.) Fortunately, because the schedule cannam@127: is stall free, each parallel step of the schedule is independent cannam@127: of every other step, and we can reorder the steps arbitrarily cannam@127: to achieve any desired order on a particular process. cannam@127: */ cannam@127: cannam@127: void free_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: if (sched) { cannam@127: int i; cannam@127: cannam@127: for (i = 0; i < npes; ++i) cannam@127: free(sched[i]); cannam@127: free(sched); cannam@127: } cannam@127: } cannam@127: cannam@127: void empty_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: int i; cannam@127: for (i = 0; i < npes; ++i) cannam@127: sched[i][0] = -1; cannam@127: } cannam@127: cannam@127: extern void fill_comm_schedule(int **sched, int npes); cannam@127: cannam@127: /* Create a new communications schedule for a given number of processors. cannam@127: The schedule is initialized to a deadlock-free, maximum overlap cannam@127: schedule. Returns NULL on an error (may print a message to cannam@127: stderr if there is a program bug detected). */ cannam@127: int **make_comm_schedule(int npes) cannam@127: { cannam@127: int **sched; cannam@127: int i; cannam@127: cannam@127: sched = (int **) malloc(sizeof(int *) * npes); cannam@127: if (!sched) cannam@127: return NULL; cannam@127: cannam@127: for (i = 0; i < npes; ++i) cannam@127: sched[i] = NULL; cannam@127: cannam@127: for (i = 0; i < npes; ++i) { cannam@127: sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1)); cannam@127: if (!sched[i]) { cannam@127: free_comm_schedule(sched,npes); cannam@127: return NULL; cannam@127: } cannam@127: } cannam@127: cannam@127: empty_comm_schedule(sched,npes); cannam@127: fill_comm_schedule(sched,npes); cannam@127: cannam@127: if (!check_comm_schedule(sched,npes)) { cannam@127: free_comm_schedule(sched,npes); cannam@127: return NULL; cannam@127: } cannam@127: cannam@127: return sched; cannam@127: } cannam@127: cannam@127: static void add_dest_to_comm_schedule(int **sched, int pe, int dest) cannam@127: { cannam@127: int i; cannam@127: cannam@127: for (i = 0; sched[pe][i] != -1; ++i) cannam@127: ; cannam@127: cannam@127: sched[pe][i] = dest; cannam@127: sched[pe][i+1] = -1; cannam@127: } cannam@127: cannam@127: static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2) cannam@127: { cannam@127: add_dest_to_comm_schedule(sched, pe1, pe2); cannam@127: if (pe1 != pe2) cannam@127: add_dest_to_comm_schedule(sched, pe2, pe1); cannam@127: } cannam@127: cannam@127: /* Simplification of algorithm presented in [1] (we have fewer cannam@127: constraints). Produces a perfect schedule (npes steps). */ cannam@127: cannam@127: void fill_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: int pe, i, n; cannam@127: cannam@127: if (npes % 2 == 0) { cannam@127: n = npes; cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: add_pair_to_comm_schedule(sched,pe,pe); cannam@127: } cannam@127: else cannam@127: n = npes + 1; cannam@127: cannam@127: for (pe = 0; pe < n - 1; ++pe) { cannam@127: add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe); cannam@127: cannam@127: for (i = 1; i < n/2; ++i) { cannam@127: int pe_a, pe_b; cannam@127: cannam@127: pe_a = pe - i; cannam@127: if (pe_a < 0) cannam@127: pe_a += n - 1; cannam@127: cannam@127: pe_b = (pe + i) % (n - 1); cannam@127: cannam@127: add_pair_to_comm_schedule(sched,pe_a,pe_b); cannam@127: } cannam@127: } cannam@127: } cannam@127: cannam@127: /* given an array sched[npes], fills it with the communications cannam@127: schedule for process pe. */ cannam@127: void fill1_comm_sched(int *sched, int which_pe, int npes) cannam@127: { cannam@127: int pe, i, n, s = 0; cannam@127: if (npes % 2 == 0) { cannam@127: n = npes; cannam@127: sched[s++] = which_pe; cannam@127: } cannam@127: else cannam@127: n = npes + 1; cannam@127: for (pe = 0; pe < n - 1; ++pe) { cannam@127: if (npes % 2 == 0) { cannam@127: if (pe == which_pe) sched[s++] = npes - 1; cannam@127: else if (npes - 1 == which_pe) sched[s++] = pe; cannam@127: } cannam@127: else if (pe == which_pe) sched[s++] = pe; cannam@127: cannam@127: if (pe != which_pe && which_pe < n - 1) { cannam@127: i = (pe - which_pe + (n - 1)) % (n - 1); cannam@127: if (i < n/2) cannam@127: sched[s++] = (pe + i) % (n - 1); cannam@127: cannam@127: i = (which_pe - pe + (n - 1)) % (n - 1); cannam@127: if (i < n/2) cannam@127: sched[s++] = (pe - i + (n - 1)) % (n - 1); cannam@127: } cannam@127: } cannam@127: if (s != npes) { cannam@127: fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n", cannam@127: s, which_pe, npes); cannam@127: exit(EXIT_FAILURE); cannam@127: } cannam@127: } cannam@127: cannam@127: /* sort the communication schedule sched for npes so that the schedule cannam@127: on process sortpe is ascending or descending (!ascending). */ cannam@127: static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending) cannam@127: { cannam@127: int *sortsched, i; cannam@127: sortsched = (int *) malloc(npes * sizeof(int) * 2); cannam@127: fill1_comm_sched(sortsched, sortpe, npes); cannam@127: if (ascending) cannam@127: for (i = 0; i < npes; ++i) cannam@127: sortsched[npes + sortsched[i]] = sched[i]; cannam@127: else cannam@127: for (i = 0; i < npes; ++i) cannam@127: sortsched[2*npes - 1 - sortsched[i]] = sched[i]; cannam@127: for (i = 0; i < npes; ++i) cannam@127: sched[i] = sortsched[npes + i]; cannam@127: free(sortsched); cannam@127: } cannam@127: cannam@127: /* Below, we have various checks in case of bugs: */ cannam@127: cannam@127: /* check for deadlocks by simulating the schedule and looking for cannam@127: cycles in the dependency list; returns 0 if there are deadlocks cannam@127: (or other errors) */ cannam@127: static int check_schedule_deadlock(int **sched, int npes) cannam@127: { cannam@127: int *step, *depend, *visited, pe, pe2, period, done = 0; cannam@127: int counter = 0; cannam@127: cannam@127: /* step[pe] is the step in the schedule that a given pe is on */ cannam@127: step = (int *) malloc(sizeof(int) * npes); cannam@127: cannam@127: /* depend[pe] is the pe' that pe is currently waiting for a message cannam@127: from (-1 if none) */ cannam@127: depend = (int *) malloc(sizeof(int) * npes); cannam@127: cannam@127: /* visited[pe] tells whether we have visited the current pe already cannam@127: when we are looking for cycles. */ cannam@127: visited = (int *) malloc(sizeof(int) * npes); cannam@127: cannam@127: if (!step || !depend || !visited) { cannam@127: free(step); free(depend); free(visited); cannam@127: return 0; cannam@127: } cannam@127: cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: step[pe] = 0; cannam@127: cannam@127: while (!done) { cannam@127: ++counter; cannam@127: cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: depend[pe] = sched[pe][step[pe]]; cannam@127: cannam@127: /* now look for cycles in the dependencies with period > 2: */ cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: if (depend[pe] != -1) { cannam@127: for (pe2 = 0; pe2 < npes; ++pe2) cannam@127: visited[pe2] = 0; cannam@127: cannam@127: period = 0; cannam@127: pe2 = pe; cannam@127: do { cannam@127: visited[pe2] = period + 1; cannam@127: pe2 = depend[pe2]; cannam@127: period++; cannam@127: } while (pe2 != -1 && !visited[pe2]); cannam@127: cannam@127: if (pe2 == -1) { cannam@127: fprintf(stderr, cannam@127: "BUG: unterminated cycle in schedule!\n"); cannam@127: free(step); free(depend); cannam@127: free(visited); cannam@127: return 0; cannam@127: } cannam@127: if (period - (visited[pe2] - 1) > 2) { cannam@127: fprintf(stderr,"BUG: deadlock in schedule!\n"); cannam@127: free(step); free(depend); cannam@127: free(visited); cannam@127: return 0; cannam@127: } cannam@127: cannam@127: if (pe2 == pe) cannam@127: step[pe]++; cannam@127: } cannam@127: cannam@127: done = 1; cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: if (sched[pe][step[pe]] != -1) { cannam@127: done = 0; cannam@127: break; cannam@127: } cannam@127: } cannam@127: cannam@127: free(step); free(depend); free(visited); cannam@127: return (counter > 0 ? counter : 1); cannam@127: } cannam@127: cannam@127: /* sanity checks; prints message and returns 0 on failure. cannam@127: undocumented feature: the return value on success is actually the cannam@127: number of steps required for the schedule to complete, counting cannam@127: stalls. */ cannam@127: int check_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: int pe, i, comm_pe; cannam@127: cannam@127: for (pe = 0; pe < npes; ++pe) { cannam@127: for (comm_pe = 0; comm_pe < npes; ++comm_pe) { cannam@127: for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i) cannam@127: ; cannam@127: if (sched[pe][i] == -1) { cannam@127: fprintf(stderr,"BUG: schedule never sends message from " cannam@127: "%d to %d.\n",pe,comm_pe); cannam@127: return 0; /* never send message to comm_pe */ cannam@127: } cannam@127: } cannam@127: for (i = 0; sched[pe][i] != -1; ++i) cannam@127: ; cannam@127: if (i != npes) { cannam@127: fprintf(stderr,"BUG: schedule sends too many messages from " cannam@127: "%d\n",pe); cannam@127: return 0; cannam@127: } cannam@127: } cannam@127: return check_schedule_deadlock(sched,npes); cannam@127: } cannam@127: cannam@127: /* invert the order of all the schedules; this has no effect on cannam@127: its required properties. */ cannam@127: void invert_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: int pe, i; cannam@127: cannam@127: for (pe = 0; pe < npes; ++pe) cannam@127: for (i = 0; i < npes/2; ++i) { cannam@127: int dummy = sched[pe][i]; cannam@127: sched[pe][i] = sched[pe][npes-1-i]; cannam@127: sched[pe][npes-1-i] = dummy; cannam@127: } cannam@127: } cannam@127: cannam@127: /* Sort the schedule for sort_pe in ascending order of processor cannam@127: index. Unfortunately, for odd npes (when schedule has a stall cannam@127: to begin with) this will introduce an extra stall due to cannam@127: the motion of the self-communication past a stall. We could cannam@127: fix this if it were really important. Actually, we don't cannam@127: get an extra stall when sort_pe == 0 or npes-1, which is sufficient cannam@127: for our purposes. */ cannam@127: void sort_comm_schedule(int **sched, int npes, int sort_pe) cannam@127: { cannam@127: int i,j,pe; cannam@127: cannam@127: /* Note that we can do this sort in O(npes) swaps because we know cannam@127: that the numbers we are sorting are just 0...npes-1. But we'll cannam@127: just do a bubble sort for simplicity here. */ cannam@127: cannam@127: for (i = 0; i < npes - 1; ++i) cannam@127: for (j = i + 1; j < npes; ++j) cannam@127: if (sched[sort_pe][i] > sched[sort_pe][j]) { cannam@127: for (pe = 0; pe < npes; ++pe) { cannam@127: int s = sched[pe][i]; cannam@127: sched[pe][i] = sched[pe][j]; cannam@127: sched[pe][j] = s; cannam@127: } cannam@127: } cannam@127: } cannam@127: cannam@127: /* print the schedule (for debugging purposes) */ cannam@127: void print_comm_schedule(int **sched, int npes) cannam@127: { cannam@127: int pe, i, width; cannam@127: cannam@127: if (npes < 10) cannam@127: width = 1; cannam@127: else if (npes < 100) cannam@127: width = 2; cannam@127: else cannam@127: width = 3; cannam@127: cannam@127: for (pe = 0; pe < npes; ++pe) { cannam@127: printf("pe %*d schedule:", width, pe); cannam@127: for (i = 0; sched[pe][i] != -1; ++i) cannam@127: printf(" %*d",width,sched[pe][i]); cannam@127: printf("\n"); cannam@127: } cannam@127: } cannam@127: cannam@127: int main(int argc, char **argv) cannam@127: { cannam@127: int **sched; cannam@127: int npes = -1, sortpe = -1, steps, i; cannam@127: cannam@127: if (argc >= 2) { cannam@127: npes = atoi(argv[1]); cannam@127: if (npes <= 0) { cannam@127: fprintf(stderr,"npes must be positive!"); cannam@127: return 1; cannam@127: } cannam@127: } cannam@127: if (argc >= 3) { cannam@127: sortpe = atoi(argv[2]); cannam@127: if (sortpe < 0 || sortpe >= npes) { cannam@127: fprintf(stderr,"sortpe must be between 0 and npes-1.\n"); cannam@127: return 1; cannam@127: } cannam@127: } cannam@127: cannam@127: if (npes != -1) { cannam@127: printf("Computing schedule for npes = %d:\n",npes); cannam@127: sched = make_comm_schedule(npes); cannam@127: if (!sched) { cannam@127: fprintf(stderr,"Out of memory!"); cannam@127: return 6; cannam@127: } cannam@127: cannam@127: if (steps = check_comm_schedule(sched,npes)) cannam@127: printf("schedule OK (takes %d steps to complete).\n", steps); cannam@127: else cannam@127: printf("schedule not OK.\n"); cannam@127: cannam@127: print_comm_schedule(sched, npes); cannam@127: cannam@127: if (sortpe != -1) { cannam@127: printf("\nRe-creating schedule for pe = %d...\n", sortpe); cannam@127: int *sched1 = (int*) malloc(sizeof(int) * npes); cannam@127: for (i = 0; i < npes; ++i) sched1[i] = -1; cannam@127: fill1_comm_sched(sched1, sortpe, npes); cannam@127: printf(" ="); cannam@127: for (i = 0; i < npes; ++i) cannam@127: printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3), cannam@127: sched1[i]); cannam@127: printf("\n"); cannam@127: cannam@127: printf("\nSorting schedule for sortpe = %d...\n", sortpe); cannam@127: sort_comm_schedule(sched,npes,sortpe); cannam@127: cannam@127: if (steps = check_comm_schedule(sched,npes)) cannam@127: printf("schedule OK (takes %d steps to complete).\n", cannam@127: steps); cannam@127: else cannam@127: printf("schedule not OK.\n"); cannam@127: cannam@127: print_comm_schedule(sched, npes); cannam@127: cannam@127: printf("\nInverting schedule...\n"); cannam@127: invert_comm_schedule(sched,npes); cannam@127: cannam@127: if (steps = check_comm_schedule(sched,npes)) cannam@127: printf("schedule OK (takes %d steps to complete).\n", cannam@127: steps); cannam@127: else cannam@127: printf("schedule not OK.\n"); cannam@127: cannam@127: print_comm_schedule(sched, npes); cannam@127: cannam@127: free_comm_schedule(sched,npes); cannam@127: cannam@127: free(sched1); cannam@127: } cannam@127: } cannam@127: else { cannam@127: printf("Doing infinite tests...\n"); cannam@127: for (npes = 1; ; ++npes) { cannam@127: int *sched1 = (int*) malloc(sizeof(int) * npes); cannam@127: printf("npes = %d...",npes); cannam@127: sched = make_comm_schedule(npes); cannam@127: if (!sched) { cannam@127: fprintf(stderr,"Out of memory!\n"); cannam@127: return 5; cannam@127: } cannam@127: for (sortpe = 0; sortpe < npes; ++sortpe) { cannam@127: empty_comm_schedule(sched,npes); cannam@127: fill_comm_schedule(sched,npes); cannam@127: if (!check_comm_schedule(sched,npes)) { cannam@127: fprintf(stderr, cannam@127: "\n -- fill error for sortpe = %d!\n",sortpe); cannam@127: return 2; cannam@127: } cannam@127: cannam@127: for (i = 0; i < npes; ++i) sched1[i] = -1; cannam@127: fill1_comm_sched(sched1, sortpe, npes); cannam@127: for (i = 0; i < npes; ++i) cannam@127: if (sched1[i] != sched[sortpe][i]) cannam@127: fprintf(stderr, cannam@127: "\n -- fill1 error for pe = %d!\n", cannam@127: sortpe); cannam@127: cannam@127: sort_comm_schedule(sched,npes,sortpe); cannam@127: if (!check_comm_schedule(sched,npes)) { cannam@127: fprintf(stderr, cannam@127: "\n -- sort error for sortpe = %d!\n",sortpe); cannam@127: return 3; cannam@127: } cannam@127: invert_comm_schedule(sched,npes); cannam@127: if (!check_comm_schedule(sched,npes)) { cannam@127: fprintf(stderr, cannam@127: "\n -- invert error for sortpe = %d!\n", cannam@127: sortpe); cannam@127: return 4; cannam@127: } cannam@127: } cannam@127: free_comm_schedule(sched,npes); cannam@127: printf("OK\n"); cannam@127: if (npes % 50 == 0) cannam@127: printf("(...Hit Ctrl-C to stop...)\n"); cannam@127: free(sched1); cannam@127: } cannam@127: } cannam@127: cannam@127: return 0; cannam@127: }