| Chris@10 | 1 /* | 
| Chris@10 | 2  * Copyright (c) 2003, 2007-11 Matteo Frigo | 
| Chris@10 | 3  * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology | 
| Chris@10 | 4  * | 
| Chris@10 | 5  * This program is free software; you can redistribute it and/or modify | 
| Chris@10 | 6  * it under the terms of the GNU General Public License as published by | 
| Chris@10 | 7  * the Free Software Foundation; either version 2 of the License, or | 
| Chris@10 | 8  * (at your option) any later version. | 
| Chris@10 | 9  * | 
| Chris@10 | 10  * This program is distributed in the hope that it will be useful, | 
| Chris@10 | 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| Chris@10 | 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| Chris@10 | 13  * GNU General Public License for more details. | 
| Chris@10 | 14  * | 
| Chris@10 | 15  * You should have received a copy of the GNU General Public License | 
| Chris@10 | 16  * along with this program; if not, write to the Free Software | 
| Chris@10 | 17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | 
| Chris@10 | 18  * | 
| Chris@10 | 19  */ | 
| Chris@10 | 20 | 
| Chris@10 | 21 /**********************************************************************/ | 
| Chris@10 | 22 /* This is a modified and combined version of the sched.c and | 
| Chris@10 | 23    test_sched.c files shipped with FFTW 2, written to implement and | 
| Chris@10 | 24    test various all-to-all communications scheduling patterns. | 
| Chris@10 | 25 | 
| Chris@10 | 26    It is not used in FFTW 3, but I keep it around in case we ever want | 
| Chris@10 | 27    to play with this again or to change algorithms.  In particular, I | 
| Chris@10 | 28    used it to implement and test the fill1_comm_sched routine in | 
| Chris@10 | 29    transpose-pairwise.c, which allows us to create a schedule for one | 
| Chris@10 | 30    process at a time and is much more compact than the FFTW 2 code. | 
| Chris@10 | 31 | 
| Chris@10 | 32    Note that the scheduling algorithm is somewhat modified from that | 
| Chris@10 | 33    of FFTW 2.  Originally, I thought that one "stall" in the schedule | 
| Chris@10 | 34    was unavoidable for odd numbers of processes, since this is the | 
| Chris@10 | 35    case for the soccer-timetabling problem.  However, because of the | 
| Chris@10 | 36    self-communication step, we can use the self-communication to fill | 
| Chris@10 | 37    in the stalls.  (Thanks to Ralf Wildenhues for pointing this out.) | 
| Chris@10 | 38    This greatly simplifies the process re-sorting algorithm. */ | 
| Chris@10 | 39 | 
| Chris@10 | 40 /**********************************************************************/ | 
| Chris@10 | 41 | 
| Chris@10 | 42 #include <stdio.h> | 
| Chris@10 | 43 #include <stdlib.h> | 
| Chris@10 | 44 | 
| Chris@10 | 45 /* This file contains routines to compute communications schedules for | 
| Chris@10 | 46    all-to-all communications (complete exchanges) that are performed | 
| Chris@10 | 47    in-place.  (That is, the block that processor x sends to processor | 
| Chris@10 | 48    y gets replaced on processor x by a block received from processor y.) | 
| Chris@10 | 49 | 
| Chris@10 | 50    A schedule, int **sched, is a two-dimensional array where | 
| Chris@10 | 51    sched[pe][i] is the processor that pe expects to exchange a message | 
| Chris@10 | 52    with on the i-th step of the exchange.  sched[pe][i] == -1 for the | 
| Chris@10 | 53    i after the last exchange scheduled on pe. | 
| Chris@10 | 54 | 
| Chris@10 | 55    Here, processors (pe's, for processing elements), are numbered from | 
| Chris@10 | 56    0 to npes-1. | 
| Chris@10 | 57 | 
| Chris@10 | 58    There are a couple of constraints that a schedule should satisfy | 
| Chris@10 | 59    (besides the obvious one that every processor has to communicate | 
| Chris@10 | 60    with every other processor exactly once). | 
| Chris@10 | 61 | 
| Chris@10 | 62    * First, and most importantly, there must be no deadlocks. | 
| Chris@10 | 63 | 
| Chris@10 | 64    * Second, we would like to overlap communications as much as possible, | 
| Chris@10 | 65    so that all exchanges occur in parallel.  It turns out that perfect | 
| Chris@10 | 66    overlap is possible for all number of processes (npes). | 
| Chris@10 | 67 | 
| Chris@10 | 68    It turns out that this scheduling problem is actually well-studied, | 
| Chris@10 | 69    and good solutions are known.  The problem is known as a | 
| Chris@10 | 70    "time-tabling" problem, and is specifically the problem of | 
| Chris@10 | 71    scheduling a sports competition (where n teams must compete exactly | 
| Chris@10 | 72    once with every other team).  The problem is discussed and | 
| Chris@10 | 73    algorithms are presented in: | 
| Chris@10 | 74 | 
| Chris@10 | 75    [1] J. A. M. Schreuder, "Constructing Timetables for Sport | 
| Chris@10 | 76    Competitions," Mathematical Programming Study 13, pp. 58-67 (1980). | 
| Chris@10 | 77 | 
| Chris@10 | 78    [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint | 
| Chris@10 | 79    Logic Programming," Proc. of 12th Europ. Conf. on | 
| Chris@10 | 80    Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996). | 
| Chris@10 | 81    http://hermes.dis.uniromal.it/~aschaerf/publications.html | 
| Chris@10 | 82 | 
| Chris@10 | 83    (These people actually impose a lot of additional constraints that | 
| Chris@10 | 84    we don't care about, so they are solving harder problems. [1] gives | 
| Chris@10 | 85    a simple enough algorithm for our purposes, though.) | 
| Chris@10 | 86 | 
| Chris@10 | 87    In the timetabling problem, N teams can all play one another in N-1 | 
| Chris@10 | 88    steps if N is even, and N steps if N is odd.  Here, however, | 
| Chris@10 | 89    there is a "self-communication" step (a team must also "play itself") | 
| Chris@10 | 90    and so we can always make an optimal N-step schedule regardless of N. | 
| Chris@10 | 91 | 
| Chris@10 | 92    However, we have to do more: for a particular processor, the | 
| Chris@10 | 93    communications schedule must be sorted in ascending or descending | 
| Chris@10 | 94    order of processor index.  (This is necessary so that the data | 
| Chris@10 | 95    coming in for the transpose does not overwrite data that will be | 
| Chris@10 | 96    sent later; for that processor the incoming and outgoing blocks are | 
| Chris@10 | 97    of different non-zero sizes.)  Fortunately, because the schedule | 
| Chris@10 | 98    is stall free, each parallel step of the schedule is independent | 
| Chris@10 | 99    of every other step, and we can reorder the steps arbitrarily | 
| Chris@10 | 100    to achieve any desired order on a particular process. | 
| Chris@10 | 101 */ | 
| Chris@10 | 102 | 
| Chris@10 | 103 void free_comm_schedule(int **sched, int npes) | 
| Chris@10 | 104 { | 
| Chris@10 | 105      if (sched) { | 
| Chris@10 | 106 	  int i; | 
| Chris@10 | 107 | 
| Chris@10 | 108 	  for (i = 0; i < npes; ++i) | 
| Chris@10 | 109 	       free(sched[i]); | 
| Chris@10 | 110 	  free(sched); | 
| Chris@10 | 111      } | 
| Chris@10 | 112 } | 
| Chris@10 | 113 | 
| Chris@10 | 114 void empty_comm_schedule(int **sched, int npes) | 
| Chris@10 | 115 { | 
| Chris@10 | 116      int i; | 
| Chris@10 | 117      for (i = 0; i < npes; ++i) | 
| Chris@10 | 118 	  sched[i][0] = -1; | 
| Chris@10 | 119 } | 
| Chris@10 | 120 | 
| Chris@10 | 121 extern void fill_comm_schedule(int **sched, int npes); | 
| Chris@10 | 122 | 
| Chris@10 | 123 /* Create a new communications schedule for a given number of processors. | 
| Chris@10 | 124    The schedule is initialized to a deadlock-free, maximum overlap | 
| Chris@10 | 125    schedule.  Returns NULL on an error (may print a message to | 
| Chris@10 | 126    stderr if there is a program bug detected).  */ | 
| Chris@10 | 127 int **make_comm_schedule(int npes) | 
| Chris@10 | 128 { | 
| Chris@10 | 129      int **sched; | 
| Chris@10 | 130      int i; | 
| Chris@10 | 131 | 
| Chris@10 | 132      sched = (int **) malloc(sizeof(int *) * npes); | 
| Chris@10 | 133      if (!sched) | 
| Chris@10 | 134 	  return NULL; | 
| Chris@10 | 135 | 
| Chris@10 | 136      for (i = 0; i < npes; ++i) | 
| Chris@10 | 137 	  sched[i] = NULL; | 
| Chris@10 | 138 | 
| Chris@10 | 139      for (i = 0; i < npes; ++i) { | 
| Chris@10 | 140 	  sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1)); | 
| Chris@10 | 141 	  if (!sched[i]) { | 
| Chris@10 | 142 	       free_comm_schedule(sched,npes); | 
| Chris@10 | 143 	       return NULL; | 
| Chris@10 | 144 	  } | 
| Chris@10 | 145      } | 
| Chris@10 | 146 | 
| Chris@10 | 147      empty_comm_schedule(sched,npes); | 
| Chris@10 | 148      fill_comm_schedule(sched,npes); | 
| Chris@10 | 149 | 
| Chris@10 | 150      if (!check_comm_schedule(sched,npes)) { | 
| Chris@10 | 151 	  free_comm_schedule(sched,npes); | 
| Chris@10 | 152 	  return NULL; | 
| Chris@10 | 153      } | 
| Chris@10 | 154 | 
| Chris@10 | 155      return sched; | 
| Chris@10 | 156 } | 
| Chris@10 | 157 | 
| Chris@10 | 158 static void add_dest_to_comm_schedule(int **sched, int pe, int dest) | 
| Chris@10 | 159 { | 
| Chris@10 | 160      int i; | 
| Chris@10 | 161 | 
| Chris@10 | 162      for (i = 0; sched[pe][i] != -1; ++i) | 
| Chris@10 | 163 	  ; | 
| Chris@10 | 164 | 
| Chris@10 | 165      sched[pe][i] = dest; | 
| Chris@10 | 166      sched[pe][i+1] = -1; | 
| Chris@10 | 167 } | 
| Chris@10 | 168 | 
| Chris@10 | 169 static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2) | 
| Chris@10 | 170 { | 
| Chris@10 | 171      add_dest_to_comm_schedule(sched, pe1, pe2); | 
| Chris@10 | 172      if (pe1 != pe2) | 
| Chris@10 | 173 	  add_dest_to_comm_schedule(sched, pe2, pe1); | 
| Chris@10 | 174 } | 
| Chris@10 | 175 | 
| Chris@10 | 176 /* Simplification of algorithm presented in [1] (we have fewer | 
| Chris@10 | 177    constraints).  Produces a perfect schedule (npes steps).  */ | 
| Chris@10 | 178 | 
| Chris@10 | 179 void fill_comm_schedule(int **sched, int npes) | 
| Chris@10 | 180 { | 
| Chris@10 | 181      int pe, i, n; | 
| Chris@10 | 182 | 
| Chris@10 | 183      if (npes % 2 == 0) { | 
| Chris@10 | 184 	  n = npes; | 
| Chris@10 | 185 	  for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 186 	       add_pair_to_comm_schedule(sched,pe,pe); | 
| Chris@10 | 187      } | 
| Chris@10 | 188      else | 
| Chris@10 | 189 	  n = npes + 1; | 
| Chris@10 | 190 | 
| Chris@10 | 191      for (pe = 0; pe < n - 1; ++pe) { | 
| Chris@10 | 192 	  add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe); | 
| Chris@10 | 193 | 
| Chris@10 | 194 	  for (i = 1; i < n/2; ++i) { | 
| Chris@10 | 195 	       int pe_a, pe_b; | 
| Chris@10 | 196 | 
| Chris@10 | 197 	       pe_a = pe - i; | 
| Chris@10 | 198 	       if (pe_a < 0) | 
| Chris@10 | 199 		    pe_a += n - 1; | 
| Chris@10 | 200 | 
| Chris@10 | 201 	       pe_b = (pe + i) % (n - 1); | 
| Chris@10 | 202 | 
| Chris@10 | 203 	       add_pair_to_comm_schedule(sched,pe_a,pe_b); | 
| Chris@10 | 204 	  } | 
| Chris@10 | 205      } | 
| Chris@10 | 206 } | 
| Chris@10 | 207 | 
| Chris@10 | 208 /* given an array sched[npes], fills it with the communications | 
| Chris@10 | 209    schedule for process pe. */ | 
| Chris@10 | 210 void fill1_comm_sched(int *sched, int which_pe, int npes) | 
| Chris@10 | 211 { | 
| Chris@10 | 212      int pe, i, n, s = 0; | 
| Chris@10 | 213      if (npes % 2 == 0) { | 
| Chris@10 | 214 	  n = npes; | 
| Chris@10 | 215 	  sched[s++] = which_pe; | 
| Chris@10 | 216      } | 
| Chris@10 | 217      else | 
| Chris@10 | 218 	  n = npes + 1; | 
| Chris@10 | 219      for (pe = 0; pe < n - 1; ++pe) { | 
| Chris@10 | 220 	  if (npes % 2 == 0) { | 
| Chris@10 | 221 	       if (pe == which_pe) sched[s++] = npes - 1; | 
| Chris@10 | 222 	       else if (npes - 1 == which_pe) sched[s++] = pe; | 
| Chris@10 | 223 	  } | 
| Chris@10 | 224 	  else if (pe == which_pe) sched[s++] = pe; | 
| Chris@10 | 225 | 
| Chris@10 | 226 	  if (pe != which_pe && which_pe < n - 1) { | 
| Chris@10 | 227 	       i = (pe - which_pe + (n - 1)) % (n - 1); | 
| Chris@10 | 228 	       if (i < n/2) | 
| Chris@10 | 229 		    sched[s++] = (pe + i) % (n - 1); | 
| Chris@10 | 230 | 
| Chris@10 | 231 	       i = (which_pe - pe + (n - 1)) % (n - 1); | 
| Chris@10 | 232 	       if (i < n/2) | 
| Chris@10 | 233 		    sched[s++] = (pe - i + (n - 1)) % (n - 1); | 
| Chris@10 | 234 	  } | 
| Chris@10 | 235      } | 
| Chris@10 | 236      if (s != npes) { | 
| Chris@10 | 237 	  fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n", | 
| Chris@10 | 238 		  s, which_pe, npes); | 
| Chris@10 | 239 	  exit(EXIT_FAILURE); | 
| Chris@10 | 240      } | 
| Chris@10 | 241 } | 
| Chris@10 | 242 | 
| Chris@10 | 243 /* sort the communication schedule sched for npes so that the schedule | 
| Chris@10 | 244    on process sortpe is ascending or descending (!ascending). */ | 
| Chris@10 | 245 static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending) | 
| Chris@10 | 246 { | 
| Chris@10 | 247      int *sortsched, i; | 
| Chris@10 | 248      sortsched = (int *) malloc(npes * sizeof(int) * 2); | 
| Chris@10 | 249      fill1_comm_sched(sortsched, sortpe, npes); | 
| Chris@10 | 250      if (ascending) | 
| Chris@10 | 251           for (i = 0; i < npes; ++i) | 
| Chris@10 | 252                sortsched[npes + sortsched[i]] = sched[i]; | 
| Chris@10 | 253      else | 
| Chris@10 | 254           for (i = 0; i < npes; ++i) | 
| Chris@10 | 255                sortsched[2*npes - 1 - sortsched[i]] = sched[i]; | 
| Chris@10 | 256      for (i = 0; i < npes; ++i) | 
| Chris@10 | 257           sched[i] = sortsched[npes + i]; | 
| Chris@10 | 258      free(sortsched); | 
| Chris@10 | 259 } | 
| Chris@10 | 260 | 
| Chris@10 | 261 /* Below, we have various checks in case of bugs: */ | 
| Chris@10 | 262 | 
| Chris@10 | 263 /* check for deadlocks by simulating the schedule and looking for | 
| Chris@10 | 264    cycles in the dependency list; returns 0 if there are deadlocks | 
| Chris@10 | 265    (or other errors) */ | 
| Chris@10 | 266 static int check_schedule_deadlock(int **sched, int npes) | 
| Chris@10 | 267 { | 
| Chris@10 | 268      int *step, *depend, *visited, pe, pe2, period, done = 0; | 
| Chris@10 | 269      int counter = 0; | 
| Chris@10 | 270 | 
| Chris@10 | 271      /* step[pe] is the step in the schedule that a given pe is on */ | 
| Chris@10 | 272      step = (int *) malloc(sizeof(int) * npes); | 
| Chris@10 | 273 | 
| Chris@10 | 274      /* depend[pe] is the pe' that pe is currently waiting for a message | 
| Chris@10 | 275 	from (-1 if none) */ | 
| Chris@10 | 276      depend = (int *) malloc(sizeof(int) * npes); | 
| Chris@10 | 277 | 
| Chris@10 | 278      /* visited[pe] tells whether we have visited the current pe already | 
| Chris@10 | 279 	when we are looking for cycles. */ | 
| Chris@10 | 280      visited = (int *) malloc(sizeof(int) * npes); | 
| Chris@10 | 281 | 
| Chris@10 | 282      if (!step || !depend || !visited) { | 
| Chris@10 | 283 	  free(step); free(depend); free(visited); | 
| Chris@10 | 284 	  return 0; | 
| Chris@10 | 285      } | 
| Chris@10 | 286 | 
| Chris@10 | 287      for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 288 	  step[pe] = 0; | 
| Chris@10 | 289 | 
| Chris@10 | 290      while (!done) { | 
| Chris@10 | 291 	  ++counter; | 
| Chris@10 | 292 | 
| Chris@10 | 293 	  for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 294 	       depend[pe] = sched[pe][step[pe]]; | 
| Chris@10 | 295 | 
| Chris@10 | 296 	  /* now look for cycles in the dependencies with period > 2: */ | 
| Chris@10 | 297 	  for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 298 	       if (depend[pe] != -1) { | 
| Chris@10 | 299 		    for (pe2 = 0; pe2 < npes; ++pe2) | 
| Chris@10 | 300 			 visited[pe2] = 0; | 
| Chris@10 | 301 | 
| Chris@10 | 302 		    period = 0; | 
| Chris@10 | 303 		    pe2 = pe; | 
| Chris@10 | 304 		    do { | 
| Chris@10 | 305 			 visited[pe2] = period + 1; | 
| Chris@10 | 306 			 pe2 = depend[pe2]; | 
| Chris@10 | 307 			 period++; | 
| Chris@10 | 308 		    } while (pe2 != -1 && !visited[pe2]); | 
| Chris@10 | 309 | 
| Chris@10 | 310 		    if (pe2 == -1) { | 
| Chris@10 | 311 			 fprintf(stderr, | 
| Chris@10 | 312 				 "BUG: unterminated cycle in schedule!\n"); | 
| Chris@10 | 313 			 free(step); free(depend); | 
| Chris@10 | 314 			 free(visited); | 
| Chris@10 | 315 			 return 0; | 
| Chris@10 | 316 		    } | 
| Chris@10 | 317 		    if (period - (visited[pe2] - 1) > 2) { | 
| Chris@10 | 318 			 fprintf(stderr,"BUG: deadlock in schedule!\n"); | 
| Chris@10 | 319 			 free(step); free(depend); | 
| Chris@10 | 320 			 free(visited); | 
| Chris@10 | 321 			 return 0; | 
| Chris@10 | 322 		    } | 
| Chris@10 | 323 | 
| Chris@10 | 324 		    if (pe2 == pe) | 
| Chris@10 | 325 			 step[pe]++; | 
| Chris@10 | 326 	       } | 
| Chris@10 | 327 | 
| Chris@10 | 328 	  done = 1; | 
| Chris@10 | 329 	  for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 330 	       if (sched[pe][step[pe]] != -1) { | 
| Chris@10 | 331 		    done = 0; | 
| Chris@10 | 332 		    break; | 
| Chris@10 | 333 	       } | 
| Chris@10 | 334      } | 
| Chris@10 | 335 | 
| Chris@10 | 336      free(step); free(depend); free(visited); | 
| Chris@10 | 337      return (counter > 0 ? counter : 1); | 
| Chris@10 | 338 } | 
| Chris@10 | 339 | 
| Chris@10 | 340 /* sanity checks; prints message and returns 0 on failure. | 
| Chris@10 | 341    undocumented feature: the return value on success is actually the | 
| Chris@10 | 342    number of steps required for the schedule to complete, counting | 
| Chris@10 | 343    stalls. */ | 
| Chris@10 | 344 int check_comm_schedule(int **sched, int npes) | 
| Chris@10 | 345 { | 
| Chris@10 | 346      int pe, i, comm_pe; | 
| Chris@10 | 347 | 
| Chris@10 | 348      for (pe = 0; pe < npes; ++pe) { | 
| Chris@10 | 349 	  for (comm_pe = 0; comm_pe < npes; ++comm_pe) { | 
| Chris@10 | 350 	       for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i) | 
| Chris@10 | 351 		    ; | 
| Chris@10 | 352 	       if (sched[pe][i] == -1) { | 
| Chris@10 | 353 		    fprintf(stderr,"BUG: schedule never sends message from " | 
| Chris@10 | 354 			    "%d to %d.\n",pe,comm_pe); | 
| Chris@10 | 355 		    return 0;  /* never send message to comm_pe */ | 
| Chris@10 | 356 	       } | 
| Chris@10 | 357 	  } | 
| Chris@10 | 358 	  for (i = 0; sched[pe][i] != -1; ++i) | 
| Chris@10 | 359 	       ; | 
| Chris@10 | 360 	  if (i != npes) { | 
| Chris@10 | 361 	       fprintf(stderr,"BUG: schedule sends too many messages from " | 
| Chris@10 | 362 		       "%d\n",pe); | 
| Chris@10 | 363 	       return 0; | 
| Chris@10 | 364 	  } | 
| Chris@10 | 365      } | 
| Chris@10 | 366      return check_schedule_deadlock(sched,npes); | 
| Chris@10 | 367 } | 
| Chris@10 | 368 | 
| Chris@10 | 369 /* invert the order of all the schedules; this has no effect on | 
| Chris@10 | 370    its required properties. */ | 
| Chris@10 | 371 void invert_comm_schedule(int **sched, int npes) | 
| Chris@10 | 372 { | 
| Chris@10 | 373      int pe, i; | 
| Chris@10 | 374 | 
| Chris@10 | 375      for (pe = 0; pe < npes; ++pe) | 
| Chris@10 | 376 	  for (i = 0; i < npes/2; ++i) { | 
| Chris@10 | 377 	       int dummy = sched[pe][i]; | 
| Chris@10 | 378 	       sched[pe][i] = sched[pe][npes-1-i]; | 
| Chris@10 | 379 	       sched[pe][npes-1-i] = dummy; | 
| Chris@10 | 380 	  } | 
| Chris@10 | 381 } | 
| Chris@10 | 382 | 
| Chris@10 | 383 /* Sort the schedule for sort_pe in ascending order of processor | 
| Chris@10 | 384    index.  Unfortunately, for odd npes (when schedule has a stall | 
| Chris@10 | 385    to begin with) this will introduce an extra stall due to | 
| Chris@10 | 386    the motion of the self-communication past a stall.  We could | 
| Chris@10 | 387    fix this if it were really important.  Actually, we don't | 
| Chris@10 | 388    get an extra stall when sort_pe == 0 or npes-1, which is sufficient | 
| Chris@10 | 389    for our purposes. */ | 
| Chris@10 | 390 void sort_comm_schedule(int **sched, int npes, int sort_pe) | 
| Chris@10 | 391 { | 
| Chris@10 | 392      int i,j,pe; | 
| Chris@10 | 393 | 
| Chris@10 | 394      /* Note that we can do this sort in O(npes) swaps because we know | 
| Chris@10 | 395 	that the numbers we are sorting are just 0...npes-1.   But we'll | 
| Chris@10 | 396 	just do a bubble sort for simplicity here. */ | 
| Chris@10 | 397 | 
| Chris@10 | 398      for (i = 0; i < npes - 1; ++i) | 
| Chris@10 | 399 	  for (j = i + 1; j < npes; ++j) | 
| Chris@10 | 400 	       if (sched[sort_pe][i] > sched[sort_pe][j]) { | 
| Chris@10 | 401 		    for (pe = 0; pe < npes; ++pe) { | 
| Chris@10 | 402 			 int s = sched[pe][i]; | 
| Chris@10 | 403 			 sched[pe][i] = sched[pe][j]; | 
| Chris@10 | 404 			 sched[pe][j] = s; | 
| Chris@10 | 405 		    } | 
| Chris@10 | 406 	       } | 
| Chris@10 | 407 } | 
| Chris@10 | 408 | 
| Chris@10 | 409 /* print the schedule (for debugging purposes) */ | 
| Chris@10 | 410 void print_comm_schedule(int **sched, int npes) | 
| Chris@10 | 411 { | 
| Chris@10 | 412      int pe, i, width; | 
| Chris@10 | 413 | 
| Chris@10 | 414      if (npes < 10) | 
| Chris@10 | 415 	  width = 1; | 
| Chris@10 | 416      else if (npes < 100) | 
| Chris@10 | 417 	  width = 2; | 
| Chris@10 | 418      else | 
| Chris@10 | 419 	  width = 3; | 
| Chris@10 | 420 | 
| Chris@10 | 421      for (pe = 0; pe < npes; ++pe) { | 
| Chris@10 | 422 	  printf("pe %*d schedule:", width, pe); | 
| Chris@10 | 423 	  for (i = 0; sched[pe][i] != -1; ++i) | 
| Chris@10 | 424 	       printf("  %*d",width,sched[pe][i]); | 
| Chris@10 | 425 	  printf("\n"); | 
| Chris@10 | 426      } | 
| Chris@10 | 427 } | 
| Chris@10 | 428 | 
| Chris@10 | 429 int main(int argc, char **argv) | 
| Chris@10 | 430 { | 
| Chris@10 | 431      int **sched; | 
| Chris@10 | 432      int npes = -1, sortpe = -1, steps, i; | 
| Chris@10 | 433 | 
| Chris@10 | 434      if (argc >= 2) { | 
| Chris@10 | 435 	  npes = atoi(argv[1]); | 
| Chris@10 | 436 	  if (npes <= 0) { | 
| Chris@10 | 437 	       fprintf(stderr,"npes must be positive!"); | 
| Chris@10 | 438 	       return 1; | 
| Chris@10 | 439 	  } | 
| Chris@10 | 440      } | 
| Chris@10 | 441      if (argc >= 3) { | 
| Chris@10 | 442 	  sortpe = atoi(argv[2]); | 
| Chris@10 | 443 	  if (sortpe < 0 || sortpe >= npes) { | 
| Chris@10 | 444 	       fprintf(stderr,"sortpe must be between 0 and npes-1.\n"); | 
| Chris@10 | 445 	       return 1; | 
| Chris@10 | 446 	  } | 
| Chris@10 | 447      } | 
| Chris@10 | 448 | 
| Chris@10 | 449      if (npes != -1) { | 
| Chris@10 | 450 	  printf("Computing schedule for npes = %d:\n",npes); | 
| Chris@10 | 451 	  sched = make_comm_schedule(npes); | 
| Chris@10 | 452 	  if (!sched) { | 
| Chris@10 | 453 	       fprintf(stderr,"Out of memory!"); | 
| Chris@10 | 454 	       return 6; | 
| Chris@10 | 455 	  } | 
| Chris@10 | 456 | 
| Chris@10 | 457 	  if (steps = check_comm_schedule(sched,npes)) | 
| Chris@10 | 458 	       printf("schedule OK (takes %d steps to complete).\n", steps); | 
| Chris@10 | 459 	  else | 
| Chris@10 | 460 	       printf("schedule not OK.\n"); | 
| Chris@10 | 461 | 
| Chris@10 | 462 	  print_comm_schedule(sched, npes); | 
| Chris@10 | 463 | 
| Chris@10 | 464 	  if (sortpe != -1) { | 
| Chris@10 | 465 	       printf("\nRe-creating schedule for pe = %d...\n", sortpe); | 
| Chris@10 | 466 	       int *sched1 = (int*) malloc(sizeof(int) * npes); | 
| Chris@10 | 467 	       for (i = 0; i < npes; ++i) sched1[i] = -1; | 
| Chris@10 | 468 	       fill1_comm_sched(sched1, sortpe, npes); | 
| Chris@10 | 469 	       printf("  ="); | 
| Chris@10 | 470 	       for (i = 0; i < npes; ++i) | 
| Chris@10 | 471 		    printf("  %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3), | 
| Chris@10 | 472 			   sched1[i]); | 
| Chris@10 | 473 	       printf("\n"); | 
| Chris@10 | 474 | 
| Chris@10 | 475 	       printf("\nSorting schedule for sortpe = %d...\n", sortpe); | 
| Chris@10 | 476 	       sort_comm_schedule(sched,npes,sortpe); | 
| Chris@10 | 477 | 
| Chris@10 | 478 	       if (steps = check_comm_schedule(sched,npes)) | 
| Chris@10 | 479 		    printf("schedule OK (takes %d steps to complete).\n", | 
| Chris@10 | 480 			   steps); | 
| Chris@10 | 481 	       else | 
| Chris@10 | 482 		    printf("schedule not OK.\n"); | 
| Chris@10 | 483 | 
| Chris@10 | 484 	       print_comm_schedule(sched, npes); | 
| Chris@10 | 485 | 
| Chris@10 | 486 	       printf("\nInverting schedule...\n"); | 
| Chris@10 | 487 	       invert_comm_schedule(sched,npes); | 
| Chris@10 | 488 | 
| Chris@10 | 489 	       if (steps = check_comm_schedule(sched,npes)) | 
| Chris@10 | 490 		    printf("schedule OK (takes %d steps to complete).\n", | 
| Chris@10 | 491 			   steps); | 
| Chris@10 | 492 	       else | 
| Chris@10 | 493 		    printf("schedule not OK.\n"); | 
| Chris@10 | 494 | 
| Chris@10 | 495 	       print_comm_schedule(sched, npes); | 
| Chris@10 | 496 | 
| Chris@10 | 497 	       free_comm_schedule(sched,npes); | 
| Chris@10 | 498 | 
| Chris@10 | 499 	       free(sched1); | 
| Chris@10 | 500 	  } | 
| Chris@10 | 501      } | 
| Chris@10 | 502      else { | 
| Chris@10 | 503 	  printf("Doing infinite tests...\n"); | 
| Chris@10 | 504 	  for (npes = 1; ; ++npes) { | 
| Chris@10 | 505 	       int *sched1 = (int*) malloc(sizeof(int) * npes); | 
| Chris@10 | 506 	       printf("npes = %d...",npes); | 
| Chris@10 | 507 	       sched = make_comm_schedule(npes); | 
| Chris@10 | 508 	       if (!sched) { | 
| Chris@10 | 509 		    fprintf(stderr,"Out of memory!\n"); | 
| Chris@10 | 510 		    return 5; | 
| Chris@10 | 511 	       } | 
| Chris@10 | 512 	       for (sortpe = 0; sortpe < npes; ++sortpe) { | 
| Chris@10 | 513 		    empty_comm_schedule(sched,npes); | 
| Chris@10 | 514 		    fill_comm_schedule(sched,npes); | 
| Chris@10 | 515 		    if (!check_comm_schedule(sched,npes)) { | 
| Chris@10 | 516 			 fprintf(stderr, | 
| Chris@10 | 517 				 "\n -- fill error for sortpe = %d!\n",sortpe); | 
| Chris@10 | 518 			 return 2; | 
| Chris@10 | 519 		    } | 
| Chris@10 | 520 | 
| Chris@10 | 521 		    for (i = 0; i < npes; ++i) sched1[i] = -1; | 
| Chris@10 | 522 		    fill1_comm_sched(sched1, sortpe, npes); | 
| Chris@10 | 523 		    for (i = 0; i < npes; ++i) | 
| Chris@10 | 524 			 if (sched1[i] != sched[sortpe][i]) | 
| Chris@10 | 525 			      fprintf(stderr, | 
| Chris@10 | 526 				      "\n -- fill1 error for pe = %d!\n", | 
| Chris@10 | 527 				      sortpe); | 
| Chris@10 | 528 | 
| Chris@10 | 529 		    sort_comm_schedule(sched,npes,sortpe); | 
| Chris@10 | 530 		    if (!check_comm_schedule(sched,npes)) { | 
| Chris@10 | 531 			 fprintf(stderr, | 
| Chris@10 | 532 				 "\n -- sort error for sortpe = %d!\n",sortpe); | 
| Chris@10 | 533 			 return 3; | 
| Chris@10 | 534 		    } | 
| Chris@10 | 535 		    invert_comm_schedule(sched,npes); | 
| Chris@10 | 536 		    if (!check_comm_schedule(sched,npes)) { | 
| Chris@10 | 537 			 fprintf(stderr, | 
| Chris@10 | 538 				 "\n -- invert error for sortpe = %d!\n", | 
| Chris@10 | 539 				 sortpe); | 
| Chris@10 | 540 			 return 4; | 
| Chris@10 | 541 		    } | 
| Chris@10 | 542 	       } | 
| Chris@10 | 543 	       free_comm_schedule(sched,npes); | 
| Chris@10 | 544 	       printf("OK\n"); | 
| Chris@10 | 545 	       if (npes % 50 == 0) | 
| Chris@10 | 546 		    printf("(...Hit Ctrl-C to stop...)\n"); | 
| Chris@10 | 547 	       free(sched1); | 
| Chris@10 | 548 	  } | 
| Chris@10 | 549      } | 
| Chris@10 | 550 | 
| Chris@10 | 551      return 0; | 
| Chris@10 | 552 } |