annotate src/fftw-3.3.8/mpi/transpose-recurse.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* Recursive "radix-r" distributed transpose, which breaks a transpose
Chris@82 22 over p processes into p/r transposes over r processes plus r
Chris@82 23 transposes over p/r processes. If performed recursively, this
Chris@82 24 produces a total of O(p log p) messages vs. O(p^2) messages for a
Chris@82 25 direct approach.
Chris@82 26
Chris@82 27 However, this is not necessarily an improvement. The total size of
Chris@82 28 all the messages is actually increased from O(N) to O(N log p)
Chris@82 29 where N is the total data size. Also, the amount of local data
Chris@82 30 rearrangement is increased. So, it's not clear, a priori, what the
Chris@82 31 best algorithm will be, and we'll leave it to the planner. (In
Chris@82 32 theory and practice, it looks like this becomes advantageous for
Chris@82 33 large p, in the limit where the message sizes are small and
Chris@82 34 latency-dominated.)
Chris@82 35 */
Chris@82 36
Chris@82 37 #include "mpi-transpose.h"
Chris@82 38 #include <string.h>
Chris@82 39
Chris@82 40 typedef struct {
Chris@82 41 solver super;
Chris@82 42 int (*radix)(int np);
Chris@82 43 const char *nam;
Chris@82 44 int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
Chris@82 45 } S;
Chris@82 46
Chris@82 47 typedef struct {
Chris@82 48 plan_mpi_transpose super;
Chris@82 49
Chris@82 50 plan *cld1, *cldtr, *cldtm;
Chris@82 51 int preserve_input;
Chris@82 52
Chris@82 53 int r; /* "radix" */
Chris@82 54 const char *nam;
Chris@82 55 } P;
Chris@82 56
Chris@82 57 static void apply(const plan *ego_, R *I, R *O)
Chris@82 58 {
Chris@82 59 const P *ego = (const P *) ego_;
Chris@82 60 plan_rdft *cld1, *cldtr, *cldtm;
Chris@82 61
Chris@82 62 cld1 = (plan_rdft *) ego->cld1;
Chris@82 63 if (cld1) cld1->apply((plan *) cld1, I, O);
Chris@82 64
Chris@82 65 if (ego->preserve_input) I = O;
Chris@82 66
Chris@82 67 cldtr = (plan_rdft *) ego->cldtr;
Chris@82 68 if (cldtr) cldtr->apply((plan *) cldtr, O, I);
Chris@82 69
Chris@82 70 cldtm = (plan_rdft *) ego->cldtm;
Chris@82 71 if (cldtm) cldtm->apply((plan *) cldtm, I, O);
Chris@82 72 }
Chris@82 73
Chris@82 74 static int radix_sqrt(int np)
Chris@82 75 {
Chris@82 76 int r;
Chris@82 77 for (r = (int) (X(isqrt)(np)); np % r != 0; ++r)
Chris@82 78 ;
Chris@82 79 return r;
Chris@82 80 }
Chris@82 81
Chris@82 82 static int radix_first(int np)
Chris@82 83 {
Chris@82 84 int r = (int) (X(first_divisor)(np));
Chris@82 85 return (r >= (int) (X(isqrt)(np)) ? 0 : r);
Chris@82 86 }
Chris@82 87
Chris@82 88 /* the local allocated space on process pe required for the given transpose
Chris@82 89 dimensions and block sizes */
Chris@82 90 static INT transpose_space(INT nx, INT ny, INT block, INT tblock, int pe)
Chris@82 91 {
Chris@82 92 return X(imax)(XM(block)(nx, block, pe) * ny,
Chris@82 93 nx * XM(block)(ny, tblock, pe));
Chris@82 94 }
Chris@82 95
Chris@82 96 /* check whether the recursive transposes fit within the space
Chris@82 97 that must have been allocated on each process for this transpose;
Chris@82 98 this must be modified if the subdivision in mkplan is changed! */
Chris@82 99 static int enough_space(INT nx, INT ny, INT block, INT tblock,
Chris@82 100 int r, int n_pes)
Chris@82 101 {
Chris@82 102 int pe;
Chris@82 103 int m = n_pes / r;
Chris@82 104 for (pe = 0; pe < n_pes; ++pe) {
Chris@82 105 INT space = transpose_space(nx, ny, block, tblock, pe);
Chris@82 106 INT b1 = XM(block)(nx, r * block, pe / r);
Chris@82 107 INT b2 = XM(block)(ny, m * tblock, pe % r);
Chris@82 108 if (transpose_space(b1, ny, block, m*tblock, pe % r) > space
Chris@82 109 || transpose_space(nx, b2, r*block, tblock, pe / r) > space)
Chris@82 110 return 0;
Chris@82 111 }
Chris@82 112 return 1;
Chris@82 113 }
Chris@82 114
Chris@82 115 /* In theory, transpose-recurse becomes advantageous for message sizes
Chris@82 116 below some minimum, assuming that the time is dominated by
Chris@82 117 communications. In practice, we want to constrain the minimum
Chris@82 118 message size for transpose-recurse to keep the planning time down.
Chris@82 119 I've set this conservatively according to some simple experiments
Chris@82 120 on a Cray XT3 where the crossover message size was 128, although on
Chris@82 121 a larger-latency machine the crossover will be larger. */
Chris@82 122 #define SMALL_MESSAGE 2048
Chris@82 123
Chris@82 124 static int applicable(const S *ego, const problem *p_,
Chris@82 125 const planner *plnr, int *r)
Chris@82 126 {
Chris@82 127 const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
Chris@82 128 int n_pes;
Chris@82 129 MPI_Comm_size(p->comm, &n_pes);
Chris@82 130 return (1
Chris@82 131 && p->tblock * n_pes == p->ny
Chris@82 132 && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
Chris@82 133 && p->I != p->O))
Chris@82 134 && (*r = ego->radix(n_pes)) && *r < n_pes && *r > 1
Chris@82 135 && enough_space(p->nx, p->ny, p->block, p->tblock, *r, n_pes)
Chris@82 136 && (!CONSERVE_MEMORYP(plnr) || *r > 8
Chris@82 137 || !X(toobig)((p->nx * (p->ny / n_pes) * p->vn) / *r))
Chris@82 138 && (!NO_SLOWP(plnr) ||
Chris@82 139 (p->nx * (p->ny / n_pes) * p->vn) / n_pes <= SMALL_MESSAGE)
Chris@82 140 && ONLY_TRANSPOSEDP(p->flags)
Chris@82 141 );
Chris@82 142 }
Chris@82 143
Chris@82 144 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@82 145 {
Chris@82 146 P *ego = (P *) ego_;
Chris@82 147 X(plan_awake)(ego->cld1, wakefulness);
Chris@82 148 X(plan_awake)(ego->cldtr, wakefulness);
Chris@82 149 X(plan_awake)(ego->cldtm, wakefulness);
Chris@82 150 }
Chris@82 151
Chris@82 152 static void destroy(plan *ego_)
Chris@82 153 {
Chris@82 154 P *ego = (P *) ego_;
Chris@82 155 X(plan_destroy_internal)(ego->cldtm);
Chris@82 156 X(plan_destroy_internal)(ego->cldtr);
Chris@82 157 X(plan_destroy_internal)(ego->cld1);
Chris@82 158 }
Chris@82 159
Chris@82 160 static void print(const plan *ego_, printer *p)
Chris@82 161 {
Chris@82 162 const P *ego = (const P *) ego_;
Chris@82 163 p->print(p, "(mpi-transpose-recurse/%s/%d%s%(%p%)%(%p%)%(%p%))",
Chris@82 164 ego->nam, ego->r, ego->preserve_input==2 ?"/p":"",
Chris@82 165 ego->cld1, ego->cldtr, ego->cldtm);
Chris@82 166 }
Chris@82 167
Chris@82 168 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
Chris@82 169 {
Chris@82 170 const S *ego = (const S *) ego_;
Chris@82 171 const problem_mpi_transpose *p;
Chris@82 172 P *pln;
Chris@82 173 plan *cld1 = 0, *cldtr = 0, *cldtm = 0;
Chris@82 174 R *I, *O;
Chris@82 175 int me, np, r, m;
Chris@82 176 INT b;
Chris@82 177 MPI_Comm comm2;
Chris@82 178 static const plan_adt padt = {
Chris@82 179 XM(transpose_solve), awake, print, destroy
Chris@82 180 };
Chris@82 181
Chris@82 182 UNUSED(ego);
Chris@82 183
Chris@82 184 if (!applicable(ego, p_, plnr, &r))
Chris@82 185 return (plan *) 0;
Chris@82 186
Chris@82 187 p = (const problem_mpi_transpose *) p_;
Chris@82 188
Chris@82 189 MPI_Comm_size(p->comm, &np);
Chris@82 190 MPI_Comm_rank(p->comm, &me);
Chris@82 191 m = np / r;
Chris@82 192 A(r * m == np);
Chris@82 193
Chris@82 194 I = p->I; O = p->O;
Chris@82 195
Chris@82 196 b = XM(block)(p->nx, p->block, me);
Chris@82 197 A(p->tblock * np == p->ny); /* this is currently required for cld1 */
Chris@82 198 if (p->flags & TRANSPOSED_IN) {
Chris@82 199 /* m x r x (bt x b x vn) -> r x m x (bt x b x vn) */
Chris@82 200 INT vn = p->vn * b * p->tblock;
Chris@82 201 cld1 = X(mkplan_f_d)(plnr,
Chris@82 202 X(mkproblem_rdft_0_d)(X(mktensor_3d)
Chris@82 203 (m, r*vn, vn,
Chris@82 204 r, vn, m*vn,
Chris@82 205 vn, 1, 1),
Chris@82 206 I, O),
Chris@82 207 0, 0, NO_SLOW);
Chris@82 208 }
Chris@82 209 else if (I != O) { /* combine cld1 with TRANSPOSED_IN permutation */
Chris@82 210 /* b x m x r x bt x vn -> r x m x bt x b x vn */
Chris@82 211 INT vn = p->vn;
Chris@82 212 INT bt = p->tblock;
Chris@82 213 cld1 = X(mkplan_f_d)(plnr,
Chris@82 214 X(mkproblem_rdft_0_d)(X(mktensor_5d)
Chris@82 215 (b, m*r*bt*vn, vn,
Chris@82 216 m, r*bt*vn, bt*b*vn,
Chris@82 217 r, bt*vn, m*bt*b*vn,
Chris@82 218 bt, vn, b*vn,
Chris@82 219 vn, 1, 1),
Chris@82 220 I, O),
Chris@82 221 0, 0, NO_SLOW);
Chris@82 222 }
Chris@82 223 else { /* TRANSPOSED_IN permutation must be separate for in-place */
Chris@82 224 /* b x (m x r) x bt x vn -> b x (r x m) x bt x vn */
Chris@82 225 INT vn = p->vn * p->tblock;
Chris@82 226 cld1 = X(mkplan_f_d)(plnr,
Chris@82 227 X(mkproblem_rdft_0_d)(X(mktensor_4d)
Chris@82 228 (m, r*vn, vn,
Chris@82 229 r, vn, m*vn,
Chris@82 230 vn, 1, 1,
Chris@82 231 b, np*vn, np*vn),
Chris@82 232 I, O),
Chris@82 233 0, 0, NO_SLOW);
Chris@82 234 }
Chris@82 235 if (XM(any_true)(!cld1, p->comm)) goto nada;
Chris@82 236
Chris@82 237 if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = O;
Chris@82 238
Chris@82 239 b = XM(block)(p->nx, r * p->block, me / r);
Chris@82 240 MPI_Comm_split(p->comm, me / r, me, &comm2);
Chris@82 241 if (b)
Chris@82 242 cldtr = X(mkplan_d)(plnr, XM(mkproblem_transpose)
Chris@82 243 (b, p->ny, p->vn,
Chris@82 244 O, I, p->block, m * p->tblock, comm2,
Chris@82 245 p->I != p->O
Chris@82 246 ? TRANSPOSED_IN : (p->flags & TRANSPOSED_IN)));
Chris@82 247 MPI_Comm_free(&comm2);
Chris@82 248 if (XM(any_true)(b && !cldtr, p->comm)) goto nada;
Chris@82 249
Chris@82 250 b = XM(block)(p->ny, m * p->tblock, me % r);
Chris@82 251 MPI_Comm_split(p->comm, me % r, me, &comm2);
Chris@82 252 if (b)
Chris@82 253 cldtm = X(mkplan_d)(plnr, XM(mkproblem_transpose)
Chris@82 254 (p->nx, b, p->vn,
Chris@82 255 I, O, r * p->block, p->tblock, comm2,
Chris@82 256 TRANSPOSED_IN | (p->flags & TRANSPOSED_OUT)));
Chris@82 257 MPI_Comm_free(&comm2);
Chris@82 258 if (XM(any_true)(b && !cldtm, p->comm)) goto nada;
Chris@82 259
Chris@82 260 pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
Chris@82 261
Chris@82 262 pln->cld1 = cld1;
Chris@82 263 pln->cldtr = cldtr;
Chris@82 264 pln->cldtm = cldtm;
Chris@82 265 pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
Chris@82 266 pln->r = r;
Chris@82 267 pln->nam = ego->nam;
Chris@82 268
Chris@82 269 pln->super.super.ops = cld1->ops;
Chris@82 270 if (cldtr) X(ops_add2)(&cldtr->ops, &pln->super.super.ops);
Chris@82 271 if (cldtm) X(ops_add2)(&cldtm->ops, &pln->super.super.ops);
Chris@82 272
Chris@82 273 return &(pln->super.super);
Chris@82 274
Chris@82 275 nada:
Chris@82 276 X(plan_destroy_internal)(cldtm);
Chris@82 277 X(plan_destroy_internal)(cldtr);
Chris@82 278 X(plan_destroy_internal)(cld1);
Chris@82 279 return (plan *) 0;
Chris@82 280 }
Chris@82 281
Chris@82 282 static solver *mksolver(int preserve_input,
Chris@82 283 int (*radix)(int np), const char *nam)
Chris@82 284 {
Chris@82 285 static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
Chris@82 286 S *slv = MKSOLVER(S, &sadt);
Chris@82 287 slv->preserve_input = preserve_input;
Chris@82 288 slv->radix = radix;
Chris@82 289 slv->nam = nam;
Chris@82 290 return &(slv->super);
Chris@82 291 }
Chris@82 292
Chris@82 293 void XM(transpose_recurse_register)(planner *p)
Chris@82 294 {
Chris@82 295 int preserve_input;
Chris@82 296 for (preserve_input = 0; preserve_input <= 1; ++preserve_input) {
Chris@82 297 REGISTER_SOLVER(p, mksolver(preserve_input, radix_sqrt, "sqrt"));
Chris@82 298 REGISTER_SOLVER(p, mksolver(preserve_input, radix_first, "first"));
Chris@82 299 }
Chris@82 300 }