Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.3/rdft/vrank3-transpose.c @ 10:37bf6b4a2645
Add FFTW3
| author | Chris Cannam |
|---|---|
| date | Wed, 20 Mar 2013 15:35:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 9:c0fb53affa76 | 10:37bf6b4a2645 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2003, 2007-11 Matteo Frigo | |
| 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology | |
| 4 * | |
| 5 * This program is free software; you can redistribute it and/or modify | |
| 6 * it under the terms of the GNU General Public License as published by | |
| 7 * the Free Software Foundation; either version 2 of the License, or | |
| 8 * (at your option) any later version. | |
| 9 * | |
| 10 * This program is distributed in the hope that it will be useful, | |
| 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 * GNU General Public License for more details. | |
| 14 * | |
| 15 * You should have received a copy of the GNU General Public License | |
| 16 * along with this program; if not, write to the Free Software | |
| 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 18 * | |
| 19 */ | |
| 20 | |
| 21 | |
| 22 /* rank-0, vector-rank-3, non-square in-place transposition | |
| 23 (see rank0.c for square transposition) */ | |
| 24 | |
| 25 #include "rdft.h" | |
| 26 | |
| 27 #ifdef HAVE_STRING_H | |
| 28 #include <string.h> /* for memcpy() */ | |
| 29 #endif | |
| 30 | |
| 31 struct P_s; | |
| 32 | |
| 33 typedef struct { | |
| 34 rdftapply apply; | |
| 35 int (*applicable)(const problem_rdft *p, planner *plnr, | |
| 36 int dim0, int dim1, int dim2, INT *nbuf); | |
| 37 int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego); | |
| 38 const char *nam; | |
| 39 } transpose_adt; | |
| 40 | |
| 41 typedef struct { | |
| 42 solver super; | |
| 43 const transpose_adt *adt; | |
| 44 } S; | |
| 45 | |
| 46 typedef struct P_s { | |
| 47 plan_rdft super; | |
| 48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */ | |
| 49 INT nbuf; /* buffer size */ | |
| 50 INT nd, md, d; /* transpose-gcd params */ | |
| 51 INT nc, mc; /* transpose-cut params */ | |
| 52 plan *cld1, *cld2, *cld3; /* children, null if unused */ | |
| 53 const S *slv; | |
| 54 } P; | |
| 55 | |
| 56 | |
| 57 /*************************************************************************/ | |
| 58 /* some utilities for the solvers */ | |
| 59 | |
| 60 static INT gcd(INT a, INT b) | |
| 61 { | |
| 62 INT r; | |
| 63 do { | |
| 64 r = a % b; | |
| 65 a = b; | |
| 66 b = r; | |
| 67 } while (r != 0); | |
| 68 | |
| 69 return a; | |
| 70 } | |
| 71 | |
| 72 /* whether we can transpose with one of our routines expecting | |
| 73 contiguous Ntuples */ | |
| 74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs) | |
| 75 { | |
| 76 return (vs == 1 && b->is == vl && a->os == vl && | |
| 77 ((a->n == b->n && a->is == b->os | |
| 78 && a->is >= b->n && a->is % vl == 0) | |
| 79 || (a->is == b->n * vl && b->os == a->n * vl))); | |
| 80 } | |
| 81 | |
| 82 /* check whether a and b correspond to the first and second dimensions | |
| 83 of a transpose of tuples with vector length = vl, stride = vs. */ | |
| 84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs) | |
| 85 { | |
| 86 return ((a->n == b->n && a->os == b->is && a->is == b->os) | |
| 87 || Ntuple_transposable(a, b, vl, vs)); | |
| 88 } | |
| 89 | |
| 90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2) | |
| 91 { | |
| 92 int dim0, dim1; | |
| 93 | |
| 94 for (dim0 = 0; dim0 < s->rnk; ++dim0) | |
| 95 for (dim1 = 0; dim1 < s->rnk; ++dim1) { | |
| 96 int dim2 = 3 - dim0 - dim1; | |
| 97 if (dim0 == dim1) continue; | |
| 98 if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os) | |
| 99 && transposable(s->dims + dim0, s->dims + dim1, | |
| 100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n, | |
| 101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) { | |
| 102 *pdim0 = dim0; | |
| 103 *pdim1 = dim1; | |
| 104 *pdim2 = dim2; | |
| 105 return 1; | |
| 106 } | |
| 107 } | |
| 108 return 0; | |
| 109 } | |
| 110 | |
| 111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */ | |
| 112 #define MAXBUF 65536 /* maximum non-ugly buffer */ | |
| 113 | |
| 114 /* generic applicability function */ | |
| 115 static int applicable(const solver *ego_, const problem *p_, planner *plnr, | |
| 116 int *dim0, int *dim1, int *dim2, INT *nbuf) | |
| 117 { | |
| 118 const S *ego = (const S *) ego_; | |
| 119 const problem_rdft *p = (const problem_rdft *) p_; | |
| 120 | |
| 121 return (1 | |
| 122 && p->I == p->O | |
| 123 && p->sz->rnk == 0 | |
| 124 && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3) | |
| 125 | |
| 126 && pickdim(p->vecsz, dim0, dim1, dim2) | |
| 127 | |
| 128 /* UGLY if vecloop in wrong order for locality */ | |
| 129 && (!NO_UGLYP(plnr) || | |
| 130 p->vecsz->rnk == 2 || | |
| 131 X(iabs)(p->vecsz->dims[*dim2].is) | |
| 132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is), | |
| 133 X(iabs)(p->vecsz->dims[*dim0].os))) | |
| 134 | |
| 135 /* SLOW if non-square */ | |
| 136 && (!NO_SLOWP(plnr) | |
| 137 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n) | |
| 138 | |
| 139 && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf) | |
| 140 | |
| 141 /* buffers too big are UGLY */ | |
| 142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr)) | |
| 143 || *nbuf <= MAXBUF | |
| 144 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz)) | |
| 145 ); | |
| 146 } | |
| 147 | |
| 148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs) | |
| 149 { | |
| 150 if (p->vecsz->rnk == 2) { | |
| 151 *vl = 1; *vs = 1; | |
| 152 } | |
| 153 else { | |
| 154 *vl = p->vecsz->dims[dim2].n; | |
| 155 *vs = p->vecsz->dims[dim2].is; /* == os */ | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 /*************************************************************************/ | |
| 160 /* Cache-oblivious in-place transpose of non-square matrices, based | |
| 161 on transposes of blocks given by the gcd of the dimensions. | |
| 162 | |
| 163 This algorithm is related to algorithm V5 from Murray Dow, | |
| 164 "Transposing a matrix on a vector computer," Parallel Computing 21 | |
| 165 (12), 1997-2005 (1995), with the modification that we use | |
| 166 cache-oblivious recursive transpose subroutines (and we derived | |
| 167 it independently). | |
| 168 | |
| 169 For a p x q matrix, this requires scratch space equal to the size | |
| 170 of the matrix divided by gcd(p,q). Alternatively, see also the | |
| 171 "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */ | |
| 172 | |
| 173 static void apply_gcd(const plan *ego_, R *I, R *O) | |
| 174 { | |
| 175 const P *ego = (const P *) ego_; | |
| 176 INT n = ego->nd, m = ego->md, d = ego->d; | |
| 177 INT vl = ego->vl; | |
| 178 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); | |
| 179 INT i, num_el = n*m*d*vl; | |
| 180 | |
| 181 A(ego->n == n * d && ego->m == m * d); | |
| 182 UNUSED(O); | |
| 183 | |
| 184 /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix | |
| 185 of vl-tuples and buf contains n*m*d*vl elements. | |
| 186 | |
| 187 In general, to transpose a p x q matrix, you should call this | |
| 188 routine with d = gcd(p, q), n = p/d, and m = q/d. */ | |
| 189 | |
| 190 A(n > 0 && m > 0 && vl > 0); | |
| 191 A(d > 1); | |
| 192 | |
| 193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */ | |
| 194 | |
| 195 /* First, transpose d x (n x d') x m to d x (d' x n) x m, | |
| 196 using the buf matrix. This consists of d transposes | |
| 197 of contiguous n x d' matrices of m-tuples. */ | |
| 198 if (n > 1) { | |
| 199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply; | |
| 200 for (i = 0; i < d; ++i) { | |
| 201 cldapply(ego->cld1, I + i*num_el, buf); | |
| 202 memcpy(I + i*num_el, buf, num_el*sizeof(R)); | |
| 203 } | |
| 204 } | |
| 205 | |
| 206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which | |
| 207 is a square in-place transpose of n*m-tuples: */ | |
| 208 { | |
| 209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply; | |
| 210 cldapply(ego->cld2, I, I); | |
| 211 } | |
| 212 | |
| 213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)), | |
| 214 using the buf matrix. This consists of d' transposes | |
| 215 of contiguous d*n x m matrices. */ | |
| 216 if (m > 1) { | |
| 217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply; | |
| 218 for (i = 0; i < d; ++i) { | |
| 219 cldapply(ego->cld3, I + i*num_el, buf); | |
| 220 memcpy(I + i*num_el, buf, num_el*sizeof(R)); | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 X(ifree)(buf); | |
| 225 } | |
| 226 | |
| 227 static int applicable_gcd(const problem_rdft *p, planner *plnr, | |
| 228 int dim0, int dim1, int dim2, INT *nbuf) | |
| 229 { | |
| 230 INT n = p->vecsz->dims[dim0].n; | |
| 231 INT m = p->vecsz->dims[dim1].n; | |
| 232 INT d, vl, vs; | |
| 233 get_transpose_vec(p, dim2, &vl, &vs); | |
| 234 d = gcd(n, m); | |
| 235 *nbuf = n * (m / d) * vl; | |
| 236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */ | |
| 237 && n != m | |
| 238 && d > 1 | |
| 239 && Ntuple_transposable(p->vecsz->dims + dim0, | |
| 240 p->vecsz->dims + dim1, | |
| 241 vl, vs)); | |
| 242 } | |
| 243 | |
| 244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego) | |
| 245 { | |
| 246 INT n = ego->nd, m = ego->md, d = ego->d; | |
| 247 INT vl = ego->vl; | |
| 248 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); | |
| 249 INT num_el = n*m*d*vl; | |
| 250 | |
| 251 if (n > 1) { | |
| 252 ego->cld1 = X(mkplan_d)(plnr, | |
| 253 X(mkproblem_rdft_0_d)( | |
| 254 X(mktensor_3d)(n, d*m*vl, m*vl, | |
| 255 d, m*vl, n*m*vl, | |
| 256 m*vl, 1, 1), | |
| 257 TAINT(p->I, num_el), buf)); | |
| 258 if (!ego->cld1) | |
| 259 goto nada; | |
| 260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops, | |
| 261 &ego->super.super.ops); | |
| 262 ego->super.super.ops.other += num_el * d * 2; | |
| 263 } | |
| 264 | |
| 265 ego->cld2 = X(mkplan_d)(plnr, | |
| 266 X(mkproblem_rdft_0_d)( | |
| 267 X(mktensor_3d)(d, d*n*m*vl, n*m*vl, | |
| 268 d, n*m*vl, d*n*m*vl, | |
| 269 n*m*vl, 1, 1), | |
| 270 p->I, p->I)); | |
| 271 if (!ego->cld2) | |
| 272 goto nada; | |
| 273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); | |
| 274 | |
| 275 if (m > 1) { | |
| 276 ego->cld3 = X(mkplan_d)(plnr, | |
| 277 X(mkproblem_rdft_0_d)( | |
| 278 X(mktensor_3d)(d*n, m*vl, vl, | |
| 279 m, vl, d*n*vl, | |
| 280 vl, 1, 1), | |
| 281 TAINT(p->I, num_el), buf)); | |
| 282 if (!ego->cld3) | |
| 283 goto nada; | |
| 284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops); | |
| 285 ego->super.super.ops.other += num_el * d * 2; | |
| 286 } | |
| 287 | |
| 288 X(ifree)(buf); | |
| 289 return 1; | |
| 290 | |
| 291 nada: | |
| 292 X(ifree)(buf); | |
| 293 return 0; | |
| 294 } | |
| 295 | |
| 296 static const transpose_adt adt_gcd = | |
| 297 { | |
| 298 apply_gcd, applicable_gcd, mkcldrn_gcd, | |
| 299 "rdft-transpose-gcd" | |
| 300 }; | |
| 301 | |
| 302 /*************************************************************************/ | |
| 303 /* Cache-oblivious in-place transpose of non-square n x m matrices, | |
| 304 based on transposing a sub-matrix first and then transposing the | |
| 305 remainder(s) with the help of a buffer. See also transpose-gcd, | |
| 306 above, if gcd(n,m) is large. | |
| 307 | |
| 308 This algorithm is related to algorithm V3 from Murray Dow, | |
| 309 "Transposing a matrix on a vector computer," Parallel Computing 21 | |
| 310 (12), 1997-2005 (1995), with the modifications that we use | |
| 311 cache-oblivious recursive transpose subroutines and we have the | |
| 312 generalization for large |n-m| below. | |
| 313 | |
| 314 The best case, and the one described by Dow, is for |n-m| small, in | |
| 315 which case we transpose a square sub-matrix of size min(n,m), | |
| 316 handling the remainder via a buffer. This requires scratch space | |
| 317 equal to the size of the matrix times |n-m| / max(n,m). | |
| 318 | |
| 319 As a generalization when |n-m| is not small, we also support cutting | |
| 320 *both* dimensions to an nc x mc matrix which is *not* necessarily | |
| 321 square, but has a large gcd (and can therefore use transpose-gcd). | |
| 322 */ | |
| 323 | |
| 324 static void apply_cut(const plan *ego_, R *I, R *O) | |
| 325 { | |
| 326 const P *ego = (const P *) ego_; | |
| 327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl; | |
| 328 INT i; | |
| 329 R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); | |
| 330 UNUSED(O); | |
| 331 | |
| 332 if (m > mc) { | |
| 333 ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1); | |
| 334 for (i = 0; i < nc; ++i) | |
| 335 memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl)); | |
| 336 } | |
| 337 | |
| 338 ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */ | |
| 339 | |
| 340 if (n > nc) { | |
| 341 R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */ | |
| 342 memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R)); | |
| 343 for (i = mc-1; i >= 0; --i) | |
| 344 memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl)); | |
| 345 ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl); | |
| 346 } | |
| 347 | |
| 348 if (m > mc) { | |
| 349 if (n > nc) | |
| 350 for (i = mc; i < m; ++i) | |
| 351 memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl), | |
| 352 (nc*vl)*sizeof(R)); | |
| 353 else | |
| 354 memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R)); | |
| 355 } | |
| 356 | |
| 357 X(ifree)(buf1); | |
| 358 } | |
| 359 | |
| 360 /* only cut one dimension if the resulting buffer is small enough */ | |
| 361 static int cut1(INT n, INT m, INT vl) | |
| 362 { | |
| 363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV | |
| 364 || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF); | |
| 365 } | |
| 366 | |
| 367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */ | |
| 368 | |
| 369 static int applicable_cut(const problem_rdft *p, planner *plnr, | |
| 370 int dim0, int dim1, int dim2, INT *nbuf) | |
| 371 { | |
| 372 INT n = p->vecsz->dims[dim0].n; | |
| 373 INT m = p->vecsz->dims[dim1].n; | |
| 374 INT vl, vs; | |
| 375 get_transpose_vec(p, dim2, &vl, &vs); | |
| 376 *nbuf = 0; /* always small enough to be non-UGLY (?) */ | |
| 377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */ | |
| 378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */ | |
| 379 && n != m | |
| 380 | |
| 381 /* Don't call transpose-cut recursively (avoid inf. loops): | |
| 382 the non-square sub-transpose produced when !cut1 | |
| 383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m), | |
| 384 for which transpose-gcd is applicable */ | |
| 385 && (cut1(n, m, vl) | |
| 386 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m))) | |
| 387 | |
| 388 && Ntuple_transposable(p->vecsz->dims + dim0, | |
| 389 p->vecsz->dims + dim1, | |
| 390 vl, vs)); | |
| 391 } | |
| 392 | |
| 393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego) | |
| 394 { | |
| 395 INT n = ego->n, m = ego->m, nc, mc; | |
| 396 INT vl = ego->vl; | |
| 397 R *buf; | |
| 398 | |
| 399 /* pick the "best" cut */ | |
| 400 if (cut1(n, m, vl)) { | |
| 401 nc = mc = X(imin)(n,m); | |
| 402 } | |
| 403 else { | |
| 404 INT dc, ns, ms; | |
| 405 dc = gcd(m, n); nc = n; mc = m; | |
| 406 /* search for cut with largest gcd | |
| 407 (TODO: different optimality criteria? different search range?) */ | |
| 408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) { | |
| 409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) { | |
| 410 INT ds = gcd(ms, ns); | |
| 411 if (ds > dc) { | |
| 412 dc = ds; nc = ns; mc = ms; | |
| 413 if (dc == X(imin)(ns, ms)) | |
| 414 break; /* cannot get larger than this */ | |
| 415 } | |
| 416 } | |
| 417 if (dc == X(imin)(n, ms)) | |
| 418 break; /* cannot get larger than this */ | |
| 419 } | |
| 420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m))); | |
| 421 } | |
| 422 ego->nc = nc; | |
| 423 ego->mc = mc; | |
| 424 ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl); | |
| 425 | |
| 426 buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); | |
| 427 | |
| 428 if (m > mc) { | |
| 429 ego->cld1 = X(mkplan_d)(plnr, | |
| 430 X(mkproblem_rdft_0_d)( | |
| 431 X(mktensor_3d)(nc, m*vl, vl, | |
| 432 m-mc, vl, nc*vl, | |
| 433 vl, 1, 1), | |
| 434 p->I + mc*vl, buf)); | |
| 435 if (!ego->cld1) | |
| 436 goto nada; | |
| 437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops); | |
| 438 } | |
| 439 | |
| 440 ego->cld2 = X(mkplan_d)(plnr, | |
| 441 X(mkproblem_rdft_0_d)( | |
| 442 X(mktensor_3d)(nc, mc*vl, vl, | |
| 443 mc, vl, nc*vl, | |
| 444 vl, 1, 1), | |
| 445 p->I, p->I)); | |
| 446 if (!ego->cld2) | |
| 447 goto nada; | |
| 448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); | |
| 449 | |
| 450 if (n > nc) { | |
| 451 ego->cld3 = X(mkplan_d)(plnr, | |
| 452 X(mkproblem_rdft_0_d)( | |
| 453 X(mktensor_3d)(n-nc, m*vl, vl, | |
| 454 m, vl, n*vl, | |
| 455 vl, 1, 1), | |
| 456 buf + (m-mc)*(nc*vl), p->I + nc*vl)); | |
| 457 if (!ego->cld3) | |
| 458 goto nada; | |
| 459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops); | |
| 460 } | |
| 461 | |
| 462 /* memcpy/memmove operations */ | |
| 463 ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc)) | |
| 464 + (n-nc)*m + (m-mc)*nc); | |
| 465 | |
| 466 X(ifree)(buf); | |
| 467 return 1; | |
| 468 | |
| 469 nada: | |
| 470 X(ifree)(buf); | |
| 471 return 0; | |
| 472 } | |
| 473 | |
| 474 static const transpose_adt adt_cut = | |
| 475 { | |
| 476 apply_cut, applicable_cut, mkcldrn_cut, | |
| 477 "rdft-transpose-cut" | |
| 478 }; | |
| 479 | |
| 480 /*************************************************************************/ | |
| 481 /* In-place transpose routine from TOMS, which follows the cycles of | |
| 482 the permutation so that it writes to each location only once. | |
| 483 Because of cache-line and other issues, however, this routine is | |
| 484 typically much slower than transpose-gcd or transpose-cut, even | |
| 485 though the latter do some extra writes. On the other hand, if the | |
| 486 vector length is large then the TOMS routine is best. | |
| 487 | |
| 488 The TOMS routine also has the advantage of requiring less buffer | |
| 489 space for the case of gcd(nx,ny) small. However, in this case it | |
| 490 has been superseded by the combination of the generalized | |
| 491 transpose-cut method with the transpose-gcd method, which can | |
| 492 always transpose with buffers a small fraction of the array size | |
| 493 regardless of gcd(nx,ny). */ | |
| 494 | |
| 495 /* | |
| 496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380). | |
| 497 * | |
| 498 * These routines do in-place transposes of arrays. | |
| 499 * | |
| 500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, | |
| 501 * vol. 3, no. 1, 104-110 (1977) ] | |
| 502 * | |
| 503 * C version by Steven G. Johnson (February 1997). | |
| 504 */ | |
| 505 | |
| 506 /* | |
| 507 * "a" is a 1D array of length ny*nx*N which constains the nx x ny | |
| 508 * matrix of N-tuples to be transposed. "a" is stored in row-major | |
| 509 * order (last index varies fastest). move is a 1D array of length | |
| 510 * move_size used to store information to speed up the process. The | |
| 511 * value move_size=(ny+nx)/2 is recommended. buf should be an array | |
| 512 * of length 2*N. | |
| 513 * | |
| 514 */ | |
| 515 | |
| 516 static void transpose_toms513(R *a, INT nx, INT ny, INT N, | |
| 517 char *move, INT move_size, R *buf) | |
| 518 { | |
| 519 INT i, im, mn; | |
| 520 R *b, *c, *d; | |
| 521 INT ncount; | |
| 522 INT k; | |
| 523 | |
| 524 /* check arguments and initialize: */ | |
| 525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0); | |
| 526 | |
| 527 b = buf; | |
| 528 | |
| 529 /* Cate & Twigg have a special case for nx == ny, but we don't | |
| 530 bother, since we already have special code for this case elsewhere. */ | |
| 531 | |
| 532 c = buf + N; | |
| 533 ncount = 2; /* always at least 2 fixed points */ | |
| 534 k = (mn = ny * nx) - 1; | |
| 535 | |
| 536 for (i = 0; i < move_size; ++i) | |
| 537 move[i] = 0; | |
| 538 | |
| 539 if (ny >= 3 && nx >= 3) | |
| 540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */ | |
| 541 | |
| 542 i = 1; | |
| 543 im = ny; | |
| 544 | |
| 545 while (1) { | |
| 546 INT i1, i2, i1c, i2c; | |
| 547 INT kmi; | |
| 548 | |
| 549 /** Rearrange the elements of a loop | |
| 550 and its companion loop: **/ | |
| 551 | |
| 552 i1 = i; | |
| 553 kmi = k - i; | |
| 554 i1c = kmi; | |
| 555 switch (N) { | |
| 556 case 1: | |
| 557 b[0] = a[i1]; | |
| 558 c[0] = a[i1c]; | |
| 559 break; | |
| 560 case 2: | |
| 561 b[0] = a[2*i1]; | |
| 562 b[1] = a[2*i1+1]; | |
| 563 c[0] = a[2*i1c]; | |
| 564 c[1] = a[2*i1c+1]; | |
| 565 break; | |
| 566 default: | |
| 567 memcpy(b, &a[N * i1], N * sizeof(R)); | |
| 568 memcpy(c, &a[N * i1c], N * sizeof(R)); | |
| 569 } | |
| 570 while (1) { | |
| 571 i2 = ny * i1 - k * (i1 / nx); | |
| 572 i2c = k - i2; | |
| 573 if (i1 < move_size) | |
| 574 move[i1] = 1; | |
| 575 if (i1c < move_size) | |
| 576 move[i1c] = 1; | |
| 577 ncount += 2; | |
| 578 if (i2 == i) | |
| 579 break; | |
| 580 if (i2 == kmi) { | |
| 581 d = b; | |
| 582 b = c; | |
| 583 c = d; | |
| 584 break; | |
| 585 } | |
| 586 switch (N) { | |
| 587 case 1: | |
| 588 a[i1] = a[i2]; | |
| 589 a[i1c] = a[i2c]; | |
| 590 break; | |
| 591 case 2: | |
| 592 a[2*i1] = a[2*i2]; | |
| 593 a[2*i1+1] = a[2*i2+1]; | |
| 594 a[2*i1c] = a[2*i2c]; | |
| 595 a[2*i1c+1] = a[2*i2c+1]; | |
| 596 break; | |
| 597 default: | |
| 598 memcpy(&a[N * i1], &a[N * i2], | |
| 599 N * sizeof(R)); | |
| 600 memcpy(&a[N * i1c], &a[N * i2c], | |
| 601 N * sizeof(R)); | |
| 602 } | |
| 603 i1 = i2; | |
| 604 i1c = i2c; | |
| 605 } | |
| 606 switch (N) { | |
| 607 case 1: | |
| 608 a[i1] = b[0]; | |
| 609 a[i1c] = c[0]; | |
| 610 break; | |
| 611 case 2: | |
| 612 a[2*i1] = b[0]; | |
| 613 a[2*i1+1] = b[1]; | |
| 614 a[2*i1c] = c[0]; | |
| 615 a[2*i1c+1] = c[1]; | |
| 616 break; | |
| 617 default: | |
| 618 memcpy(&a[N * i1], b, N * sizeof(R)); | |
| 619 memcpy(&a[N * i1c], c, N * sizeof(R)); | |
| 620 } | |
| 621 if (ncount >= mn) | |
| 622 break; /* we've moved all elements */ | |
| 623 | |
| 624 /** Search for loops to rearrange: **/ | |
| 625 | |
| 626 while (1) { | |
| 627 INT max = k - i; | |
| 628 ++i; | |
| 629 A(i <= max); | |
| 630 im += ny; | |
| 631 if (im > k) | |
| 632 im -= k; | |
| 633 i2 = im; | |
| 634 if (i == i2) | |
| 635 continue; | |
| 636 if (i >= move_size) { | |
| 637 while (i2 > i && i2 < max) { | |
| 638 i1 = i2; | |
| 639 i2 = ny * i1 - k * (i1 / nx); | |
| 640 } | |
| 641 if (i2 == i) | |
| 642 break; | |
| 643 } else if (!move[i]) | |
| 644 break; | |
| 645 } | |
| 646 } | |
| 647 } | |
| 648 | |
| 649 static void apply_toms513(const plan *ego_, R *I, R *O) | |
| 650 { | |
| 651 const P *ego = (const P *) ego_; | |
| 652 INT n = ego->n, m = ego->m; | |
| 653 INT vl = ego->vl; | |
| 654 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); | |
| 655 UNUSED(O); | |
| 656 transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf); | |
| 657 X(ifree)(buf); | |
| 658 } | |
| 659 | |
| 660 static int applicable_toms513(const problem_rdft *p, planner *plnr, | |
| 661 int dim0, int dim1, int dim2, INT *nbuf) | |
| 662 { | |
| 663 INT n = p->vecsz->dims[dim0].n; | |
| 664 INT m = p->vecsz->dims[dim1].n; | |
| 665 INT vl, vs; | |
| 666 get_transpose_vec(p, dim2, &vl, &vs); | |
| 667 *nbuf = 2*vl | |
| 668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R); | |
| 669 return (!NO_SLOWP(plnr) | |
| 670 && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */ | |
| 671 && n != m | |
| 672 && Ntuple_transposable(p->vecsz->dims + dim0, | |
| 673 p->vecsz->dims + dim1, | |
| 674 vl, vs)); | |
| 675 } | |
| 676 | |
| 677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego) | |
| 678 { | |
| 679 UNUSED(p); UNUSED(plnr); | |
| 680 /* heuristic so that TOMS algorithm is last resort for small vl */ | |
| 681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30); | |
| 682 return 1; | |
| 683 } | |
| 684 | |
| 685 static const transpose_adt adt_toms513 = | |
| 686 { | |
| 687 apply_toms513, applicable_toms513, mkcldrn_toms513, | |
| 688 "rdft-transpose-toms513" | |
| 689 }; | |
| 690 | |
| 691 /*-----------------------------------------------------------------------*/ | |
| 692 /*-----------------------------------------------------------------------*/ | |
| 693 /* generic stuff: */ | |
| 694 | |
| 695 static void awake(plan *ego_, enum wakefulness wakefulness) | |
| 696 { | |
| 697 P *ego = (P *) ego_; | |
| 698 X(plan_awake)(ego->cld1, wakefulness); | |
| 699 X(plan_awake)(ego->cld2, wakefulness); | |
| 700 X(plan_awake)(ego->cld3, wakefulness); | |
| 701 } | |
| 702 | |
| 703 static void print(const plan *ego_, printer *p) | |
| 704 { | |
| 705 const P *ego = (const P *) ego_; | |
| 706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam, | |
| 707 ego->n, ego->m, ego->vl); | |
| 708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1); | |
| 709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2); | |
| 710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3); | |
| 711 p->print(p, ")"); | |
| 712 } | |
| 713 | |
| 714 static void destroy(plan *ego_) | |
| 715 { | |
| 716 P *ego = (P *) ego_; | |
| 717 X(plan_destroy_internal)(ego->cld3); | |
| 718 X(plan_destroy_internal)(ego->cld2); | |
| 719 X(plan_destroy_internal)(ego->cld1); | |
| 720 } | |
| 721 | |
| 722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) | |
| 723 { | |
| 724 const S *ego = (const S *) ego_; | |
| 725 const problem_rdft *p; | |
| 726 int dim0, dim1, dim2; | |
| 727 INT nbuf, vs; | |
| 728 P *pln; | |
| 729 | |
| 730 static const plan_adt padt = { | |
| 731 X(rdft_solve), awake, print, destroy | |
| 732 }; | |
| 733 | |
| 734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf)) | |
| 735 return (plan *) 0; | |
| 736 | |
| 737 p = (const problem_rdft *) p_; | |
| 738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply); | |
| 739 | |
| 740 pln->n = p->vecsz->dims[dim0].n; | |
| 741 pln->m = p->vecsz->dims[dim1].n; | |
| 742 get_transpose_vec(p, dim2, &pln->vl, &vs); | |
| 743 pln->nbuf = nbuf; | |
| 744 pln->d = gcd(pln->n, pln->m); | |
| 745 pln->nd = pln->n / pln->d; | |
| 746 pln->md = pln->m / pln->d; | |
| 747 pln->slv = ego; | |
| 748 | |
| 749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */ | |
| 750 | |
| 751 pln->cld1 = pln->cld2 = pln->cld3 = 0; | |
| 752 if (!ego->adt->mkcldrn(p, plnr, pln)) { | |
| 753 X(plan_destroy_internal)(&(pln->super.super)); | |
| 754 return 0; | |
| 755 } | |
| 756 | |
| 757 return &(pln->super.super); | |
| 758 } | |
| 759 | |
| 760 static solver *mksolver(const transpose_adt *adt) | |
| 761 { | |
| 762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; | |
| 763 S *slv = MKSOLVER(S, &sadt); | |
| 764 slv->adt = adt; | |
| 765 return &(slv->super); | |
| 766 } | |
| 767 | |
| 768 void X(rdft_vrank3_transpose_register)(planner *p) | |
| 769 { | |
| 770 unsigned i; | |
| 771 static const transpose_adt *const adts[] = { | |
| 772 &adt_gcd, &adt_cut, | |
| 773 &adt_toms513 | |
| 774 }; | |
| 775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i) | |
| 776 REGISTER_SOLVER(p, mksolver(adts[i])); | |
| 777 } |
