annotate src/fftw-3.3.5/rdft/vrank3-transpose.c @ 169:223a55898ab9 tip default

Add null config files
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 02 Mar 2020 14:03:47 +0000
parents 7867fa7e1b6b
children
rev   line source
cannam@127 1 /*
cannam@127 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
cannam@127 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
cannam@127 4 *
cannam@127 5 * This program is free software; you can redistribute it and/or modify
cannam@127 6 * it under the terms of the GNU General Public License as published by
cannam@127 7 * the Free Software Foundation; either version 2 of the License, or
cannam@127 8 * (at your option) any later version.
cannam@127 9 *
cannam@127 10 * This program is distributed in the hope that it will be useful,
cannam@127 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@127 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@127 13 * GNU General Public License for more details.
cannam@127 14 *
cannam@127 15 * You should have received a copy of the GNU General Public License
cannam@127 16 * along with this program; if not, write to the Free Software
cannam@127 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@127 18 *
cannam@127 19 */
cannam@127 20
cannam@127 21
cannam@127 22 /* rank-0, vector-rank-3, non-square in-place transposition
cannam@127 23 (see rank0.c for square transposition) */
cannam@127 24
cannam@127 25 #include "rdft.h"
cannam@127 26
cannam@127 27 #ifdef HAVE_STRING_H
cannam@127 28 #include <string.h> /* for memcpy() */
cannam@127 29 #endif
cannam@127 30
cannam@127 31 struct P_s;
cannam@127 32
cannam@127 33 typedef struct {
cannam@127 34 rdftapply apply;
cannam@127 35 int (*applicable)(const problem_rdft *p, planner *plnr,
cannam@127 36 int dim0, int dim1, int dim2, INT *nbuf);
cannam@127 37 int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
cannam@127 38 const char *nam;
cannam@127 39 } transpose_adt;
cannam@127 40
cannam@127 41 typedef struct {
cannam@127 42 solver super;
cannam@127 43 const transpose_adt *adt;
cannam@127 44 } S;
cannam@127 45
cannam@127 46 typedef struct P_s {
cannam@127 47 plan_rdft super;
cannam@127 48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */
cannam@127 49 INT nbuf; /* buffer size */
cannam@127 50 INT nd, md, d; /* transpose-gcd params */
cannam@127 51 INT nc, mc; /* transpose-cut params */
cannam@127 52 plan *cld1, *cld2, *cld3; /* children, null if unused */
cannam@127 53 const S *slv;
cannam@127 54 } P;
cannam@127 55
cannam@127 56
cannam@127 57 /*************************************************************************/
cannam@127 58 /* some utilities for the solvers */
cannam@127 59
cannam@127 60 static INT gcd(INT a, INT b)
cannam@127 61 {
cannam@127 62 INT r;
cannam@127 63 do {
cannam@127 64 r = a % b;
cannam@127 65 a = b;
cannam@127 66 b = r;
cannam@127 67 } while (r != 0);
cannam@127 68
cannam@127 69 return a;
cannam@127 70 }
cannam@127 71
cannam@127 72 /* whether we can transpose with one of our routines expecting
cannam@127 73 contiguous Ntuples */
cannam@127 74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
cannam@127 75 {
cannam@127 76 return (vs == 1 && b->is == vl && a->os == vl &&
cannam@127 77 ((a->n == b->n && a->is == b->os
cannam@127 78 && a->is >= b->n && a->is % vl == 0)
cannam@127 79 || (a->is == b->n * vl && b->os == a->n * vl)));
cannam@127 80 }
cannam@127 81
cannam@127 82 /* check whether a and b correspond to the first and second dimensions
cannam@127 83 of a transpose of tuples with vector length = vl, stride = vs. */
cannam@127 84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
cannam@127 85 {
cannam@127 86 return ((a->n == b->n && a->os == b->is && a->is == b->os)
cannam@127 87 || Ntuple_transposable(a, b, vl, vs));
cannam@127 88 }
cannam@127 89
cannam@127 90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
cannam@127 91 {
cannam@127 92 int dim0, dim1;
cannam@127 93
cannam@127 94 for (dim0 = 0; dim0 < s->rnk; ++dim0)
cannam@127 95 for (dim1 = 0; dim1 < s->rnk; ++dim1) {
cannam@127 96 int dim2 = 3 - dim0 - dim1;
cannam@127 97 if (dim0 == dim1) continue;
cannam@127 98 if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
cannam@127 99 && transposable(s->dims + dim0, s->dims + dim1,
cannam@127 100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
cannam@127 101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
cannam@127 102 *pdim0 = dim0;
cannam@127 103 *pdim1 = dim1;
cannam@127 104 *pdim2 = dim2;
cannam@127 105 return 1;
cannam@127 106 }
cannam@127 107 }
cannam@127 108 return 0;
cannam@127 109 }
cannam@127 110
cannam@127 111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
cannam@127 112 #define MAXBUF 65536 /* maximum non-ugly buffer */
cannam@127 113
cannam@127 114 /* generic applicability function */
cannam@127 115 static int applicable(const solver *ego_, const problem *p_, planner *plnr,
cannam@127 116 int *dim0, int *dim1, int *dim2, INT *nbuf)
cannam@127 117 {
cannam@127 118 const S *ego = (const S *) ego_;
cannam@127 119 const problem_rdft *p = (const problem_rdft *) p_;
cannam@127 120
cannam@127 121 return (1
cannam@127 122 && p->I == p->O
cannam@127 123 && p->sz->rnk == 0
cannam@127 124 && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
cannam@127 125
cannam@127 126 && pickdim(p->vecsz, dim0, dim1, dim2)
cannam@127 127
cannam@127 128 /* UGLY if vecloop in wrong order for locality */
cannam@127 129 && (!NO_UGLYP(plnr) ||
cannam@127 130 p->vecsz->rnk == 2 ||
cannam@127 131 X(iabs)(p->vecsz->dims[*dim2].is)
cannam@127 132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
cannam@127 133 X(iabs)(p->vecsz->dims[*dim0].os)))
cannam@127 134
cannam@127 135 /* SLOW if non-square */
cannam@127 136 && (!NO_SLOWP(plnr)
cannam@127 137 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
cannam@127 138
cannam@127 139 && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
cannam@127 140
cannam@127 141 /* buffers too big are UGLY */
cannam@127 142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
cannam@127 143 || *nbuf <= MAXBUF
cannam@127 144 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
cannam@127 145 );
cannam@127 146 }
cannam@127 147
cannam@127 148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
cannam@127 149 {
cannam@127 150 if (p->vecsz->rnk == 2) {
cannam@127 151 *vl = 1; *vs = 1;
cannam@127 152 }
cannam@127 153 else {
cannam@127 154 *vl = p->vecsz->dims[dim2].n;
cannam@127 155 *vs = p->vecsz->dims[dim2].is; /* == os */
cannam@127 156 }
cannam@127 157 }
cannam@127 158
cannam@127 159 /*************************************************************************/
cannam@127 160 /* Cache-oblivious in-place transpose of non-square matrices, based
cannam@127 161 on transposes of blocks given by the gcd of the dimensions.
cannam@127 162
cannam@127 163 This algorithm is related to algorithm V5 from Murray Dow,
cannam@127 164 "Transposing a matrix on a vector computer," Parallel Computing 21
cannam@127 165 (12), 1997-2005 (1995), with the modification that we use
cannam@127 166 cache-oblivious recursive transpose subroutines (and we derived
cannam@127 167 it independently).
cannam@127 168
cannam@127 169 For a p x q matrix, this requires scratch space equal to the size
cannam@127 170 of the matrix divided by gcd(p,q). Alternatively, see also the
cannam@127 171 "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
cannam@127 172
cannam@127 173 static void apply_gcd(const plan *ego_, R *I, R *O)
cannam@127 174 {
cannam@127 175 const P *ego = (const P *) ego_;
cannam@127 176 INT n = ego->nd, m = ego->md, d = ego->d;
cannam@127 177 INT vl = ego->vl;
cannam@127 178 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@127 179 INT i, num_el = n*m*d*vl;
cannam@127 180
cannam@127 181 A(ego->n == n * d && ego->m == m * d);
cannam@127 182 UNUSED(O);
cannam@127 183
cannam@127 184 /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
cannam@127 185 of vl-tuples and buf contains n*m*d*vl elements.
cannam@127 186
cannam@127 187 In general, to transpose a p x q matrix, you should call this
cannam@127 188 routine with d = gcd(p, q), n = p/d, and m = q/d. */
cannam@127 189
cannam@127 190 A(n > 0 && m > 0 && vl > 0);
cannam@127 191 A(d > 1);
cannam@127 192
cannam@127 193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */
cannam@127 194
cannam@127 195 /* First, transpose d x (n x d') x m to d x (d' x n) x m,
cannam@127 196 using the buf matrix. This consists of d transposes
cannam@127 197 of contiguous n x d' matrices of m-tuples. */
cannam@127 198 if (n > 1) {
cannam@127 199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
cannam@127 200 for (i = 0; i < d; ++i) {
cannam@127 201 cldapply(ego->cld1, I + i*num_el, buf);
cannam@127 202 memcpy(I + i*num_el, buf, num_el*sizeof(R));
cannam@127 203 }
cannam@127 204 }
cannam@127 205
cannam@127 206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
cannam@127 207 is a square in-place transpose of n*m-tuples: */
cannam@127 208 {
cannam@127 209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
cannam@127 210 cldapply(ego->cld2, I, I);
cannam@127 211 }
cannam@127 212
cannam@127 213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
cannam@127 214 using the buf matrix. This consists of d' transposes
cannam@127 215 of contiguous d*n x m matrices. */
cannam@127 216 if (m > 1) {
cannam@127 217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
cannam@127 218 for (i = 0; i < d; ++i) {
cannam@127 219 cldapply(ego->cld3, I + i*num_el, buf);
cannam@127 220 memcpy(I + i*num_el, buf, num_el*sizeof(R));
cannam@127 221 }
cannam@127 222 }
cannam@127 223
cannam@127 224 X(ifree)(buf);
cannam@127 225 }
cannam@127 226
cannam@127 227 static int applicable_gcd(const problem_rdft *p, planner *plnr,
cannam@127 228 int dim0, int dim1, int dim2, INT *nbuf)
cannam@127 229 {
cannam@127 230 INT n = p->vecsz->dims[dim0].n;
cannam@127 231 INT m = p->vecsz->dims[dim1].n;
cannam@127 232 INT d, vl, vs;
cannam@127 233 get_transpose_vec(p, dim2, &vl, &vs);
cannam@127 234 d = gcd(n, m);
cannam@127 235 *nbuf = n * (m / d) * vl;
cannam@127 236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
cannam@127 237 && n != m
cannam@127 238 && d > 1
cannam@127 239 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@127 240 p->vecsz->dims + dim1,
cannam@127 241 vl, vs));
cannam@127 242 }
cannam@127 243
cannam@127 244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
cannam@127 245 {
cannam@127 246 INT n = ego->nd, m = ego->md, d = ego->d;
cannam@127 247 INT vl = ego->vl;
cannam@127 248 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@127 249 INT num_el = n*m*d*vl;
cannam@127 250
cannam@127 251 if (n > 1) {
cannam@127 252 ego->cld1 = X(mkplan_d)(plnr,
cannam@127 253 X(mkproblem_rdft_0_d)(
cannam@127 254 X(mktensor_3d)(n, d*m*vl, m*vl,
cannam@127 255 d, m*vl, n*m*vl,
cannam@127 256 m*vl, 1, 1),
cannam@127 257 TAINT(p->I, num_el), buf));
cannam@127 258 if (!ego->cld1)
cannam@127 259 goto nada;
cannam@127 260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
cannam@127 261 &ego->super.super.ops);
cannam@127 262 ego->super.super.ops.other += num_el * d * 2;
cannam@127 263 }
cannam@127 264
cannam@127 265 ego->cld2 = X(mkplan_d)(plnr,
cannam@127 266 X(mkproblem_rdft_0_d)(
cannam@127 267 X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
cannam@127 268 d, n*m*vl, d*n*m*vl,
cannam@127 269 n*m*vl, 1, 1),
cannam@127 270 p->I, p->I));
cannam@127 271 if (!ego->cld2)
cannam@127 272 goto nada;
cannam@127 273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
cannam@127 274
cannam@127 275 if (m > 1) {
cannam@127 276 ego->cld3 = X(mkplan_d)(plnr,
cannam@127 277 X(mkproblem_rdft_0_d)(
cannam@127 278 X(mktensor_3d)(d*n, m*vl, vl,
cannam@127 279 m, vl, d*n*vl,
cannam@127 280 vl, 1, 1),
cannam@127 281 TAINT(p->I, num_el), buf));
cannam@127 282 if (!ego->cld3)
cannam@127 283 goto nada;
cannam@127 284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
cannam@127 285 ego->super.super.ops.other += num_el * d * 2;
cannam@127 286 }
cannam@127 287
cannam@127 288 X(ifree)(buf);
cannam@127 289 return 1;
cannam@127 290
cannam@127 291 nada:
cannam@127 292 X(ifree)(buf);
cannam@127 293 return 0;
cannam@127 294 }
cannam@127 295
cannam@127 296 static const transpose_adt adt_gcd =
cannam@127 297 {
cannam@127 298 apply_gcd, applicable_gcd, mkcldrn_gcd,
cannam@127 299 "rdft-transpose-gcd"
cannam@127 300 };
cannam@127 301
cannam@127 302 /*************************************************************************/
cannam@127 303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
cannam@127 304 based on transposing a sub-matrix first and then transposing the
cannam@127 305 remainder(s) with the help of a buffer. See also transpose-gcd,
cannam@127 306 above, if gcd(n,m) is large.
cannam@127 307
cannam@127 308 This algorithm is related to algorithm V3 from Murray Dow,
cannam@127 309 "Transposing a matrix on a vector computer," Parallel Computing 21
cannam@127 310 (12), 1997-2005 (1995), with the modifications that we use
cannam@127 311 cache-oblivious recursive transpose subroutines and we have the
cannam@127 312 generalization for large |n-m| below.
cannam@127 313
cannam@127 314 The best case, and the one described by Dow, is for |n-m| small, in
cannam@127 315 which case we transpose a square sub-matrix of size min(n,m),
cannam@127 316 handling the remainder via a buffer. This requires scratch space
cannam@127 317 equal to the size of the matrix times |n-m| / max(n,m).
cannam@127 318
cannam@127 319 As a generalization when |n-m| is not small, we also support cutting
cannam@127 320 *both* dimensions to an nc x mc matrix which is *not* necessarily
cannam@127 321 square, but has a large gcd (and can therefore use transpose-gcd).
cannam@127 322 */
cannam@127 323
cannam@127 324 static void apply_cut(const plan *ego_, R *I, R *O)
cannam@127 325 {
cannam@127 326 const P *ego = (const P *) ego_;
cannam@127 327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
cannam@127 328 INT i;
cannam@127 329 R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@127 330 UNUSED(O);
cannam@127 331
cannam@127 332 if (m > mc) {
cannam@127 333 ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
cannam@127 334 for (i = 0; i < nc; ++i)
cannam@127 335 memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
cannam@127 336 }
cannam@127 337
cannam@127 338 ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
cannam@127 339
cannam@127 340 if (n > nc) {
cannam@127 341 R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
cannam@127 342 memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
cannam@127 343 for (i = mc-1; i >= 0; --i)
cannam@127 344 memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
cannam@127 345 ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
cannam@127 346 }
cannam@127 347
cannam@127 348 if (m > mc) {
cannam@127 349 if (n > nc)
cannam@127 350 for (i = mc; i < m; ++i)
cannam@127 351 memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
cannam@127 352 (nc*vl)*sizeof(R));
cannam@127 353 else
cannam@127 354 memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
cannam@127 355 }
cannam@127 356
cannam@127 357 X(ifree)(buf1);
cannam@127 358 }
cannam@127 359
cannam@127 360 /* only cut one dimension if the resulting buffer is small enough */
cannam@127 361 static int cut1(INT n, INT m, INT vl)
cannam@127 362 {
cannam@127 363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
cannam@127 364 || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
cannam@127 365 }
cannam@127 366
cannam@127 367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
cannam@127 368
cannam@127 369 static int applicable_cut(const problem_rdft *p, planner *plnr,
cannam@127 370 int dim0, int dim1, int dim2, INT *nbuf)
cannam@127 371 {
cannam@127 372 INT n = p->vecsz->dims[dim0].n;
cannam@127 373 INT m = p->vecsz->dims[dim1].n;
cannam@127 374 INT vl, vs;
cannam@127 375 get_transpose_vec(p, dim2, &vl, &vs);
cannam@127 376 *nbuf = 0; /* always small enough to be non-UGLY (?) */
cannam@127 377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
cannam@127 378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
cannam@127 379 && n != m
cannam@127 380
cannam@127 381 /* Don't call transpose-cut recursively (avoid inf. loops):
cannam@127 382 the non-square sub-transpose produced when !cut1
cannam@127 383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
cannam@127 384 for which transpose-gcd is applicable */
cannam@127 385 && (cut1(n, m, vl)
cannam@127 386 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
cannam@127 387
cannam@127 388 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@127 389 p->vecsz->dims + dim1,
cannam@127 390 vl, vs));
cannam@127 391 }
cannam@127 392
cannam@127 393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
cannam@127 394 {
cannam@127 395 INT n = ego->n, m = ego->m, nc, mc;
cannam@127 396 INT vl = ego->vl;
cannam@127 397 R *buf;
cannam@127 398
cannam@127 399 /* pick the "best" cut */
cannam@127 400 if (cut1(n, m, vl)) {
cannam@127 401 nc = mc = X(imin)(n,m);
cannam@127 402 }
cannam@127 403 else {
cannam@127 404 INT dc, ns, ms;
cannam@127 405 dc = gcd(m, n); nc = n; mc = m;
cannam@127 406 /* search for cut with largest gcd
cannam@127 407 (TODO: different optimality criteria? different search range?) */
cannam@127 408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
cannam@127 409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
cannam@127 410 INT ds = gcd(ms, ns);
cannam@127 411 if (ds > dc) {
cannam@127 412 dc = ds; nc = ns; mc = ms;
cannam@127 413 if (dc == X(imin)(ns, ms))
cannam@127 414 break; /* cannot get larger than this */
cannam@127 415 }
cannam@127 416 }
cannam@127 417 if (dc == X(imin)(n, ms))
cannam@127 418 break; /* cannot get larger than this */
cannam@127 419 }
cannam@127 420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
cannam@127 421 }
cannam@127 422 ego->nc = nc;
cannam@127 423 ego->mc = mc;
cannam@127 424 ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
cannam@127 425
cannam@127 426 buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@127 427
cannam@127 428 if (m > mc) {
cannam@127 429 ego->cld1 = X(mkplan_d)(plnr,
cannam@127 430 X(mkproblem_rdft_0_d)(
cannam@127 431 X(mktensor_3d)(nc, m*vl, vl,
cannam@127 432 m-mc, vl, nc*vl,
cannam@127 433 vl, 1, 1),
cannam@127 434 p->I + mc*vl, buf));
cannam@127 435 if (!ego->cld1)
cannam@127 436 goto nada;
cannam@127 437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
cannam@127 438 }
cannam@127 439
cannam@127 440 ego->cld2 = X(mkplan_d)(plnr,
cannam@127 441 X(mkproblem_rdft_0_d)(
cannam@127 442 X(mktensor_3d)(nc, mc*vl, vl,
cannam@127 443 mc, vl, nc*vl,
cannam@127 444 vl, 1, 1),
cannam@127 445 p->I, p->I));
cannam@127 446 if (!ego->cld2)
cannam@127 447 goto nada;
cannam@127 448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
cannam@127 449
cannam@127 450 if (n > nc) {
cannam@127 451 ego->cld3 = X(mkplan_d)(plnr,
cannam@127 452 X(mkproblem_rdft_0_d)(
cannam@127 453 X(mktensor_3d)(n-nc, m*vl, vl,
cannam@127 454 m, vl, n*vl,
cannam@127 455 vl, 1, 1),
cannam@127 456 buf + (m-mc)*(nc*vl), p->I + nc*vl));
cannam@127 457 if (!ego->cld3)
cannam@127 458 goto nada;
cannam@127 459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
cannam@127 460 }
cannam@127 461
cannam@127 462 /* memcpy/memmove operations */
cannam@127 463 ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
cannam@127 464 + (n-nc)*m + (m-mc)*nc);
cannam@127 465
cannam@127 466 X(ifree)(buf);
cannam@127 467 return 1;
cannam@127 468
cannam@127 469 nada:
cannam@127 470 X(ifree)(buf);
cannam@127 471 return 0;
cannam@127 472 }
cannam@127 473
cannam@127 474 static const transpose_adt adt_cut =
cannam@127 475 {
cannam@127 476 apply_cut, applicable_cut, mkcldrn_cut,
cannam@127 477 "rdft-transpose-cut"
cannam@127 478 };
cannam@127 479
cannam@127 480 /*************************************************************************/
cannam@127 481 /* In-place transpose routine from TOMS, which follows the cycles of
cannam@127 482 the permutation so that it writes to each location only once.
cannam@127 483 Because of cache-line and other issues, however, this routine is
cannam@127 484 typically much slower than transpose-gcd or transpose-cut, even
cannam@127 485 though the latter do some extra writes. On the other hand, if the
cannam@127 486 vector length is large then the TOMS routine is best.
cannam@127 487
cannam@127 488 The TOMS routine also has the advantage of requiring less buffer
cannam@127 489 space for the case of gcd(nx,ny) small. However, in this case it
cannam@127 490 has been superseded by the combination of the generalized
cannam@127 491 transpose-cut method with the transpose-gcd method, which can
cannam@127 492 always transpose with buffers a small fraction of the array size
cannam@127 493 regardless of gcd(nx,ny). */
cannam@127 494
cannam@127 495 /*
cannam@127 496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380).
cannam@127 497 *
cannam@127 498 * These routines do in-place transposes of arrays.
cannam@127 499 *
cannam@127 500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
cannam@127 501 * vol. 3, no. 1, 104-110 (1977) ]
cannam@127 502 *
cannam@127 503 * C version by Steven G. Johnson (February 1997).
cannam@127 504 */
cannam@127 505
cannam@127 506 /*
cannam@127 507 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
cannam@127 508 * matrix of N-tuples to be transposed. "a" is stored in row-major
cannam@127 509 * order (last index varies fastest). move is a 1D array of length
cannam@127 510 * move_size used to store information to speed up the process. The
cannam@127 511 * value move_size=(ny+nx)/2 is recommended. buf should be an array
cannam@127 512 * of length 2*N.
cannam@127 513 *
cannam@127 514 */
cannam@127 515
cannam@127 516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
cannam@127 517 char *move, INT move_size, R *buf)
cannam@127 518 {
cannam@127 519 INT i, im, mn;
cannam@127 520 R *b, *c, *d;
cannam@127 521 INT ncount;
cannam@127 522 INT k;
cannam@127 523
cannam@127 524 /* check arguments and initialize: */
cannam@127 525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
cannam@127 526
cannam@127 527 b = buf;
cannam@127 528
cannam@127 529 /* Cate & Twigg have a special case for nx == ny, but we don't
cannam@127 530 bother, since we already have special code for this case elsewhere. */
cannam@127 531
cannam@127 532 c = buf + N;
cannam@127 533 ncount = 2; /* always at least 2 fixed points */
cannam@127 534 k = (mn = ny * nx) - 1;
cannam@127 535
cannam@127 536 for (i = 0; i < move_size; ++i)
cannam@127 537 move[i] = 0;
cannam@127 538
cannam@127 539 if (ny >= 3 && nx >= 3)
cannam@127 540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
cannam@127 541
cannam@127 542 i = 1;
cannam@127 543 im = ny;
cannam@127 544
cannam@127 545 while (1) {
cannam@127 546 INT i1, i2, i1c, i2c;
cannam@127 547 INT kmi;
cannam@127 548
cannam@127 549 /** Rearrange the elements of a loop
cannam@127 550 and its companion loop: **/
cannam@127 551
cannam@127 552 i1 = i;
cannam@127 553 kmi = k - i;
cannam@127 554 i1c = kmi;
cannam@127 555 switch (N) {
cannam@127 556 case 1:
cannam@127 557 b[0] = a[i1];
cannam@127 558 c[0] = a[i1c];
cannam@127 559 break;
cannam@127 560 case 2:
cannam@127 561 b[0] = a[2*i1];
cannam@127 562 b[1] = a[2*i1+1];
cannam@127 563 c[0] = a[2*i1c];
cannam@127 564 c[1] = a[2*i1c+1];
cannam@127 565 break;
cannam@127 566 default:
cannam@127 567 memcpy(b, &a[N * i1], N * sizeof(R));
cannam@127 568 memcpy(c, &a[N * i1c], N * sizeof(R));
cannam@127 569 }
cannam@127 570 while (1) {
cannam@127 571 i2 = ny * i1 - k * (i1 / nx);
cannam@127 572 i2c = k - i2;
cannam@127 573 if (i1 < move_size)
cannam@127 574 move[i1] = 1;
cannam@127 575 if (i1c < move_size)
cannam@127 576 move[i1c] = 1;
cannam@127 577 ncount += 2;
cannam@127 578 if (i2 == i)
cannam@127 579 break;
cannam@127 580 if (i2 == kmi) {
cannam@127 581 d = b;
cannam@127 582 b = c;
cannam@127 583 c = d;
cannam@127 584 break;
cannam@127 585 }
cannam@127 586 switch (N) {
cannam@127 587 case 1:
cannam@127 588 a[i1] = a[i2];
cannam@127 589 a[i1c] = a[i2c];
cannam@127 590 break;
cannam@127 591 case 2:
cannam@127 592 a[2*i1] = a[2*i2];
cannam@127 593 a[2*i1+1] = a[2*i2+1];
cannam@127 594 a[2*i1c] = a[2*i2c];
cannam@127 595 a[2*i1c+1] = a[2*i2c+1];
cannam@127 596 break;
cannam@127 597 default:
cannam@127 598 memcpy(&a[N * i1], &a[N * i2],
cannam@127 599 N * sizeof(R));
cannam@127 600 memcpy(&a[N * i1c], &a[N * i2c],
cannam@127 601 N * sizeof(R));
cannam@127 602 }
cannam@127 603 i1 = i2;
cannam@127 604 i1c = i2c;
cannam@127 605 }
cannam@127 606 switch (N) {
cannam@127 607 case 1:
cannam@127 608 a[i1] = b[0];
cannam@127 609 a[i1c] = c[0];
cannam@127 610 break;
cannam@127 611 case 2:
cannam@127 612 a[2*i1] = b[0];
cannam@127 613 a[2*i1+1] = b[1];
cannam@127 614 a[2*i1c] = c[0];
cannam@127 615 a[2*i1c+1] = c[1];
cannam@127 616 break;
cannam@127 617 default:
cannam@127 618 memcpy(&a[N * i1], b, N * sizeof(R));
cannam@127 619 memcpy(&a[N * i1c], c, N * sizeof(R));
cannam@127 620 }
cannam@127 621 if (ncount >= mn)
cannam@127 622 break; /* we've moved all elements */
cannam@127 623
cannam@127 624 /** Search for loops to rearrange: **/
cannam@127 625
cannam@127 626 while (1) {
cannam@127 627 INT max = k - i;
cannam@127 628 ++i;
cannam@127 629 A(i <= max);
cannam@127 630 im += ny;
cannam@127 631 if (im > k)
cannam@127 632 im -= k;
cannam@127 633 i2 = im;
cannam@127 634 if (i == i2)
cannam@127 635 continue;
cannam@127 636 if (i >= move_size) {
cannam@127 637 while (i2 > i && i2 < max) {
cannam@127 638 i1 = i2;
cannam@127 639 i2 = ny * i1 - k * (i1 / nx);
cannam@127 640 }
cannam@127 641 if (i2 == i)
cannam@127 642 break;
cannam@127 643 } else if (!move[i])
cannam@127 644 break;
cannam@127 645 }
cannam@127 646 }
cannam@127 647 }
cannam@127 648
cannam@127 649 static void apply_toms513(const plan *ego_, R *I, R *O)
cannam@127 650 {
cannam@127 651 const P *ego = (const P *) ego_;
cannam@127 652 INT n = ego->n, m = ego->m;
cannam@127 653 INT vl = ego->vl;
cannam@127 654 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@127 655 UNUSED(O);
cannam@127 656 transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
cannam@127 657 X(ifree)(buf);
cannam@127 658 }
cannam@127 659
cannam@127 660 static int applicable_toms513(const problem_rdft *p, planner *plnr,
cannam@127 661 int dim0, int dim1, int dim2, INT *nbuf)
cannam@127 662 {
cannam@127 663 INT n = p->vecsz->dims[dim0].n;
cannam@127 664 INT m = p->vecsz->dims[dim1].n;
cannam@127 665 INT vl, vs;
cannam@127 666 get_transpose_vec(p, dim2, &vl, &vs);
cannam@127 667 *nbuf = 2*vl
cannam@127 668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
cannam@127 669 return (!NO_SLOWP(plnr)
cannam@127 670 && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
cannam@127 671 && n != m
cannam@127 672 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@127 673 p->vecsz->dims + dim1,
cannam@127 674 vl, vs));
cannam@127 675 }
cannam@127 676
cannam@127 677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
cannam@127 678 {
cannam@127 679 UNUSED(p); UNUSED(plnr);
cannam@127 680 /* heuristic so that TOMS algorithm is last resort for small vl */
cannam@127 681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
cannam@127 682 return 1;
cannam@127 683 }
cannam@127 684
cannam@127 685 static const transpose_adt adt_toms513 =
cannam@127 686 {
cannam@127 687 apply_toms513, applicable_toms513, mkcldrn_toms513,
cannam@127 688 "rdft-transpose-toms513"
cannam@127 689 };
cannam@127 690
cannam@127 691 /*-----------------------------------------------------------------------*/
cannam@127 692 /*-----------------------------------------------------------------------*/
cannam@127 693 /* generic stuff: */
cannam@127 694
cannam@127 695 static void awake(plan *ego_, enum wakefulness wakefulness)
cannam@127 696 {
cannam@127 697 P *ego = (P *) ego_;
cannam@127 698 X(plan_awake)(ego->cld1, wakefulness);
cannam@127 699 X(plan_awake)(ego->cld2, wakefulness);
cannam@127 700 X(plan_awake)(ego->cld3, wakefulness);
cannam@127 701 }
cannam@127 702
cannam@127 703 static void print(const plan *ego_, printer *p)
cannam@127 704 {
cannam@127 705 const P *ego = (const P *) ego_;
cannam@127 706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
cannam@127 707 ego->n, ego->m, ego->vl);
cannam@127 708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
cannam@127 709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
cannam@127 710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
cannam@127 711 p->print(p, ")");
cannam@127 712 }
cannam@127 713
cannam@127 714 static void destroy(plan *ego_)
cannam@127 715 {
cannam@127 716 P *ego = (P *) ego_;
cannam@127 717 X(plan_destroy_internal)(ego->cld3);
cannam@127 718 X(plan_destroy_internal)(ego->cld2);
cannam@127 719 X(plan_destroy_internal)(ego->cld1);
cannam@127 720 }
cannam@127 721
cannam@127 722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
cannam@127 723 {
cannam@127 724 const S *ego = (const S *) ego_;
cannam@127 725 const problem_rdft *p;
cannam@127 726 int dim0, dim1, dim2;
cannam@127 727 INT nbuf, vs;
cannam@127 728 P *pln;
cannam@127 729
cannam@127 730 static const plan_adt padt = {
cannam@127 731 X(rdft_solve), awake, print, destroy
cannam@127 732 };
cannam@127 733
cannam@127 734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
cannam@127 735 return (plan *) 0;
cannam@127 736
cannam@127 737 p = (const problem_rdft *) p_;
cannam@127 738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
cannam@127 739
cannam@127 740 pln->n = p->vecsz->dims[dim0].n;
cannam@127 741 pln->m = p->vecsz->dims[dim1].n;
cannam@127 742 get_transpose_vec(p, dim2, &pln->vl, &vs);
cannam@127 743 pln->nbuf = nbuf;
cannam@127 744 pln->d = gcd(pln->n, pln->m);
cannam@127 745 pln->nd = pln->n / pln->d;
cannam@127 746 pln->md = pln->m / pln->d;
cannam@127 747 pln->slv = ego;
cannam@127 748
cannam@127 749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
cannam@127 750
cannam@127 751 pln->cld1 = pln->cld2 = pln->cld3 = 0;
cannam@127 752 if (!ego->adt->mkcldrn(p, plnr, pln)) {
cannam@127 753 X(plan_destroy_internal)(&(pln->super.super));
cannam@127 754 return 0;
cannam@127 755 }
cannam@127 756
cannam@127 757 return &(pln->super.super);
cannam@127 758 }
cannam@127 759
cannam@127 760 static solver *mksolver(const transpose_adt *adt)
cannam@127 761 {
cannam@127 762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
cannam@127 763 S *slv = MKSOLVER(S, &sadt);
cannam@127 764 slv->adt = adt;
cannam@127 765 return &(slv->super);
cannam@127 766 }
cannam@127 767
cannam@127 768 void X(rdft_vrank3_transpose_register)(planner *p)
cannam@127 769 {
cannam@127 770 unsigned i;
cannam@127 771 static const transpose_adt *const adts[] = {
cannam@127 772 &adt_gcd, &adt_cut,
cannam@127 773 &adt_toms513
cannam@127 774 };
cannam@127 775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
cannam@127 776 REGISTER_SOLVER(p, mksolver(adts[i]));
cannam@127 777 }