annotate src/fftw-3.3.8/rdft/vrank3-transpose.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21
Chris@82 22 /* rank-0, vector-rank-3, non-square in-place transposition
Chris@82 23 (see rank0.c for square transposition) */
Chris@82 24
Chris@82 25 #include "rdft/rdft.h"
Chris@82 26
Chris@82 27 #ifdef HAVE_STRING_H
Chris@82 28 #include <string.h> /* for memcpy() */
Chris@82 29 #endif
Chris@82 30
Chris@82 31 struct P_s;
Chris@82 32
Chris@82 33 typedef struct {
Chris@82 34 rdftapply apply;
Chris@82 35 int (*applicable)(const problem_rdft *p, planner *plnr,
Chris@82 36 int dim0, int dim1, int dim2, INT *nbuf);
Chris@82 37 int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
Chris@82 38 const char *nam;
Chris@82 39 } transpose_adt;
Chris@82 40
Chris@82 41 typedef struct {
Chris@82 42 solver super;
Chris@82 43 const transpose_adt *adt;
Chris@82 44 } S;
Chris@82 45
Chris@82 46 typedef struct P_s {
Chris@82 47 plan_rdft super;
Chris@82 48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */
Chris@82 49 INT nbuf; /* buffer size */
Chris@82 50 INT nd, md, d; /* transpose-gcd params */
Chris@82 51 INT nc, mc; /* transpose-cut params */
Chris@82 52 plan *cld1, *cld2, *cld3; /* children, null if unused */
Chris@82 53 const S *slv;
Chris@82 54 } P;
Chris@82 55
Chris@82 56
Chris@82 57 /*************************************************************************/
Chris@82 58 /* some utilities for the solvers */
Chris@82 59
Chris@82 60 static INT gcd(INT a, INT b)
Chris@82 61 {
Chris@82 62 INT r;
Chris@82 63 do {
Chris@82 64 r = a % b;
Chris@82 65 a = b;
Chris@82 66 b = r;
Chris@82 67 } while (r != 0);
Chris@82 68
Chris@82 69 return a;
Chris@82 70 }
Chris@82 71
Chris@82 72 /* whether we can transpose with one of our routines expecting
Chris@82 73 contiguous Ntuples */
Chris@82 74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
Chris@82 75 {
Chris@82 76 return (vs == 1 && b->is == vl && a->os == vl &&
Chris@82 77 ((a->n == b->n && a->is == b->os
Chris@82 78 && a->is >= b->n && a->is % vl == 0)
Chris@82 79 || (a->is == b->n * vl && b->os == a->n * vl)));
Chris@82 80 }
Chris@82 81
Chris@82 82 /* check whether a and b correspond to the first and second dimensions
Chris@82 83 of a transpose of tuples with vector length = vl, stride = vs. */
Chris@82 84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
Chris@82 85 {
Chris@82 86 return ((a->n == b->n && a->os == b->is && a->is == b->os)
Chris@82 87 || Ntuple_transposable(a, b, vl, vs));
Chris@82 88 }
Chris@82 89
Chris@82 90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
Chris@82 91 {
Chris@82 92 int dim0, dim1;
Chris@82 93
Chris@82 94 for (dim0 = 0; dim0 < s->rnk; ++dim0)
Chris@82 95 for (dim1 = 0; dim1 < s->rnk; ++dim1) {
Chris@82 96 int dim2 = 3 - dim0 - dim1;
Chris@82 97 if (dim0 == dim1) continue;
Chris@82 98 if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
Chris@82 99 && transposable(s->dims + dim0, s->dims + dim1,
Chris@82 100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
Chris@82 101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
Chris@82 102 *pdim0 = dim0;
Chris@82 103 *pdim1 = dim1;
Chris@82 104 *pdim2 = dim2;
Chris@82 105 return 1;
Chris@82 106 }
Chris@82 107 }
Chris@82 108 return 0;
Chris@82 109 }
Chris@82 110
Chris@82 111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
Chris@82 112 #define MAXBUF 65536 /* maximum non-ugly buffer */
Chris@82 113
Chris@82 114 /* generic applicability function */
Chris@82 115 static int applicable(const solver *ego_, const problem *p_, planner *plnr,
Chris@82 116 int *dim0, int *dim1, int *dim2, INT *nbuf)
Chris@82 117 {
Chris@82 118 const S *ego = (const S *) ego_;
Chris@82 119 const problem_rdft *p = (const problem_rdft *) p_;
Chris@82 120
Chris@82 121 return (1
Chris@82 122 && p->I == p->O
Chris@82 123 && p->sz->rnk == 0
Chris@82 124 && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
Chris@82 125
Chris@82 126 && pickdim(p->vecsz, dim0, dim1, dim2)
Chris@82 127
Chris@82 128 /* UGLY if vecloop in wrong order for locality */
Chris@82 129 && (!NO_UGLYP(plnr) ||
Chris@82 130 p->vecsz->rnk == 2 ||
Chris@82 131 X(iabs)(p->vecsz->dims[*dim2].is)
Chris@82 132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
Chris@82 133 X(iabs)(p->vecsz->dims[*dim0].os)))
Chris@82 134
Chris@82 135 /* SLOW if non-square */
Chris@82 136 && (!NO_SLOWP(plnr)
Chris@82 137 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
Chris@82 138
Chris@82 139 && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
Chris@82 140
Chris@82 141 /* buffers too big are UGLY */
Chris@82 142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
Chris@82 143 || *nbuf <= MAXBUF
Chris@82 144 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
Chris@82 145 );
Chris@82 146 }
Chris@82 147
Chris@82 148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
Chris@82 149 {
Chris@82 150 if (p->vecsz->rnk == 2) {
Chris@82 151 *vl = 1; *vs = 1;
Chris@82 152 }
Chris@82 153 else {
Chris@82 154 *vl = p->vecsz->dims[dim2].n;
Chris@82 155 *vs = p->vecsz->dims[dim2].is; /* == os */
Chris@82 156 }
Chris@82 157 }
Chris@82 158
Chris@82 159 /*************************************************************************/
Chris@82 160 /* Cache-oblivious in-place transpose of non-square matrices, based
Chris@82 161 on transposes of blocks given by the gcd of the dimensions.
Chris@82 162
Chris@82 163 This algorithm is related to algorithm V5 from Murray Dow,
Chris@82 164 "Transposing a matrix on a vector computer," Parallel Computing 21
Chris@82 165 (12), 1997-2005 (1995), with the modification that we use
Chris@82 166 cache-oblivious recursive transpose subroutines (and we derived
Chris@82 167 it independently).
Chris@82 168
Chris@82 169 For a p x q matrix, this requires scratch space equal to the size
Chris@82 170 of the matrix divided by gcd(p,q). Alternatively, see also the
Chris@82 171 "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
Chris@82 172
Chris@82 173 static void apply_gcd(const plan *ego_, R *I, R *O)
Chris@82 174 {
Chris@82 175 const P *ego = (const P *) ego_;
Chris@82 176 INT n = ego->nd, m = ego->md, d = ego->d;
Chris@82 177 INT vl = ego->vl;
Chris@82 178 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
Chris@82 179 INT i, num_el = n*m*d*vl;
Chris@82 180
Chris@82 181 A(ego->n == n * d && ego->m == m * d);
Chris@82 182 UNUSED(O);
Chris@82 183
Chris@82 184 /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
Chris@82 185 of vl-tuples and buf contains n*m*d*vl elements.
Chris@82 186
Chris@82 187 In general, to transpose a p x q matrix, you should call this
Chris@82 188 routine with d = gcd(p, q), n = p/d, and m = q/d. */
Chris@82 189
Chris@82 190 A(n > 0 && m > 0 && vl > 0);
Chris@82 191 A(d > 1);
Chris@82 192
Chris@82 193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */
Chris@82 194
Chris@82 195 /* First, transpose d x (n x d') x m to d x (d' x n) x m,
Chris@82 196 using the buf matrix. This consists of d transposes
Chris@82 197 of contiguous n x d' matrices of m-tuples. */
Chris@82 198 if (n > 1) {
Chris@82 199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
Chris@82 200 for (i = 0; i < d; ++i) {
Chris@82 201 cldapply(ego->cld1, I + i*num_el, buf);
Chris@82 202 memcpy(I + i*num_el, buf, num_el*sizeof(R));
Chris@82 203 }
Chris@82 204 }
Chris@82 205
Chris@82 206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
Chris@82 207 is a square in-place transpose of n*m-tuples: */
Chris@82 208 {
Chris@82 209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
Chris@82 210 cldapply(ego->cld2, I, I);
Chris@82 211 }
Chris@82 212
Chris@82 213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
Chris@82 214 using the buf matrix. This consists of d' transposes
Chris@82 215 of contiguous d*n x m matrices. */
Chris@82 216 if (m > 1) {
Chris@82 217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
Chris@82 218 for (i = 0; i < d; ++i) {
Chris@82 219 cldapply(ego->cld3, I + i*num_el, buf);
Chris@82 220 memcpy(I + i*num_el, buf, num_el*sizeof(R));
Chris@82 221 }
Chris@82 222 }
Chris@82 223
Chris@82 224 X(ifree)(buf);
Chris@82 225 }
Chris@82 226
Chris@82 227 static int applicable_gcd(const problem_rdft *p, planner *plnr,
Chris@82 228 int dim0, int dim1, int dim2, INT *nbuf)
Chris@82 229 {
Chris@82 230 INT n = p->vecsz->dims[dim0].n;
Chris@82 231 INT m = p->vecsz->dims[dim1].n;
Chris@82 232 INT d, vl, vs;
Chris@82 233 get_transpose_vec(p, dim2, &vl, &vs);
Chris@82 234 d = gcd(n, m);
Chris@82 235 *nbuf = n * (m / d) * vl;
Chris@82 236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
Chris@82 237 && n != m
Chris@82 238 && d > 1
Chris@82 239 && Ntuple_transposable(p->vecsz->dims + dim0,
Chris@82 240 p->vecsz->dims + dim1,
Chris@82 241 vl, vs));
Chris@82 242 }
Chris@82 243
Chris@82 244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
Chris@82 245 {
Chris@82 246 INT n = ego->nd, m = ego->md, d = ego->d;
Chris@82 247 INT vl = ego->vl;
Chris@82 248 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
Chris@82 249 INT num_el = n*m*d*vl;
Chris@82 250
Chris@82 251 if (n > 1) {
Chris@82 252 ego->cld1 = X(mkplan_d)(plnr,
Chris@82 253 X(mkproblem_rdft_0_d)(
Chris@82 254 X(mktensor_3d)(n, d*m*vl, m*vl,
Chris@82 255 d, m*vl, n*m*vl,
Chris@82 256 m*vl, 1, 1),
Chris@82 257 TAINT(p->I, num_el), buf));
Chris@82 258 if (!ego->cld1)
Chris@82 259 goto nada;
Chris@82 260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
Chris@82 261 &ego->super.super.ops);
Chris@82 262 ego->super.super.ops.other += num_el * d * 2;
Chris@82 263 }
Chris@82 264
Chris@82 265 ego->cld2 = X(mkplan_d)(plnr,
Chris@82 266 X(mkproblem_rdft_0_d)(
Chris@82 267 X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
Chris@82 268 d, n*m*vl, d*n*m*vl,
Chris@82 269 n*m*vl, 1, 1),
Chris@82 270 p->I, p->I));
Chris@82 271 if (!ego->cld2)
Chris@82 272 goto nada;
Chris@82 273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
Chris@82 274
Chris@82 275 if (m > 1) {
Chris@82 276 ego->cld3 = X(mkplan_d)(plnr,
Chris@82 277 X(mkproblem_rdft_0_d)(
Chris@82 278 X(mktensor_3d)(d*n, m*vl, vl,
Chris@82 279 m, vl, d*n*vl,
Chris@82 280 vl, 1, 1),
Chris@82 281 TAINT(p->I, num_el), buf));
Chris@82 282 if (!ego->cld3)
Chris@82 283 goto nada;
Chris@82 284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
Chris@82 285 ego->super.super.ops.other += num_el * d * 2;
Chris@82 286 }
Chris@82 287
Chris@82 288 X(ifree)(buf);
Chris@82 289 return 1;
Chris@82 290
Chris@82 291 nada:
Chris@82 292 X(ifree)(buf);
Chris@82 293 return 0;
Chris@82 294 }
Chris@82 295
Chris@82 296 static const transpose_adt adt_gcd =
Chris@82 297 {
Chris@82 298 apply_gcd, applicable_gcd, mkcldrn_gcd,
Chris@82 299 "rdft-transpose-gcd"
Chris@82 300 };
Chris@82 301
Chris@82 302 /*************************************************************************/
Chris@82 303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
Chris@82 304 based on transposing a sub-matrix first and then transposing the
Chris@82 305 remainder(s) with the help of a buffer. See also transpose-gcd,
Chris@82 306 above, if gcd(n,m) is large.
Chris@82 307
Chris@82 308 This algorithm is related to algorithm V3 from Murray Dow,
Chris@82 309 "Transposing a matrix on a vector computer," Parallel Computing 21
Chris@82 310 (12), 1997-2005 (1995), with the modifications that we use
Chris@82 311 cache-oblivious recursive transpose subroutines and we have the
Chris@82 312 generalization for large |n-m| below.
Chris@82 313
Chris@82 314 The best case, and the one described by Dow, is for |n-m| small, in
Chris@82 315 which case we transpose a square sub-matrix of size min(n,m),
Chris@82 316 handling the remainder via a buffer. This requires scratch space
Chris@82 317 equal to the size of the matrix times |n-m| / max(n,m).
Chris@82 318
Chris@82 319 As a generalization when |n-m| is not small, we also support cutting
Chris@82 320 *both* dimensions to an nc x mc matrix which is *not* necessarily
Chris@82 321 square, but has a large gcd (and can therefore use transpose-gcd).
Chris@82 322 */
Chris@82 323
Chris@82 324 static void apply_cut(const plan *ego_, R *I, R *O)
Chris@82 325 {
Chris@82 326 const P *ego = (const P *) ego_;
Chris@82 327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
Chris@82 328 INT i;
Chris@82 329 R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
Chris@82 330 UNUSED(O);
Chris@82 331
Chris@82 332 if (m > mc) {
Chris@82 333 ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
Chris@82 334 for (i = 0; i < nc; ++i)
Chris@82 335 memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
Chris@82 336 }
Chris@82 337
Chris@82 338 ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
Chris@82 339
Chris@82 340 if (n > nc) {
Chris@82 341 R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
Chris@82 342 memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
Chris@82 343 for (i = mc-1; i >= 0; --i)
Chris@82 344 memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
Chris@82 345 ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
Chris@82 346 }
Chris@82 347
Chris@82 348 if (m > mc) {
Chris@82 349 if (n > nc)
Chris@82 350 for (i = mc; i < m; ++i)
Chris@82 351 memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
Chris@82 352 (nc*vl)*sizeof(R));
Chris@82 353 else
Chris@82 354 memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
Chris@82 355 }
Chris@82 356
Chris@82 357 X(ifree)(buf1);
Chris@82 358 }
Chris@82 359
Chris@82 360 /* only cut one dimension if the resulting buffer is small enough */
Chris@82 361 static int cut1(INT n, INT m, INT vl)
Chris@82 362 {
Chris@82 363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
Chris@82 364 || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
Chris@82 365 }
Chris@82 366
Chris@82 367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
Chris@82 368
Chris@82 369 static int applicable_cut(const problem_rdft *p, planner *plnr,
Chris@82 370 int dim0, int dim1, int dim2, INT *nbuf)
Chris@82 371 {
Chris@82 372 INT n = p->vecsz->dims[dim0].n;
Chris@82 373 INT m = p->vecsz->dims[dim1].n;
Chris@82 374 INT vl, vs;
Chris@82 375 get_transpose_vec(p, dim2, &vl, &vs);
Chris@82 376 *nbuf = 0; /* always small enough to be non-UGLY (?) */
Chris@82 377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
Chris@82 378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
Chris@82 379 && n != m
Chris@82 380
Chris@82 381 /* Don't call transpose-cut recursively (avoid inf. loops):
Chris@82 382 the non-square sub-transpose produced when !cut1
Chris@82 383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
Chris@82 384 for which transpose-gcd is applicable */
Chris@82 385 && (cut1(n, m, vl)
Chris@82 386 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
Chris@82 387
Chris@82 388 && Ntuple_transposable(p->vecsz->dims + dim0,
Chris@82 389 p->vecsz->dims + dim1,
Chris@82 390 vl, vs));
Chris@82 391 }
Chris@82 392
Chris@82 393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
Chris@82 394 {
Chris@82 395 INT n = ego->n, m = ego->m, nc, mc;
Chris@82 396 INT vl = ego->vl;
Chris@82 397 R *buf;
Chris@82 398
Chris@82 399 /* pick the "best" cut */
Chris@82 400 if (cut1(n, m, vl)) {
Chris@82 401 nc = mc = X(imin)(n,m);
Chris@82 402 }
Chris@82 403 else {
Chris@82 404 INT dc, ns, ms;
Chris@82 405 dc = gcd(m, n); nc = n; mc = m;
Chris@82 406 /* search for cut with largest gcd
Chris@82 407 (TODO: different optimality criteria? different search range?) */
Chris@82 408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
Chris@82 409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
Chris@82 410 INT ds = gcd(ms, ns);
Chris@82 411 if (ds > dc) {
Chris@82 412 dc = ds; nc = ns; mc = ms;
Chris@82 413 if (dc == X(imin)(ns, ms))
Chris@82 414 break; /* cannot get larger than this */
Chris@82 415 }
Chris@82 416 }
Chris@82 417 if (dc == X(imin)(n, ms))
Chris@82 418 break; /* cannot get larger than this */
Chris@82 419 }
Chris@82 420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
Chris@82 421 }
Chris@82 422 ego->nc = nc;
Chris@82 423 ego->mc = mc;
Chris@82 424 ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
Chris@82 425
Chris@82 426 buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
Chris@82 427
Chris@82 428 if (m > mc) {
Chris@82 429 ego->cld1 = X(mkplan_d)(plnr,
Chris@82 430 X(mkproblem_rdft_0_d)(
Chris@82 431 X(mktensor_3d)(nc, m*vl, vl,
Chris@82 432 m-mc, vl, nc*vl,
Chris@82 433 vl, 1, 1),
Chris@82 434 p->I + mc*vl, buf));
Chris@82 435 if (!ego->cld1)
Chris@82 436 goto nada;
Chris@82 437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
Chris@82 438 }
Chris@82 439
Chris@82 440 ego->cld2 = X(mkplan_d)(plnr,
Chris@82 441 X(mkproblem_rdft_0_d)(
Chris@82 442 X(mktensor_3d)(nc, mc*vl, vl,
Chris@82 443 mc, vl, nc*vl,
Chris@82 444 vl, 1, 1),
Chris@82 445 p->I, p->I));
Chris@82 446 if (!ego->cld2)
Chris@82 447 goto nada;
Chris@82 448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
Chris@82 449
Chris@82 450 if (n > nc) {
Chris@82 451 ego->cld3 = X(mkplan_d)(plnr,
Chris@82 452 X(mkproblem_rdft_0_d)(
Chris@82 453 X(mktensor_3d)(n-nc, m*vl, vl,
Chris@82 454 m, vl, n*vl,
Chris@82 455 vl, 1, 1),
Chris@82 456 buf + (m-mc)*(nc*vl), p->I + nc*vl));
Chris@82 457 if (!ego->cld3)
Chris@82 458 goto nada;
Chris@82 459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
Chris@82 460 }
Chris@82 461
Chris@82 462 /* memcpy/memmove operations */
Chris@82 463 ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
Chris@82 464 + (n-nc)*m + (m-mc)*nc);
Chris@82 465
Chris@82 466 X(ifree)(buf);
Chris@82 467 return 1;
Chris@82 468
Chris@82 469 nada:
Chris@82 470 X(ifree)(buf);
Chris@82 471 return 0;
Chris@82 472 }
Chris@82 473
Chris@82 474 static const transpose_adt adt_cut =
Chris@82 475 {
Chris@82 476 apply_cut, applicable_cut, mkcldrn_cut,
Chris@82 477 "rdft-transpose-cut"
Chris@82 478 };
Chris@82 479
Chris@82 480 /*************************************************************************/
Chris@82 481 /* In-place transpose routine from TOMS, which follows the cycles of
Chris@82 482 the permutation so that it writes to each location only once.
Chris@82 483 Because of cache-line and other issues, however, this routine is
Chris@82 484 typically much slower than transpose-gcd or transpose-cut, even
Chris@82 485 though the latter do some extra writes. On the other hand, if the
Chris@82 486 vector length is large then the TOMS routine is best.
Chris@82 487
Chris@82 488 The TOMS routine also has the advantage of requiring less buffer
Chris@82 489 space for the case of gcd(nx,ny) small. However, in this case it
Chris@82 490 has been superseded by the combination of the generalized
Chris@82 491 transpose-cut method with the transpose-gcd method, which can
Chris@82 492 always transpose with buffers a small fraction of the array size
Chris@82 493 regardless of gcd(nx,ny). */
Chris@82 494
Chris@82 495 /*
Chris@82 496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380).
Chris@82 497 *
Chris@82 498 * These routines do in-place transposes of arrays.
Chris@82 499 *
Chris@82 500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
Chris@82 501 * vol. 3, no. 1, 104-110 (1977) ]
Chris@82 502 *
Chris@82 503 * C version by Steven G. Johnson (February 1997).
Chris@82 504 */
Chris@82 505
Chris@82 506 /*
Chris@82 507 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
Chris@82 508 * matrix of N-tuples to be transposed. "a" is stored in row-major
Chris@82 509 * order (last index varies fastest). move is a 1D array of length
Chris@82 510 * move_size used to store information to speed up the process. The
Chris@82 511 * value move_size=(ny+nx)/2 is recommended. buf should be an array
Chris@82 512 * of length 2*N.
Chris@82 513 *
Chris@82 514 */
Chris@82 515
Chris@82 516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
Chris@82 517 char *move, INT move_size, R *buf)
Chris@82 518 {
Chris@82 519 INT i, im, mn;
Chris@82 520 R *b, *c, *d;
Chris@82 521 INT ncount;
Chris@82 522 INT k;
Chris@82 523
Chris@82 524 /* check arguments and initialize: */
Chris@82 525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
Chris@82 526
Chris@82 527 b = buf;
Chris@82 528
Chris@82 529 /* Cate & Twigg have a special case for nx == ny, but we don't
Chris@82 530 bother, since we already have special code for this case elsewhere. */
Chris@82 531
Chris@82 532 c = buf + N;
Chris@82 533 ncount = 2; /* always at least 2 fixed points */
Chris@82 534 k = (mn = ny * nx) - 1;
Chris@82 535
Chris@82 536 for (i = 0; i < move_size; ++i)
Chris@82 537 move[i] = 0;
Chris@82 538
Chris@82 539 if (ny >= 3 && nx >= 3)
Chris@82 540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
Chris@82 541
Chris@82 542 i = 1;
Chris@82 543 im = ny;
Chris@82 544
Chris@82 545 while (1) {
Chris@82 546 INT i1, i2, i1c, i2c;
Chris@82 547 INT kmi;
Chris@82 548
Chris@82 549 /** Rearrange the elements of a loop
Chris@82 550 and its companion loop: **/
Chris@82 551
Chris@82 552 i1 = i;
Chris@82 553 kmi = k - i;
Chris@82 554 i1c = kmi;
Chris@82 555 switch (N) {
Chris@82 556 case 1:
Chris@82 557 b[0] = a[i1];
Chris@82 558 c[0] = a[i1c];
Chris@82 559 break;
Chris@82 560 case 2:
Chris@82 561 b[0] = a[2*i1];
Chris@82 562 b[1] = a[2*i1+1];
Chris@82 563 c[0] = a[2*i1c];
Chris@82 564 c[1] = a[2*i1c+1];
Chris@82 565 break;
Chris@82 566 default:
Chris@82 567 memcpy(b, &a[N * i1], N * sizeof(R));
Chris@82 568 memcpy(c, &a[N * i1c], N * sizeof(R));
Chris@82 569 }
Chris@82 570 while (1) {
Chris@82 571 i2 = ny * i1 - k * (i1 / nx);
Chris@82 572 i2c = k - i2;
Chris@82 573 if (i1 < move_size)
Chris@82 574 move[i1] = 1;
Chris@82 575 if (i1c < move_size)
Chris@82 576 move[i1c] = 1;
Chris@82 577 ncount += 2;
Chris@82 578 if (i2 == i)
Chris@82 579 break;
Chris@82 580 if (i2 == kmi) {
Chris@82 581 d = b;
Chris@82 582 b = c;
Chris@82 583 c = d;
Chris@82 584 break;
Chris@82 585 }
Chris@82 586 switch (N) {
Chris@82 587 case 1:
Chris@82 588 a[i1] = a[i2];
Chris@82 589 a[i1c] = a[i2c];
Chris@82 590 break;
Chris@82 591 case 2:
Chris@82 592 a[2*i1] = a[2*i2];
Chris@82 593 a[2*i1+1] = a[2*i2+1];
Chris@82 594 a[2*i1c] = a[2*i2c];
Chris@82 595 a[2*i1c+1] = a[2*i2c+1];
Chris@82 596 break;
Chris@82 597 default:
Chris@82 598 memcpy(&a[N * i1], &a[N * i2],
Chris@82 599 N * sizeof(R));
Chris@82 600 memcpy(&a[N * i1c], &a[N * i2c],
Chris@82 601 N * sizeof(R));
Chris@82 602 }
Chris@82 603 i1 = i2;
Chris@82 604 i1c = i2c;
Chris@82 605 }
Chris@82 606 switch (N) {
Chris@82 607 case 1:
Chris@82 608 a[i1] = b[0];
Chris@82 609 a[i1c] = c[0];
Chris@82 610 break;
Chris@82 611 case 2:
Chris@82 612 a[2*i1] = b[0];
Chris@82 613 a[2*i1+1] = b[1];
Chris@82 614 a[2*i1c] = c[0];
Chris@82 615 a[2*i1c+1] = c[1];
Chris@82 616 break;
Chris@82 617 default:
Chris@82 618 memcpy(&a[N * i1], b, N * sizeof(R));
Chris@82 619 memcpy(&a[N * i1c], c, N * sizeof(R));
Chris@82 620 }
Chris@82 621 if (ncount >= mn)
Chris@82 622 break; /* we've moved all elements */
Chris@82 623
Chris@82 624 /** Search for loops to rearrange: **/
Chris@82 625
Chris@82 626 while (1) {
Chris@82 627 INT max = k - i;
Chris@82 628 ++i;
Chris@82 629 A(i <= max);
Chris@82 630 im += ny;
Chris@82 631 if (im > k)
Chris@82 632 im -= k;
Chris@82 633 i2 = im;
Chris@82 634 if (i == i2)
Chris@82 635 continue;
Chris@82 636 if (i >= move_size) {
Chris@82 637 while (i2 > i && i2 < max) {
Chris@82 638 i1 = i2;
Chris@82 639 i2 = ny * i1 - k * (i1 / nx);
Chris@82 640 }
Chris@82 641 if (i2 == i)
Chris@82 642 break;
Chris@82 643 } else if (!move[i])
Chris@82 644 break;
Chris@82 645 }
Chris@82 646 }
Chris@82 647 }
Chris@82 648
Chris@82 649 static void apply_toms513(const plan *ego_, R *I, R *O)
Chris@82 650 {
Chris@82 651 const P *ego = (const P *) ego_;
Chris@82 652 INT n = ego->n, m = ego->m;
Chris@82 653 INT vl = ego->vl;
Chris@82 654 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
Chris@82 655 UNUSED(O);
Chris@82 656 transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
Chris@82 657 X(ifree)(buf);
Chris@82 658 }
Chris@82 659
Chris@82 660 static int applicable_toms513(const problem_rdft *p, planner *plnr,
Chris@82 661 int dim0, int dim1, int dim2, INT *nbuf)
Chris@82 662 {
Chris@82 663 INT n = p->vecsz->dims[dim0].n;
Chris@82 664 INT m = p->vecsz->dims[dim1].n;
Chris@82 665 INT vl, vs;
Chris@82 666 get_transpose_vec(p, dim2, &vl, &vs);
Chris@82 667 *nbuf = 2*vl
Chris@82 668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
Chris@82 669 return (!NO_SLOWP(plnr)
Chris@82 670 && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
Chris@82 671 && n != m
Chris@82 672 && Ntuple_transposable(p->vecsz->dims + dim0,
Chris@82 673 p->vecsz->dims + dim1,
Chris@82 674 vl, vs));
Chris@82 675 }
Chris@82 676
Chris@82 677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
Chris@82 678 {
Chris@82 679 UNUSED(p); UNUSED(plnr);
Chris@82 680 /* heuristic so that TOMS algorithm is last resort for small vl */
Chris@82 681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
Chris@82 682 return 1;
Chris@82 683 }
Chris@82 684
Chris@82 685 static const transpose_adt adt_toms513 =
Chris@82 686 {
Chris@82 687 apply_toms513, applicable_toms513, mkcldrn_toms513,
Chris@82 688 "rdft-transpose-toms513"
Chris@82 689 };
Chris@82 690
Chris@82 691 /*-----------------------------------------------------------------------*/
Chris@82 692 /*-----------------------------------------------------------------------*/
Chris@82 693 /* generic stuff: */
Chris@82 694
Chris@82 695 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@82 696 {
Chris@82 697 P *ego = (P *) ego_;
Chris@82 698 X(plan_awake)(ego->cld1, wakefulness);
Chris@82 699 X(plan_awake)(ego->cld2, wakefulness);
Chris@82 700 X(plan_awake)(ego->cld3, wakefulness);
Chris@82 701 }
Chris@82 702
Chris@82 703 static void print(const plan *ego_, printer *p)
Chris@82 704 {
Chris@82 705 const P *ego = (const P *) ego_;
Chris@82 706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
Chris@82 707 ego->n, ego->m, ego->vl);
Chris@82 708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
Chris@82 709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
Chris@82 710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
Chris@82 711 p->print(p, ")");
Chris@82 712 }
Chris@82 713
Chris@82 714 static void destroy(plan *ego_)
Chris@82 715 {
Chris@82 716 P *ego = (P *) ego_;
Chris@82 717 X(plan_destroy_internal)(ego->cld3);
Chris@82 718 X(plan_destroy_internal)(ego->cld2);
Chris@82 719 X(plan_destroy_internal)(ego->cld1);
Chris@82 720 }
Chris@82 721
Chris@82 722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
Chris@82 723 {
Chris@82 724 const S *ego = (const S *) ego_;
Chris@82 725 const problem_rdft *p;
Chris@82 726 int dim0, dim1, dim2;
Chris@82 727 INT nbuf, vs;
Chris@82 728 P *pln;
Chris@82 729
Chris@82 730 static const plan_adt padt = {
Chris@82 731 X(rdft_solve), awake, print, destroy
Chris@82 732 };
Chris@82 733
Chris@82 734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
Chris@82 735 return (plan *) 0;
Chris@82 736
Chris@82 737 p = (const problem_rdft *) p_;
Chris@82 738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
Chris@82 739
Chris@82 740 pln->n = p->vecsz->dims[dim0].n;
Chris@82 741 pln->m = p->vecsz->dims[dim1].n;
Chris@82 742 get_transpose_vec(p, dim2, &pln->vl, &vs);
Chris@82 743 pln->nbuf = nbuf;
Chris@82 744 pln->d = gcd(pln->n, pln->m);
Chris@82 745 pln->nd = pln->n / pln->d;
Chris@82 746 pln->md = pln->m / pln->d;
Chris@82 747 pln->slv = ego;
Chris@82 748
Chris@82 749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
Chris@82 750
Chris@82 751 pln->cld1 = pln->cld2 = pln->cld3 = 0;
Chris@82 752 if (!ego->adt->mkcldrn(p, plnr, pln)) {
Chris@82 753 X(plan_destroy_internal)(&(pln->super.super));
Chris@82 754 return 0;
Chris@82 755 }
Chris@82 756
Chris@82 757 return &(pln->super.super);
Chris@82 758 }
Chris@82 759
Chris@82 760 static solver *mksolver(const transpose_adt *adt)
Chris@82 761 {
Chris@82 762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
Chris@82 763 S *slv = MKSOLVER(S, &sadt);
Chris@82 764 slv->adt = adt;
Chris@82 765 return &(slv->super);
Chris@82 766 }
Chris@82 767
Chris@82 768 void X(rdft_vrank3_transpose_register)(planner *p)
Chris@82 769 {
Chris@82 770 unsigned i;
Chris@82 771 static const transpose_adt *const adts[] = {
Chris@82 772 &adt_gcd, &adt_cut,
Chris@82 773 &adt_toms513
Chris@82 774 };
Chris@82 775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
Chris@82 776 REGISTER_SOLVER(p, mksolver(adts[i]));
Chris@82 777 }