cannam@167: /* cannam@167: * Copyright (c) 2003, 2007-14 Matteo Frigo cannam@167: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology cannam@167: * cannam@167: * This program is free software; you can redistribute it and/or modify cannam@167: * it under the terms of the GNU General Public License as published by cannam@167: * the Free Software Foundation; either version 2 of the License, or cannam@167: * (at your option) any later version. cannam@167: * cannam@167: * This program is distributed in the hope that it will be useful, cannam@167: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@167: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@167: * GNU General Public License for more details. cannam@167: * cannam@167: * You should have received a copy of the GNU General Public License cannam@167: * along with this program; if not, write to the Free Software cannam@167: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@167: * cannam@167: */ cannam@167: cannam@167: cannam@167: /* rank-0, vector-rank-3, non-square in-place transposition cannam@167: (see rank0.c for square transposition) */ cannam@167: cannam@167: #include "rdft/rdft.h" cannam@167: cannam@167: #ifdef HAVE_STRING_H cannam@167: #include /* for memcpy() */ cannam@167: #endif cannam@167: cannam@167: struct P_s; cannam@167: cannam@167: typedef struct { cannam@167: rdftapply apply; cannam@167: int (*applicable)(const problem_rdft *p, planner *plnr, cannam@167: int dim0, int dim1, int dim2, INT *nbuf); cannam@167: int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego); cannam@167: const char *nam; cannam@167: } transpose_adt; cannam@167: cannam@167: typedef struct { cannam@167: solver super; cannam@167: const transpose_adt *adt; cannam@167: } S; cannam@167: cannam@167: typedef struct P_s { cannam@167: plan_rdft super; cannam@167: INT n, m, vl; /* transpose n x m matrix of vl-tuples */ cannam@167: INT nbuf; /* buffer size */ cannam@167: INT nd, md, d; /* transpose-gcd params */ cannam@167: INT nc, mc; /* transpose-cut params */ cannam@167: plan *cld1, *cld2, *cld3; /* children, null if unused */ cannam@167: const S *slv; cannam@167: } P; cannam@167: cannam@167: cannam@167: /*************************************************************************/ cannam@167: /* some utilities for the solvers */ cannam@167: cannam@167: static INT gcd(INT a, INT b) cannam@167: { cannam@167: INT r; cannam@167: do { cannam@167: r = a % b; cannam@167: a = b; cannam@167: b = r; cannam@167: } while (r != 0); cannam@167: cannam@167: return a; cannam@167: } cannam@167: cannam@167: /* whether we can transpose with one of our routines expecting cannam@167: contiguous Ntuples */ cannam@167: static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs) cannam@167: { cannam@167: return (vs == 1 && b->is == vl && a->os == vl && cannam@167: ((a->n == b->n && a->is == b->os cannam@167: && a->is >= b->n && a->is % vl == 0) cannam@167: || (a->is == b->n * vl && b->os == a->n * vl))); cannam@167: } cannam@167: cannam@167: /* check whether a and b correspond to the first and second dimensions cannam@167: of a transpose of tuples with vector length = vl, stride = vs. */ cannam@167: static int transposable(const iodim *a, const iodim *b, INT vl, INT vs) cannam@167: { cannam@167: return ((a->n == b->n && a->os == b->is && a->is == b->os) cannam@167: || Ntuple_transposable(a, b, vl, vs)); cannam@167: } cannam@167: cannam@167: static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2) cannam@167: { cannam@167: int dim0, dim1; cannam@167: cannam@167: for (dim0 = 0; dim0 < s->rnk; ++dim0) cannam@167: for (dim1 = 0; dim1 < s->rnk; ++dim1) { cannam@167: int dim2 = 3 - dim0 - dim1; cannam@167: if (dim0 == dim1) continue; cannam@167: if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os) cannam@167: && transposable(s->dims + dim0, s->dims + dim1, cannam@167: s->rnk == 2 ? (INT)1 : s->dims[dim2].n, cannam@167: s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) { cannam@167: *pdim0 = dim0; cannam@167: *pdim1 = dim1; cannam@167: *pdim2 = dim2; cannam@167: return 1; cannam@167: } cannam@167: } cannam@167: return 0; cannam@167: } cannam@167: cannam@167: #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */ cannam@167: #define MAXBUF 65536 /* maximum non-ugly buffer */ cannam@167: cannam@167: /* generic applicability function */ cannam@167: static int applicable(const solver *ego_, const problem *p_, planner *plnr, cannam@167: int *dim0, int *dim1, int *dim2, INT *nbuf) cannam@167: { cannam@167: const S *ego = (const S *) ego_; cannam@167: const problem_rdft *p = (const problem_rdft *) p_; cannam@167: cannam@167: return (1 cannam@167: && p->I == p->O cannam@167: && p->sz->rnk == 0 cannam@167: && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3) cannam@167: cannam@167: && pickdim(p->vecsz, dim0, dim1, dim2) cannam@167: cannam@167: /* UGLY if vecloop in wrong order for locality */ cannam@167: && (!NO_UGLYP(plnr) || cannam@167: p->vecsz->rnk == 2 || cannam@167: X(iabs)(p->vecsz->dims[*dim2].is) cannam@167: < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is), cannam@167: X(iabs)(p->vecsz->dims[*dim0].os))) cannam@167: cannam@167: /* SLOW if non-square */ cannam@167: && (!NO_SLOWP(plnr) cannam@167: || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n) cannam@167: cannam@167: && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf) cannam@167: cannam@167: /* buffers too big are UGLY */ cannam@167: && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr)) cannam@167: || *nbuf <= MAXBUF cannam@167: || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz)) cannam@167: ); cannam@167: } cannam@167: cannam@167: static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs) cannam@167: { cannam@167: if (p->vecsz->rnk == 2) { cannam@167: *vl = 1; *vs = 1; cannam@167: } cannam@167: else { cannam@167: *vl = p->vecsz->dims[dim2].n; cannam@167: *vs = p->vecsz->dims[dim2].is; /* == os */ cannam@167: } cannam@167: } cannam@167: cannam@167: /*************************************************************************/ cannam@167: /* Cache-oblivious in-place transpose of non-square matrices, based cannam@167: on transposes of blocks given by the gcd of the dimensions. cannam@167: cannam@167: This algorithm is related to algorithm V5 from Murray Dow, cannam@167: "Transposing a matrix on a vector computer," Parallel Computing 21 cannam@167: (12), 1997-2005 (1995), with the modification that we use cannam@167: cache-oblivious recursive transpose subroutines (and we derived cannam@167: it independently). cannam@167: cannam@167: For a p x q matrix, this requires scratch space equal to the size cannam@167: of the matrix divided by gcd(p,q). Alternatively, see also the cannam@167: "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */ cannam@167: cannam@167: static void apply_gcd(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: INT n = ego->nd, m = ego->md, d = ego->d; cannam@167: INT vl = ego->vl; cannam@167: R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); cannam@167: INT i, num_el = n*m*d*vl; cannam@167: cannam@167: A(ego->n == n * d && ego->m == m * d); cannam@167: UNUSED(O); cannam@167: cannam@167: /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix cannam@167: of vl-tuples and buf contains n*m*d*vl elements. cannam@167: cannam@167: In general, to transpose a p x q matrix, you should call this cannam@167: routine with d = gcd(p, q), n = p/d, and m = q/d. */ cannam@167: cannam@167: A(n > 0 && m > 0 && vl > 0); cannam@167: A(d > 1); cannam@167: cannam@167: /* treat as (d x n) x (d' x m) matrix. (d' = d) */ cannam@167: cannam@167: /* First, transpose d x (n x d') x m to d x (d' x n) x m, cannam@167: using the buf matrix. This consists of d transposes cannam@167: of contiguous n x d' matrices of m-tuples. */ cannam@167: if (n > 1) { cannam@167: rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply; cannam@167: for (i = 0; i < d; ++i) { cannam@167: cldapply(ego->cld1, I + i*num_el, buf); cannam@167: memcpy(I + i*num_el, buf, num_el*sizeof(R)); cannam@167: } cannam@167: } cannam@167: cannam@167: /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which cannam@167: is a square in-place transpose of n*m-tuples: */ cannam@167: { cannam@167: rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply; cannam@167: cldapply(ego->cld2, I, I); cannam@167: } cannam@167: cannam@167: /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)), cannam@167: using the buf matrix. This consists of d' transposes cannam@167: of contiguous d*n x m matrices. */ cannam@167: if (m > 1) { cannam@167: rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply; cannam@167: for (i = 0; i < d; ++i) { cannam@167: cldapply(ego->cld3, I + i*num_el, buf); cannam@167: memcpy(I + i*num_el, buf, num_el*sizeof(R)); cannam@167: } cannam@167: } cannam@167: cannam@167: X(ifree)(buf); cannam@167: } cannam@167: cannam@167: static int applicable_gcd(const problem_rdft *p, planner *plnr, cannam@167: int dim0, int dim1, int dim2, INT *nbuf) cannam@167: { cannam@167: INT n = p->vecsz->dims[dim0].n; cannam@167: INT m = p->vecsz->dims[dim1].n; cannam@167: INT d, vl, vs; cannam@167: get_transpose_vec(p, dim2, &vl, &vs); cannam@167: d = gcd(n, m); cannam@167: *nbuf = n * (m / d) * vl; cannam@167: return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */ cannam@167: && n != m cannam@167: && d > 1 cannam@167: && Ntuple_transposable(p->vecsz->dims + dim0, cannam@167: p->vecsz->dims + dim1, cannam@167: vl, vs)); cannam@167: } cannam@167: cannam@167: static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego) cannam@167: { cannam@167: INT n = ego->nd, m = ego->md, d = ego->d; cannam@167: INT vl = ego->vl; cannam@167: R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); cannam@167: INT num_el = n*m*d*vl; cannam@167: cannam@167: if (n > 1) { cannam@167: ego->cld1 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(n, d*m*vl, m*vl, cannam@167: d, m*vl, n*m*vl, cannam@167: m*vl, 1, 1), cannam@167: TAINT(p->I, num_el), buf)); cannam@167: if (!ego->cld1) cannam@167: goto nada; cannam@167: X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops, cannam@167: &ego->super.super.ops); cannam@167: ego->super.super.ops.other += num_el * d * 2; cannam@167: } cannam@167: cannam@167: ego->cld2 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(d, d*n*m*vl, n*m*vl, cannam@167: d, n*m*vl, d*n*m*vl, cannam@167: n*m*vl, 1, 1), cannam@167: p->I, p->I)); cannam@167: if (!ego->cld2) cannam@167: goto nada; cannam@167: X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); cannam@167: cannam@167: if (m > 1) { cannam@167: ego->cld3 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(d*n, m*vl, vl, cannam@167: m, vl, d*n*vl, cannam@167: vl, 1, 1), cannam@167: TAINT(p->I, num_el), buf)); cannam@167: if (!ego->cld3) cannam@167: goto nada; cannam@167: X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops); cannam@167: ego->super.super.ops.other += num_el * d * 2; cannam@167: } cannam@167: cannam@167: X(ifree)(buf); cannam@167: return 1; cannam@167: cannam@167: nada: cannam@167: X(ifree)(buf); cannam@167: return 0; cannam@167: } cannam@167: cannam@167: static const transpose_adt adt_gcd = cannam@167: { cannam@167: apply_gcd, applicable_gcd, mkcldrn_gcd, cannam@167: "rdft-transpose-gcd" cannam@167: }; cannam@167: cannam@167: /*************************************************************************/ cannam@167: /* Cache-oblivious in-place transpose of non-square n x m matrices, cannam@167: based on transposing a sub-matrix first and then transposing the cannam@167: remainder(s) with the help of a buffer. See also transpose-gcd, cannam@167: above, if gcd(n,m) is large. cannam@167: cannam@167: This algorithm is related to algorithm V3 from Murray Dow, cannam@167: "Transposing a matrix on a vector computer," Parallel Computing 21 cannam@167: (12), 1997-2005 (1995), with the modifications that we use cannam@167: cache-oblivious recursive transpose subroutines and we have the cannam@167: generalization for large |n-m| below. cannam@167: cannam@167: The best case, and the one described by Dow, is for |n-m| small, in cannam@167: which case we transpose a square sub-matrix of size min(n,m), cannam@167: handling the remainder via a buffer. This requires scratch space cannam@167: equal to the size of the matrix times |n-m| / max(n,m). cannam@167: cannam@167: As a generalization when |n-m| is not small, we also support cutting cannam@167: *both* dimensions to an nc x mc matrix which is *not* necessarily cannam@167: square, but has a large gcd (and can therefore use transpose-gcd). cannam@167: */ cannam@167: cannam@167: static void apply_cut(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl; cannam@167: INT i; cannam@167: R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); cannam@167: UNUSED(O); cannam@167: cannam@167: if (m > mc) { cannam@167: ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1); cannam@167: for (i = 0; i < nc; ++i) cannam@167: memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl)); cannam@167: } cannam@167: cannam@167: ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */ cannam@167: cannam@167: if (n > nc) { cannam@167: R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */ cannam@167: memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R)); cannam@167: for (i = mc-1; i >= 0; --i) cannam@167: memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl)); cannam@167: ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl); cannam@167: } cannam@167: cannam@167: if (m > mc) { cannam@167: if (n > nc) cannam@167: for (i = mc; i < m; ++i) cannam@167: memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl), cannam@167: (nc*vl)*sizeof(R)); cannam@167: else cannam@167: memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R)); cannam@167: } cannam@167: cannam@167: X(ifree)(buf1); cannam@167: } cannam@167: cannam@167: /* only cut one dimension if the resulting buffer is small enough */ cannam@167: static int cut1(INT n, INT m, INT vl) cannam@167: { cannam@167: return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV cannam@167: || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF); cannam@167: } cannam@167: cannam@167: #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */ cannam@167: cannam@167: static int applicable_cut(const problem_rdft *p, planner *plnr, cannam@167: int dim0, int dim1, int dim2, INT *nbuf) cannam@167: { cannam@167: INT n = p->vecsz->dims[dim0].n; cannam@167: INT m = p->vecsz->dims[dim1].n; cannam@167: INT vl, vs; cannam@167: get_transpose_vec(p, dim2, &vl, &vs); cannam@167: *nbuf = 0; /* always small enough to be non-UGLY (?) */ cannam@167: A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */ cannam@167: return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */ cannam@167: && n != m cannam@167: cannam@167: /* Don't call transpose-cut recursively (avoid inf. loops): cannam@167: the non-square sub-transpose produced when !cut1 cannam@167: should always have gcd(n,m) >= min(CUT_NSRCH,n,m), cannam@167: for which transpose-gcd is applicable */ cannam@167: && (cut1(n, m, vl) cannam@167: || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m))) cannam@167: cannam@167: && Ntuple_transposable(p->vecsz->dims + dim0, cannam@167: p->vecsz->dims + dim1, cannam@167: vl, vs)); cannam@167: } cannam@167: cannam@167: static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego) cannam@167: { cannam@167: INT n = ego->n, m = ego->m, nc, mc; cannam@167: INT vl = ego->vl; cannam@167: R *buf; cannam@167: cannam@167: /* pick the "best" cut */ cannam@167: if (cut1(n, m, vl)) { cannam@167: nc = mc = X(imin)(n,m); cannam@167: } cannam@167: else { cannam@167: INT dc, ns, ms; cannam@167: dc = gcd(m, n); nc = n; mc = m; cannam@167: /* search for cut with largest gcd cannam@167: (TODO: different optimality criteria? different search range?) */ cannam@167: for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) { cannam@167: for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) { cannam@167: INT ds = gcd(ms, ns); cannam@167: if (ds > dc) { cannam@167: dc = ds; nc = ns; mc = ms; cannam@167: if (dc == X(imin)(ns, ms)) cannam@167: break; /* cannot get larger than this */ cannam@167: } cannam@167: } cannam@167: if (dc == X(imin)(n, ms)) cannam@167: break; /* cannot get larger than this */ cannam@167: } cannam@167: A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m))); cannam@167: } cannam@167: ego->nc = nc; cannam@167: ego->mc = mc; cannam@167: ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl); cannam@167: cannam@167: buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); cannam@167: cannam@167: if (m > mc) { cannam@167: ego->cld1 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(nc, m*vl, vl, cannam@167: m-mc, vl, nc*vl, cannam@167: vl, 1, 1), cannam@167: p->I + mc*vl, buf)); cannam@167: if (!ego->cld1) cannam@167: goto nada; cannam@167: X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops); cannam@167: } cannam@167: cannam@167: ego->cld2 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(nc, mc*vl, vl, cannam@167: mc, vl, nc*vl, cannam@167: vl, 1, 1), cannam@167: p->I, p->I)); cannam@167: if (!ego->cld2) cannam@167: goto nada; cannam@167: X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops); cannam@167: cannam@167: if (n > nc) { cannam@167: ego->cld3 = X(mkplan_d)(plnr, cannam@167: X(mkproblem_rdft_0_d)( cannam@167: X(mktensor_3d)(n-nc, m*vl, vl, cannam@167: m, vl, n*vl, cannam@167: vl, 1, 1), cannam@167: buf + (m-mc)*(nc*vl), p->I + nc*vl)); cannam@167: if (!ego->cld3) cannam@167: goto nada; cannam@167: X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops); cannam@167: } cannam@167: cannam@167: /* memcpy/memmove operations */ cannam@167: ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc)) cannam@167: + (n-nc)*m + (m-mc)*nc); cannam@167: cannam@167: X(ifree)(buf); cannam@167: return 1; cannam@167: cannam@167: nada: cannam@167: X(ifree)(buf); cannam@167: return 0; cannam@167: } cannam@167: cannam@167: static const transpose_adt adt_cut = cannam@167: { cannam@167: apply_cut, applicable_cut, mkcldrn_cut, cannam@167: "rdft-transpose-cut" cannam@167: }; cannam@167: cannam@167: /*************************************************************************/ cannam@167: /* In-place transpose routine from TOMS, which follows the cycles of cannam@167: the permutation so that it writes to each location only once. cannam@167: Because of cache-line and other issues, however, this routine is cannam@167: typically much slower than transpose-gcd or transpose-cut, even cannam@167: though the latter do some extra writes. On the other hand, if the cannam@167: vector length is large then the TOMS routine is best. cannam@167: cannam@167: The TOMS routine also has the advantage of requiring less buffer cannam@167: space for the case of gcd(nx,ny) small. However, in this case it cannam@167: has been superseded by the combination of the generalized cannam@167: transpose-cut method with the transpose-gcd method, which can cannam@167: always transpose with buffers a small fraction of the array size cannam@167: regardless of gcd(nx,ny). */ cannam@167: cannam@167: /* cannam@167: * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380). cannam@167: * cannam@167: * These routines do in-place transposes of arrays. cannam@167: * cannam@167: * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, cannam@167: * vol. 3, no. 1, 104-110 (1977) ] cannam@167: * cannam@167: * C version by Steven G. Johnson (February 1997). cannam@167: */ cannam@167: cannam@167: /* cannam@167: * "a" is a 1D array of length ny*nx*N which constains the nx x ny cannam@167: * matrix of N-tuples to be transposed. "a" is stored in row-major cannam@167: * order (last index varies fastest). move is a 1D array of length cannam@167: * move_size used to store information to speed up the process. The cannam@167: * value move_size=(ny+nx)/2 is recommended. buf should be an array cannam@167: * of length 2*N. cannam@167: * cannam@167: */ cannam@167: cannam@167: static void transpose_toms513(R *a, INT nx, INT ny, INT N, cannam@167: char *move, INT move_size, R *buf) cannam@167: { cannam@167: INT i, im, mn; cannam@167: R *b, *c, *d; cannam@167: INT ncount; cannam@167: INT k; cannam@167: cannam@167: /* check arguments and initialize: */ cannam@167: A(ny > 0 && nx > 0 && N > 0 && move_size > 0); cannam@167: cannam@167: b = buf; cannam@167: cannam@167: /* Cate & Twigg have a special case for nx == ny, but we don't cannam@167: bother, since we already have special code for this case elsewhere. */ cannam@167: cannam@167: c = buf + N; cannam@167: ncount = 2; /* always at least 2 fixed points */ cannam@167: k = (mn = ny * nx) - 1; cannam@167: cannam@167: for (i = 0; i < move_size; ++i) cannam@167: move[i] = 0; cannam@167: cannam@167: if (ny >= 3 && nx >= 3) cannam@167: ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */ cannam@167: cannam@167: i = 1; cannam@167: im = ny; cannam@167: cannam@167: while (1) { cannam@167: INT i1, i2, i1c, i2c; cannam@167: INT kmi; cannam@167: cannam@167: /** Rearrange the elements of a loop cannam@167: and its companion loop: **/ cannam@167: cannam@167: i1 = i; cannam@167: kmi = k - i; cannam@167: i1c = kmi; cannam@167: switch (N) { cannam@167: case 1: cannam@167: b[0] = a[i1]; cannam@167: c[0] = a[i1c]; cannam@167: break; cannam@167: case 2: cannam@167: b[0] = a[2*i1]; cannam@167: b[1] = a[2*i1+1]; cannam@167: c[0] = a[2*i1c]; cannam@167: c[1] = a[2*i1c+1]; cannam@167: break; cannam@167: default: cannam@167: memcpy(b, &a[N * i1], N * sizeof(R)); cannam@167: memcpy(c, &a[N * i1c], N * sizeof(R)); cannam@167: } cannam@167: while (1) { cannam@167: i2 = ny * i1 - k * (i1 / nx); cannam@167: i2c = k - i2; cannam@167: if (i1 < move_size) cannam@167: move[i1] = 1; cannam@167: if (i1c < move_size) cannam@167: move[i1c] = 1; cannam@167: ncount += 2; cannam@167: if (i2 == i) cannam@167: break; cannam@167: if (i2 == kmi) { cannam@167: d = b; cannam@167: b = c; cannam@167: c = d; cannam@167: break; cannam@167: } cannam@167: switch (N) { cannam@167: case 1: cannam@167: a[i1] = a[i2]; cannam@167: a[i1c] = a[i2c]; cannam@167: break; cannam@167: case 2: cannam@167: a[2*i1] = a[2*i2]; cannam@167: a[2*i1+1] = a[2*i2+1]; cannam@167: a[2*i1c] = a[2*i2c]; cannam@167: a[2*i1c+1] = a[2*i2c+1]; cannam@167: break; cannam@167: default: cannam@167: memcpy(&a[N * i1], &a[N * i2], cannam@167: N * sizeof(R)); cannam@167: memcpy(&a[N * i1c], &a[N * i2c], cannam@167: N * sizeof(R)); cannam@167: } cannam@167: i1 = i2; cannam@167: i1c = i2c; cannam@167: } cannam@167: switch (N) { cannam@167: case 1: cannam@167: a[i1] = b[0]; cannam@167: a[i1c] = c[0]; cannam@167: break; cannam@167: case 2: cannam@167: a[2*i1] = b[0]; cannam@167: a[2*i1+1] = b[1]; cannam@167: a[2*i1c] = c[0]; cannam@167: a[2*i1c+1] = c[1]; cannam@167: break; cannam@167: default: cannam@167: memcpy(&a[N * i1], b, N * sizeof(R)); cannam@167: memcpy(&a[N * i1c], c, N * sizeof(R)); cannam@167: } cannam@167: if (ncount >= mn) cannam@167: break; /* we've moved all elements */ cannam@167: cannam@167: /** Search for loops to rearrange: **/ cannam@167: cannam@167: while (1) { cannam@167: INT max = k - i; cannam@167: ++i; cannam@167: A(i <= max); cannam@167: im += ny; cannam@167: if (im > k) cannam@167: im -= k; cannam@167: i2 = im; cannam@167: if (i == i2) cannam@167: continue; cannam@167: if (i >= move_size) { cannam@167: while (i2 > i && i2 < max) { cannam@167: i1 = i2; cannam@167: i2 = ny * i1 - k * (i1 / nx); cannam@167: } cannam@167: if (i2 == i) cannam@167: break; cannam@167: } else if (!move[i]) cannam@167: break; cannam@167: } cannam@167: } cannam@167: } cannam@167: cannam@167: static void apply_toms513(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: INT n = ego->n, m = ego->m; cannam@167: INT vl = ego->vl; cannam@167: R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS); cannam@167: UNUSED(O); cannam@167: transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf); cannam@167: X(ifree)(buf); cannam@167: } cannam@167: cannam@167: static int applicable_toms513(const problem_rdft *p, planner *plnr, cannam@167: int dim0, int dim1, int dim2, INT *nbuf) cannam@167: { cannam@167: INT n = p->vecsz->dims[dim0].n; cannam@167: INT m = p->vecsz->dims[dim1].n; cannam@167: INT vl, vs; cannam@167: get_transpose_vec(p, dim2, &vl, &vs); cannam@167: *nbuf = 2*vl cannam@167: + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R); cannam@167: return (!NO_SLOWP(plnr) cannam@167: && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */ cannam@167: && n != m cannam@167: && Ntuple_transposable(p->vecsz->dims + dim0, cannam@167: p->vecsz->dims + dim1, cannam@167: vl, vs)); cannam@167: } cannam@167: cannam@167: static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego) cannam@167: { cannam@167: UNUSED(p); UNUSED(plnr); cannam@167: /* heuristic so that TOMS algorithm is last resort for small vl */ cannam@167: ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30); cannam@167: return 1; cannam@167: } cannam@167: cannam@167: static const transpose_adt adt_toms513 = cannam@167: { cannam@167: apply_toms513, applicable_toms513, mkcldrn_toms513, cannam@167: "rdft-transpose-toms513" cannam@167: }; cannam@167: cannam@167: /*-----------------------------------------------------------------------*/ cannam@167: /*-----------------------------------------------------------------------*/ cannam@167: /* generic stuff: */ cannam@167: cannam@167: static void awake(plan *ego_, enum wakefulness wakefulness) cannam@167: { cannam@167: P *ego = (P *) ego_; cannam@167: X(plan_awake)(ego->cld1, wakefulness); cannam@167: X(plan_awake)(ego->cld2, wakefulness); cannam@167: X(plan_awake)(ego->cld3, wakefulness); cannam@167: } cannam@167: cannam@167: static void print(const plan *ego_, printer *p) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam, cannam@167: ego->n, ego->m, ego->vl); cannam@167: if (ego->cld1) p->print(p, "%(%p%)", ego->cld1); cannam@167: if (ego->cld2) p->print(p, "%(%p%)", ego->cld2); cannam@167: if (ego->cld3) p->print(p, "%(%p%)", ego->cld3); cannam@167: p->print(p, ")"); cannam@167: } cannam@167: cannam@167: static void destroy(plan *ego_) cannam@167: { cannam@167: P *ego = (P *) ego_; cannam@167: X(plan_destroy_internal)(ego->cld3); cannam@167: X(plan_destroy_internal)(ego->cld2); cannam@167: X(plan_destroy_internal)(ego->cld1); cannam@167: } cannam@167: cannam@167: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) cannam@167: { cannam@167: const S *ego = (const S *) ego_; cannam@167: const problem_rdft *p; cannam@167: int dim0, dim1, dim2; cannam@167: INT nbuf, vs; cannam@167: P *pln; cannam@167: cannam@167: static const plan_adt padt = { cannam@167: X(rdft_solve), awake, print, destroy cannam@167: }; cannam@167: cannam@167: if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf)) cannam@167: return (plan *) 0; cannam@167: cannam@167: p = (const problem_rdft *) p_; cannam@167: pln = MKPLAN_RDFT(P, &padt, ego->adt->apply); cannam@167: cannam@167: pln->n = p->vecsz->dims[dim0].n; cannam@167: pln->m = p->vecsz->dims[dim1].n; cannam@167: get_transpose_vec(p, dim2, &pln->vl, &vs); cannam@167: pln->nbuf = nbuf; cannam@167: pln->d = gcd(pln->n, pln->m); cannam@167: pln->nd = pln->n / pln->d; cannam@167: pln->md = pln->m / pln->d; cannam@167: pln->slv = ego; cannam@167: cannam@167: X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */ cannam@167: cannam@167: pln->cld1 = pln->cld2 = pln->cld3 = 0; cannam@167: if (!ego->adt->mkcldrn(p, plnr, pln)) { cannam@167: X(plan_destroy_internal)(&(pln->super.super)); cannam@167: return 0; cannam@167: } cannam@167: cannam@167: return &(pln->super.super); cannam@167: } cannam@167: cannam@167: static solver *mksolver(const transpose_adt *adt) cannam@167: { cannam@167: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; cannam@167: S *slv = MKSOLVER(S, &sadt); cannam@167: slv->adt = adt; cannam@167: return &(slv->super); cannam@167: } cannam@167: cannam@167: void X(rdft_vrank3_transpose_register)(planner *p) cannam@167: { cannam@167: unsigned i; cannam@167: static const transpose_adt *const adts[] = { cannam@167: &adt_gcd, &adt_cut, cannam@167: &adt_toms513 cannam@167: }; cannam@167: for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i) cannam@167: REGISTER_SOLVER(p, mksolver(adts[i])); cannam@167: }