cannam@167: /* cannam@167: * Copyright (c) 2003, 2007-14 Matteo Frigo cannam@167: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology cannam@167: * cannam@167: * This program is free software; you can redistribute it and/or modify cannam@167: * it under the terms of the GNU General Public License as published by cannam@167: * the Free Software Foundation; either version 2 of the License, or cannam@167: * (at your option) any later version. cannam@167: * cannam@167: * This program is distributed in the hope that it will be useful, cannam@167: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@167: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@167: * GNU General Public License for more details. cannam@167: * cannam@167: * You should have received a copy of the GNU General Public License cannam@167: * along with this program; if not, write to the Free Software cannam@167: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@167: * cannam@167: */ cannam@167: cannam@167: cannam@167: /* plans for rank-0 RDFTs (copy operations) */ cannam@167: cannam@167: #include "rdft/rdft.h" cannam@167: cannam@167: #ifdef HAVE_STRING_H cannam@167: #include /* for memcpy() */ cannam@167: #endif cannam@167: cannam@167: #define MAXRNK 32 /* FIXME: should malloc() */ cannam@167: cannam@167: typedef struct { cannam@167: plan_rdft super; cannam@167: INT vl; cannam@167: int rnk; cannam@167: iodim d[MAXRNK]; cannam@167: const char *nam; cannam@167: } P; cannam@167: cannam@167: typedef struct { cannam@167: solver super; cannam@167: rdftapply apply; cannam@167: int (*applicable)(const P *pln, const problem_rdft *p); cannam@167: const char *nam; cannam@167: } S; cannam@167: cannam@167: /* copy up to MAXRNK dimensions from problem into plan. If a cannam@167: contiguous dimension exists, save its length in pln->vl */ cannam@167: static int fill_iodim(P *pln, const problem_rdft *p) cannam@167: { cannam@167: int i; cannam@167: const tensor *vecsz = p->vecsz; cannam@167: cannam@167: pln->vl = 1; cannam@167: pln->rnk = 0; cannam@167: for (i = 0; i < vecsz->rnk; ++i) { cannam@167: /* extract contiguous dimensions */ cannam@167: if (pln->vl == 1 && cannam@167: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) cannam@167: pln->vl = vecsz->dims[i].n; cannam@167: else if (pln->rnk == MAXRNK) cannam@167: return 0; cannam@167: else cannam@167: pln->d[pln->rnk++] = vecsz->dims[i]; cannam@167: } cannam@167: cannam@167: return 1; cannam@167: } cannam@167: cannam@167: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ cannam@167: static void copy(const iodim *d, int rnk, INT vl, cannam@167: R *I, R *O, cannam@167: cpy2d_func cpy2d) cannam@167: { cannam@167: A(rnk >= 2); cannam@167: if (rnk == 2) cannam@167: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); cannam@167: else { cannam@167: INT i; cannam@167: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) cannam@167: copy(d + 1, rnk - 1, vl, I, O, cpy2d); cannam@167: } cannam@167: } cannam@167: cannam@167: /* FIXME: should be more general */ cannam@167: static int transposep(const P *pln) cannam@167: { cannam@167: int i; cannam@167: cannam@167: for (i = 0; i < pln->rnk - 2; ++i) cannam@167: if (pln->d[i].is != pln->d[i].os) cannam@167: return 0; cannam@167: cannam@167: return (pln->d[i].n == pln->d[i+1].n && cannam@167: pln->d[i].is == pln->d[i+1].os && cannam@167: pln->d[i].os == pln->d[i+1].is); cannam@167: } cannam@167: cannam@167: /* generic higher-rank transpose routine, calls transpose2d() to do cannam@167: * the real work */ cannam@167: static void transpose(const iodim *d, int rnk, INT vl, cannam@167: R *I, cannam@167: transpose_func transpose2d) cannam@167: { cannam@167: A(rnk >= 2); cannam@167: if (rnk == 2) cannam@167: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); cannam@167: else { cannam@167: INT i; cannam@167: for (i = 0; i < d[0].n; ++i, I += d[0].is) cannam@167: transpose(d + 1, rnk - 1, vl, I, transpose2d); cannam@167: } cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank 0,1,2, out of place, iterative */ cannam@167: static void apply_iter(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: cannam@167: switch (ego->rnk) { cannam@167: case 0: cannam@167: X(cpy1d)(I, O, ego->vl, 1, 1, 1); cannam@167: break; cannam@167: case 1: cannam@167: X(cpy1d)(I, O, cannam@167: ego->d[0].n, ego->d[0].is, ego->d[0].os, cannam@167: ego->vl); cannam@167: break; cannam@167: default: cannam@167: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); cannam@167: break; cannam@167: } cannam@167: } cannam@167: cannam@167: static int applicable_iter(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: UNUSED(pln); cannam@167: return (p->I != p->O); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* out of place, write contiguous output */ cannam@167: static void apply_cpy2dco(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); cannam@167: } cannam@167: cannam@167: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: int rnk = pln->rnk; cannam@167: return (1 cannam@167: && p->I != p->O cannam@167: && rnk >= 2 cannam@167: cannam@167: /* must not duplicate apply_iter */ cannam@167: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) cannam@167: || cannam@167: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) cannam@167: ); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* out of place, tiled, no buffering */ cannam@167: static void apply_tiled(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); cannam@167: } cannam@167: cannam@167: static int applicable_tiled(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: return (1 cannam@167: && p->I != p->O cannam@167: && pln->rnk >= 2 cannam@167: cannam@167: /* somewhat arbitrary */ cannam@167: && X(compute_tilesz)(pln->vl, 1) > 4 cannam@167: ); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* out of place, tiled, with buffer */ cannam@167: static void apply_tiledbuf(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); cannam@167: } cannam@167: cannam@167: #define applicable_tiledbuf applicable_tiled cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank 0, out of place, using memcpy */ cannam@167: static void apply_memcpy(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: cannam@167: A(ego->rnk == 0); cannam@167: memcpy(O, I, ego->vl * sizeof(R)); cannam@167: } cannam@167: cannam@167: static int applicable_memcpy(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: return (1 cannam@167: && p->I != p->O cannam@167: && pln->rnk == 0 cannam@167: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ cannam@167: ); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place cannam@167: transposes of vl-tuples ... for large vl it should be more cannam@167: efficient to use memcpy than the tiled stuff). */ cannam@167: cannam@167: static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) cannam@167: { cannam@167: INT i, n = d->n, is = d->is, os = d->os; cannam@167: if (rnk == 1) cannam@167: for (i = 0; i < n; ++i, I += is, O += os) cannam@167: memcpy(O, I, cpysz); cannam@167: else { cannam@167: --rnk; ++d; cannam@167: for (i = 0; i < n; ++i, I += is, O += os) cannam@167: memcpy_loop(cpysz, rnk, d, I, O); cannam@167: } cannam@167: } cannam@167: cannam@167: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); cannam@167: } cannam@167: cannam@167: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: return (p->I != p->O cannam@167: && pln->rnk > 0 cannam@167: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank 2, in place, square transpose, iterative */ cannam@167: static void apply_ip_sq(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: UNUSED(O); cannam@167: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); cannam@167: } cannam@167: cannam@167: cannam@167: static int applicable_ip_sq(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: return (1 cannam@167: && p->I == p->O cannam@167: && pln->rnk >= 2 cannam@167: && transposep(pln)); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank 2, in place, square transpose, tiled */ cannam@167: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: UNUSED(O); cannam@167: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); cannam@167: } cannam@167: cannam@167: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) cannam@167: { cannam@167: return (1 cannam@167: && applicable_ip_sq(pln, p) cannam@167: cannam@167: /* somewhat arbitrary */ cannam@167: && X(compute_tilesz)(pln->vl, 2) > 4 cannam@167: ); cannam@167: } cannam@167: cannam@167: /**************************************************************/ cannam@167: /* rank 2, in place, square transpose, tiled, buffered */ cannam@167: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: UNUSED(O); cannam@167: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); cannam@167: } cannam@167: cannam@167: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled cannam@167: cannam@167: /**************************************************************/ cannam@167: static int applicable(const S *ego, const problem *p_) cannam@167: { cannam@167: const problem_rdft *p = (const problem_rdft *) p_; cannam@167: P pln; cannam@167: return (1 cannam@167: && p->sz->rnk == 0 cannam@167: && FINITE_RNK(p->vecsz->rnk) cannam@167: && fill_iodim(&pln, p) cannam@167: && ego->applicable(&pln, p) cannam@167: ); cannam@167: } cannam@167: cannam@167: static void print(const plan *ego_, printer *p) cannam@167: { cannam@167: const P *ego = (const P *) ego_; cannam@167: int i; cannam@167: p->print(p, "(%s/%D", ego->nam, ego->vl); cannam@167: for (i = 0; i < ego->rnk; ++i) cannam@167: p->print(p, "%v", ego->d[i].n); cannam@167: p->print(p, ")"); cannam@167: } cannam@167: cannam@167: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) cannam@167: { cannam@167: const problem_rdft *p; cannam@167: const S *ego = (const S *) ego_; cannam@167: P *pln; cannam@167: int retval; cannam@167: cannam@167: static const plan_adt padt = { cannam@167: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) cannam@167: }; cannam@167: cannam@167: UNUSED(plnr); cannam@167: cannam@167: if (!applicable(ego, p_)) cannam@167: return (plan *) 0; cannam@167: cannam@167: p = (const problem_rdft *) p_; cannam@167: pln = MKPLAN_RDFT(P, &padt, ego->apply); cannam@167: cannam@167: retval = fill_iodim(pln, p); cannam@167: (void)retval; /* UNUSED unless DEBUG */ cannam@167: A(retval); cannam@167: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ cannam@167: pln->nam = ego->nam; cannam@167: cannam@167: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ cannam@167: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); cannam@167: return &(pln->super.super); cannam@167: } cannam@167: cannam@167: cannam@167: void X(rdft_rank0_register)(planner *p) cannam@167: { cannam@167: unsigned i; cannam@167: static struct { cannam@167: rdftapply apply; cannam@167: int (*applicable)(const P *, const problem_rdft *); cannam@167: const char *nam; cannam@167: } tab[] = { cannam@167: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, cannam@167: { apply_memcpy_loop, applicable_memcpy_loop, cannam@167: "rdft-rank0-memcpy-loop" }, cannam@167: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, cannam@167: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, cannam@167: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, cannam@167: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, cannam@167: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, cannam@167: { cannam@167: apply_ip_sq_tiled, cannam@167: applicable_ip_sq_tiled, cannam@167: "rdft-rank0-ip-sq-tiled" cannam@167: }, cannam@167: { cannam@167: apply_ip_sq_tiledbuf, cannam@167: applicable_ip_sq_tiledbuf, cannam@167: "rdft-rank0-ip-sq-tiledbuf" cannam@167: }, cannam@167: }; cannam@167: cannam@167: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { cannam@167: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; cannam@167: S *slv = MKSOLVER(S, &sadt); cannam@167: slv->apply = tab[i].apply; cannam@167: slv->applicable = tab[i].applicable; cannam@167: slv->nam = tab[i].nam; cannam@167: REGISTER_SOLVER(p, &(slv->super)); cannam@167: } cannam@167: }