cannam@127: /* cannam@127: * Copyright (c) 2003, 2007-14 Matteo Frigo cannam@127: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology cannam@127: * cannam@127: * This program is free software; you can redistribute it and/or modify cannam@127: * it under the terms of the GNU General Public License as published by cannam@127: * the Free Software Foundation; either version 2 of the License, or cannam@127: * (at your option) any later version. cannam@127: * cannam@127: * This program is distributed in the hope that it will be useful, cannam@127: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@127: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@127: * GNU General Public License for more details. cannam@127: * cannam@127: * You should have received a copy of the GNU General Public License cannam@127: * along with this program; if not, write to the Free Software cannam@127: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@127: * cannam@127: */ cannam@127: cannam@127: cannam@127: /* plans for rank-0 RDFTs (copy operations) */ cannam@127: cannam@127: #include "rdft.h" cannam@127: cannam@127: #ifdef HAVE_STRING_H cannam@127: #include /* for memcpy() */ cannam@127: #endif cannam@127: cannam@127: #define MAXRNK 32 /* FIXME: should malloc() */ cannam@127: cannam@127: typedef struct { cannam@127: plan_rdft super; cannam@127: INT vl; cannam@127: int rnk; cannam@127: iodim d[MAXRNK]; cannam@127: const char *nam; cannam@127: } P; cannam@127: cannam@127: typedef struct { cannam@127: solver super; cannam@127: rdftapply apply; cannam@127: int (*applicable)(const P *pln, const problem_rdft *p); cannam@127: const char *nam; cannam@127: } S; cannam@127: cannam@127: /* copy up to MAXRNK dimensions from problem into plan. If a cannam@127: contiguous dimension exists, save its length in pln->vl */ cannam@127: static int fill_iodim(P *pln, const problem_rdft *p) cannam@127: { cannam@127: int i; cannam@127: const tensor *vecsz = p->vecsz; cannam@127: cannam@127: pln->vl = 1; cannam@127: pln->rnk = 0; cannam@127: for (i = 0; i < vecsz->rnk; ++i) { cannam@127: /* extract contiguous dimensions */ cannam@127: if (pln->vl == 1 && cannam@127: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) cannam@127: pln->vl = vecsz->dims[i].n; cannam@127: else if (pln->rnk == MAXRNK) cannam@127: return 0; cannam@127: else cannam@127: pln->d[pln->rnk++] = vecsz->dims[i]; cannam@127: } cannam@127: cannam@127: return 1; cannam@127: } cannam@127: cannam@127: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ cannam@127: static void copy(const iodim *d, int rnk, INT vl, cannam@127: R *I, R *O, cannam@127: cpy2d_func cpy2d) cannam@127: { cannam@127: A(rnk >= 2); cannam@127: if (rnk == 2) cannam@127: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); cannam@127: else { cannam@127: INT i; cannam@127: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) cannam@127: copy(d + 1, rnk - 1, vl, I, O, cpy2d); cannam@127: } cannam@127: } cannam@127: cannam@127: /* FIXME: should be more general */ cannam@127: static int transposep(const P *pln) cannam@127: { cannam@127: int i; cannam@127: cannam@127: for (i = 0; i < pln->rnk - 2; ++i) cannam@127: if (pln->d[i].is != pln->d[i].os) cannam@127: return 0; cannam@127: cannam@127: return (pln->d[i].n == pln->d[i+1].n && cannam@127: pln->d[i].is == pln->d[i+1].os && cannam@127: pln->d[i].os == pln->d[i+1].is); cannam@127: } cannam@127: cannam@127: /* generic higher-rank transpose routine, calls transpose2d() to do cannam@127: * the real work */ cannam@127: static void transpose(const iodim *d, int rnk, INT vl, cannam@127: R *I, cannam@127: transpose_func transpose2d) cannam@127: { cannam@127: A(rnk >= 2); cannam@127: if (rnk == 2) cannam@127: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); cannam@127: else { cannam@127: INT i; cannam@127: for (i = 0; i < d[0].n; ++i, I += d[0].is) cannam@127: transpose(d + 1, rnk - 1, vl, I, transpose2d); cannam@127: } cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank 0,1,2, out of place, iterative */ cannam@127: static void apply_iter(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: cannam@127: switch (ego->rnk) { cannam@127: case 0: cannam@127: X(cpy1d)(I, O, ego->vl, 1, 1, 1); cannam@127: break; cannam@127: case 1: cannam@127: X(cpy1d)(I, O, cannam@127: ego->d[0].n, ego->d[0].is, ego->d[0].os, cannam@127: ego->vl); cannam@127: break; cannam@127: default: cannam@127: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); cannam@127: break; cannam@127: } cannam@127: } cannam@127: cannam@127: static int applicable_iter(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: UNUSED(pln); cannam@127: return (p->I != p->O); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* out of place, write contiguous output */ cannam@127: static void apply_cpy2dco(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); cannam@127: } cannam@127: cannam@127: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: int rnk = pln->rnk; cannam@127: return (1 cannam@127: && p->I != p->O cannam@127: && rnk >= 2 cannam@127: cannam@127: /* must not duplicate apply_iter */ cannam@127: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) cannam@127: || cannam@127: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) cannam@127: ); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* out of place, tiled, no buffering */ cannam@127: static void apply_tiled(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); cannam@127: } cannam@127: cannam@127: static int applicable_tiled(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: return (1 cannam@127: && p->I != p->O cannam@127: && pln->rnk >= 2 cannam@127: cannam@127: /* somewhat arbitrary */ cannam@127: && X(compute_tilesz)(pln->vl, 1) > 4 cannam@127: ); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* out of place, tiled, with buffer */ cannam@127: static void apply_tiledbuf(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); cannam@127: } cannam@127: cannam@127: #define applicable_tiledbuf applicable_tiled cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank 0, out of place, using memcpy */ cannam@127: static void apply_memcpy(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: cannam@127: A(ego->rnk == 0); cannam@127: memcpy(O, I, ego->vl * sizeof(R)); cannam@127: } cannam@127: cannam@127: static int applicable_memcpy(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: return (1 cannam@127: && p->I != p->O cannam@127: && pln->rnk == 0 cannam@127: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ cannam@127: ); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place cannam@127: transposes of vl-tuples ... for large vl it should be more cannam@127: efficient to use memcpy than the tiled stuff). */ cannam@127: cannam@127: static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) cannam@127: { cannam@127: INT i, n = d->n, is = d->is, os = d->os; cannam@127: if (rnk == 1) cannam@127: for (i = 0; i < n; ++i, I += is, O += os) cannam@127: memcpy(O, I, cpysz); cannam@127: else { cannam@127: --rnk; ++d; cannam@127: for (i = 0; i < n; ++i, I += is, O += os) cannam@127: memcpy_loop(cpysz, rnk, d, I, O); cannam@127: } cannam@127: } cannam@127: cannam@127: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); cannam@127: } cannam@127: cannam@127: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: return (p->I != p->O cannam@127: && pln->rnk > 0 cannam@127: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank 2, in place, square transpose, iterative */ cannam@127: static void apply_ip_sq(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: UNUSED(O); cannam@127: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); cannam@127: } cannam@127: cannam@127: cannam@127: static int applicable_ip_sq(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: return (1 cannam@127: && p->I == p->O cannam@127: && pln->rnk >= 2 cannam@127: && transposep(pln)); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank 2, in place, square transpose, tiled */ cannam@127: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: UNUSED(O); cannam@127: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); cannam@127: } cannam@127: cannam@127: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) cannam@127: { cannam@127: return (1 cannam@127: && applicable_ip_sq(pln, p) cannam@127: cannam@127: /* somewhat arbitrary */ cannam@127: && X(compute_tilesz)(pln->vl, 2) > 4 cannam@127: ); cannam@127: } cannam@127: cannam@127: /**************************************************************/ cannam@127: /* rank 2, in place, square transpose, tiled, buffered */ cannam@127: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: UNUSED(O); cannam@127: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); cannam@127: } cannam@127: cannam@127: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled cannam@127: cannam@127: /**************************************************************/ cannam@127: static int applicable(const S *ego, const problem *p_) cannam@127: { cannam@127: const problem_rdft *p = (const problem_rdft *) p_; cannam@127: P pln; cannam@127: return (1 cannam@127: && p->sz->rnk == 0 cannam@127: && FINITE_RNK(p->vecsz->rnk) cannam@127: && fill_iodim(&pln, p) cannam@127: && ego->applicable(&pln, p) cannam@127: ); cannam@127: } cannam@127: cannam@127: static void print(const plan *ego_, printer *p) cannam@127: { cannam@127: const P *ego = (const P *) ego_; cannam@127: int i; cannam@127: p->print(p, "(%s/%D", ego->nam, ego->vl); cannam@127: for (i = 0; i < ego->rnk; ++i) cannam@127: p->print(p, "%v", ego->d[i].n); cannam@127: p->print(p, ")"); cannam@127: } cannam@127: cannam@127: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) cannam@127: { cannam@127: const problem_rdft *p; cannam@127: const S *ego = (const S *) ego_; cannam@127: P *pln; cannam@127: int retval; cannam@127: cannam@127: static const plan_adt padt = { cannam@127: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) cannam@127: }; cannam@127: cannam@127: UNUSED(plnr); cannam@127: cannam@127: if (!applicable(ego, p_)) cannam@127: return (plan *) 0; cannam@127: cannam@127: p = (const problem_rdft *) p_; cannam@127: pln = MKPLAN_RDFT(P, &padt, ego->apply); cannam@127: cannam@127: retval = fill_iodim(pln, p); cannam@127: (void)retval; /* UNUSED unless DEBUG */ cannam@127: A(retval); cannam@127: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ cannam@127: pln->nam = ego->nam; cannam@127: cannam@127: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ cannam@127: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); cannam@127: return &(pln->super.super); cannam@127: } cannam@127: cannam@127: cannam@127: void X(rdft_rank0_register)(planner *p) cannam@127: { cannam@127: unsigned i; cannam@127: static struct { cannam@127: rdftapply apply; cannam@127: int (*applicable)(const P *, const problem_rdft *); cannam@127: const char *nam; cannam@127: } tab[] = { cannam@127: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, cannam@127: { apply_memcpy_loop, applicable_memcpy_loop, cannam@127: "rdft-rank0-memcpy-loop" }, cannam@127: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, cannam@127: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, cannam@127: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, cannam@127: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, cannam@127: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, cannam@127: { cannam@127: apply_ip_sq_tiled, cannam@127: applicable_ip_sq_tiled, cannam@127: "rdft-rank0-ip-sq-tiled" cannam@127: }, cannam@127: { cannam@127: apply_ip_sq_tiledbuf, cannam@127: applicable_ip_sq_tiledbuf, cannam@127: "rdft-rank0-ip-sq-tiledbuf" cannam@127: }, cannam@127: }; cannam@127: cannam@127: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { cannam@127: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; cannam@127: S *slv = MKSOLVER(S, &sadt); cannam@127: slv->apply = tab[i].apply; cannam@127: slv->applicable = tab[i].applicable; cannam@127: slv->nam = tab[i].nam; cannam@127: REGISTER_SOLVER(p, &(slv->super)); cannam@127: } cannam@127: }