Chris@10: /* Chris@10: * Copyright (c) 2003, 2007-11 Matteo Frigo Chris@10: * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology Chris@10: * Chris@10: * This program is free software; you can redistribute it and/or modify Chris@10: * it under the terms of the GNU General Public License as published by Chris@10: * the Free Software Foundation; either version 2 of the License, or Chris@10: * (at your option) any later version. Chris@10: * Chris@10: * This program is distributed in the hope that it will be useful, Chris@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of Chris@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Chris@10: * GNU General Public License for more details. Chris@10: * Chris@10: * You should have received a copy of the GNU General Public License Chris@10: * along with this program; if not, write to the Free Software Chris@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Chris@10: * Chris@10: */ Chris@10: Chris@10: Chris@10: /* plans for rank-0 RDFTs (copy operations) */ Chris@10: Chris@10: #include "rdft.h" Chris@10: Chris@10: #ifdef HAVE_STRING_H Chris@10: #include /* for memcpy() */ Chris@10: #endif Chris@10: Chris@10: #define MAXRNK 32 /* FIXME: should malloc() */ Chris@10: Chris@10: typedef struct { Chris@10: plan_rdft super; Chris@10: INT vl; Chris@10: int rnk; Chris@10: iodim d[MAXRNK]; Chris@10: const char *nam; Chris@10: } P; Chris@10: Chris@10: typedef struct { Chris@10: solver super; Chris@10: rdftapply apply; Chris@10: int (*applicable)(const P *pln, const problem_rdft *p); Chris@10: const char *nam; Chris@10: } S; Chris@10: Chris@10: /* copy up to MAXRNK dimensions from problem into plan. If a Chris@10: contiguous dimension exists, save its length in pln->vl */ Chris@10: static int fill_iodim(P *pln, const problem_rdft *p) Chris@10: { Chris@10: int i; Chris@10: const tensor *vecsz = p->vecsz; Chris@10: Chris@10: pln->vl = 1; Chris@10: pln->rnk = 0; Chris@10: for (i = 0; i < vecsz->rnk; ++i) { Chris@10: /* extract contiguous dimensions */ Chris@10: if (pln->vl == 1 && Chris@10: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) Chris@10: pln->vl = vecsz->dims[i].n; Chris@10: else if (pln->rnk == MAXRNK) Chris@10: return 0; Chris@10: else Chris@10: pln->d[pln->rnk++] = vecsz->dims[i]; Chris@10: } Chris@10: Chris@10: return 1; Chris@10: } Chris@10: Chris@10: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ Chris@10: static void copy(const iodim *d, int rnk, INT vl, Chris@10: R *I, R *O, Chris@10: cpy2d_func cpy2d) Chris@10: { Chris@10: A(rnk >= 2); Chris@10: if (rnk == 2) Chris@10: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); Chris@10: else { Chris@10: INT i; Chris@10: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) Chris@10: copy(d + 1, rnk - 1, vl, I, O, cpy2d); Chris@10: } Chris@10: } Chris@10: Chris@10: /* FIXME: should be more general */ Chris@10: static int transposep(const P *pln) Chris@10: { Chris@10: int i; Chris@10: Chris@10: for (i = 0; i < pln->rnk - 2; ++i) Chris@10: if (pln->d[i].is != pln->d[i].os) Chris@10: return 0; Chris@10: Chris@10: return (pln->d[i].n == pln->d[i+1].n && Chris@10: pln->d[i].is == pln->d[i+1].os && Chris@10: pln->d[i].os == pln->d[i+1].is); Chris@10: } Chris@10: Chris@10: /* generic higher-rank transpose routine, calls transpose2d() to do Chris@10: * the real work */ Chris@10: static void transpose(const iodim *d, int rnk, INT vl, Chris@10: R *I, Chris@10: transpose_func transpose2d) Chris@10: { Chris@10: A(rnk >= 2); Chris@10: if (rnk == 2) Chris@10: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); Chris@10: else { Chris@10: INT i; Chris@10: for (i = 0; i < d[0].n; ++i, I += d[0].is) Chris@10: transpose(d + 1, rnk - 1, vl, I, transpose2d); Chris@10: } Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank 0,1,2, out of place, iterative */ Chris@10: static void apply_iter(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: Chris@10: switch (ego->rnk) { Chris@10: case 0: Chris@10: X(cpy1d)(I, O, ego->vl, 1, 1, 1); Chris@10: break; Chris@10: case 1: Chris@10: X(cpy1d)(I, O, Chris@10: ego->d[0].n, ego->d[0].is, ego->d[0].os, Chris@10: ego->vl); Chris@10: break; Chris@10: default: Chris@10: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); Chris@10: break; Chris@10: } Chris@10: } Chris@10: Chris@10: static int applicable_iter(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: UNUSED(pln); Chris@10: return (p->I != p->O); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* out of place, write contiguous output */ Chris@10: static void apply_cpy2dco(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); Chris@10: } Chris@10: Chris@10: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: int rnk = pln->rnk; Chris@10: return (1 Chris@10: && p->I != p->O Chris@10: && rnk >= 2 Chris@10: Chris@10: /* must not duplicate apply_iter */ Chris@10: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) Chris@10: || Chris@10: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) Chris@10: ); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* out of place, tiled, no buffering */ Chris@10: static void apply_tiled(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); Chris@10: } Chris@10: Chris@10: static int applicable_tiled(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: return (1 Chris@10: && p->I != p->O Chris@10: && pln->rnk >= 2 Chris@10: Chris@10: /* somewhat arbitrary */ Chris@10: && X(compute_tilesz)(pln->vl, 1) > 4 Chris@10: ); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* out of place, tiled, with buffer */ Chris@10: static void apply_tiledbuf(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); Chris@10: } Chris@10: Chris@10: #define applicable_tiledbuf applicable_tiled Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank 0, out of place, using memcpy */ Chris@10: static void apply_memcpy(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: Chris@10: A(ego->rnk == 0); Chris@10: memcpy(O, I, ego->vl * sizeof(R)); Chris@10: } Chris@10: Chris@10: static int applicable_memcpy(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: return (1 Chris@10: && p->I != p->O Chris@10: && pln->rnk == 0 Chris@10: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ Chris@10: ); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place Chris@10: transposes of vl-tuples ... for large vl it should be more Chris@10: efficient to use memcpy than the tiled stuff). */ Chris@10: Chris@10: static void memcpy_loop(INT cpysz, int rnk, const iodim *d, R *I, R *O) Chris@10: { Chris@10: INT i, n = d->n, is = d->is, os = d->os; Chris@10: if (rnk == 1) Chris@10: for (i = 0; i < n; ++i, I += is, O += os) Chris@10: memcpy(O, I, cpysz); Chris@10: else { Chris@10: --rnk; ++d; Chris@10: for (i = 0; i < n; ++i, I += is, O += os) Chris@10: memcpy_loop(cpysz, rnk, d, I, O); Chris@10: } Chris@10: } Chris@10: Chris@10: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); Chris@10: } Chris@10: Chris@10: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: return (p->I != p->O Chris@10: && pln->rnk > 0 Chris@10: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank 2, in place, square transpose, iterative */ Chris@10: static void apply_ip_sq(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: UNUSED(O); Chris@10: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); Chris@10: } Chris@10: Chris@10: Chris@10: static int applicable_ip_sq(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: return (1 Chris@10: && p->I == p->O Chris@10: && pln->rnk >= 2 Chris@10: && transposep(pln)); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank 2, in place, square transpose, tiled */ Chris@10: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: UNUSED(O); Chris@10: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); Chris@10: } Chris@10: Chris@10: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) Chris@10: { Chris@10: return (1 Chris@10: && applicable_ip_sq(pln, p) Chris@10: Chris@10: /* somewhat arbitrary */ Chris@10: && X(compute_tilesz)(pln->vl, 2) > 4 Chris@10: ); Chris@10: } Chris@10: Chris@10: /**************************************************************/ Chris@10: /* rank 2, in place, square transpose, tiled, buffered */ Chris@10: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: UNUSED(O); Chris@10: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); Chris@10: } Chris@10: Chris@10: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled Chris@10: Chris@10: /**************************************************************/ Chris@10: static int applicable(const S *ego, const problem *p_) Chris@10: { Chris@10: const problem_rdft *p = (const problem_rdft *) p_; Chris@10: P pln; Chris@10: return (1 Chris@10: && p->sz->rnk == 0 Chris@10: && FINITE_RNK(p->vecsz->rnk) Chris@10: && fill_iodim(&pln, p) Chris@10: && ego->applicable(&pln, p) Chris@10: ); Chris@10: } Chris@10: Chris@10: static void print(const plan *ego_, printer *p) Chris@10: { Chris@10: const P *ego = (const P *) ego_; Chris@10: int i; Chris@10: p->print(p, "(%s/%D", ego->nam, ego->vl); Chris@10: for (i = 0; i < ego->rnk; ++i) Chris@10: p->print(p, "%v", ego->d[i].n); Chris@10: p->print(p, ")"); Chris@10: } Chris@10: Chris@10: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) Chris@10: { Chris@10: const problem_rdft *p; Chris@10: const S *ego = (const S *) ego_; Chris@10: P *pln; Chris@10: int retval; Chris@10: Chris@10: static const plan_adt padt = { Chris@10: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) Chris@10: }; Chris@10: Chris@10: UNUSED(plnr); Chris@10: Chris@10: if (!applicable(ego, p_)) Chris@10: return (plan *) 0; Chris@10: Chris@10: p = (const problem_rdft *) p_; Chris@10: pln = MKPLAN_RDFT(P, &padt, ego->apply); Chris@10: Chris@10: retval = fill_iodim(pln, p); Chris@10: (void)retval; /* UNUSED unless DEBUG */ Chris@10: A(retval); Chris@10: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ Chris@10: pln->nam = ego->nam; Chris@10: Chris@10: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ Chris@10: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); Chris@10: return &(pln->super.super); Chris@10: } Chris@10: Chris@10: Chris@10: void X(rdft_rank0_register)(planner *p) Chris@10: { Chris@10: unsigned i; Chris@10: static struct { Chris@10: rdftapply apply; Chris@10: int (*applicable)(const P *, const problem_rdft *); Chris@10: const char *nam; Chris@10: } tab[] = { Chris@10: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, Chris@10: { apply_memcpy_loop, applicable_memcpy_loop, Chris@10: "rdft-rank0-memcpy-loop" }, Chris@10: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, Chris@10: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, Chris@10: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, Chris@10: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, Chris@10: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, Chris@10: { Chris@10: apply_ip_sq_tiled, Chris@10: applicable_ip_sq_tiled, Chris@10: "rdft-rank0-ip-sq-tiled" Chris@10: }, Chris@10: { Chris@10: apply_ip_sq_tiledbuf, Chris@10: applicable_ip_sq_tiledbuf, Chris@10: "rdft-rank0-ip-sq-tiledbuf" Chris@10: }, Chris@10: }; Chris@10: Chris@10: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { Chris@10: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; Chris@10: S *slv = MKSOLVER(S, &sadt); Chris@10: slv->apply = tab[i].apply; Chris@10: slv->applicable = tab[i].applicable; Chris@10: slv->nam = tab[i].nam; Chris@10: REGISTER_SOLVER(p, &(slv->super)); Chris@10: } Chris@10: }