Chris@42: /* Chris@42: * Copyright (c) 2003, 2007-14 Matteo Frigo Chris@42: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology Chris@42: * Chris@42: * This program is free software; you can redistribute it and/or modify Chris@42: * it under the terms of the GNU General Public License as published by Chris@42: * the Free Software Foundation; either version 2 of the License, or Chris@42: * (at your option) any later version. Chris@42: * Chris@42: * This program is distributed in the hope that it will be useful, Chris@42: * but WITHOUT ANY WARRANTY; without even the implied warranty of Chris@42: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Chris@42: * GNU General Public License for more details. Chris@42: * Chris@42: * You should have received a copy of the GNU General Public License Chris@42: * along with this program; if not, write to the Free Software Chris@42: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Chris@42: * Chris@42: */ Chris@42: Chris@42: Chris@42: /* plans for rank-0 RDFTs (copy operations) */ Chris@42: Chris@42: #include "rdft.h" Chris@42: Chris@42: #ifdef HAVE_STRING_H Chris@42: #include /* for memcpy() */ Chris@42: #endif Chris@42: Chris@42: #define MAXRNK 32 /* FIXME: should malloc() */ Chris@42: Chris@42: typedef struct { Chris@42: plan_rdft super; Chris@42: INT vl; Chris@42: int rnk; Chris@42: iodim d[MAXRNK]; Chris@42: const char *nam; Chris@42: } P; Chris@42: Chris@42: typedef struct { Chris@42: solver super; Chris@42: rdftapply apply; Chris@42: int (*applicable)(const P *pln, const problem_rdft *p); Chris@42: const char *nam; Chris@42: } S; Chris@42: Chris@42: /* copy up to MAXRNK dimensions from problem into plan. If a Chris@42: contiguous dimension exists, save its length in pln->vl */ Chris@42: static int fill_iodim(P *pln, const problem_rdft *p) Chris@42: { Chris@42: int i; Chris@42: const tensor *vecsz = p->vecsz; Chris@42: Chris@42: pln->vl = 1; Chris@42: pln->rnk = 0; Chris@42: for (i = 0; i < vecsz->rnk; ++i) { Chris@42: /* extract contiguous dimensions */ Chris@42: if (pln->vl == 1 && Chris@42: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) Chris@42: pln->vl = vecsz->dims[i].n; Chris@42: else if (pln->rnk == MAXRNK) Chris@42: return 0; Chris@42: else Chris@42: pln->d[pln->rnk++] = vecsz->dims[i]; Chris@42: } Chris@42: Chris@42: return 1; Chris@42: } Chris@42: Chris@42: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ Chris@42: static void copy(const iodim *d, int rnk, INT vl, Chris@42: R *I, R *O, Chris@42: cpy2d_func cpy2d) Chris@42: { Chris@42: A(rnk >= 2); Chris@42: if (rnk == 2) Chris@42: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); Chris@42: else { Chris@42: INT i; Chris@42: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) Chris@42: copy(d + 1, rnk - 1, vl, I, O, cpy2d); Chris@42: } Chris@42: } Chris@42: Chris@42: /* FIXME: should be more general */ Chris@42: static int transposep(const P *pln) Chris@42: { Chris@42: int i; Chris@42: Chris@42: for (i = 0; i < pln->rnk - 2; ++i) Chris@42: if (pln->d[i].is != pln->d[i].os) Chris@42: return 0; Chris@42: Chris@42: return (pln->d[i].n == pln->d[i+1].n && Chris@42: pln->d[i].is == pln->d[i+1].os && Chris@42: pln->d[i].os == pln->d[i+1].is); Chris@42: } Chris@42: Chris@42: /* generic higher-rank transpose routine, calls transpose2d() to do Chris@42: * the real work */ Chris@42: static void transpose(const iodim *d, int rnk, INT vl, Chris@42: R *I, Chris@42: transpose_func transpose2d) Chris@42: { Chris@42: A(rnk >= 2); Chris@42: if (rnk == 2) Chris@42: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); Chris@42: else { Chris@42: INT i; Chris@42: for (i = 0; i < d[0].n; ++i, I += d[0].is) Chris@42: transpose(d + 1, rnk - 1, vl, I, transpose2d); Chris@42: } Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank 0,1,2, out of place, iterative */ Chris@42: static void apply_iter(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: Chris@42: switch (ego->rnk) { Chris@42: case 0: Chris@42: X(cpy1d)(I, O, ego->vl, 1, 1, 1); Chris@42: break; Chris@42: case 1: Chris@42: X(cpy1d)(I, O, Chris@42: ego->d[0].n, ego->d[0].is, ego->d[0].os, Chris@42: ego->vl); Chris@42: break; Chris@42: default: Chris@42: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); Chris@42: break; Chris@42: } Chris@42: } Chris@42: Chris@42: static int applicable_iter(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: UNUSED(pln); Chris@42: return (p->I != p->O); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* out of place, write contiguous output */ Chris@42: static void apply_cpy2dco(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); Chris@42: } Chris@42: Chris@42: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: int rnk = pln->rnk; Chris@42: return (1 Chris@42: && p->I != p->O Chris@42: && rnk >= 2 Chris@42: Chris@42: /* must not duplicate apply_iter */ Chris@42: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) Chris@42: || Chris@42: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) Chris@42: ); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* out of place, tiled, no buffering */ Chris@42: static void apply_tiled(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); Chris@42: } Chris@42: Chris@42: static int applicable_tiled(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: return (1 Chris@42: && p->I != p->O Chris@42: && pln->rnk >= 2 Chris@42: Chris@42: /* somewhat arbitrary */ Chris@42: && X(compute_tilesz)(pln->vl, 1) > 4 Chris@42: ); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* out of place, tiled, with buffer */ Chris@42: static void apply_tiledbuf(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); Chris@42: } Chris@42: Chris@42: #define applicable_tiledbuf applicable_tiled Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank 0, out of place, using memcpy */ Chris@42: static void apply_memcpy(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: Chris@42: A(ego->rnk == 0); Chris@42: memcpy(O, I, ego->vl * sizeof(R)); Chris@42: } Chris@42: Chris@42: static int applicable_memcpy(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: return (1 Chris@42: && p->I != p->O Chris@42: && pln->rnk == 0 Chris@42: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ Chris@42: ); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place Chris@42: transposes of vl-tuples ... for large vl it should be more Chris@42: efficient to use memcpy than the tiled stuff). */ Chris@42: Chris@42: static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) Chris@42: { Chris@42: INT i, n = d->n, is = d->is, os = d->os; Chris@42: if (rnk == 1) Chris@42: for (i = 0; i < n; ++i, I += is, O += os) Chris@42: memcpy(O, I, cpysz); Chris@42: else { Chris@42: --rnk; ++d; Chris@42: for (i = 0; i < n; ++i, I += is, O += os) Chris@42: memcpy_loop(cpysz, rnk, d, I, O); Chris@42: } Chris@42: } Chris@42: Chris@42: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); Chris@42: } Chris@42: Chris@42: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: return (p->I != p->O Chris@42: && pln->rnk > 0 Chris@42: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank 2, in place, square transpose, iterative */ Chris@42: static void apply_ip_sq(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: UNUSED(O); Chris@42: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); Chris@42: } Chris@42: Chris@42: Chris@42: static int applicable_ip_sq(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: return (1 Chris@42: && p->I == p->O Chris@42: && pln->rnk >= 2 Chris@42: && transposep(pln)); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank 2, in place, square transpose, tiled */ Chris@42: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: UNUSED(O); Chris@42: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); Chris@42: } Chris@42: Chris@42: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) Chris@42: { Chris@42: return (1 Chris@42: && applicable_ip_sq(pln, p) Chris@42: Chris@42: /* somewhat arbitrary */ Chris@42: && X(compute_tilesz)(pln->vl, 2) > 4 Chris@42: ); Chris@42: } Chris@42: Chris@42: /**************************************************************/ Chris@42: /* rank 2, in place, square transpose, tiled, buffered */ Chris@42: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: UNUSED(O); Chris@42: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); Chris@42: } Chris@42: Chris@42: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled Chris@42: Chris@42: /**************************************************************/ Chris@42: static int applicable(const S *ego, const problem *p_) Chris@42: { Chris@42: const problem_rdft *p = (const problem_rdft *) p_; Chris@42: P pln; Chris@42: return (1 Chris@42: && p->sz->rnk == 0 Chris@42: && FINITE_RNK(p->vecsz->rnk) Chris@42: && fill_iodim(&pln, p) Chris@42: && ego->applicable(&pln, p) Chris@42: ); Chris@42: } Chris@42: Chris@42: static void print(const plan *ego_, printer *p) Chris@42: { Chris@42: const P *ego = (const P *) ego_; Chris@42: int i; Chris@42: p->print(p, "(%s/%D", ego->nam, ego->vl); Chris@42: for (i = 0; i < ego->rnk; ++i) Chris@42: p->print(p, "%v", ego->d[i].n); Chris@42: p->print(p, ")"); Chris@42: } Chris@42: Chris@42: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) Chris@42: { Chris@42: const problem_rdft *p; Chris@42: const S *ego = (const S *) ego_; Chris@42: P *pln; Chris@42: int retval; Chris@42: Chris@42: static const plan_adt padt = { Chris@42: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) Chris@42: }; Chris@42: Chris@42: UNUSED(plnr); Chris@42: Chris@42: if (!applicable(ego, p_)) Chris@42: return (plan *) 0; Chris@42: Chris@42: p = (const problem_rdft *) p_; Chris@42: pln = MKPLAN_RDFT(P, &padt, ego->apply); Chris@42: Chris@42: retval = fill_iodim(pln, p); Chris@42: (void)retval; /* UNUSED unless DEBUG */ Chris@42: A(retval); Chris@42: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ Chris@42: pln->nam = ego->nam; Chris@42: Chris@42: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ Chris@42: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); Chris@42: return &(pln->super.super); Chris@42: } Chris@42: Chris@42: Chris@42: void X(rdft_rank0_register)(planner *p) Chris@42: { Chris@42: unsigned i; Chris@42: static struct { Chris@42: rdftapply apply; Chris@42: int (*applicable)(const P *, const problem_rdft *); Chris@42: const char *nam; Chris@42: } tab[] = { Chris@42: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, Chris@42: { apply_memcpy_loop, applicable_memcpy_loop, Chris@42: "rdft-rank0-memcpy-loop" }, Chris@42: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, Chris@42: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, Chris@42: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, Chris@42: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, Chris@42: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, Chris@42: { Chris@42: apply_ip_sq_tiled, Chris@42: applicable_ip_sq_tiled, Chris@42: "rdft-rank0-ip-sq-tiled" Chris@42: }, Chris@42: { Chris@42: apply_ip_sq_tiledbuf, Chris@42: applicable_ip_sq_tiledbuf, Chris@42: "rdft-rank0-ip-sq-tiledbuf" Chris@42: }, Chris@42: }; Chris@42: Chris@42: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { Chris@42: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; Chris@42: S *slv = MKSOLVER(S, &sadt); Chris@42: slv->apply = tab[i].apply; Chris@42: slv->applicable = tab[i].applicable; Chris@42: slv->nam = tab[i].nam; Chris@42: REGISTER_SOLVER(p, &(slv->super)); Chris@42: } Chris@42: }