Chris@82: /* Chris@82: * Copyright (c) 2003, 2007-14 Matteo Frigo Chris@82: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology Chris@82: * Chris@82: * This program is free software; you can redistribute it and/or modify Chris@82: * it under the terms of the GNU General Public License as published by Chris@82: * the Free Software Foundation; either version 2 of the License, or Chris@82: * (at your option) any later version. Chris@82: * Chris@82: * This program is distributed in the hope that it will be useful, Chris@82: * but WITHOUT ANY WARRANTY; without even the implied warranty of Chris@82: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Chris@82: * GNU General Public License for more details. Chris@82: * Chris@82: * You should have received a copy of the GNU General Public License Chris@82: * along with this program; if not, write to the Free Software Chris@82: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Chris@82: * Chris@82: */ Chris@82: Chris@82: Chris@82: /* plans for rank-0 RDFTs (copy operations) */ Chris@82: Chris@82: #include "rdft/rdft.h" Chris@82: Chris@82: #ifdef HAVE_STRING_H Chris@82: #include /* for memcpy() */ Chris@82: #endif Chris@82: Chris@82: #define MAXRNK 32 /* FIXME: should malloc() */ Chris@82: Chris@82: typedef struct { Chris@82: plan_rdft super; Chris@82: INT vl; Chris@82: int rnk; Chris@82: iodim d[MAXRNK]; Chris@82: const char *nam; Chris@82: } P; Chris@82: Chris@82: typedef struct { Chris@82: solver super; Chris@82: rdftapply apply; Chris@82: int (*applicable)(const P *pln, const problem_rdft *p); Chris@82: const char *nam; Chris@82: } S; Chris@82: Chris@82: /* copy up to MAXRNK dimensions from problem into plan. If a Chris@82: contiguous dimension exists, save its length in pln->vl */ Chris@82: static int fill_iodim(P *pln, const problem_rdft *p) Chris@82: { Chris@82: int i; Chris@82: const tensor *vecsz = p->vecsz; Chris@82: Chris@82: pln->vl = 1; Chris@82: pln->rnk = 0; Chris@82: for (i = 0; i < vecsz->rnk; ++i) { Chris@82: /* extract contiguous dimensions */ Chris@82: if (pln->vl == 1 && Chris@82: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) Chris@82: pln->vl = vecsz->dims[i].n; Chris@82: else if (pln->rnk == MAXRNK) Chris@82: return 0; Chris@82: else Chris@82: pln->d[pln->rnk++] = vecsz->dims[i]; Chris@82: } Chris@82: Chris@82: return 1; Chris@82: } Chris@82: Chris@82: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ Chris@82: static void copy(const iodim *d, int rnk, INT vl, Chris@82: R *I, R *O, Chris@82: cpy2d_func cpy2d) Chris@82: { Chris@82: A(rnk >= 2); Chris@82: if (rnk == 2) Chris@82: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); Chris@82: else { Chris@82: INT i; Chris@82: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) Chris@82: copy(d + 1, rnk - 1, vl, I, O, cpy2d); Chris@82: } Chris@82: } Chris@82: Chris@82: /* FIXME: should be more general */ Chris@82: static int transposep(const P *pln) Chris@82: { Chris@82: int i; Chris@82: Chris@82: for (i = 0; i < pln->rnk - 2; ++i) Chris@82: if (pln->d[i].is != pln->d[i].os) Chris@82: return 0; Chris@82: Chris@82: return (pln->d[i].n == pln->d[i+1].n && Chris@82: pln->d[i].is == pln->d[i+1].os && Chris@82: pln->d[i].os == pln->d[i+1].is); Chris@82: } Chris@82: Chris@82: /* generic higher-rank transpose routine, calls transpose2d() to do Chris@82: * the real work */ Chris@82: static void transpose(const iodim *d, int rnk, INT vl, Chris@82: R *I, Chris@82: transpose_func transpose2d) Chris@82: { Chris@82: A(rnk >= 2); Chris@82: if (rnk == 2) Chris@82: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); Chris@82: else { Chris@82: INT i; Chris@82: for (i = 0; i < d[0].n; ++i, I += d[0].is) Chris@82: transpose(d + 1, rnk - 1, vl, I, transpose2d); Chris@82: } Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank 0,1,2, out of place, iterative */ Chris@82: static void apply_iter(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: Chris@82: switch (ego->rnk) { Chris@82: case 0: Chris@82: X(cpy1d)(I, O, ego->vl, 1, 1, 1); Chris@82: break; Chris@82: case 1: Chris@82: X(cpy1d)(I, O, Chris@82: ego->d[0].n, ego->d[0].is, ego->d[0].os, Chris@82: ego->vl); Chris@82: break; Chris@82: default: Chris@82: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); Chris@82: break; Chris@82: } Chris@82: } Chris@82: Chris@82: static int applicable_iter(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: UNUSED(pln); Chris@82: return (p->I != p->O); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* out of place, write contiguous output */ Chris@82: static void apply_cpy2dco(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); Chris@82: } Chris@82: Chris@82: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: int rnk = pln->rnk; Chris@82: return (1 Chris@82: && p->I != p->O Chris@82: && rnk >= 2 Chris@82: Chris@82: /* must not duplicate apply_iter */ Chris@82: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) Chris@82: || Chris@82: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) Chris@82: ); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* out of place, tiled, no buffering */ Chris@82: static void apply_tiled(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); Chris@82: } Chris@82: Chris@82: static int applicable_tiled(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: return (1 Chris@82: && p->I != p->O Chris@82: && pln->rnk >= 2 Chris@82: Chris@82: /* somewhat arbitrary */ Chris@82: && X(compute_tilesz)(pln->vl, 1) > 4 Chris@82: ); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* out of place, tiled, with buffer */ Chris@82: static void apply_tiledbuf(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); Chris@82: } Chris@82: Chris@82: #define applicable_tiledbuf applicable_tiled Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank 0, out of place, using memcpy */ Chris@82: static void apply_memcpy(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: Chris@82: A(ego->rnk == 0); Chris@82: memcpy(O, I, ego->vl * sizeof(R)); Chris@82: } Chris@82: Chris@82: static int applicable_memcpy(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: return (1 Chris@82: && p->I != p->O Chris@82: && pln->rnk == 0 Chris@82: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ Chris@82: ); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place Chris@82: transposes of vl-tuples ... for large vl it should be more Chris@82: efficient to use memcpy than the tiled stuff). */ Chris@82: Chris@82: static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) Chris@82: { Chris@82: INT i, n = d->n, is = d->is, os = d->os; Chris@82: if (rnk == 1) Chris@82: for (i = 0; i < n; ++i, I += is, O += os) Chris@82: memcpy(O, I, cpysz); Chris@82: else { Chris@82: --rnk; ++d; Chris@82: for (i = 0; i < n; ++i, I += is, O += os) Chris@82: memcpy_loop(cpysz, rnk, d, I, O); Chris@82: } Chris@82: } Chris@82: Chris@82: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); Chris@82: } Chris@82: Chris@82: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: return (p->I != p->O Chris@82: && pln->rnk > 0 Chris@82: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank 2, in place, square transpose, iterative */ Chris@82: static void apply_ip_sq(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: UNUSED(O); Chris@82: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); Chris@82: } Chris@82: Chris@82: Chris@82: static int applicable_ip_sq(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: return (1 Chris@82: && p->I == p->O Chris@82: && pln->rnk >= 2 Chris@82: && transposep(pln)); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank 2, in place, square transpose, tiled */ Chris@82: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: UNUSED(O); Chris@82: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); Chris@82: } Chris@82: Chris@82: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) Chris@82: { Chris@82: return (1 Chris@82: && applicable_ip_sq(pln, p) Chris@82: Chris@82: /* somewhat arbitrary */ Chris@82: && X(compute_tilesz)(pln->vl, 2) > 4 Chris@82: ); Chris@82: } Chris@82: Chris@82: /**************************************************************/ Chris@82: /* rank 2, in place, square transpose, tiled, buffered */ Chris@82: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: UNUSED(O); Chris@82: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); Chris@82: } Chris@82: Chris@82: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled Chris@82: Chris@82: /**************************************************************/ Chris@82: static int applicable(const S *ego, const problem *p_) Chris@82: { Chris@82: const problem_rdft *p = (const problem_rdft *) p_; Chris@82: P pln; Chris@82: return (1 Chris@82: && p->sz->rnk == 0 Chris@82: && FINITE_RNK(p->vecsz->rnk) Chris@82: && fill_iodim(&pln, p) Chris@82: && ego->applicable(&pln, p) Chris@82: ); Chris@82: } Chris@82: Chris@82: static void print(const plan *ego_, printer *p) Chris@82: { Chris@82: const P *ego = (const P *) ego_; Chris@82: int i; Chris@82: p->print(p, "(%s/%D", ego->nam, ego->vl); Chris@82: for (i = 0; i < ego->rnk; ++i) Chris@82: p->print(p, "%v", ego->d[i].n); Chris@82: p->print(p, ")"); Chris@82: } Chris@82: Chris@82: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) Chris@82: { Chris@82: const problem_rdft *p; Chris@82: const S *ego = (const S *) ego_; Chris@82: P *pln; Chris@82: int retval; Chris@82: Chris@82: static const plan_adt padt = { Chris@82: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) Chris@82: }; Chris@82: Chris@82: UNUSED(plnr); Chris@82: Chris@82: if (!applicable(ego, p_)) Chris@82: return (plan *) 0; Chris@82: Chris@82: p = (const problem_rdft *) p_; Chris@82: pln = MKPLAN_RDFT(P, &padt, ego->apply); Chris@82: Chris@82: retval = fill_iodim(pln, p); Chris@82: (void)retval; /* UNUSED unless DEBUG */ Chris@82: A(retval); Chris@82: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ Chris@82: pln->nam = ego->nam; Chris@82: Chris@82: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ Chris@82: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); Chris@82: return &(pln->super.super); Chris@82: } Chris@82: Chris@82: Chris@82: void X(rdft_rank0_register)(planner *p) Chris@82: { Chris@82: unsigned i; Chris@82: static struct { Chris@82: rdftapply apply; Chris@82: int (*applicable)(const P *, const problem_rdft *); Chris@82: const char *nam; Chris@82: } tab[] = { Chris@82: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, Chris@82: { apply_memcpy_loop, applicable_memcpy_loop, Chris@82: "rdft-rank0-memcpy-loop" }, Chris@82: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, Chris@82: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, Chris@82: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, Chris@82: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, Chris@82: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, Chris@82: { Chris@82: apply_ip_sq_tiled, Chris@82: applicable_ip_sq_tiled, Chris@82: "rdft-rank0-ip-sq-tiled" Chris@82: }, Chris@82: { Chris@82: apply_ip_sq_tiledbuf, Chris@82: applicable_ip_sq_tiledbuf, Chris@82: "rdft-rank0-ip-sq-tiledbuf" Chris@82: }, Chris@82: }; Chris@82: Chris@82: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { Chris@82: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; Chris@82: S *slv = MKSOLVER(S, &sadt); Chris@82: slv->apply = tab[i].apply; Chris@82: slv->applicable = tab[i].applicable; Chris@82: slv->nam = tab[i].nam; Chris@82: REGISTER_SOLVER(p, &(slv->super)); Chris@82: } Chris@82: }