Chris@19: /* Chris@19: * Copyright (c) 2003, 2007-14 Matteo Frigo Chris@19: * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology Chris@19: * Chris@19: * This program is free software; you can redistribute it and/or modify Chris@19: * it under the terms of the GNU General Public License as published by Chris@19: * the Free Software Foundation; either version 2 of the License, or Chris@19: * (at your option) any later version. Chris@19: * Chris@19: * This program is distributed in the hope that it will be useful, Chris@19: * but WITHOUT ANY WARRANTY; without even the implied warranty of Chris@19: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Chris@19: * GNU General Public License for more details. Chris@19: * Chris@19: * You should have received a copy of the GNU General Public License Chris@19: * along with this program; if not, write to the Free Software Chris@19: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Chris@19: * Chris@19: */ Chris@19: Chris@19: Chris@19: /* plans for rank-0 RDFTs (copy operations) */ Chris@19: Chris@19: #include "rdft.h" Chris@19: Chris@19: #ifdef HAVE_STRING_H Chris@19: #include /* for memcpy() */ Chris@19: #endif Chris@19: Chris@19: #define MAXRNK 32 /* FIXME: should malloc() */ Chris@19: Chris@19: typedef struct { Chris@19: plan_rdft super; Chris@19: INT vl; Chris@19: int rnk; Chris@19: iodim d[MAXRNK]; Chris@19: const char *nam; Chris@19: } P; Chris@19: Chris@19: typedef struct { Chris@19: solver super; Chris@19: rdftapply apply; Chris@19: int (*applicable)(const P *pln, const problem_rdft *p); Chris@19: const char *nam; Chris@19: } S; Chris@19: Chris@19: /* copy up to MAXRNK dimensions from problem into plan. If a Chris@19: contiguous dimension exists, save its length in pln->vl */ Chris@19: static int fill_iodim(P *pln, const problem_rdft *p) Chris@19: { Chris@19: int i; Chris@19: const tensor *vecsz = p->vecsz; Chris@19: Chris@19: pln->vl = 1; Chris@19: pln->rnk = 0; Chris@19: for (i = 0; i < vecsz->rnk; ++i) { Chris@19: /* extract contiguous dimensions */ Chris@19: if (pln->vl == 1 && Chris@19: vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) Chris@19: pln->vl = vecsz->dims[i].n; Chris@19: else if (pln->rnk == MAXRNK) Chris@19: return 0; Chris@19: else Chris@19: pln->d[pln->rnk++] = vecsz->dims[i]; Chris@19: } Chris@19: Chris@19: return 1; Chris@19: } Chris@19: Chris@19: /* generic higher-rank copy routine, calls cpy2d() to do the real work */ Chris@19: static void copy(const iodim *d, int rnk, INT vl, Chris@19: R *I, R *O, Chris@19: cpy2d_func cpy2d) Chris@19: { Chris@19: A(rnk >= 2); Chris@19: if (rnk == 2) Chris@19: cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); Chris@19: else { Chris@19: INT i; Chris@19: for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) Chris@19: copy(d + 1, rnk - 1, vl, I, O, cpy2d); Chris@19: } Chris@19: } Chris@19: Chris@19: /* FIXME: should be more general */ Chris@19: static int transposep(const P *pln) Chris@19: { Chris@19: int i; Chris@19: Chris@19: for (i = 0; i < pln->rnk - 2; ++i) Chris@19: if (pln->d[i].is != pln->d[i].os) Chris@19: return 0; Chris@19: Chris@19: return (pln->d[i].n == pln->d[i+1].n && Chris@19: pln->d[i].is == pln->d[i+1].os && Chris@19: pln->d[i].os == pln->d[i+1].is); Chris@19: } Chris@19: Chris@19: /* generic higher-rank transpose routine, calls transpose2d() to do Chris@19: * the real work */ Chris@19: static void transpose(const iodim *d, int rnk, INT vl, Chris@19: R *I, Chris@19: transpose_func transpose2d) Chris@19: { Chris@19: A(rnk >= 2); Chris@19: if (rnk == 2) Chris@19: transpose2d(I, d[0].n, d[0].is, d[0].os, vl); Chris@19: else { Chris@19: INT i; Chris@19: for (i = 0; i < d[0].n; ++i, I += d[0].is) Chris@19: transpose(d + 1, rnk - 1, vl, I, transpose2d); Chris@19: } Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank 0,1,2, out of place, iterative */ Chris@19: static void apply_iter(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: Chris@19: switch (ego->rnk) { Chris@19: case 0: Chris@19: X(cpy1d)(I, O, ego->vl, 1, 1, 1); Chris@19: break; Chris@19: case 1: Chris@19: X(cpy1d)(I, O, Chris@19: ego->d[0].n, ego->d[0].is, ego->d[0].os, Chris@19: ego->vl); Chris@19: break; Chris@19: default: Chris@19: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); Chris@19: break; Chris@19: } Chris@19: } Chris@19: Chris@19: static int applicable_iter(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: UNUSED(pln); Chris@19: return (p->I != p->O); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* out of place, write contiguous output */ Chris@19: static void apply_cpy2dco(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); Chris@19: } Chris@19: Chris@19: static int applicable_cpy2dco(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: int rnk = pln->rnk; Chris@19: return (1 Chris@19: && p->I != p->O Chris@19: && rnk >= 2 Chris@19: Chris@19: /* must not duplicate apply_iter */ Chris@19: && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) Chris@19: || Chris@19: X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) Chris@19: ); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* out of place, tiled, no buffering */ Chris@19: static void apply_tiled(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); Chris@19: } Chris@19: Chris@19: static int applicable_tiled(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: return (1 Chris@19: && p->I != p->O Chris@19: && pln->rnk >= 2 Chris@19: Chris@19: /* somewhat arbitrary */ Chris@19: && X(compute_tilesz)(pln->vl, 1) > 4 Chris@19: ); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* out of place, tiled, with buffer */ Chris@19: static void apply_tiledbuf(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); Chris@19: } Chris@19: Chris@19: #define applicable_tiledbuf applicable_tiled Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank 0, out of place, using memcpy */ Chris@19: static void apply_memcpy(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: Chris@19: A(ego->rnk == 0); Chris@19: memcpy(O, I, ego->vl * sizeof(R)); Chris@19: } Chris@19: Chris@19: static int applicable_memcpy(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: return (1 Chris@19: && p->I != p->O Chris@19: && pln->rnk == 0 Chris@19: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ Chris@19: ); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place Chris@19: transposes of vl-tuples ... for large vl it should be more Chris@19: efficient to use memcpy than the tiled stuff). */ Chris@19: Chris@19: static void memcpy_loop(INT cpysz, int rnk, const iodim *d, R *I, R *O) Chris@19: { Chris@19: INT i, n = d->n, is = d->is, os = d->os; Chris@19: if (rnk == 1) Chris@19: for (i = 0; i < n; ++i, I += is, O += os) Chris@19: memcpy(O, I, cpysz); Chris@19: else { Chris@19: --rnk; ++d; Chris@19: for (i = 0; i < n; ++i, I += is, O += os) Chris@19: memcpy_loop(cpysz, rnk, d, I, O); Chris@19: } Chris@19: } Chris@19: Chris@19: static void apply_memcpy_loop(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); Chris@19: } Chris@19: Chris@19: static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: return (p->I != p->O Chris@19: && pln->rnk > 0 Chris@19: && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank 2, in place, square transpose, iterative */ Chris@19: static void apply_ip_sq(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: UNUSED(O); Chris@19: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); Chris@19: } Chris@19: Chris@19: Chris@19: static int applicable_ip_sq(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: return (1 Chris@19: && p->I == p->O Chris@19: && pln->rnk >= 2 Chris@19: && transposep(pln)); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank 2, in place, square transpose, tiled */ Chris@19: static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: UNUSED(O); Chris@19: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); Chris@19: } Chris@19: Chris@19: static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) Chris@19: { Chris@19: return (1 Chris@19: && applicable_ip_sq(pln, p) Chris@19: Chris@19: /* somewhat arbitrary */ Chris@19: && X(compute_tilesz)(pln->vl, 2) > 4 Chris@19: ); Chris@19: } Chris@19: Chris@19: /**************************************************************/ Chris@19: /* rank 2, in place, square transpose, tiled, buffered */ Chris@19: static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: UNUSED(O); Chris@19: transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); Chris@19: } Chris@19: Chris@19: #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled Chris@19: Chris@19: /**************************************************************/ Chris@19: static int applicable(const S *ego, const problem *p_) Chris@19: { Chris@19: const problem_rdft *p = (const problem_rdft *) p_; Chris@19: P pln; Chris@19: return (1 Chris@19: && p->sz->rnk == 0 Chris@19: && FINITE_RNK(p->vecsz->rnk) Chris@19: && fill_iodim(&pln, p) Chris@19: && ego->applicable(&pln, p) Chris@19: ); Chris@19: } Chris@19: Chris@19: static void print(const plan *ego_, printer *p) Chris@19: { Chris@19: const P *ego = (const P *) ego_; Chris@19: int i; Chris@19: p->print(p, "(%s/%D", ego->nam, ego->vl); Chris@19: for (i = 0; i < ego->rnk; ++i) Chris@19: p->print(p, "%v", ego->d[i].n); Chris@19: p->print(p, ")"); Chris@19: } Chris@19: Chris@19: static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) Chris@19: { Chris@19: const problem_rdft *p; Chris@19: const S *ego = (const S *) ego_; Chris@19: P *pln; Chris@19: int retval; Chris@19: Chris@19: static const plan_adt padt = { Chris@19: X(rdft_solve), X(null_awake), print, X(plan_null_destroy) Chris@19: }; Chris@19: Chris@19: UNUSED(plnr); Chris@19: Chris@19: if (!applicable(ego, p_)) Chris@19: return (plan *) 0; Chris@19: Chris@19: p = (const problem_rdft *) p_; Chris@19: pln = MKPLAN_RDFT(P, &padt, ego->apply); Chris@19: Chris@19: retval = fill_iodim(pln, p); Chris@19: (void)retval; /* UNUSED unless DEBUG */ Chris@19: A(retval); Chris@19: A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ Chris@19: pln->nam = ego->nam; Chris@19: Chris@19: /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ Chris@19: X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); Chris@19: return &(pln->super.super); Chris@19: } Chris@19: Chris@19: Chris@19: void X(rdft_rank0_register)(planner *p) Chris@19: { Chris@19: unsigned i; Chris@19: static struct { Chris@19: rdftapply apply; Chris@19: int (*applicable)(const P *, const problem_rdft *); Chris@19: const char *nam; Chris@19: } tab[] = { Chris@19: { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, Chris@19: { apply_memcpy_loop, applicable_memcpy_loop, Chris@19: "rdft-rank0-memcpy-loop" }, Chris@19: { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, Chris@19: { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, Chris@19: { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, Chris@19: { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, Chris@19: { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, Chris@19: { Chris@19: apply_ip_sq_tiled, Chris@19: applicable_ip_sq_tiled, Chris@19: "rdft-rank0-ip-sq-tiled" Chris@19: }, Chris@19: { Chris@19: apply_ip_sq_tiledbuf, Chris@19: applicable_ip_sq_tiledbuf, Chris@19: "rdft-rank0-ip-sq-tiledbuf" Chris@19: }, Chris@19: }; Chris@19: Chris@19: for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { Chris@19: static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; Chris@19: S *slv = MKSOLVER(S, &sadt); Chris@19: slv->apply = tab[i].apply; Chris@19: slv->applicable = tab[i].applicable; Chris@19: slv->nam = tab[i].nam; Chris@19: REGISTER_SOLVER(p, &(slv->super)); Chris@19: } Chris@19: }