annotate src/fftw-3.3.5/reodft/rodft00e-r2hc.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 /* Do a RODFT00 problem via an R2HC problem, with some pre/post-processing.
Chris@42 23
Chris@42 24 This code uses the trick from FFTPACK, also documented in a similar
Chris@42 25 form by Numerical Recipes. Unfortunately, this algorithm seems to
Chris@42 26 have intrinsic numerical problems (similar to those in
Chris@42 27 reodft11e-r2hc.c), possibly due to the fact that it multiplies its
Chris@42 28 input by a sine, causing a loss of precision near the zero. For
Chris@42 29 transforms of 16k points, it has already lost three or four decimal
Chris@42 30 places of accuracy, which we deem unacceptable.
Chris@42 31
Chris@42 32 So, we have abandoned this algorithm in favor of the one in
Chris@42 33 rodft00-r2hc-pad.c, which unfortunately sacrifices 30-50% in speed.
Chris@42 34 The only other alternative in the literature that does not have
Chris@42 35 similar numerical difficulties seems to be the direct adaptation of
Chris@42 36 the Cooley-Tukey decomposition for antisymmetric data, but this
Chris@42 37 would require a whole new set of codelets and it's not clear that
Chris@42 38 it's worth it at this point. However, we did implement the latter
Chris@42 39 algorithm for the specific case of odd n (logically adapting the
Chris@42 40 split-radix algorithm); see reodft00e-splitradix.c. */
Chris@42 41
Chris@42 42 #include "reodft.h"
Chris@42 43
Chris@42 44 typedef struct {
Chris@42 45 solver super;
Chris@42 46 } S;
Chris@42 47
Chris@42 48 typedef struct {
Chris@42 49 plan_rdft super;
Chris@42 50 plan *cld;
Chris@42 51 twid *td;
Chris@42 52 INT is, os;
Chris@42 53 INT n;
Chris@42 54 INT vl;
Chris@42 55 INT ivs, ovs;
Chris@42 56 } P;
Chris@42 57
Chris@42 58 static void apply(const plan *ego_, R *I, R *O)
Chris@42 59 {
Chris@42 60 const P *ego = (const P *) ego_;
Chris@42 61 INT is = ego->is, os = ego->os;
Chris@42 62 INT i, n = ego->n;
Chris@42 63 INT iv, vl = ego->vl;
Chris@42 64 INT ivs = ego->ivs, ovs = ego->ovs;
Chris@42 65 R *W = ego->td->W;
Chris@42 66 R *buf;
Chris@42 67
Chris@42 68 buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
Chris@42 69
Chris@42 70 for (iv = 0; iv < vl; ++iv, I += ivs, O += ovs) {
Chris@42 71 buf[0] = 0;
Chris@42 72 for (i = 1; i < n - i; ++i) {
Chris@42 73 E a, b, apb, amb;
Chris@42 74 a = I[is * (i - 1)];
Chris@42 75 b = I[is * ((n - i) - 1)];
Chris@42 76 apb = K(2.0) * W[i] * (a + b);
Chris@42 77 amb = (a - b);
Chris@42 78 buf[i] = apb + amb;
Chris@42 79 buf[n - i] = apb - amb;
Chris@42 80 }
Chris@42 81 if (i == n - i) {
Chris@42 82 buf[i] = K(4.0) * I[is * (i - 1)];
Chris@42 83 }
Chris@42 84
Chris@42 85 {
Chris@42 86 plan_rdft *cld = (plan_rdft *) ego->cld;
Chris@42 87 cld->apply((plan *) cld, buf, buf);
Chris@42 88 }
Chris@42 89
Chris@42 90 /* FIXME: use recursive/cascade summation for better stability? */
Chris@42 91 O[0] = buf[0] * 0.5;
Chris@42 92 for (i = 1; i + i < n - 1; ++i) {
Chris@42 93 INT k = i + i;
Chris@42 94 O[os * (k - 1)] = -buf[n - i];
Chris@42 95 O[os * k] = O[os * (k - 2)] + buf[i];
Chris@42 96 }
Chris@42 97 if (i + i == n - 1) {
Chris@42 98 O[os * (n - 2)] = -buf[n - i];
Chris@42 99 }
Chris@42 100 }
Chris@42 101
Chris@42 102 X(ifree)(buf);
Chris@42 103 }
Chris@42 104
Chris@42 105 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@42 106 {
Chris@42 107 P *ego = (P *) ego_;
Chris@42 108 static const tw_instr rodft00e_tw[] = {
Chris@42 109 { TW_SIN, 0, 1 },
Chris@42 110 { TW_NEXT, 1, 0 }
Chris@42 111 };
Chris@42 112
Chris@42 113 X(plan_awake)(ego->cld, wakefulness);
Chris@42 114
Chris@42 115 X(twiddle_awake)(wakefulness,
Chris@42 116 &ego->td, rodft00e_tw, 2*ego->n, 1, (ego->n+1)/2);
Chris@42 117 }
Chris@42 118
Chris@42 119 static void destroy(plan *ego_)
Chris@42 120 {
Chris@42 121 P *ego = (P *) ego_;
Chris@42 122 X(plan_destroy_internal)(ego->cld);
Chris@42 123 }
Chris@42 124
Chris@42 125 static void print(const plan *ego_, printer *p)
Chris@42 126 {
Chris@42 127 const P *ego = (const P *) ego_;
Chris@42 128 p->print(p, "(rodft00e-r2hc-%D%v%(%p%))", ego->n - 1, ego->vl, ego->cld);
Chris@42 129 }
Chris@42 130
Chris@42 131 static int applicable0(const solver *ego_, const problem *p_)
Chris@42 132 {
Chris@42 133 const problem_rdft *p = (const problem_rdft *) p_;
Chris@42 134 UNUSED(ego_);
Chris@42 135
Chris@42 136 return (1
Chris@42 137 && p->sz->rnk == 1
Chris@42 138 && p->vecsz->rnk <= 1
Chris@42 139 && p->kind[0] == RODFT00
Chris@42 140 );
Chris@42 141 }
Chris@42 142
Chris@42 143 static int applicable(const solver *ego, const problem *p, const planner *plnr)
Chris@42 144 {
Chris@42 145 return (!NO_SLOWP(plnr) && applicable0(ego, p));
Chris@42 146 }
Chris@42 147
Chris@42 148 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
Chris@42 149 {
Chris@42 150 P *pln;
Chris@42 151 const problem_rdft *p;
Chris@42 152 plan *cld;
Chris@42 153 R *buf;
Chris@42 154 INT n;
Chris@42 155 opcnt ops;
Chris@42 156
Chris@42 157 static const plan_adt padt = {
Chris@42 158 X(rdft_solve), awake, print, destroy
Chris@42 159 };
Chris@42 160
Chris@42 161 if (!applicable(ego_, p_, plnr))
Chris@42 162 return (plan *)0;
Chris@42 163
Chris@42 164 p = (const problem_rdft *) p_;
Chris@42 165
Chris@42 166 n = p->sz->dims[0].n + 1;
Chris@42 167 buf = (R *) MALLOC(sizeof(R) * n, BUFFERS);
Chris@42 168
Chris@42 169 cld = X(mkplan_d)(plnr, X(mkproblem_rdft_1_d)(X(mktensor_1d)(n, 1, 1),
Chris@42 170 X(mktensor_0d)(),
Chris@42 171 buf, buf, R2HC));
Chris@42 172 X(ifree)(buf);
Chris@42 173 if (!cld)
Chris@42 174 return (plan *)0;
Chris@42 175
Chris@42 176 pln = MKPLAN_RDFT(P, &padt, apply);
Chris@42 177
Chris@42 178 pln->n = n;
Chris@42 179 pln->is = p->sz->dims[0].is;
Chris@42 180 pln->os = p->sz->dims[0].os;
Chris@42 181 pln->cld = cld;
Chris@42 182 pln->td = 0;
Chris@42 183
Chris@42 184 X(tensor_tornk1)(p->vecsz, &pln->vl, &pln->ivs, &pln->ovs);
Chris@42 185
Chris@42 186 X(ops_zero)(&ops);
Chris@42 187 ops.other = 4 + (n-1)/2 * 5 + (n-2)/2 * 5;
Chris@42 188 ops.add = (n-1)/2 * 4 + (n-2)/2 * 1;
Chris@42 189 ops.mul = 1 + (n-1)/2 * 2;
Chris@42 190 if (n % 2 == 0)
Chris@42 191 ops.mul += 1;
Chris@42 192
Chris@42 193 X(ops_zero)(&pln->super.super.ops);
Chris@42 194 X(ops_madd2)(pln->vl, &ops, &pln->super.super.ops);
Chris@42 195 X(ops_madd2)(pln->vl, &cld->ops, &pln->super.super.ops);
Chris@42 196
Chris@42 197 return &(pln->super.super);
Chris@42 198 }
Chris@42 199
Chris@42 200 /* constructor */
Chris@42 201 static solver *mksolver(void)
Chris@42 202 {
Chris@42 203 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
Chris@42 204 S *slv = MKSOLVER(S, &sadt);
Chris@42 205 return &(slv->super);
Chris@42 206 }
Chris@42 207
Chris@42 208 void X(rodft00e_r2hc_register)(planner *p)
Chris@42 209 {
Chris@42 210 REGISTER_SOLVER(p, mksolver());
Chris@42 211 }