annotate src/fftw-3.3.3/rdft/dht-rader.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 #include "rdft.h"
Chris@10 22
Chris@10 23 /*
Chris@10 24 * Compute DHTs of prime sizes using Rader's trick: turn them
Chris@10 25 * into convolutions of size n - 1, which we then perform via a pair
Chris@10 26 * of FFTs. (We can then do prime real FFTs via rdft-dht.c.)
Chris@10 27 *
Chris@10 28 * Optionally (determined by the "pad" field of the solver), we can
Chris@10 29 * perform the (cyclic) convolution by zero-padding to a size
Chris@10 30 * >= 2*(n-1) - 1. This is advantageous if n-1 has large prime factors.
Chris@10 31 *
Chris@10 32 */
Chris@10 33
Chris@10 34 typedef struct {
Chris@10 35 solver super;
Chris@10 36 int pad;
Chris@10 37 } S;
Chris@10 38
Chris@10 39 typedef struct {
Chris@10 40 plan_rdft super;
Chris@10 41
Chris@10 42 plan *cld1, *cld2;
Chris@10 43 R *omega;
Chris@10 44 INT n, npad, g, ginv;
Chris@10 45 INT is, os;
Chris@10 46 plan *cld_omega;
Chris@10 47 } P;
Chris@10 48
Chris@10 49 static rader_tl *omegas = 0;
Chris@10 50
Chris@10 51 /***************************************************************************/
Chris@10 52
Chris@10 53 /* If R2HC_ONLY_CONV is 1, we use a trick to perform the convolution
Chris@10 54 purely in terms of R2HC transforms, as opposed to R2HC followed by H2RC.
Chris@10 55 This requires a few more operations, but allows us to share the same
Chris@10 56 plan/codelets for both Rader children. */
Chris@10 57 #define R2HC_ONLY_CONV 1
Chris@10 58
Chris@10 59 static void apply(const plan *ego_, R *I, R *O)
Chris@10 60 {
Chris@10 61 const P *ego = (const P *) ego_;
Chris@10 62 INT n = ego->n; /* prime */
Chris@10 63 INT npad = ego->npad; /* == n - 1 for unpadded Rader; always even */
Chris@10 64 INT is = ego->is, os;
Chris@10 65 INT k, gpower, g;
Chris@10 66 R *buf, *omega;
Chris@10 67 R r0;
Chris@10 68
Chris@10 69 buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
Chris@10 70
Chris@10 71 /* First, permute the input, storing in buf: */
Chris@10 72 g = ego->g;
Chris@10 73 for (gpower = 1, k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
Chris@10 74 buf[k] = I[gpower * is];
Chris@10 75 }
Chris@10 76 /* gpower == g^(n-1) mod n == 1 */;
Chris@10 77
Chris@10 78 A(n - 1 <= npad);
Chris@10 79 for (k = n - 1; k < npad; ++k) /* optionally, zero-pad convolution */
Chris@10 80 buf[k] = 0;
Chris@10 81
Chris@10 82 os = ego->os;
Chris@10 83
Chris@10 84 /* compute RDFT of buf, storing in buf (i.e., in-place): */
Chris@10 85 {
Chris@10 86 plan_rdft *cld = (plan_rdft *) ego->cld1;
Chris@10 87 cld->apply((plan *) cld, buf, buf);
Chris@10 88 }
Chris@10 89
Chris@10 90 /* set output DC component: */
Chris@10 91 O[0] = (r0 = I[0]) + buf[0];
Chris@10 92
Chris@10 93 /* now, multiply by omega: */
Chris@10 94 omega = ego->omega;
Chris@10 95 buf[0] *= omega[0];
Chris@10 96 for (k = 1; k < npad/2; ++k) {
Chris@10 97 E rB, iB, rW, iW, a, b;
Chris@10 98 rW = omega[k];
Chris@10 99 iW = omega[npad - k];
Chris@10 100 rB = buf[k];
Chris@10 101 iB = buf[npad - k];
Chris@10 102 a = rW * rB - iW * iB;
Chris@10 103 b = rW * iB + iW * rB;
Chris@10 104 #if R2HC_ONLY_CONV
Chris@10 105 buf[k] = a + b;
Chris@10 106 buf[npad - k] = a - b;
Chris@10 107 #else
Chris@10 108 buf[k] = a;
Chris@10 109 buf[npad - k] = b;
Chris@10 110 #endif
Chris@10 111 }
Chris@10 112 /* Nyquist component: */
Chris@10 113 A(k + k == npad); /* since npad is even */
Chris@10 114 buf[k] *= omega[k];
Chris@10 115
Chris@10 116 /* this will add input[0] to all of the outputs after the ifft */
Chris@10 117 buf[0] += r0;
Chris@10 118
Chris@10 119 /* inverse FFT: */
Chris@10 120 {
Chris@10 121 plan_rdft *cld = (plan_rdft *) ego->cld2;
Chris@10 122 cld->apply((plan *) cld, buf, buf);
Chris@10 123 }
Chris@10 124
Chris@10 125 /* do inverse permutation to unshuffle the output: */
Chris@10 126 A(gpower == 1);
Chris@10 127 #if R2HC_ONLY_CONV
Chris@10 128 O[os] = buf[0];
Chris@10 129 gpower = g = ego->ginv;
Chris@10 130 A(npad == n - 1 || npad/2 >= n - 1);
Chris@10 131 if (npad == n - 1) {
Chris@10 132 for (k = 1; k < npad/2; ++k, gpower = MULMOD(gpower, g, n)) {
Chris@10 133 O[gpower * os] = buf[k] + buf[npad - k];
Chris@10 134 }
Chris@10 135 O[gpower * os] = buf[k];
Chris@10 136 ++k, gpower = MULMOD(gpower, g, n);
Chris@10 137 for (; k < npad; ++k, gpower = MULMOD(gpower, g, n)) {
Chris@10 138 O[gpower * os] = buf[npad - k] - buf[k];
Chris@10 139 }
Chris@10 140 }
Chris@10 141 else {
Chris@10 142 for (k = 1; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
Chris@10 143 O[gpower * os] = buf[k] + buf[npad - k];
Chris@10 144 }
Chris@10 145 }
Chris@10 146 #else
Chris@10 147 g = ego->ginv;
Chris@10 148 for (k = 0; k < n - 1; ++k, gpower = MULMOD(gpower, g, n)) {
Chris@10 149 O[gpower * os] = buf[k];
Chris@10 150 }
Chris@10 151 #endif
Chris@10 152 A(gpower == 1);
Chris@10 153
Chris@10 154 X(ifree)(buf);
Chris@10 155 }
Chris@10 156
Chris@10 157 static R *mkomega(enum wakefulness wakefulness,
Chris@10 158 plan *p_, INT n, INT npad, INT ginv)
Chris@10 159 {
Chris@10 160 plan_rdft *p = (plan_rdft *) p_;
Chris@10 161 R *omega;
Chris@10 162 INT i, gpower;
Chris@10 163 trigreal scale;
Chris@10 164 triggen *t;
Chris@10 165
Chris@10 166 if ((omega = X(rader_tl_find)(n, npad + 1, ginv, omegas)))
Chris@10 167 return omega;
Chris@10 168
Chris@10 169 omega = (R *)MALLOC(sizeof(R) * npad, TWIDDLES);
Chris@10 170
Chris@10 171 scale = npad; /* normalization for convolution */
Chris@10 172
Chris@10 173 t = X(mktriggen)(wakefulness, n);
Chris@10 174 for (i = 0, gpower = 1; i < n-1; ++i, gpower = MULMOD(gpower, ginv, n)) {
Chris@10 175 trigreal w[2];
Chris@10 176 t->cexpl(t, gpower, w);
Chris@10 177 omega[i] = (w[0] + w[1]) / scale;
Chris@10 178 }
Chris@10 179 X(triggen_destroy)(t);
Chris@10 180 A(gpower == 1);
Chris@10 181
Chris@10 182 A(npad == n - 1 || npad >= 2*(n - 1) - 1);
Chris@10 183
Chris@10 184 for (; i < npad; ++i)
Chris@10 185 omega[i] = K(0.0);
Chris@10 186 if (npad > n - 1)
Chris@10 187 for (i = 1; i < n-1; ++i)
Chris@10 188 omega[npad - i] = omega[n - 1 - i];
Chris@10 189
Chris@10 190 p->apply(p_, omega, omega);
Chris@10 191
Chris@10 192 X(rader_tl_insert)(n, npad + 1, ginv, omega, &omegas);
Chris@10 193 return omega;
Chris@10 194 }
Chris@10 195
Chris@10 196 static void free_omega(R *omega)
Chris@10 197 {
Chris@10 198 X(rader_tl_delete)(omega, &omegas);
Chris@10 199 }
Chris@10 200
Chris@10 201 /***************************************************************************/
Chris@10 202
Chris@10 203 static void awake(plan *ego_, enum wakefulness wakefulness)
Chris@10 204 {
Chris@10 205 P *ego = (P *) ego_;
Chris@10 206
Chris@10 207 X(plan_awake)(ego->cld1, wakefulness);
Chris@10 208 X(plan_awake)(ego->cld2, wakefulness);
Chris@10 209 X(plan_awake)(ego->cld_omega, wakefulness);
Chris@10 210
Chris@10 211 switch (wakefulness) {
Chris@10 212 case SLEEPY:
Chris@10 213 free_omega(ego->omega);
Chris@10 214 ego->omega = 0;
Chris@10 215 break;
Chris@10 216 default:
Chris@10 217 ego->g = X(find_generator)(ego->n);
Chris@10 218 ego->ginv = X(power_mod)(ego->g, ego->n - 2, ego->n);
Chris@10 219 A(MULMOD(ego->g, ego->ginv, ego->n) == 1);
Chris@10 220
Chris@10 221 A(!ego->omega);
Chris@10 222 ego->omega = mkomega(wakefulness,
Chris@10 223 ego->cld_omega,ego->n,ego->npad,ego->ginv);
Chris@10 224 break;
Chris@10 225 }
Chris@10 226 }
Chris@10 227
Chris@10 228 static void destroy(plan *ego_)
Chris@10 229 {
Chris@10 230 P *ego = (P *) ego_;
Chris@10 231 X(plan_destroy_internal)(ego->cld_omega);
Chris@10 232 X(plan_destroy_internal)(ego->cld2);
Chris@10 233 X(plan_destroy_internal)(ego->cld1);
Chris@10 234 }
Chris@10 235
Chris@10 236 static void print(const plan *ego_, printer *p)
Chris@10 237 {
Chris@10 238 const P *ego = (const P *) ego_;
Chris@10 239
Chris@10 240 p->print(p, "(dht-rader-%D/%D%ois=%oos=%(%p%)",
Chris@10 241 ego->n, ego->npad, ego->is, ego->os, ego->cld1);
Chris@10 242 if (ego->cld2 != ego->cld1)
Chris@10 243 p->print(p, "%(%p%)", ego->cld2);
Chris@10 244 if (ego->cld_omega != ego->cld1 && ego->cld_omega != ego->cld2)
Chris@10 245 p->print(p, "%(%p%)", ego->cld_omega);
Chris@10 246 p->putchr(p, ')');
Chris@10 247 }
Chris@10 248
Chris@10 249 static int applicable(const solver *ego, const problem *p_, const planner *plnr)
Chris@10 250 {
Chris@10 251 const problem_rdft *p = (const problem_rdft *) p_;
Chris@10 252 UNUSED(ego);
Chris@10 253 return (1
Chris@10 254 && p->sz->rnk == 1
Chris@10 255 && p->vecsz->rnk == 0
Chris@10 256 && p->kind[0] == DHT
Chris@10 257 && X(is_prime)(p->sz->dims[0].n)
Chris@10 258 && p->sz->dims[0].n > 2
Chris@10 259 && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
Chris@10 260 /* proclaim the solver SLOW if p-1 is not easily
Chris@10 261 factorizable. Unlike in the complex case where
Chris@10 262 Bluestein can solve the problem, in the DHT case we
Chris@10 263 may have no other choice */
Chris@10 264 && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
Chris@10 265 );
Chris@10 266 }
Chris@10 267
Chris@10 268 static INT choose_transform_size(INT minsz)
Chris@10 269 {
Chris@10 270 static const INT primes[] = { 2, 3, 5, 0 };
Chris@10 271 while (!X(factors_into)(minsz, primes) || minsz % 2)
Chris@10 272 ++minsz;
Chris@10 273 return minsz;
Chris@10 274 }
Chris@10 275
Chris@10 276 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
Chris@10 277 {
Chris@10 278 const S *ego = (const S *) ego_;
Chris@10 279 const problem_rdft *p = (const problem_rdft *) p_;
Chris@10 280 P *pln;
Chris@10 281 INT n, npad;
Chris@10 282 INT is, os;
Chris@10 283 plan *cld1 = (plan *) 0;
Chris@10 284 plan *cld2 = (plan *) 0;
Chris@10 285 plan *cld_omega = (plan *) 0;
Chris@10 286 R *buf = (R *) 0;
Chris@10 287 problem *cldp;
Chris@10 288
Chris@10 289 static const plan_adt padt = {
Chris@10 290 X(rdft_solve), awake, print, destroy
Chris@10 291 };
Chris@10 292
Chris@10 293 if (!applicable(ego_, p_, plnr))
Chris@10 294 return (plan *) 0;
Chris@10 295
Chris@10 296 n = p->sz->dims[0].n;
Chris@10 297 is = p->sz->dims[0].is;
Chris@10 298 os = p->sz->dims[0].os;
Chris@10 299
Chris@10 300 if (ego->pad)
Chris@10 301 npad = choose_transform_size(2 * (n - 1) - 1);
Chris@10 302 else
Chris@10 303 npad = n - 1;
Chris@10 304
Chris@10 305 /* initial allocation for the purpose of planning */
Chris@10 306 buf = (R *) MALLOC(sizeof(R) * npad, BUFFERS);
Chris@10 307
Chris@10 308 cld1 = X(mkplan_f_d)(plnr,
Chris@10 309 X(mkproblem_rdft_1_d)(X(mktensor_1d)(npad, 1, 1),
Chris@10 310 X(mktensor_1d)(1, 0, 0),
Chris@10 311 buf, buf,
Chris@10 312 R2HC),
Chris@10 313 NO_SLOW, 0, 0);
Chris@10 314 if (!cld1) goto nada;
Chris@10 315
Chris@10 316 cldp =
Chris@10 317 X(mkproblem_rdft_1_d)(
Chris@10 318 X(mktensor_1d)(npad, 1, 1),
Chris@10 319 X(mktensor_1d)(1, 0, 0),
Chris@10 320 buf, buf,
Chris@10 321 #if R2HC_ONLY_CONV
Chris@10 322 R2HC
Chris@10 323 #else
Chris@10 324 HC2R
Chris@10 325 #endif
Chris@10 326 );
Chris@10 327 if (!(cld2 = X(mkplan_f_d)(plnr, cldp, NO_SLOW, 0, 0)))
Chris@10 328 goto nada;
Chris@10 329
Chris@10 330 /* plan for omega */
Chris@10 331 cld_omega = X(mkplan_f_d)(plnr,
Chris@10 332 X(mkproblem_rdft_1_d)(
Chris@10 333 X(mktensor_1d)(npad, 1, 1),
Chris@10 334 X(mktensor_1d)(1, 0, 0),
Chris@10 335 buf, buf, R2HC),
Chris@10 336 NO_SLOW, ESTIMATE, 0);
Chris@10 337 if (!cld_omega) goto nada;
Chris@10 338
Chris@10 339 /* deallocate buffers; let awake() or apply() allocate them for real */
Chris@10 340 X(ifree)(buf);
Chris@10 341 buf = 0;
Chris@10 342
Chris@10 343 pln = MKPLAN_RDFT(P, &padt, apply);
Chris@10 344 pln->cld1 = cld1;
Chris@10 345 pln->cld2 = cld2;
Chris@10 346 pln->cld_omega = cld_omega;
Chris@10 347 pln->omega = 0;
Chris@10 348 pln->n = n;
Chris@10 349 pln->npad = npad;
Chris@10 350 pln->is = is;
Chris@10 351 pln->os = os;
Chris@10 352
Chris@10 353 X(ops_add)(&cld1->ops, &cld2->ops, &pln->super.super.ops);
Chris@10 354 pln->super.super.ops.other += (npad/2-1)*6 + npad + n + (n-1) * ego->pad;
Chris@10 355 pln->super.super.ops.add += (npad/2-1)*2 + 2 + (n-1) * ego->pad;
Chris@10 356 pln->super.super.ops.mul += (npad/2-1)*4 + 2 + ego->pad;
Chris@10 357 #if R2HC_ONLY_CONV
Chris@10 358 pln->super.super.ops.other += n-2 - ego->pad;
Chris@10 359 pln->super.super.ops.add += (npad/2-1)*2 + (n-2) - ego->pad;
Chris@10 360 #endif
Chris@10 361
Chris@10 362 return &(pln->super.super);
Chris@10 363
Chris@10 364 nada:
Chris@10 365 X(ifree0)(buf);
Chris@10 366 X(plan_destroy_internal)(cld_omega);
Chris@10 367 X(plan_destroy_internal)(cld2);
Chris@10 368 X(plan_destroy_internal)(cld1);
Chris@10 369 return 0;
Chris@10 370 }
Chris@10 371
Chris@10 372 /* constructors */
Chris@10 373
Chris@10 374 static solver *mksolver(int pad)
Chris@10 375 {
Chris@10 376 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
Chris@10 377 S *slv = MKSOLVER(S, &sadt);
Chris@10 378 slv->pad = pad;
Chris@10 379 return &(slv->super);
Chris@10 380 }
Chris@10 381
Chris@10 382 void X(dht_rader_register)(planner *p)
Chris@10 383 {
Chris@10 384 REGISTER_SOLVER(p, mksolver(0));
Chris@10 385 REGISTER_SOLVER(p, mksolver(1));
Chris@10 386 }