annotate src/fftw-3.3.5/libbench2/mp.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 #include "config.h"
Chris@42 2 #include "bench.h"
Chris@42 3 #include <math.h>
Chris@42 4
Chris@42 5 #define DG unsigned short
Chris@42 6 #define ACC unsigned long
Chris@42 7 #define REAL bench_real
Chris@42 8 #define BITS_IN_REAL 53 /* mantissa */
Chris@42 9
Chris@42 10 #define SHFT 16
Chris@42 11 #define RADIX 65536L
Chris@42 12 #define IRADIX (1.0 / RADIX)
Chris@42 13 #define LO(x) ((x) & (RADIX - 1))
Chris@42 14 #define HI(x) ((x) >> SHFT)
Chris@42 15 #define HI_SIGNED(x) \
Chris@42 16 ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
Chris@42 17 #define ZEROEXP (-32768)
Chris@42 18
Chris@42 19 #define LEN 10
Chris@42 20
Chris@42 21 typedef struct {
Chris@42 22 short sign;
Chris@42 23 short expt;
Chris@42 24 DG d[LEN];
Chris@42 25 } N[1];
Chris@42 26
Chris@42 27 #define EXA a->expt
Chris@42 28 #define EXB b->expt
Chris@42 29 #define EXC c->expt
Chris@42 30
Chris@42 31 #define AD a->d
Chris@42 32 #define BD b->d
Chris@42 33
Chris@42 34 #define SGNA a->sign
Chris@42 35 #define SGNB b->sign
Chris@42 36
Chris@42 37 static const N zero = {{ 1, ZEROEXP, {0} }};
Chris@42 38
Chris@42 39 static void cpy(const N a, N b)
Chris@42 40 {
Chris@42 41 *b = *a;
Chris@42 42 }
Chris@42 43
Chris@42 44 static void fromreal(REAL x, N a)
Chris@42 45 {
Chris@42 46 int i, e;
Chris@42 47
Chris@42 48 cpy(zero, a);
Chris@42 49 if (x == 0.0) return;
Chris@42 50
Chris@42 51 if (x >= 0) { SGNA = 1; }
Chris@42 52 else { SGNA = -1; x = -x; }
Chris@42 53
Chris@42 54 e = 0;
Chris@42 55 while (x >= 1.0) { x *= IRADIX; ++e; }
Chris@42 56 while (x < IRADIX) { x *= RADIX; --e; }
Chris@42 57 EXA = e;
Chris@42 58
Chris@42 59 for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
Chris@42 60 REAL y;
Chris@42 61
Chris@42 62 x *= RADIX;
Chris@42 63 y = (REAL) ((int) x);
Chris@42 64 AD[i] = (DG)y;
Chris@42 65 x -= y;
Chris@42 66 }
Chris@42 67 }
Chris@42 68
Chris@42 69 static void fromshort(int x, N a)
Chris@42 70 {
Chris@42 71 cpy(zero, a);
Chris@42 72
Chris@42 73 if (x < 0) { x = -x; SGNA = -1; }
Chris@42 74 else { SGNA = 1; }
Chris@42 75 EXA = 1;
Chris@42 76 AD[LEN - 1] = x;
Chris@42 77 }
Chris@42 78
Chris@42 79 static void pack(DG *d, int e, int s, int l, N a)
Chris@42 80 {
Chris@42 81 int i, j;
Chris@42 82
Chris@42 83 for (i = l - 1; i >= 0; --i, --e)
Chris@42 84 if (d[i] != 0)
Chris@42 85 break;
Chris@42 86
Chris@42 87 if (i < 0) {
Chris@42 88 /* number is zero */
Chris@42 89 cpy(zero, a);
Chris@42 90 } else {
Chris@42 91 EXA = e;
Chris@42 92 SGNA = s;
Chris@42 93
Chris@42 94 if (i >= LEN - 1) {
Chris@42 95 for (j = LEN - 1; j >= 0; --i, --j)
Chris@42 96 AD[j] = d[i];
Chris@42 97 } else {
Chris@42 98 for (j = LEN - 1; i >= 0; --i, --j)
Chris@42 99 AD[j] = d[i];
Chris@42 100 for ( ; j >= 0; --j)
Chris@42 101 AD[j] = 0;
Chris@42 102 }
Chris@42 103 }
Chris@42 104 }
Chris@42 105
Chris@42 106
Chris@42 107 /* compare absolute values */
Chris@42 108 static int abscmp(const N a, const N b)
Chris@42 109 {
Chris@42 110 int i;
Chris@42 111 if (EXA > EXB) return 1;
Chris@42 112 if (EXA < EXB) return -1;
Chris@42 113 for (i = LEN - 1; i >= 0; --i) {
Chris@42 114 if (AD[i] > BD[i])
Chris@42 115 return 1;
Chris@42 116 if (AD[i] < BD[i])
Chris@42 117 return -1;
Chris@42 118 }
Chris@42 119 return 0;
Chris@42 120 }
Chris@42 121
Chris@42 122 static int eq(const N a, const N b)
Chris@42 123 {
Chris@42 124 return (SGNA == SGNB) && (abscmp(a, b) == 0);
Chris@42 125 }
Chris@42 126
Chris@42 127 /* add magnitudes, for |a| >= |b| */
Chris@42 128 static void addmag0(int s, const N a, const N b, N c)
Chris@42 129 {
Chris@42 130 int ia, ib;
Chris@42 131 ACC r = 0;
Chris@42 132 DG d[LEN + 1];
Chris@42 133
Chris@42 134 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
Chris@42 135 r += (ACC)AD[ia] + (ACC)BD[ib];
Chris@42 136 d[ia] = LO(r);
Chris@42 137 r = HI(r);
Chris@42 138 }
Chris@42 139 for (; ia < LEN; ++ia) {
Chris@42 140 r += (ACC)AD[ia];
Chris@42 141 d[ia] = LO(r);
Chris@42 142 r = HI(r);
Chris@42 143 }
Chris@42 144 d[ia] = LO(r);
Chris@42 145 pack(d, EXA + 1, s * SGNA, LEN + 1, c);
Chris@42 146 }
Chris@42 147
Chris@42 148 static void addmag(int s, const N a, const N b, N c)
Chris@42 149 {
Chris@42 150 if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
Chris@42 151 }
Chris@42 152
Chris@42 153 /* subtract magnitudes, for |a| >= |b| */
Chris@42 154 static void submag0(int s, const N a, const N b, N c)
Chris@42 155 {
Chris@42 156 int ia, ib;
Chris@42 157 ACC r = 0;
Chris@42 158 DG d[LEN];
Chris@42 159
Chris@42 160 for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
Chris@42 161 r += (ACC)AD[ia] - (ACC)BD[ib];
Chris@42 162 d[ia] = LO(r);
Chris@42 163 r = HI_SIGNED(r);
Chris@42 164 }
Chris@42 165 for (; ia < LEN; ++ia) {
Chris@42 166 r += (ACC)AD[ia];
Chris@42 167 d[ia] = LO(r);
Chris@42 168 r = HI_SIGNED(r);
Chris@42 169 }
Chris@42 170
Chris@42 171 pack(d, EXA, s * SGNA, LEN, c);
Chris@42 172 }
Chris@42 173
Chris@42 174 static void submag(int s, const N a, const N b, N c)
Chris@42 175 {
Chris@42 176 if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
Chris@42 177 }
Chris@42 178
Chris@42 179 /* c = a + b */
Chris@42 180 static void add(const N a, const N b, N c)
Chris@42 181 {
Chris@42 182 if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
Chris@42 183 }
Chris@42 184
Chris@42 185 static void sub(const N a, const N b, N c)
Chris@42 186 {
Chris@42 187 if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
Chris@42 188 }
Chris@42 189
Chris@42 190 static void mul(const N a, const N b, N c)
Chris@42 191 {
Chris@42 192 DG d[2 * LEN];
Chris@42 193 int i, j, k;
Chris@42 194 ACC r;
Chris@42 195
Chris@42 196 for (i = 0; i < LEN; ++i)
Chris@42 197 d[2 * i] = d[2 * i + 1] = 0;
Chris@42 198
Chris@42 199 for (i = 0; i < LEN; ++i) {
Chris@42 200 ACC ai = AD[i];
Chris@42 201 if (ai) {
Chris@42 202 r = 0;
Chris@42 203 for (j = 0, k = i; j < LEN; ++j, ++k) {
Chris@42 204 r += ai * (ACC)BD[j] + (ACC)d[k];
Chris@42 205 d[k] = LO(r);
Chris@42 206 r = HI(r);
Chris@42 207 }
Chris@42 208 d[k] = LO(r);
Chris@42 209 }
Chris@42 210 }
Chris@42 211
Chris@42 212 pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
Chris@42 213 }
Chris@42 214
Chris@42 215 static REAL toreal(const N a)
Chris@42 216 {
Chris@42 217 REAL h, l, f;
Chris@42 218 int i, bits;
Chris@42 219 ACC r;
Chris@42 220 DG sticky;
Chris@42 221
Chris@42 222 if (EXA != ZEROEXP) {
Chris@42 223 f = IRADIX;
Chris@42 224 i = LEN;
Chris@42 225
Chris@42 226 bits = 0;
Chris@42 227 h = (r = AD[--i]) * f; f *= IRADIX;
Chris@42 228 for (bits = 0; r > 0; ++bits)
Chris@42 229 r >>= 1;
Chris@42 230
Chris@42 231 /* first digit */
Chris@42 232 while (bits + SHFT <= BITS_IN_REAL) {
Chris@42 233 h += AD[--i] * f; f *= IRADIX; bits += SHFT;
Chris@42 234 }
Chris@42 235
Chris@42 236 /* guard digit (leave one bit for sticky bit, hence `<' instead
Chris@42 237 of `<=') */
Chris@42 238 bits = 0; l = 0.0;
Chris@42 239 while (bits + SHFT < BITS_IN_REAL) {
Chris@42 240 l += AD[--i] * f; f *= IRADIX; bits += SHFT;
Chris@42 241 }
Chris@42 242
Chris@42 243 /* sticky bit */
Chris@42 244 sticky = 0;
Chris@42 245 while (i > 0)
Chris@42 246 sticky |= AD[--i];
Chris@42 247
Chris@42 248 if (sticky)
Chris@42 249 l += (RADIX / 2) * f;
Chris@42 250
Chris@42 251 h += l;
Chris@42 252
Chris@42 253 for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
Chris@42 254 for (i = 0; i > EXA; --i) h *= IRADIX;
Chris@42 255 if (SGNA == -1) h = -h;
Chris@42 256 return h;
Chris@42 257 } else {
Chris@42 258 return 0.0;
Chris@42 259 }
Chris@42 260 }
Chris@42 261
Chris@42 262 static void neg(N a)
Chris@42 263 {
Chris@42 264 SGNA = -SGNA;
Chris@42 265 }
Chris@42 266
Chris@42 267 static void inv(const N a, N x)
Chris@42 268 {
Chris@42 269 N w, z, one, two;
Chris@42 270
Chris@42 271 fromreal(1.0 / toreal(a), x); /* initial guess */
Chris@42 272 fromshort(1, one);
Chris@42 273 fromshort(2, two);
Chris@42 274
Chris@42 275 for (;;) {
Chris@42 276 /* Newton */
Chris@42 277 mul(a, x, w);
Chris@42 278 sub(two, w, z);
Chris@42 279 if (eq(one, z)) break;
Chris@42 280 mul(x, z, x);
Chris@42 281 }
Chris@42 282 }
Chris@42 283
Chris@42 284
Chris@42 285 /* 2 pi */
Chris@42 286 static const N n2pi = {{
Chris@42 287 1, 1,
Chris@42 288 {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
Chris@42 289 }};
Chris@42 290
Chris@42 291 /* 1 / 31! */
Chris@42 292 static const N i31fac = {{
Chris@42 293 1, -7,
Chris@42 294 {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
Chris@42 295 }};
Chris@42 296
Chris@42 297
Chris@42 298 /* 1 / 32! */
Chris@42 299 static const N i32fac = {{
Chris@42 300 1, -7,
Chris@42 301 {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
Chris@42 302 }};
Chris@42 303
Chris@42 304 static void msin(const N a, N b)
Chris@42 305 {
Chris@42 306 N a2, g, k;
Chris@42 307 int i;
Chris@42 308
Chris@42 309 cpy(i31fac, g);
Chris@42 310 cpy(g, b);
Chris@42 311 mul(a, a, a2);
Chris@42 312
Chris@42 313 /* Taylor */
Chris@42 314 for (i = 31; i > 1; i -= 2) {
Chris@42 315 fromshort(i * (i - 1), k);
Chris@42 316 mul(k, g, g);
Chris@42 317 mul(a2, b, k);
Chris@42 318 sub(g, k, b);
Chris@42 319 }
Chris@42 320 mul(a, b, b);
Chris@42 321 }
Chris@42 322
Chris@42 323 static void mcos(const N a, N b)
Chris@42 324 {
Chris@42 325 N a2, g, k;
Chris@42 326 int i;
Chris@42 327
Chris@42 328 cpy(i32fac, g);
Chris@42 329 cpy(g, b);
Chris@42 330 mul(a, a, a2);
Chris@42 331
Chris@42 332 /* Taylor */
Chris@42 333 for (i = 32; i > 0; i -= 2) {
Chris@42 334 fromshort(i * (i - 1), k);
Chris@42 335 mul(k, g, g);
Chris@42 336 mul(a2, b, k);
Chris@42 337 sub(g, k, b);
Chris@42 338 }
Chris@42 339 }
Chris@42 340
Chris@42 341 static void by2pi(REAL m, REAL n, N a)
Chris@42 342 {
Chris@42 343 N b;
Chris@42 344
Chris@42 345 fromreal(n, b);
Chris@42 346 inv(b, a);
Chris@42 347 fromreal(m, b);
Chris@42 348 mul(a, b, a);
Chris@42 349 mul(n2pi, a, a);
Chris@42 350 }
Chris@42 351
Chris@42 352 static void sin2pi(REAL m, REAL n, N a);
Chris@42 353 static void cos2pi(REAL m, REAL n, N a)
Chris@42 354 {
Chris@42 355 N b;
Chris@42 356 if (m < 0) cos2pi(-m, n, a);
Chris@42 357 else if (m > n * 0.5) cos2pi(n - m, n, a);
Chris@42 358 else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
Chris@42 359 else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
Chris@42 360 else { by2pi(m, n, b); mcos(b, a); }
Chris@42 361 }
Chris@42 362
Chris@42 363 static void sin2pi(REAL m, REAL n, N a)
Chris@42 364 {
Chris@42 365 N b;
Chris@42 366 if (m < 0) {sin2pi(-m, n, a); neg(a);}
Chris@42 367 else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
Chris@42 368 else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
Chris@42 369 else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
Chris@42 370 else {by2pi(m, n, b); msin(b, a);}
Chris@42 371 }
Chris@42 372
Chris@42 373 /*----------------------------------------------------------------------*/
Chris@42 374 /* FFT stuff */
Chris@42 375
Chris@42 376 /* (r0 + i i0)(r1 + i i1) */
Chris@42 377 static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
Chris@42 378 {
Chris@42 379 N s, t, q;
Chris@42 380 mul(r0, r1, s);
Chris@42 381 mul(i0, i1, t);
Chris@42 382 sub(s, t, q);
Chris@42 383 mul(r0, i1, s);
Chris@42 384 mul(i0, r1, t);
Chris@42 385 add(s, t, i2);
Chris@42 386 cpy(q, r2);
Chris@42 387 }
Chris@42 388
Chris@42 389 /* (r0 - i i0)(r1 + i i1) */
Chris@42 390 static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
Chris@42 391 {
Chris@42 392 N s, t, q;
Chris@42 393 mul(r0, r1, s);
Chris@42 394 mul(i0, i1, t);
Chris@42 395 add(s, t, q);
Chris@42 396 mul(r0, i1, s);
Chris@42 397 mul(i0, r1, t);
Chris@42 398 sub(s, t, i2);
Chris@42 399 cpy(q, r2);
Chris@42 400 }
Chris@42 401
Chris@42 402 static void mcexp(int m, int n, N r, N i)
Chris@42 403 {
Chris@42 404 static int cached_n = -1;
Chris@42 405 static N w[64][2];
Chris@42 406 int k, j;
Chris@42 407 if (n != cached_n) {
Chris@42 408 for (j = 1, k = 0; j < n; j += j, ++k) {
Chris@42 409 cos2pi(j, n, w[k][0]);
Chris@42 410 sin2pi(j, n, w[k][1]);
Chris@42 411 }
Chris@42 412 cached_n = n;
Chris@42 413 }
Chris@42 414
Chris@42 415 fromshort(1, r);
Chris@42 416 fromshort(0, i);
Chris@42 417 if (m > 0) {
Chris@42 418 for (k = 0; m; ++k, m >>= 1)
Chris@42 419 if (m & 1)
Chris@42 420 cmul(w[k][0], w[k][1], r, i, r, i);
Chris@42 421 } else {
Chris@42 422 m = -m;
Chris@42 423 for (k = 0; m; ++k, m >>= 1)
Chris@42 424 if (m & 1)
Chris@42 425 cmulj(w[k][0], w[k][1], r, i, r, i);
Chris@42 426 }
Chris@42 427 }
Chris@42 428
Chris@42 429 static void bitrev(int n, N *a)
Chris@42 430 {
Chris@42 431 int i, j, m;
Chris@42 432 for (i = j = 0; i < n - 1; ++i) {
Chris@42 433 if (i < j) {
Chris@42 434 N t;
Chris@42 435 cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
Chris@42 436 cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
Chris@42 437 }
Chris@42 438
Chris@42 439 /* bit reversed counter */
Chris@42 440 m = n; do { m >>= 1; j ^= m; } while (!(j & m));
Chris@42 441 }
Chris@42 442 }
Chris@42 443
Chris@42 444 static void fft0(int n, N *a, int sign)
Chris@42 445 {
Chris@42 446 int i, j, k;
Chris@42 447
Chris@42 448 bitrev(n, a);
Chris@42 449 for (i = 1; i < n; i = 2 * i) {
Chris@42 450 for (j = 0; j < i; ++j) {
Chris@42 451 N wr, wi;
Chris@42 452 mcexp(sign * (int)j, 2 * i, wr, wi);
Chris@42 453 for (k = j; k < n; k += 2 * i) {
Chris@42 454 N *a0 = a + 2 * k;
Chris@42 455 N *a1 = a0 + 2 * i;
Chris@42 456 N r0, i0, r1, i1, t0, t1, xr, xi;
Chris@42 457 cpy(a0[0], r0); cpy(a0[1], i0);
Chris@42 458 cpy(a1[0], r1); cpy(a1[1], i1);
Chris@42 459 mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
Chris@42 460 mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
Chris@42 461 add(r0, xr, a0[0]); add(i0, xi, a0[1]);
Chris@42 462 sub(r0, xr, a1[0]); sub(i0, xi, a1[1]);
Chris@42 463 }
Chris@42 464 }
Chris@42 465 }
Chris@42 466 }
Chris@42 467
Chris@42 468 /* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
Chris@42 469 static void bluestein_sequence(int n, N *a)
Chris@42 470 {
Chris@42 471 int k, ksq, n2 = 2 * n;
Chris@42 472
Chris@42 473 ksq = 1; /* (-1)^2 */
Chris@42 474 for (k = 0; k < n; ++k) {
Chris@42 475 /* careful with overflow */
Chris@42 476 ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
Chris@42 477 mcexp(ksq, n2, a[2*k], a[2*k+1]);
Chris@42 478 }
Chris@42 479 }
Chris@42 480
Chris@42 481 static int pow2_atleast(int x)
Chris@42 482 {
Chris@42 483 int h;
Chris@42 484 for (h = 1; h < x; h = 2 * h)
Chris@42 485 ;
Chris@42 486 return h;
Chris@42 487 }
Chris@42 488
Chris@42 489 static N *cached_bluestein_w = 0;
Chris@42 490 static N *cached_bluestein_y = 0;
Chris@42 491 static int cached_bluestein_n = -1;
Chris@42 492
Chris@42 493 static void bluestein(int n, N *a)
Chris@42 494 {
Chris@42 495 int nb = pow2_atleast(2 * n);
Chris@42 496 N *b = (N *)bench_malloc(2 * nb * sizeof(N));
Chris@42 497 N *w = cached_bluestein_w;
Chris@42 498 N *y = cached_bluestein_y;
Chris@42 499 N nbinv;
Chris@42 500 int i;
Chris@42 501
Chris@42 502 fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */
Chris@42 503
Chris@42 504 if (cached_bluestein_n != n) {
Chris@42 505 if (w) bench_free(w);
Chris@42 506 if (y) bench_free(y);
Chris@42 507 w = (N *)bench_malloc(2 * n * sizeof(N));
Chris@42 508 y = (N *)bench_malloc(2 * nb * sizeof(N));
Chris@42 509 cached_bluestein_n = n;
Chris@42 510 cached_bluestein_w = w;
Chris@42 511 cached_bluestein_y = y;
Chris@42 512
Chris@42 513 bluestein_sequence(n, w);
Chris@42 514 for (i = 0; i < 2*nb; ++i) cpy(zero, y[i]);
Chris@42 515
Chris@42 516 for (i = 0; i < n; ++i) {
Chris@42 517 cpy(w[2*i], y[2*i]);
Chris@42 518 cpy(w[2*i+1], y[2*i+1]);
Chris@42 519 }
Chris@42 520 for (i = 1; i < n; ++i) {
Chris@42 521 cpy(w[2*i], y[2*(nb-i)]);
Chris@42 522 cpy(w[2*i+1], y[2*(nb-i)+1]);
Chris@42 523 }
Chris@42 524
Chris@42 525 fft0(nb, y, -1);
Chris@42 526 }
Chris@42 527
Chris@42 528 for (i = 0; i < 2*nb; ++i) cpy(zero, b[i]);
Chris@42 529
Chris@42 530 for (i = 0; i < n; ++i)
Chris@42 531 cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);
Chris@42 532
Chris@42 533 /* scaled convolution b * y */
Chris@42 534 fft0(nb, b, -1);
Chris@42 535
Chris@42 536 for (i = 0; i < nb; ++i)
Chris@42 537 cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
Chris@42 538 fft0(nb, b, 1);
Chris@42 539
Chris@42 540 for (i = 0; i < n; ++i) {
Chris@42 541 cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
Chris@42 542 mul(nbinv, a[2*i], a[2*i]);
Chris@42 543 mul(nbinv, a[2*i+1], a[2*i+1]);
Chris@42 544 }
Chris@42 545
Chris@42 546 bench_free(b);
Chris@42 547 }
Chris@42 548
Chris@42 549 static void swapri(int n, N *a)
Chris@42 550 {
Chris@42 551 int i;
Chris@42 552 for (i = 0; i < n; ++i) {
Chris@42 553 N t;
Chris@42 554 cpy(a[2 * i], t);
Chris@42 555 cpy(a[2 * i + 1], a[2 * i]);
Chris@42 556 cpy(t, a[2 * i + 1]);
Chris@42 557 }
Chris@42 558 }
Chris@42 559
Chris@42 560 static void fft1(int n, N *a, int sign)
Chris@42 561 {
Chris@42 562 if (power_of_two(n)) {
Chris@42 563 fft0(n, a, sign);
Chris@42 564 } else {
Chris@42 565 if (sign == 1) swapri(n, a);
Chris@42 566 bluestein(n, a);
Chris@42 567 if (sign == 1) swapri(n, a);
Chris@42 568 }
Chris@42 569 }
Chris@42 570
Chris@42 571 static void fromrealv(int n, bench_complex *a, N *b)
Chris@42 572 {
Chris@42 573 int i;
Chris@42 574
Chris@42 575 for (i = 0; i < n; ++i) {
Chris@42 576 fromreal(c_re(a[i]), b[2 * i]);
Chris@42 577 fromreal(c_im(a[i]), b[2 * i + 1]);
Chris@42 578 }
Chris@42 579 }
Chris@42 580
Chris@42 581 static void compare(int n, N *a, N *b, double *err)
Chris@42 582 {
Chris@42 583 int i;
Chris@42 584 double e1, e2, einf;
Chris@42 585 double n1, n2, ninf;
Chris@42 586
Chris@42 587 e1 = e2 = einf = 0.0;
Chris@42 588 n1 = n2 = ninf = 0.0;
Chris@42 589
Chris@42 590 # define DO(x1, x2, xinf, var) { \
Chris@42 591 double d = var; \
Chris@42 592 if (d < 0) d = -d; \
Chris@42 593 x1 += d; x2 += d * d; if (d > xinf) xinf = d; \
Chris@42 594 }
Chris@42 595
Chris@42 596 for (i = 0; i < 2 * n; ++i) {
Chris@42 597 N dd;
Chris@42 598 sub(a[i], b[i], dd);
Chris@42 599 DO(n1, n2, ninf, toreal(a[i]));
Chris@42 600 DO(e1, e2, einf, toreal(dd));
Chris@42 601 }
Chris@42 602
Chris@42 603 # undef DO
Chris@42 604 err[0] = e1 / n1;
Chris@42 605 err[1] = sqrt(e2 / n2);
Chris@42 606 err[2] = einf / ninf;
Chris@42 607 }
Chris@42 608
Chris@42 609 void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
Chris@42 610 int sign, double err[6])
Chris@42 611 {
Chris@42 612 N *b = (N *)bench_malloc(2 * n * sizeof(N));
Chris@42 613 N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
Chris@42 614 N mn, ninv;
Chris@42 615 int i;
Chris@42 616
Chris@42 617 fromreal(n, mn); inv(mn, ninv);
Chris@42 618
Chris@42 619 /* forward error */
Chris@42 620 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
Chris@42 621 fft1(n, b, sign);
Chris@42 622 compare(n, b, fftb, err);
Chris@42 623
Chris@42 624 /* backward error */
Chris@42 625 fromrealv(n, a, b); fromrealv(n, ffta, fftb);
Chris@42 626 for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
Chris@42 627 fft1(n, fftb, -sign);
Chris@42 628 compare(n, b, fftb, err + 3);
Chris@42 629
Chris@42 630 bench_free(fftb);
Chris@42 631 bench_free(b);
Chris@42 632 }
Chris@42 633
Chris@42 634 void fftaccuracy_done(void)
Chris@42 635 {
Chris@42 636 if (cached_bluestein_w) bench_free(cached_bluestein_w);
Chris@42 637 if (cached_bluestein_y) bench_free(cached_bluestein_y);
Chris@42 638 cached_bluestein_w = 0;
Chris@42 639 cached_bluestein_y = 0;
Chris@42 640 cached_bluestein_n = -1;
Chris@42 641 }