Mercurial > hg > sv-dependency-builds
diff src/fftw-3.3.3/libbench2/mp.c @ 95:89f5e221ed7b
Add FFTW3
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/fftw-3.3.3/libbench2/mp.c Wed Mar 20 15:35:50 2013 +0000 @@ -0,0 +1,641 @@ +#include "config.h" +#include "bench.h" +#include <math.h> + +#define DG unsigned short +#define ACC unsigned long +#define REAL bench_real +#define BITS_IN_REAL 53 /* mantissa */ + +#define SHFT 16 +#define RADIX 65536L +#define IRADIX (1.0 / RADIX) +#define LO(x) ((x) & (RADIX - 1)) +#define HI(x) ((x) >> SHFT) +#define HI_SIGNED(x) \ + ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1)) +#define ZEROEXP (-32768) + +#define LEN 10 + +typedef struct { + short sign; + short expt; + DG d[LEN]; +} N[1]; + +#define EXA a->expt +#define EXB b->expt +#define EXC c->expt + +#define AD a->d +#define BD b->d + +#define SGNA a->sign +#define SGNB b->sign + +static const N zero = {{ 1, ZEROEXP, {0} }}; + +static void cpy(const N a, N b) +{ + *b = *a; +} + +static void fromreal(REAL x, N a) +{ + int i, e; + + cpy(zero, a); + if (x == 0.0) return; + + if (x >= 0) { SGNA = 1; } + else { SGNA = -1; x = -x; } + + e = 0; + while (x >= 1.0) { x *= IRADIX; ++e; } + while (x < IRADIX) { x *= RADIX; --e; } + EXA = e; + + for (i = LEN - 1; i >= 0 && x != 0.0; --i) { + REAL y; + + x *= RADIX; + y = (REAL) ((int) x); + AD[i] = (DG)y; + x -= y; + } +} + +static void fromshort(int x, N a) +{ + cpy(zero, a); + + if (x < 0) { x = -x; SGNA = -1; } + else { SGNA = 1; } + EXA = 1; + AD[LEN - 1] = x; +} + +static void pack(DG *d, int e, int s, int l, N a) +{ + int i, j; + + for (i = l - 1; i >= 0; --i, --e) + if (d[i] != 0) + break; + + if (i < 0) { + /* number is zero */ + cpy(zero, a); + } else { + EXA = e; + SGNA = s; + + if (i >= LEN - 1) { + for (j = LEN - 1; j >= 0; --i, --j) + AD[j] = d[i]; + } else { + for (j = LEN - 1; i >= 0; --i, --j) + AD[j] = d[i]; + for ( ; j >= 0; --j) + AD[j] = 0; + } + } +} + + +/* compare absolute values */ +static int abscmp(const N a, const N b) +{ + int i; + if (EXA > EXB) return 1; + if (EXA < EXB) return -1; + for (i = LEN - 1; i >= 0; --i) { + if (AD[i] > BD[i]) + return 1; + if (AD[i] < BD[i]) + return -1; + } + return 0; +} + +static int eq(const N a, const N b) +{ + return (SGNA == SGNB) && (abscmp(a, b) == 0); +} + +/* add magnitudes, for |a| >= |b| */ +static void addmag0(int s, const N a, const N b, N c) +{ + int ia, ib; + ACC r = 0; + DG d[LEN + 1]; + + for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { + r += (ACC)AD[ia] + (ACC)BD[ib]; + d[ia] = LO(r); + r = HI(r); + } + for (; ia < LEN; ++ia) { + r += (ACC)AD[ia]; + d[ia] = LO(r); + r = HI(r); + } + d[ia] = LO(r); + pack(d, EXA + 1, s * SGNA, LEN + 1, c); +} + +static void addmag(int s, const N a, const N b, N c) +{ + if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c); +} + +/* subtract magnitudes, for |a| >= |b| */ +static void submag0(int s, const N a, const N b, N c) +{ + int ia, ib; + ACC r = 0; + DG d[LEN]; + + for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { + r += (ACC)AD[ia] - (ACC)BD[ib]; + d[ia] = LO(r); + r = HI_SIGNED(r); + } + for (; ia < LEN; ++ia) { + r += (ACC)AD[ia]; + d[ia] = LO(r); + r = HI_SIGNED(r); + } + + pack(d, EXA, s * SGNA, LEN, c); +} + +static void submag(int s, const N a, const N b, N c) +{ + if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c); +} + +/* c = a + b */ +static void add(const N a, const N b, N c) +{ + if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c); +} + +static void sub(const N a, const N b, N c) +{ + if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c); +} + +static void mul(const N a, const N b, N c) +{ + DG d[2 * LEN]; + int i, j, k; + ACC r; + + for (i = 0; i < LEN; ++i) + d[2 * i] = d[2 * i + 1] = 0; + + for (i = 0; i < LEN; ++i) { + ACC ai = AD[i]; + if (ai) { + r = 0; + for (j = 0, k = i; j < LEN; ++j, ++k) { + r += ai * (ACC)BD[j] + (ACC)d[k]; + d[k] = LO(r); + r = HI(r); + } + d[k] = LO(r); + } + } + + pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c); +} + +static REAL toreal(const N a) +{ + REAL h, l, f; + int i, bits; + ACC r; + DG sticky; + + if (EXA != ZEROEXP) { + f = IRADIX; + i = LEN; + + bits = 0; + h = (r = AD[--i]) * f; f *= IRADIX; + for (bits = 0; r > 0; ++bits) + r >>= 1; + + /* first digit */ + while (bits + SHFT <= BITS_IN_REAL) { + h += AD[--i] * f; f *= IRADIX; bits += SHFT; + } + + /* guard digit (leave one bit for sticky bit, hence `<' instead + of `<=') */ + bits = 0; l = 0.0; + while (bits + SHFT < BITS_IN_REAL) { + l += AD[--i] * f; f *= IRADIX; bits += SHFT; + } + + /* sticky bit */ + sticky = 0; + while (i > 0) + sticky |= AD[--i]; + + if (sticky) + l += (RADIX / 2) * f; + + h += l; + + for (i = 0; i < EXA; ++i) h *= (REAL)RADIX; + for (i = 0; i > EXA; --i) h *= IRADIX; + if (SGNA == -1) h = -h; + return h; + } else { + return 0.0; + } +} + +static void neg(N a) +{ + SGNA = -SGNA; +} + +static void inv(const N a, N x) +{ + N w, z, one, two; + + fromreal(1.0 / toreal(a), x); /* initial guess */ + fromshort(1, one); + fromshort(2, two); + + for (;;) { + /* Newton */ + mul(a, x, w); + sub(two, w, z); + if (eq(one, z)) break; + mul(x, z, x); + } +} + + +/* 2 pi */ +static const N n2pi = {{ + 1, 1, + {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6} +}}; + +/* 1 / 31! */ +static const N i31fac = {{ + 1, -7, + {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382} +}}; + + +/* 1 / 32! */ +static const N i32fac = {{ + 1, -7, + {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293} +}}; + +static void msin(const N a, N b) +{ + N a2, g, k; + int i; + + cpy(i31fac, g); + cpy(g, b); + mul(a, a, a2); + + /* Taylor */ + for (i = 31; i > 1; i -= 2) { + fromshort(i * (i - 1), k); + mul(k, g, g); + mul(a2, b, k); + sub(g, k, b); + } + mul(a, b, b); +} + +static void mcos(const N a, N b) +{ + N a2, g, k; + int i; + + cpy(i32fac, g); + cpy(g, b); + mul(a, a, a2); + + /* Taylor */ + for (i = 32; i > 0; i -= 2) { + fromshort(i * (i - 1), k); + mul(k, g, g); + mul(a2, b, k); + sub(g, k, b); + } +} + +static void by2pi(REAL m, REAL n, N a) +{ + N b; + + fromreal(n, b); + inv(b, a); + fromreal(m, b); + mul(a, b, a); + mul(n2pi, a, a); +} + +static void sin2pi(REAL m, REAL n, N a); +static void cos2pi(REAL m, REAL n, N a) +{ + N b; + if (m < 0) cos2pi(-m, n, a); + else if (m > n * 0.5) cos2pi(n - m, n, a); + else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);} + else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a); + else { by2pi(m, n, b); mcos(b, a); } +} + +static void sin2pi(REAL m, REAL n, N a) +{ + N b; + if (m < 0) {sin2pi(-m, n, a); neg(a);} + else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);} + else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);} + else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);} + else {by2pi(m, n, b); msin(b, a);} +} + +/*----------------------------------------------------------------------*/ +/* FFT stuff */ + +/* (r0 + i i0)(r1 + i i1) */ +static void cmul(N r0, N i0, N r1, N i1, N r2, N i2) +{ + N s, t, q; + mul(r0, r1, s); + mul(i0, i1, t); + sub(s, t, q); + mul(r0, i1, s); + mul(i0, r1, t); + add(s, t, i2); + cpy(q, r2); +} + +/* (r0 - i i0)(r1 + i i1) */ +static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2) +{ + N s, t, q; + mul(r0, r1, s); + mul(i0, i1, t); + add(s, t, q); + mul(r0, i1, s); + mul(i0, r1, t); + sub(s, t, i2); + cpy(q, r2); +} + +static void mcexp(int m, int n, N r, N i) +{ + static int cached_n = -1; + static N w[64][2]; + int k, j; + if (n != cached_n) { + for (j = 1, k = 0; j < n; j += j, ++k) { + cos2pi(j, n, w[k][0]); + sin2pi(j, n, w[k][1]); + } + cached_n = n; + } + + fromshort(1, r); + fromshort(0, i); + if (m > 0) { + for (k = 0; m; ++k, m >>= 1) + if (m & 1) + cmul(w[k][0], w[k][1], r, i, r, i); + } else { + m = -m; + for (k = 0; m; ++k, m >>= 1) + if (m & 1) + cmulj(w[k][0], w[k][1], r, i, r, i); + } +} + +static void bitrev(int n, N *a) +{ + int i, j, m; + for (i = j = 0; i < n - 1; ++i) { + if (i < j) { + N t; + cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]); + cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]); + } + + /* bit reversed counter */ + m = n; do { m >>= 1; j ^= m; } while (!(j & m)); + } +} + +static void fft0(int n, N *a, int sign) +{ + int i, j, k; + + bitrev(n, a); + for (i = 1; i < n; i = 2 * i) { + for (j = 0; j < i; ++j) { + N wr, wi; + mcexp(sign * (int)j, 2 * i, wr, wi); + for (k = j; k < n; k += 2 * i) { + N *a0 = a + 2 * k; + N *a1 = a0 + 2 * i; + N r0, i0, r1, i1, t0, t1, xr, xi; + cpy(a0[0], r0); cpy(a0[1], i0); + cpy(a1[0], r1); cpy(a1[1], i1); + mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr); + mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi); + add(r0, xr, a0[0]); add(i0, xi, a0[1]); + sub(r0, xr, a1[0]); sub(i0, xi, a1[1]); + } + } + } +} + +/* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */ +static void bluestein_sequence(int n, N *a) +{ + int k, ksq, n2 = 2 * n; + + ksq = 1; /* (-1)^2 */ + for (k = 0; k < n; ++k) { + /* careful with overflow */ + ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2; + mcexp(ksq, n2, a[2*k], a[2*k+1]); + } +} + +static int pow2_atleast(int x) +{ + int h; + for (h = 1; h < x; h = 2 * h) + ; + return h; +} + +static N *cached_bluestein_w = 0; +static N *cached_bluestein_y = 0; +static int cached_bluestein_n = -1; + +static void bluestein(int n, N *a) +{ + int nb = pow2_atleast(2 * n); + N *b = (N *)bench_malloc(2 * nb * sizeof(N)); + N *w = cached_bluestein_w; + N *y = cached_bluestein_y; + N nbinv; + int i; + + fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */ + + if (cached_bluestein_n != n) { + if (w) bench_free(w); + if (y) bench_free(y); + w = (N *)bench_malloc(2 * n * sizeof(N)); + y = (N *)bench_malloc(2 * nb * sizeof(N)); + cached_bluestein_n = n; + cached_bluestein_w = w; + cached_bluestein_y = y; + + bluestein_sequence(n, w); + for (i = 0; i < 2*nb; ++i) cpy(zero, y[i]); + + for (i = 0; i < n; ++i) { + cpy(w[2*i], y[2*i]); + cpy(w[2*i+1], y[2*i+1]); + } + for (i = 1; i < n; ++i) { + cpy(w[2*i], y[2*(nb-i)]); + cpy(w[2*i+1], y[2*(nb-i)+1]); + } + + fft0(nb, y, -1); + } + + for (i = 0; i < 2*nb; ++i) cpy(zero, b[i]); + + for (i = 0; i < n; ++i) + cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]); + + /* scaled convolution b * y */ + fft0(nb, b, -1); + + for (i = 0; i < nb; ++i) + cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]); + fft0(nb, b, 1); + + for (i = 0; i < n; ++i) { + cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]); + mul(nbinv, a[2*i], a[2*i]); + mul(nbinv, a[2*i+1], a[2*i+1]); + } + + bench_free(b); +} + +static void swapri(int n, N *a) +{ + int i; + for (i = 0; i < n; ++i) { + N t; + cpy(a[2 * i], t); + cpy(a[2 * i + 1], a[2 * i]); + cpy(t, a[2 * i + 1]); + } +} + +static void fft1(int n, N *a, int sign) +{ + if (power_of_two(n)) { + fft0(n, a, sign); + } else { + if (sign == 1) swapri(n, a); + bluestein(n, a); + if (sign == 1) swapri(n, a); + } +} + +static void fromrealv(int n, bench_complex *a, N *b) +{ + int i; + + for (i = 0; i < n; ++i) { + fromreal(c_re(a[i]), b[2 * i]); + fromreal(c_im(a[i]), b[2 * i + 1]); + } +} + +static void compare(int n, N *a, N *b, double *err) +{ + int i; + double e1, e2, einf; + double n1, n2, ninf; + + e1 = e2 = einf = 0.0; + n1 = n2 = ninf = 0.0; + +# define DO(x1, x2, xinf, var) { \ + double d = var; \ + if (d < 0) d = -d; \ + x1 += d; x2 += d * d; if (d > xinf) xinf = d; \ +} + + for (i = 0; i < 2 * n; ++i) { + N dd; + sub(a[i], b[i], dd); + DO(n1, n2, ninf, toreal(a[i])); + DO(e1, e2, einf, toreal(dd)); + } + +# undef DO + err[0] = e1 / n1; + err[1] = sqrt(e2 / n2); + err[2] = einf / ninf; +} + +void fftaccuracy(int n, bench_complex *a, bench_complex *ffta, + int sign, double err[6]) +{ + N *b = (N *)bench_malloc(2 * n * sizeof(N)); + N *fftb = (N *)bench_malloc(2 * n * sizeof(N)); + N mn, ninv; + int i; + + fromreal(n, mn); inv(mn, ninv); + + /* forward error */ + fromrealv(n, a, b); fromrealv(n, ffta, fftb); + fft1(n, b, sign); + compare(n, b, fftb, err); + + /* backward error */ + fromrealv(n, a, b); fromrealv(n, ffta, fftb); + for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]); + fft1(n, fftb, -sign); + compare(n, b, fftb, err + 3); + + bench_free(fftb); + bench_free(b); +} + +void fftaccuracy_done(void) +{ + if (cached_bluestein_w) bench_free(cached_bluestein_w); + if (cached_bluestein_y) bench_free(cached_bluestein_y); + cached_bluestein_w = 0; + cached_bluestein_y = 0; + cached_bluestein_n = -1; +}