view src/fftw-3.3.5/libbench2/mp.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
line wrap: on
line source
#include "config.h"
#include "bench.h"
#include <math.h>

#define DG unsigned short
#define ACC unsigned long
#define REAL bench_real
#define BITS_IN_REAL 53 /* mantissa */

#define SHFT 16
#define RADIX 65536L
#define IRADIX (1.0 / RADIX)
#define LO(x) ((x) & (RADIX - 1))
#define HI(x) ((x) >> SHFT)
#define HI_SIGNED(x) \
   ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1))
#define ZEROEXP (-32768)

#define LEN 10

typedef struct {
     short sign;
     short expt;
     DG d[LEN]; 
} N[1];

#define EXA a->expt
#define EXB b->expt
#define EXC c->expt

#define AD a->d
#define BD b->d

#define SGNA a->sign
#define SGNB b->sign

static const N zero = {{ 1, ZEROEXP, {0} }};

static void cpy(const N a, N b)
{
     *b = *a;
}

static void fromreal(REAL x, N a)
{
     int i, e;

     cpy(zero, a);
     if (x == 0.0) return;
     
     if (x >= 0) { SGNA = 1; }
     else       { SGNA = -1; x = -x; }

     e = 0;
     while (x >= 1.0) { x *= IRADIX; ++e; }
     while (x < IRADIX) { x *= RADIX; --e; }
     EXA = e;
     
     for (i = LEN - 1; i >= 0 && x != 0.0; --i) {
	  REAL y;

	  x *= RADIX;
	  y = (REAL) ((int) x);
	  AD[i] = (DG)y;
	  x -= y;
     }
}

static void fromshort(int x, N a)
{
     cpy(zero, a);

     if (x < 0) { x = -x; SGNA = -1; } 
     else { SGNA = 1; }
     EXA = 1;
     AD[LEN - 1] = x;
}

static void pack(DG *d, int e, int s, int l, N a)
{
     int i, j;

     for (i = l - 1; i >= 0; --i, --e) 
	  if (d[i] != 0) 
	       break;

     if (i < 0) {
	  /* number is zero */
	  cpy(zero, a);
     } else {
	  EXA = e;
	  SGNA = s;

	  if (i >= LEN - 1) {
	       for (j = LEN - 1; j >= 0; --i, --j)
		    AD[j] = d[i];
	  } else {
	       for (j = LEN - 1; i >= 0; --i, --j)
		    AD[j] = d[i];
	       for ( ; j >= 0; --j)
		    AD[j] = 0;
	  }
     }
}


/* compare absolute values */
static int abscmp(const N a, const N b)
{
     int i;
     if (EXA > EXB) return 1;
     if (EXA < EXB) return -1;
     for (i = LEN - 1; i >= 0; --i) {
	  if (AD[i] > BD[i])
	       return 1;
	  if (AD[i] < BD[i])
	       return -1;
     }
     return 0;
}

static int eq(const N a, const N b)
{
     return (SGNA == SGNB) && (abscmp(a, b) == 0);
}

/* add magnitudes, for |a| >= |b| */
static void addmag0(int s, const N a, const N b, N c)
{
     int ia, ib;
     ACC r = 0;
     DG d[LEN + 1];

     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
	  r += (ACC)AD[ia] + (ACC)BD[ib];
	  d[ia] = LO(r);
	  r = HI(r);
     }     
     for (; ia < LEN; ++ia) {
	  r += (ACC)AD[ia];
	  d[ia] = LO(r);
	  r = HI(r);
     }
     d[ia] = LO(r);
     pack(d, EXA + 1, s * SGNA, LEN + 1, c);
}

static void addmag(int s, const N a, const N b, N c)
{
     if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c);
}

/* subtract magnitudes, for |a| >= |b| */
static void submag0(int s, const N a, const N b, N c)
{
     int ia, ib;
     ACC r = 0;
     DG d[LEN];

     for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) {
	  r += (ACC)AD[ia] - (ACC)BD[ib];
	  d[ia] = LO(r);
	  r = HI_SIGNED(r);
     }     
     for (; ia < LEN; ++ia) {
	  r += (ACC)AD[ia];
	  d[ia] = LO(r);
	  r = HI_SIGNED(r);
     }

     pack(d, EXA, s * SGNA, LEN, c);
}

static void submag(int s, const N a, const N b, N c)
{
     if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c);
}

/* c = a + b */
static void add(const N a, const N b, N c)
{
     if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c);
}

static void sub(const N a, const N b, N c)
{
     if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c);
}

static void mul(const N a, const N b, N c)
{
     DG d[2 * LEN];
     int i, j, k;
     ACC r;

     for (i = 0; i < LEN; ++i)
	  d[2 * i] = d[2 * i + 1] = 0;

     for (i = 0; i < LEN; ++i) {
	  ACC ai = AD[i];
	  if (ai) {
	       r = 0;
	       for (j = 0, k = i; j < LEN; ++j, ++k) {
		    r += ai * (ACC)BD[j] + (ACC)d[k];
		    d[k] = LO(r);
		    r = HI(r);
	       }
	       d[k] = LO(r);
	  }
     }

     pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c);
}

static REAL toreal(const N a)
{
     REAL h, l, f;
     int i, bits;
     ACC r;
     DG sticky;

     if (EXA != ZEROEXP) {
	  f = IRADIX;
	  i = LEN;

	  bits = 0;
	  h = (r = AD[--i]) * f; f *= IRADIX;
	  for (bits = 0; r > 0; ++bits)
	       r >>= 1;

	  /* first digit */
	  while (bits + SHFT <= BITS_IN_REAL) {
	       h += AD[--i] * f;  f *= IRADIX; bits += SHFT;
	  }

	  /* guard digit (leave one bit for sticky bit, hence `<' instead
	     of `<=') */
	  bits = 0; l = 0.0;
	  while (bits + SHFT < BITS_IN_REAL) {
	       l += AD[--i] * f;  f *= IRADIX; bits += SHFT;
	  }
	  
	  /* sticky bit */
	  sticky = 0;
	  while (i > 0) 
	       sticky |= AD[--i];

	  if (sticky)
	       l += (RADIX / 2) * f;

	  h += l;

	  for (i = 0; i < EXA; ++i) h *= (REAL)RADIX;
	  for (i = 0; i > EXA; --i) h *= IRADIX;
	  if (SGNA == -1) h = -h;
	  return h;
     } else {
	  return 0.0;
     }
}

static void neg(N a)
{
     SGNA = -SGNA;
}

static void inv(const N a, N x)
{
     N w, z, one, two;

     fromreal(1.0 / toreal(a), x); /* initial guess */
     fromshort(1, one);
     fromshort(2, two);

     for (;;) {
	  /* Newton */
	  mul(a, x, w);
	  sub(two, w, z);
	  if (eq(one, z)) break;
	  mul(x, z, x);
     }
}


/* 2 pi */
static const N n2pi = {{
     1, 1,
     {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6}
}};

/* 1 / 31! */
static const N i31fac = {{ 
     1, -7, 
     {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382}
}};


/* 1 / 32! */
static const N i32fac = {{
     1, -7,
     {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293}
}};

static void msin(const N a, N b)
{
     N a2, g, k;
     int i;

     cpy(i31fac, g);
     cpy(g, b);
     mul(a, a, a2);

     /* Taylor */
     for (i = 31; i > 1; i -= 2) {
	  fromshort(i * (i - 1), k);
	  mul(k, g, g);
	  mul(a2, b, k);
	  sub(g, k, b);
     }
     mul(a, b, b);
}

static void mcos(const N a, N b)
{
     N a2, g, k;
     int i;

     cpy(i32fac, g);
     cpy(g, b);
     mul(a, a, a2);

     /* Taylor */
     for (i = 32; i > 0; i -= 2) {
	  fromshort(i * (i - 1), k);
	  mul(k, g, g);
	  mul(a2, b, k);
	  sub(g, k, b);
     }
}

static void by2pi(REAL m, REAL n, N a)
{
     N b;

     fromreal(n, b);
     inv(b, a);
     fromreal(m, b);
     mul(a, b, a);
     mul(n2pi, a, a);
}

static void sin2pi(REAL m, REAL n, N a);
static void cos2pi(REAL m, REAL n, N a)
{
     N b;
     if (m < 0) cos2pi(-m, n, a);
     else if (m > n * 0.5) cos2pi(n - m, n, a);
     else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);}
     else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a);
     else { by2pi(m, n, b); mcos(b, a); }
}

static void sin2pi(REAL m, REAL n, N a)
{
     N b;
     if (m < 0)  {sin2pi(-m, n, a); neg(a);}
     else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);}
     else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);}
     else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);}
     else {by2pi(m, n, b); msin(b, a);}
}

/*----------------------------------------------------------------------*/
/* FFT stuff */

/* (r0 + i i0)(r1 + i i1) */
static void cmul(N r0, N i0, N r1, N i1, N r2, N i2)
{
     N s, t, q;
     mul(r0, r1, s);
     mul(i0, i1, t);
     sub(s, t, q);
     mul(r0, i1, s);
     mul(i0, r1, t);
     add(s, t, i2);
     cpy(q, r2);
}

/* (r0 - i i0)(r1 + i i1) */
static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2)
{
     N s, t, q;
     mul(r0, r1, s);
     mul(i0, i1, t);
     add(s, t, q);
     mul(r0, i1, s);
     mul(i0, r1, t);
     sub(s, t, i2);
     cpy(q, r2);
}

static void mcexp(int m, int n, N r, N i)
{
     static int cached_n = -1;
     static N w[64][2];
     int k, j;
     if (n != cached_n) {
	  for (j = 1, k = 0; j < n; j += j, ++k) {
	       cos2pi(j, n, w[k][0]);
	       sin2pi(j, n, w[k][1]);
	  }
	  cached_n = n;
     }

     fromshort(1, r);
     fromshort(0, i);
     if (m > 0) {
	  for (k = 0; m; ++k, m >>= 1) 
	       if (m & 1)
		    cmul(w[k][0], w[k][1], r, i, r, i);
     } else {
	  m = -m;
	  for (k = 0; m; ++k, m >>= 1) 
	       if (m & 1)
		    cmulj(w[k][0], w[k][1], r, i, r, i);
     }
}

static void bitrev(int n, N *a)
{
     int i, j, m;
     for (i = j = 0; i < n - 1; ++i) {
	  if (i < j) {
	       N t;
	       cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]);
	       cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]);
	  }

	  /* bit reversed counter */
	  m = n; do { m >>= 1; j ^= m; } while (!(j & m));
     }
}

static void fft0(int n, N *a, int sign)
{
     int i, j, k;

     bitrev(n, a);
     for (i = 1; i < n; i = 2 * i) {
	  for (j = 0; j < i; ++j) {
	       N wr, wi;
	       mcexp(sign * (int)j, 2 * i, wr, wi);
	       for (k = j; k < n; k += 2 * i) {
		    N *a0 = a + 2 * k;
		    N *a1 = a0 + 2 * i;
		    N r0, i0, r1, i1, t0, t1, xr, xi;
		    cpy(a0[0], r0); cpy(a0[1], i0);
		    cpy(a1[0], r1); cpy(a1[1], i1);
		    mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr);
		    mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi);
		    add(r0, xr, a0[0]);  add(i0, xi, a0[1]);
		    sub(r0, xr, a1[0]);  sub(i0, xi, a1[1]);
	       }
	  }
     }
}

/* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */
static void bluestein_sequence(int n, N *a)
{
     int k, ksq, n2 = 2 * n;

     ksq = 1; /* (-1)^2 */
     for (k = 0; k < n; ++k) {
	  /* careful with overflow */
	  ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2;
	  mcexp(ksq, n2, a[2*k], a[2*k+1]);
     }
}

static int pow2_atleast(int x)
{
     int h;
     for (h = 1; h < x; h = 2 * h)
	  ;
     return h;
}

static N *cached_bluestein_w = 0;
static N *cached_bluestein_y = 0;
static int cached_bluestein_n = -1;

static void bluestein(int n, N *a)
{
     int nb = pow2_atleast(2 * n);
     N *b = (N *)bench_malloc(2 * nb * sizeof(N));
     N *w = cached_bluestein_w;
     N *y = cached_bluestein_y;
     N nbinv;
     int i;

     fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */

     if (cached_bluestein_n != n) {
	  if (w) bench_free(w);
	  if (y) bench_free(y);
	  w = (N *)bench_malloc(2 * n * sizeof(N));
	  y = (N *)bench_malloc(2 * nb * sizeof(N));
	  cached_bluestein_n = n;
	  cached_bluestein_w = w;
	  cached_bluestein_y = y;

	  bluestein_sequence(n, w);
	  for (i = 0; i < 2*nb; ++i)  cpy(zero, y[i]);

	  for (i = 0; i < n; ++i) {
	       cpy(w[2*i], y[2*i]);
	       cpy(w[2*i+1], y[2*i+1]);
	  }
	  for (i = 1; i < n; ++i) {
	       cpy(w[2*i], y[2*(nb-i)]);
	       cpy(w[2*i+1], y[2*(nb-i)+1]);
	  }

	  fft0(nb, y, -1);
     }

     for (i = 0; i < 2*nb; ++i)  cpy(zero, b[i]);
     
     for (i = 0; i < n; ++i) 
	  cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]);

     /* scaled convolution b * y */
     fft0(nb, b, -1);

     for (i = 0; i < nb; ++i) 
	  cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]);
     fft0(nb, b, 1);

     for (i = 0; i < n; ++i) {
	  cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]);
	  mul(nbinv, a[2*i], a[2*i]);
	  mul(nbinv, a[2*i+1], a[2*i+1]);
     }

     bench_free(b);
}

static void swapri(int n, N *a)
{
     int i;
     for (i = 0; i < n; ++i) {
	  N t;
	  cpy(a[2 * i], t);
	  cpy(a[2 * i + 1], a[2 * i]);
	  cpy(t, a[2 * i + 1]);
     }
}

static void fft1(int n, N *a, int sign)
{
     if (power_of_two(n)) {
	  fft0(n, a, sign);
     } else {
	  if (sign == 1) swapri(n, a);
	  bluestein(n, a);
	  if (sign == 1) swapri(n, a);
     }
}

static void fromrealv(int n, bench_complex *a, N *b)
{
     int i;

     for (i = 0; i < n; ++i) {
	  fromreal(c_re(a[i]), b[2 * i]);
	  fromreal(c_im(a[i]), b[2 * i + 1]);
     }
}

static void compare(int n, N *a, N *b, double *err)
{
     int i;
     double e1, e2, einf;
     double n1, n2, ninf;

     e1 = e2 = einf = 0.0;
     n1 = n2 = ninf = 0.0;

#    define DO(x1, x2, xinf, var) { 			\
     double d = var;					\
     if (d < 0) d = -d;					\
     x1 += d; x2 += d * d; if (d > xinf) xinf = d;	\
}
	  
     for (i = 0; i < 2 * n; ++i) {
	  N dd;
	  sub(a[i], b[i], dd);
	  DO(n1, n2, ninf, toreal(a[i]));
	  DO(e1, e2, einf, toreal(dd));
     }

#    undef DO
     err[0] = e1 / n1;
     err[1] = sqrt(e2 / n2);
     err[2] = einf / ninf;
}

void fftaccuracy(int n, bench_complex *a, bench_complex *ffta,
		 int sign, double err[6])
{
     N *b = (N *)bench_malloc(2 * n * sizeof(N));
     N *fftb = (N *)bench_malloc(2 * n * sizeof(N));
     N mn, ninv;
     int i;

     fromreal(n, mn); inv(mn, ninv);

     /* forward error */
     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
     fft1(n, b, sign);
     compare(n, b, fftb, err);

     /* backward error */
     fromrealv(n, a, b); fromrealv(n, ffta, fftb);
     for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]);
     fft1(n, fftb, -sign);
     compare(n, b, fftb, err + 3);

     bench_free(fftb);
     bench_free(b);
}

void fftaccuracy_done(void)
{
     if (cached_bluestein_w) bench_free(cached_bluestein_w);
     if (cached_bluestein_y) bench_free(cached_bluestein_y);
     cached_bluestein_w = 0;
     cached_bluestein_y = 0;
     cached_bluestein_n = -1;
}