Chris@42: /*
Chris@42:  * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42:  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42:  *
Chris@42:  * This program is free software; you can redistribute it and/or modify
Chris@42:  * it under the terms of the GNU General Public License as published by
Chris@42:  * the Free Software Foundation; either version 2 of the License, or
Chris@42:  * (at your option) any later version.
Chris@42:  *
Chris@42:  * This program is distributed in the hope that it will be useful,
Chris@42:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Chris@42:  * GNU General Public License for more details.
Chris@42:  *
Chris@42:  * You should have received a copy of the GNU General Public License
Chris@42:  * along with this program; if not, write to the Free Software
Chris@42:  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
Chris@42:  *
Chris@42:  */
Chris@42: 
Chris@42: 
Chris@42: /* FFTW internal header file */
Chris@42: #ifndef __IFFTW_H__
Chris@42: #define __IFFTW_H__
Chris@42: 
Chris@42: #include "config.h"
Chris@42: 
Chris@42: #include <stdlib.h>		/* size_t */
Chris@42: #include <stdarg.h>		/* va_list */
Chris@42: #include <stddef.h>             /* ptrdiff_t */
Chris@42: #include <limits.h>             /* INT_MAX */
Chris@42: 
Chris@42: #if HAVE_SYS_TYPES_H
Chris@42: # include <sys/types.h>
Chris@42: #endif
Chris@42: 
Chris@42: #if HAVE_STDINT_H
Chris@42: # include <stdint.h>             /* uintptr_t, maybe */
Chris@42: #endif
Chris@42: 
Chris@42: #if HAVE_INTTYPES_H
Chris@42: # include <inttypes.h>           /* uintptr_t, maybe */
Chris@42: #endif
Chris@42: 
Chris@42: #ifdef __cplusplus
Chris@42: extern "C"
Chris@42: {
Chris@42: #endif /* __cplusplus */
Chris@42: 
Chris@42: /* Windows annoyances -- since tests/hook.c uses some internal
Chris@42:    FFTW functions, we need to given them the dllexport attribute
Chris@42:    under Windows when compiling as a DLL (see api/fftw3.h). */
Chris@42: #if defined(FFTW_EXTERN)
Chris@42: #  define IFFTW_EXTERN FFTW_EXTERN
Chris@42: #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
Chris@42:  && (defined(_WIN32) || defined(__WIN32__))
Chris@42: #  define IFFTW_EXTERN extern __declspec(dllexport)
Chris@42: #else
Chris@42: #  define IFFTW_EXTERN extern
Chris@42: #endif
Chris@42: 
Chris@42: /* determine precision and name-mangling scheme */
Chris@42: #define CONCAT(prefix, name) prefix ## name
Chris@42: #if defined(FFTW_SINGLE)
Chris@42:   typedef float R;
Chris@42: # define X(name) CONCAT(fftwf_, name)
Chris@42: #elif defined(FFTW_LDOUBLE)
Chris@42:   typedef long double R;
Chris@42: # define X(name) CONCAT(fftwl_, name)
Chris@42: # define TRIGREAL_IS_LONG_DOUBLE
Chris@42: #elif defined(FFTW_QUAD)
Chris@42:   typedef __float128 R;
Chris@42: # define X(name) CONCAT(fftwq_, name)
Chris@42: # define TRIGREAL_IS_QUAD
Chris@42: #else
Chris@42:   typedef double R;
Chris@42: # define X(name) CONCAT(fftw_, name)
Chris@42: #endif
Chris@42: 
Chris@42: /*
Chris@42:   integral type large enough to contain a stride (what ``int'' should
Chris@42:   have been in the first place.
Chris@42: */
Chris@42: typedef ptrdiff_t INT;
Chris@42: 
Chris@42: /* dummy use of unused parameters to silence compiler warnings */
Chris@42: #define UNUSED(x) (void)x
Chris@42: 
Chris@42: #define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
Chris@42: 
Chris@42: #define FFT_SIGN (-1)  /* sign convention for forward transforms */
Chris@42: extern void X(extract_reim)(int sign, R *c, R **r, R **i);
Chris@42: 
Chris@42: #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
Chris@42: 
Chris@42: #define STRINGIZEx(x) #x
Chris@42: #define STRINGIZE(x) STRINGIZEx(x)
Chris@42: #define CIMPLIES(ante, post) (!(ante) || (post))
Chris@42: 
Chris@42: /* define HAVE_SIMD if any simd extensions are supported */
Chris@42: #if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
Chris@42:       defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
Chris@42:       defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
Chris@42:       defined(HAVE_KCVI) || \
Chris@42:       defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
Chris@42:       defined(HAVE_MIPS_PS) || \
Chris@42:       defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
Chris@42: #define HAVE_SIMD 1
Chris@42: #else
Chris@42: #define HAVE_SIMD 0
Chris@42: #endif
Chris@42: 
Chris@42: extern int X(have_simd_sse2)(void);
Chris@42: extern int X(have_simd_avx)(void);
Chris@42: extern int X(have_simd_avx_128_fma)(void);
Chris@42: extern int X(have_simd_avx2)(void);
Chris@42: extern int X(have_simd_avx2_128)(void);
Chris@42: extern int X(have_simd_avx512)(void);
Chris@42: extern int X(have_simd_altivec)(void);
Chris@42: extern int X(have_simd_vsx)(void);
Chris@42: extern int X(have_simd_neon)(void);
Chris@42: 
Chris@42: /* forward declarations */
Chris@42: typedef struct problem_s problem;
Chris@42: typedef struct plan_s plan;
Chris@42: typedef struct solver_s solver;
Chris@42: typedef struct planner_s planner;
Chris@42: typedef struct printer_s printer;
Chris@42: typedef struct scanner_s scanner;
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* alloca: */
Chris@42: #if HAVE_SIMD
Chris@42: #  if defined(HAVE_KCVI) || defined(HAVE_AVX512)
Chris@42: #    define MIN_ALIGNMENT 64
Chris@42: #  elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
Chris@42: #    define MIN_ALIGNMENT 32  /* best alignment for AVX, conservative for
Chris@42: 			       * everything else */
Chris@42: #  else
Chris@42:      /* Note that we cannot use 32-byte alignment for all SIMD.  For
Chris@42: 	example, MacOS X malloc is 16-byte aligned, but there was no
Chris@42: 	posix_memalign in MacOS X until version 10.6. */
Chris@42: #    define MIN_ALIGNMENT 16
Chris@42: #  endif
Chris@42: #endif
Chris@42: 
Chris@42: #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
Chris@42:    /* use alloca if available */
Chris@42: 
Chris@42: #ifndef alloca
Chris@42: #ifdef __GNUC__
Chris@42: # define alloca __builtin_alloca
Chris@42: #else
Chris@42: # ifdef _MSC_VER
Chris@42: #  include <malloc.h>
Chris@42: #  define alloca _alloca
Chris@42: # else
Chris@42: #  if HAVE_ALLOCA_H
Chris@42: #   include <alloca.h>
Chris@42: #  else
Chris@42: #   ifdef _AIX
Chris@42:  #pragma alloca
Chris@42: #   else
Chris@42: #    ifndef alloca /* predefined by HP cc +Olibcalls */
Chris@42: void *alloca(size_t);
Chris@42: #    endif
Chris@42: #   endif
Chris@42: #  endif
Chris@42: # endif
Chris@42: #endif
Chris@42: #endif
Chris@42: 
Chris@42: #  ifdef MIN_ALIGNMENT
Chris@42: #    define STACK_MALLOC(T, p, n)				\
Chris@42:      {								\
Chris@42:          p = (T)alloca((n) + MIN_ALIGNMENT);			\
Chris@42:          p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) &	\
Chris@42:                (~(uintptr_t)(MIN_ALIGNMENT - 1)));		\
Chris@42:      }
Chris@42: #    define STACK_FREE(n) 
Chris@42: #  else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
Chris@42: #    define STACK_MALLOC(T, p, n) p = (T)alloca(n) 
Chris@42: #    define STACK_FREE(n) 
Chris@42: #  endif
Chris@42: 
Chris@42: #else /* ! HAVE_ALLOCA */
Chris@42:    /* use malloc instead of alloca */
Chris@42: #  define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
Chris@42: #  define STACK_FREE(n) X(ifree)(n)
Chris@42: #endif /* ! HAVE_ALLOCA */
Chris@42: 
Chris@42: /* allocation of buffers.  If these grow too large use malloc(), else
Chris@42:    use STACK_MALLOC (hopefully reducing to alloca()). */
Chris@42: 
Chris@42: /* 64KiB ought to be enough for anybody */
Chris@42: #define MAX_STACK_ALLOC ((size_t)64 * 1024)
Chris@42: 
Chris@42: #define BUF_ALLOC(T, p, n)			\
Chris@42: {						\
Chris@42:      if (n < MAX_STACK_ALLOC) {			\
Chris@42: 	  STACK_MALLOC(T, p, n);		\
Chris@42:      } else {					\
Chris@42: 	  p = (T)MALLOC(n, BUFFERS);		\
Chris@42:      }						\
Chris@42: }
Chris@42: 
Chris@42: #define BUF_FREE(p, n)				\
Chris@42: {						\
Chris@42:      if (n < MAX_STACK_ALLOC) {			\
Chris@42: 	  STACK_FREE(p);			\
Chris@42:      } else {					\
Chris@42: 	  X(ifree)(p);				\
Chris@42:      }						\
Chris@42: }
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* define uintptr_t if it is not already defined */
Chris@42: 
Chris@42: #ifndef HAVE_UINTPTR_T
Chris@42: #  if SIZEOF_VOID_P == 0
Chris@42: #    error sizeof void* is unknown!
Chris@42: #  elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
Chris@42:      typedef unsigned int uintptr_t;
Chris@42: #  elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
Chris@42:      typedef unsigned long uintptr_t;
Chris@42: #  elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
Chris@42:      typedef unsigned long long uintptr_t;
Chris@42: #  else
Chris@42: #    error no unsigned integer type matches void* sizeof!
Chris@42: #  endif
Chris@42: #endif
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* We can do an optimization for copying pairs of (aligned) floats
Chris@42:    when in single precision if 2*float = double. */
Chris@42: 
Chris@42: #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
Chris@42:                            && SIZEOF_FLOAT != 0 \
Chris@42:                            && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
Chris@42: 
Chris@42: #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* assert.c: */
Chris@42: IFFTW_EXTERN void X(assertion_failed)(const char *s, 
Chris@42: 				      int line, const char *file);
Chris@42: 
Chris@42: /* always check */
Chris@42: #define CK(ex)						 \
Chris@42:       (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42: 
Chris@42: #ifdef FFTW_DEBUG
Chris@42: /* check only if debug enabled */
Chris@42: #define A(ex)						 \
Chris@42:       (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
Chris@42: #else
Chris@42: #define A(ex) /* nothing */
Chris@42: #endif
Chris@42: 
Chris@42: extern void X(debug)(const char *format, ...);
Chris@42: #define D X(debug)
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* kalloc.c: */
Chris@42: extern void *X(kernel_malloc)(size_t n);
Chris@42: extern void X(kernel_free)(void *p);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* alloc.c: */
Chris@42: 
Chris@42: /* objects allocated by malloc, for statistical purposes */
Chris@42: enum malloc_tag {
Chris@42:      EVERYTHING,
Chris@42:      PLANS,
Chris@42:      SOLVERS,
Chris@42:      PROBLEMS,
Chris@42:      BUFFERS,
Chris@42:      HASHT,
Chris@42:      TENSORS,
Chris@42:      PLANNERS,
Chris@42:      SLVDESCS,
Chris@42:      TWIDDLES,
Chris@42:      STRIDES,
Chris@42:      OTHER,
Chris@42:      MALLOC_WHAT_LAST		/* must be last */
Chris@42: };
Chris@42: 
Chris@42: IFFTW_EXTERN void X(ifree)(void *ptr);
Chris@42: extern void X(ifree0)(void *ptr);
Chris@42: 
Chris@42: #ifdef FFTW_DEBUG_MALLOC
Chris@42: 
Chris@42: IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
Chris@42: 			     const char *file, int line);
Chris@42: #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
Chris@42: IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
Chris@42: 
Chris@42: #else /* ! FFTW_DEBUG_MALLOC */
Chris@42: 
Chris@42: IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
Chris@42: #define MALLOC(n, what)  X(malloc_plain)(n)
Chris@42: 
Chris@42: #endif
Chris@42: 
Chris@42: #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP))
Chris@42: extern int X(in_thread);
Chris@42: #  define IN_THREAD X(in_thread)
Chris@42: #  define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
Chris@42: #  define THREAD_OFF X(in_thread) = in_thread_save; }
Chris@42: #else
Chris@42: #  define IN_THREAD 0
Chris@42: #  define THREAD_ON 
Chris@42: #  define THREAD_OFF 
Chris@42: #endif
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* low-resolution clock */
Chris@42: 
Chris@42: #ifdef FAKE_CRUDE_TIME
Chris@42:  typedef int crude_time;
Chris@42: #else
Chris@42: # if TIME_WITH_SYS_TIME
Chris@42: #  include <sys/time.h>
Chris@42: #  include <time.h>
Chris@42: # else
Chris@42: #  if HAVE_SYS_TIME_H
Chris@42: #   include <sys/time.h>
Chris@42: #  else
Chris@42: #   include <time.h>
Chris@42: #  endif
Chris@42: # endif
Chris@42: 
Chris@42: # ifdef HAVE_BSDGETTIMEOFDAY
Chris@42: # ifndef HAVE_GETTIMEOFDAY
Chris@42: # define gettimeofday BSDgettimeofday
Chris@42: # define HAVE_GETTIMEOFDAY 1
Chris@42: # endif
Chris@42: # endif
Chris@42: 
Chris@42: # if defined(HAVE_GETTIMEOFDAY)
Chris@42:    typedef struct timeval crude_time;
Chris@42: # else
Chris@42:    typedef clock_t crude_time;
Chris@42: # endif
Chris@42: #endif /* else FAKE_CRUDE_TIME */
Chris@42: 
Chris@42: crude_time X(get_crude_time)(void);
Chris@42: double X(elapsed_since)(const planner *plnr, const problem *p,
Chris@42: 			crude_time t0); /* time in seconds since t0 */
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* ops.c: */
Chris@42: /*
Chris@42:  * ops counter.  The total number of additions is add + fma
Chris@42:  * and the total number of multiplications is mul + fma.
Chris@42:  * Total flops = add + mul + 2 * fma
Chris@42:  */
Chris@42: typedef struct {
Chris@42:      double add;
Chris@42:      double mul;
Chris@42:      double fma;
Chris@42:      double other;
Chris@42: } opcnt;
Chris@42: 
Chris@42: void X(ops_zero)(opcnt *dst);
Chris@42: void X(ops_other)(INT o, opcnt *dst);
Chris@42: void X(ops_cpy)(const opcnt *src, opcnt *dst);
Chris@42: 
Chris@42: void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42: void X(ops_add2)(const opcnt *a, opcnt *dst);
Chris@42: 
Chris@42: /* dst = m * a + b */
Chris@42: void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
Chris@42: 
Chris@42: /* dst += m * a */
Chris@42: void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
Chris@42: 
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* minmax.c: */
Chris@42: INT X(imax)(INT a, INT b);
Chris@42: INT X(imin)(INT a, INT b);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* iabs.c: */
Chris@42: INT X(iabs)(INT a);
Chris@42: 
Chris@42: /* inline version */
Chris@42: #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* md5.c */
Chris@42: 
Chris@42: #if SIZEOF_UNSIGNED_INT >= 4
Chris@42: typedef unsigned int md5uint;
Chris@42: #else
Chris@42: typedef unsigned long md5uint; /* at least 32 bits as per C standard */
Chris@42: #endif
Chris@42: 
Chris@42: typedef md5uint md5sig[4];
Chris@42: 
Chris@42: typedef struct {
Chris@42:      md5sig s; /* state and signature */
Chris@42: 
Chris@42:      /* fields not meant to be used outside md5.c: */
Chris@42:      unsigned char c[64]; /* stuff not yet processed */
Chris@42:      unsigned l;  /* total length.  Should be 64 bits long, but this is
Chris@42: 		     good enough for us */
Chris@42: } md5;
Chris@42: 
Chris@42: void X(md5begin)(md5 *p);
Chris@42: void X(md5putb)(md5 *p, const void *d_, size_t len);
Chris@42: void X(md5puts)(md5 *p, const char *s);
Chris@42: void X(md5putc)(md5 *p, unsigned char c);
Chris@42: void X(md5int)(md5 *p, int i);
Chris@42: void X(md5INT)(md5 *p, INT i);
Chris@42: void X(md5unsigned)(md5 *p, unsigned i);
Chris@42: void X(md5end)(md5 *p);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* tensor.c: */
Chris@42: #define STRUCT_HACK_KR
Chris@42: #undef STRUCT_HACK_C99
Chris@42: 
Chris@42: typedef struct {
Chris@42:      INT n;
Chris@42:      INT is;			/* input stride */
Chris@42:      INT os;			/* output stride */
Chris@42: } iodim;
Chris@42: 
Chris@42: typedef struct {
Chris@42:      int rnk;
Chris@42: #if defined(STRUCT_HACK_KR)
Chris@42:      iodim dims[1];
Chris@42: #elif defined(STRUCT_HACK_C99)
Chris@42:      iodim dims[];
Chris@42: #else
Chris@42:      iodim *dims;
Chris@42: #endif
Chris@42: } tensor;
Chris@42: 
Chris@42: /*
Chris@42:   Definition of rank -infinity.
Chris@42:   This definition has the property that if you want rank 0 or 1,
Chris@42:   you can simply test for rank <= 1.  This is a common case.
Chris@42:  
Chris@42:   A tensor of rank -infinity has size 0.
Chris@42: */
Chris@42: #define RNK_MINFTY  INT_MAX
Chris@42: #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
Chris@42: 
Chris@42: typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
Chris@42: 
Chris@42: tensor *X(mktensor)(int rnk);
Chris@42: tensor *X(mktensor_0d)(void);
Chris@42: tensor *X(mktensor_1d)(INT n, INT is, INT os);
Chris@42: tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
Chris@42: 		       INT n1, INT is1, INT os1);
Chris@42: tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
Chris@42: 		       INT n1, INT is1, INT os1,
Chris@42: 		       INT n2, INT is2, INT os2);
Chris@42: tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
Chris@42: 		       INT n1, INT is1, INT os1,
Chris@42: 		       INT n2, INT is2, INT os2,
Chris@42: 		       INT n3, INT is3, INT os3);
Chris@42: tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
Chris@42: 		       INT n1, INT is1, INT os1,
Chris@42: 		       INT n2, INT is2, INT os2,
Chris@42: 		       INT n3, INT is3, INT os3,
Chris@42: 		       INT n4, INT is4, INT os4);
Chris@42: INT X(tensor_sz)(const tensor *sz);
Chris@42: void X(tensor_md5)(md5 *p, const tensor *t);
Chris@42: INT X(tensor_max_index)(const tensor *sz);
Chris@42: INT X(tensor_min_istride)(const tensor *sz);
Chris@42: INT X(tensor_min_ostride)(const tensor *sz);
Chris@42: INT X(tensor_min_stride)(const tensor *sz);
Chris@42: int X(tensor_inplace_strides)(const tensor *sz);
Chris@42: int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
Chris@42: int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
Chris@42:                                inplace_kind k);
Chris@42: tensor *X(tensor_copy)(const tensor *sz);
Chris@42: int X(tensor_kosherp)(const tensor *x);
Chris@42: 
Chris@42: tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
Chris@42: tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
Chris@42: tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
Chris@42: tensor *X(tensor_compress)(const tensor *sz);
Chris@42: tensor *X(tensor_compress_contiguous)(const tensor *sz);
Chris@42: tensor *X(tensor_append)(const tensor *a, const tensor *b);
Chris@42: void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
Chris@42: int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
Chris@42: void X(tensor_destroy)(tensor *sz);
Chris@42: void X(tensor_destroy2)(tensor *a, tensor *b);
Chris@42: void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
Chris@42: void X(tensor_print)(const tensor *sz, printer *p);
Chris@42: int X(dimcmp)(const iodim *a, const iodim *b);
Chris@42: int X(tensor_equal)(const tensor *a, const tensor *b);
Chris@42: int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* problem.c: */
Chris@42: enum { 
Chris@42:      /* a problem that cannot be solved */
Chris@42:      PROBLEM_UNSOLVABLE,
Chris@42: 
Chris@42:      PROBLEM_DFT, 
Chris@42:      PROBLEM_RDFT,
Chris@42:      PROBLEM_RDFT2,
Chris@42: 
Chris@42:      /* for mpi/ subdirectory */
Chris@42:      PROBLEM_MPI_DFT,
Chris@42:      PROBLEM_MPI_RDFT,
Chris@42:      PROBLEM_MPI_RDFT2,
Chris@42:      PROBLEM_MPI_TRANSPOSE,
Chris@42: 
Chris@42:      PROBLEM_LAST 
Chris@42: };
Chris@42: 
Chris@42: typedef struct {
Chris@42:      int problem_kind;
Chris@42:      void (*hash) (const problem *ego, md5 *p);
Chris@42:      void (*zero) (const problem *ego);
Chris@42:      void (*print) (const problem *ego, printer *p);
Chris@42:      void (*destroy) (problem *ego);
Chris@42: } problem_adt;
Chris@42: 
Chris@42: struct problem_s {
Chris@42:      const problem_adt *adt;
Chris@42: };
Chris@42: 
Chris@42: problem *X(mkproblem)(size_t sz, const problem_adt *adt);
Chris@42: void X(problem_destroy)(problem *ego);
Chris@42: problem *X(mkproblem_unsolvable)(void);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* print.c */
Chris@42: struct printer_s {
Chris@42:      void (*print)(printer *p, const char *format, ...);
Chris@42:      void (*vprint)(printer *p, const char *format, va_list ap);
Chris@42:      void (*putchr)(printer *p, char c);
Chris@42:      void (*cleanup)(printer *p);
Chris@42:      int indent;
Chris@42:      int indent_incr;
Chris@42: };
Chris@42: 
Chris@42: printer *X(mkprinter)(size_t size, 
Chris@42: 		      void (*putchr)(printer *p, char c),
Chris@42: 		      void (*cleanup)(printer *p));
Chris@42: IFFTW_EXTERN void X(printer_destroy)(printer *p);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* scan.c */
Chris@42: struct scanner_s {
Chris@42:      int (*scan)(scanner *sc, const char *format, ...);
Chris@42:      int (*vscan)(scanner *sc, const char *format, va_list ap);
Chris@42:      int (*getchr)(scanner *sc);
Chris@42:      int ungotc;
Chris@42: };
Chris@42: 
Chris@42: scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
Chris@42: void X(scanner_destroy)(scanner *sc);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* plan.c: */
Chris@42: 
Chris@42: enum wakefulness {
Chris@42:      SLEEPY,
Chris@42:      AWAKE_ZERO,
Chris@42:      AWAKE_SQRTN_TABLE,
Chris@42:      AWAKE_SINCOS
Chris@42: };
Chris@42: 
Chris@42: typedef struct {
Chris@42:      void (*solve)(const plan *ego, const problem *p);
Chris@42:      void (*awake)(plan *ego, enum wakefulness wakefulness);
Chris@42:      void (*print)(const plan *ego, printer *p);
Chris@42:      void (*destroy)(plan *ego);
Chris@42: } plan_adt;
Chris@42: 
Chris@42: struct plan_s {
Chris@42:      const plan_adt *adt;
Chris@42:      opcnt ops;
Chris@42:      double pcost;
Chris@42:      enum wakefulness wakefulness; /* used for debugging only */
Chris@42:      int could_prune_now_p;
Chris@42: };
Chris@42: 
Chris@42: plan *X(mkplan)(size_t size, const plan_adt *adt);
Chris@42: void X(plan_destroy_internal)(plan *ego);
Chris@42: IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42: void X(plan_null_destroy)(plan *ego);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* solver.c: */
Chris@42: typedef struct {
Chris@42:      int problem_kind;
Chris@42:      plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
Chris@42:      void (*destroy)(solver *ego);
Chris@42: } solver_adt;
Chris@42: 
Chris@42: struct solver_s {
Chris@42:      const solver_adt *adt;
Chris@42:      int refcnt;
Chris@42: };
Chris@42: 
Chris@42: solver *X(mksolver)(size_t size, const solver_adt *adt);
Chris@42: void X(solver_use)(solver *ego);
Chris@42: void X(solver_destroy)(solver *ego);
Chris@42: void X(solver_register)(planner *plnr, solver *s);
Chris@42: 
Chris@42: /* shorthand */
Chris@42: #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* planner.c */
Chris@42: 
Chris@42: typedef struct slvdesc_s {
Chris@42:      solver *slv;
Chris@42:      const char *reg_nam;
Chris@42:      unsigned nam_hash;
Chris@42:      int reg_id;
Chris@42:      int next_for_same_problem_kind;
Chris@42: } slvdesc;
Chris@42: 
Chris@42: typedef struct solution_s solution; /* opaque */
Chris@42: 
Chris@42: /* interpretation of L and U: 
Chris@42: 
Chris@42:    - if it returns a plan, the planner guarantees that all applicable
Chris@42:      plans at least as impatient as U have been tried, and that each
Chris@42:      plan in the solution is at least as impatient as L.
Chris@42:    
Chris@42:    - if it returns 0, the planner guarantees to have tried all solvers
Chris@42:      at least as impatient as L, and that none of them was applicable.
Chris@42: 
Chris@42:    The structure is packed to fit into 64 bits.
Chris@42: */
Chris@42: 
Chris@42: typedef struct {
Chris@42:      unsigned l:20;
Chris@42:      unsigned hash_info:3;
Chris@42: #    define BITS_FOR_TIMELIMIT 9
Chris@42:      unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
Chris@42:      unsigned u:20;
Chris@42:      
Chris@42:      /* abstraction break: we store the solver here to pad the
Chris@42: 	structure to 64 bits.  Otherwise, the struct is padded to 64
Chris@42: 	bits anyway, and another word is allocated for slvndx. */
Chris@42: #    define BITS_FOR_SLVNDX 12
Chris@42:      unsigned slvndx:BITS_FOR_SLVNDX;
Chris@42: } flags_t;
Chris@42: 
Chris@42: /* impatience flags  */
Chris@42: enum {
Chris@42:      BELIEVE_PCOST = 0x0001,
Chris@42:      ESTIMATE = 0x0002,
Chris@42:      NO_DFT_R2HC = 0x0004,
Chris@42:      NO_SLOW = 0x0008,
Chris@42:      NO_VRECURSE = 0x0010,
Chris@42:      NO_INDIRECT_OP = 0x0020,
Chris@42:      NO_LARGE_GENERIC = 0x0040,
Chris@42:      NO_RANK_SPLITS = 0x0080,
Chris@42:      NO_VRANK_SPLITS = 0x0100,
Chris@42:      NO_NONTHREADED = 0x0200,
Chris@42:      NO_BUFFERING = 0x0400,
Chris@42:      NO_FIXED_RADIX_LARGE_N = 0x0800,
Chris@42:      NO_DESTROY_INPUT = 0x1000,
Chris@42:      NO_SIMD = 0x2000,
Chris@42:      CONSERVE_MEMORY = 0x4000,
Chris@42:      NO_DHT_R2HC = 0x8000,
Chris@42:      NO_UGLY = 0x10000,
Chris@42:      ALLOW_PRUNING = 0x20000
Chris@42: };
Chris@42: 
Chris@42: /* hashtable information */
Chris@42: enum {
Chris@42:      BLESSING = 0x1u,   /* save this entry */
Chris@42:      H_VALID = 0x2u,    /* valid hastable entry */
Chris@42:      H_LIVE = 0x4u      /* entry is nonempty, implies H_VALID */
Chris@42: };
Chris@42: 
Chris@42: #define PLNR_L(plnr) ((plnr)->flags.l)
Chris@42: #define PLNR_U(plnr) ((plnr)->flags.u)
Chris@42: #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
Chris@42: 
Chris@42: #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
Chris@42: #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
Chris@42: #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
Chris@42: 
Chris@42: #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
Chris@42: #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
Chris@42: #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
Chris@42: #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
Chris@42: #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
Chris@42: #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
Chris@42: #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
Chris@42: #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
Chris@42: #define NO_FIXED_RADIX_LARGE_NP(plnr) \
Chris@42:   (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
Chris@42: #define NO_NONTHREADEDP(plnr) \
Chris@42:   ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
Chris@42: 
Chris@42: #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
Chris@42: #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
Chris@42: #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
Chris@42: #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
Chris@42: #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
Chris@42: 
Chris@42: typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
Chris@42: 
Chris@42: typedef enum { 
Chris@42:      /* WISDOM_NORMAL: planner may or may not use wisdom */
Chris@42:      WISDOM_NORMAL, 
Chris@42: 
Chris@42:      /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
Chris@42:      WISDOM_ONLY, 
Chris@42: 
Chris@42:      /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
Chris@42:      WISDOM_IS_BOGUS,
Chris@42: 
Chris@42:      /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
Chris@42:      WISDOM_IGNORE_INFEASIBLE,
Chris@42: 
Chris@42:      /* WISDOM_IGNORE_ALL: planner ignores all */
Chris@42:      WISDOM_IGNORE_ALL
Chris@42: } wisdom_state_t;
Chris@42: 
Chris@42: typedef struct {
Chris@42:      void (*register_solver)(planner *ego, solver *s);
Chris@42:      plan *(*mkplan)(planner *ego, const problem *p);
Chris@42:      void (*forget)(planner *ego, amnesia a);
Chris@42:      void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
Chris@42: 						 word in C++. */
Chris@42:      int (*imprt)(planner *ego, scanner *sc);
Chris@42: } planner_adt;
Chris@42: 
Chris@42: /* hash table of solutions */
Chris@42: typedef struct {
Chris@42:      solution *solutions;
Chris@42:      unsigned hashsiz, nelem;
Chris@42: 
Chris@42:      /* statistics */
Chris@42:      int lookup, succ_lookup, lookup_iter;
Chris@42:      int insert, insert_iter, insert_unknown;
Chris@42:      int nrehash;
Chris@42: } hashtab;
Chris@42: 
Chris@42: typedef enum { COST_SUM, COST_MAX } cost_kind;
Chris@42: 
Chris@42: struct planner_s {
Chris@42:      const planner_adt *adt;
Chris@42:      void (*hook)(struct planner_s *plnr, plan *pln, 
Chris@42: 		  const problem *p, int optimalp);
Chris@42:      double (*cost_hook)(const problem *p, double t, cost_kind k);
Chris@42:      int (*wisdom_ok_hook)(const problem *p, flags_t flags);
Chris@42:      void (*nowisdom_hook)(const problem *p);
Chris@42:      wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
Chris@42: 
Chris@42:      /* solver descriptors */
Chris@42:      slvdesc *slvdescs;
Chris@42:      unsigned nslvdesc, slvdescsiz;
Chris@42:      const char *cur_reg_nam;
Chris@42:      int cur_reg_id;
Chris@42:      int slvdescs_for_problem_kind[PROBLEM_LAST];
Chris@42: 
Chris@42:      wisdom_state_t wisdom_state;
Chris@42: 
Chris@42:      hashtab htab_blessed;
Chris@42:      hashtab htab_unblessed;
Chris@42: 
Chris@42:      int nthr;
Chris@42:      flags_t flags;
Chris@42: 
Chris@42:      crude_time start_time;
Chris@42:      double timelimit; /* elapsed_since(start_time) at which to bail out */
Chris@42:      int timed_out; /* whether most recent search timed out */
Chris@42:      int need_timeout_check;
Chris@42: 
Chris@42:      /* various statistics */
Chris@42:      int nplan;    /* number of plans evaluated */
Chris@42:      double pcost, epcost; /* total pcost of measured/estimated plans */
Chris@42:      int nprob;    /* number of problems evaluated */
Chris@42: };
Chris@42: 
Chris@42: planner *X(mkplanner)(void);
Chris@42: void X(planner_destroy)(planner *ego);
Chris@42: 
Chris@42: /*
Chris@42:   Iterate over all solvers.   Read:
Chris@42:  
Chris@42:   @article{ baker93iterators,
Chris@42:   author = "Henry G. Baker, Jr.",
Chris@42:   title = "Iterators: Signs of Weakness in Object-Oriented Languages",
Chris@42:   journal = "{ACM} {OOPS} Messenger",
Chris@42:   volume = "4",
Chris@42:   number = "3",
Chris@42:   pages = "18--25"
Chris@42:   }
Chris@42: */
Chris@42: #define FORALL_SOLVERS(ego, s, p, what)			\
Chris@42: {							\
Chris@42:      unsigned _cnt;					\
Chris@42:      for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) {	\
Chris@42: 	  slvdesc *p = ego->slvdescs + _cnt;		\
Chris@42: 	  solver *s = p->slv;				\
Chris@42: 	  what;						\
Chris@42:      }							\
Chris@42: }
Chris@42: 
Chris@42: #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what)		\
Chris@42: {								\
Chris@42:      int _cnt = ego->slvdescs_for_problem_kind[kind]; 		\
Chris@42:      while (_cnt >= 0) {					\
Chris@42: 	  slvdesc *p = ego->slvdescs + _cnt;			\
Chris@42: 	  solver *s = p->slv;					\
Chris@42: 	  what;							\
Chris@42: 	  _cnt = p->next_for_same_problem_kind;			\
Chris@42:      }								\
Chris@42: }
Chris@42: 
Chris@42: 
Chris@42: /* make plan, destroy problem */
Chris@42: plan *X(mkplan_d)(planner *ego, problem *p);
Chris@42: plan *X(mkplan_f_d)(planner *ego, problem *p, 
Chris@42: 		    unsigned l_set, unsigned u_set, unsigned u_reset);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* stride.c: */
Chris@42: 
Chris@42: /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
Chris@42: #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
Chris@42: #define PRECOMPUTE_ARRAY_INDICES
Chris@42: #endif
Chris@42: 
Chris@42: extern const INT X(an_INT_guaranteed_to_be_zero);
Chris@42: 
Chris@42: #ifdef PRECOMPUTE_ARRAY_INDICES
Chris@42: typedef INT *stride;
Chris@42: #define WS(stride, i)  (stride[i])
Chris@42: extern stride X(mkstride)(INT n, INT s);
Chris@42: void X(stride_destroy)(stride p);
Chris@42: /* hackery to prevent the compiler from copying the strides array
Chris@42:    onto the stack */
Chris@42: #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
Chris@42: #else
Chris@42: 
Chris@42: typedef INT stride;
Chris@42: #define WS(stride, i)  (stride * i)
Chris@42: #define fftwf_mkstride(n, stride) stride
Chris@42: #define fftw_mkstride(n, stride) stride
Chris@42: #define fftwl_mkstride(n, stride) stride
Chris@42: #define fftwf_stride_destroy(p) ((void) p)
Chris@42: #define fftw_stride_destroy(p) ((void) p)
Chris@42: #define fftwl_stride_destroy(p) ((void) p)
Chris@42: 
Chris@42: /* hackery to prevent the compiler from ``optimizing'' induction
Chris@42:    variables in codelet loops.  The problem is that for each K and for
Chris@42:    each expression of the form P[I + STRIDE * K] in a loop, most
Chris@42:    compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
Chris@42:    For large values of K this behavior overflows the
Chris@42:    register set, which is likely worse than doing the index computation
Chris@42:    in the first place.
Chris@42: 
Chris@42:    If we guess that there are more than
Chris@42:    ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
Chris@42:    the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
Chris@42:    be 0, but the compiler does not know this. 
Chris@42: 
Chris@42:    16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
Chris@42:    seem to imply.
Chris@42: */
Chris@42: #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
Chris@42: #define MAKE_VOLATILE_STRIDE(nptr, x)                   \
Chris@42:      (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ?     \
Chris@42:         0 :                                             \
Chris@42:       ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
Chris@42: #endif /* PRECOMPUTE_ARRAY_INDICES */
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* solvtab.c */
Chris@42: 
Chris@42: struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
Chris@42: typedef struct solvtab_s solvtab[];
Chris@42: void X(solvtab_exec)(const solvtab tbl, planner *p);
Chris@42: #define SOLVTAB(s) { s, STRINGIZE(s) }
Chris@42: #define SOLVTAB_END { 0, 0 }
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* pickdim.c */
Chris@42: int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
Chris@42: 	       const tensor *sz, int oop, int *dp);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* twiddle.c */
Chris@42: /* little language to express twiddle factors computation */
Chris@42: enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3, 
Chris@42:        TW_FULL = 4, TW_HALF = 5 };
Chris@42: 
Chris@42: typedef struct {
Chris@42:      unsigned char op;
Chris@42:      signed char v;
Chris@42:      short i;
Chris@42: } tw_instr;
Chris@42: 
Chris@42: typedef struct twid_s {
Chris@42:      R *W;                     /* array of twiddle factors */
Chris@42:      INT n, r, m;                /* transform order, radix, # twiddle rows */
Chris@42:      int refcnt;
Chris@42:      const tw_instr *instr;
Chris@42:      struct twid_s *cdr;
Chris@42:      enum wakefulness wakefulness;
Chris@42: } twid;
Chris@42: 
Chris@42: INT X(twiddle_length)(INT r, const tw_instr *p);
Chris@42: void X(twiddle_awake)(enum wakefulness wakefulness,
Chris@42: 		      twid **pp, const tw_instr *instr, INT n, INT r, INT m);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* trig.c */
Chris@42: #if defined(TRIGREAL_IS_LONG_DOUBLE)
Chris@42:    typedef long double trigreal;
Chris@42: #elif defined(TRIGREAL_IS_QUAD)
Chris@42:    typedef __float128 trigreal;
Chris@42: #else
Chris@42:    typedef double trigreal;
Chris@42: #endif
Chris@42: 
Chris@42: typedef struct triggen_s triggen;
Chris@42: 
Chris@42: struct triggen_s {
Chris@42:      void (*cexp)(triggen *t, INT m, R *result);
Chris@42:      void (*cexpl)(triggen *t, INT m, trigreal *result);
Chris@42:      void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
Chris@42: 
Chris@42:      INT twshft;
Chris@42:      INT twradix;
Chris@42:      INT twmsk;
Chris@42:      trigreal *W0, *W1;
Chris@42:      INT n;
Chris@42: };
Chris@42: 
Chris@42: triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
Chris@42: void X(triggen_destroy)(triggen *p);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* primes.c: */
Chris@42: 
Chris@42: #define MULMOD(x, y, p) \
Chris@42:    (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
Chris@42: 
Chris@42: INT X(safe_mulmod)(INT x, INT y, INT p);
Chris@42: INT X(power_mod)(INT n, INT m, INT p);
Chris@42: INT X(find_generator)(INT p);
Chris@42: INT X(first_divisor)(INT n);
Chris@42: int X(is_prime)(INT n);
Chris@42: INT X(next_prime)(INT n);
Chris@42: int X(factors_into)(INT n, const INT *primes);
Chris@42: int X(factors_into_small_primes)(INT n);
Chris@42: INT X(choose_radix)(INT r, INT n);
Chris@42: INT X(isqrt)(INT n);
Chris@42: INT X(modulo)(INT a, INT n);
Chris@42: 
Chris@42: #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
Chris@42: 
Chris@42: /* thresholds below which certain solvers are considered SLOW.  These are guesses
Chris@42:    believed to be conservative */
Chris@42: #define GENERIC_MAX_SLOW     16
Chris@42: #define RADER_MAX_SLOW       32
Chris@42: #define BLUESTEIN_MAX_SLOW   24
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* rader.c: */
Chris@42: typedef struct rader_tls rader_tl;
Chris@42: 
Chris@42: void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
Chris@42: R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
Chris@42: void X(rader_tl_delete)(R *W, rader_tl **tl);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* copy/transposition routines */
Chris@42: 
Chris@42: /* lower bound to the cache size, for tiled routines */
Chris@42: #define CACHESIZE 8192
Chris@42: 
Chris@42: INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
Chris@42: 
Chris@42: void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
Chris@42: 	       void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
Chris@42: 	       void *args);
Chris@42: void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
Chris@42: void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
Chris@42: void X(cpy2d)(R *I, R *O,
Chris@42: 	      INT n0, INT is0, INT os0,
Chris@42: 	      INT n1, INT is1, INT os1,
Chris@42: 	      INT vl);
Chris@42: void X(cpy2d_ci)(R *I, R *O,
Chris@42: 		 INT n0, INT is0, INT os0,
Chris@42: 		 INT n1, INT is1, INT os1,
Chris@42: 		 INT vl);
Chris@42: void X(cpy2d_co)(R *I, R *O,
Chris@42: 		 INT n0, INT is0, INT os0,
Chris@42: 		 INT n1, INT is1, INT os1,
Chris@42: 		 INT vl);
Chris@42: void X(cpy2d_tiled)(R *I, R *O,
Chris@42: 		    INT n0, INT is0, INT os0,
Chris@42: 		    INT n1, INT is1, INT os1, 
Chris@42: 		    INT vl);
Chris@42: void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@42: 		       INT n0, INT is0, INT os0,
Chris@42: 		       INT n1, INT is1, INT os1, 
Chris@42: 		       INT vl);
Chris@42: void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
Chris@42: 		   INT n0, INT is0, INT os0,
Chris@42: 		   INT n1, INT is1, INT os1);
Chris@42: void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
Chris@42: 		      INT n0, INT is0, INT os0,
Chris@42: 		      INT n1, INT is1, INT os1);
Chris@42: void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
Chris@42: 		      INT n0, INT is0, INT os0,
Chris@42: 		      INT n1, INT is1, INT os1);
Chris@42: 
Chris@42: void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42: void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42: void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42: 
Chris@42: typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
Chris@42: typedef void (*cpy2d_func)(R *I, R *O,
Chris@42: 			   INT n0, INT is0, INT os0,
Chris@42: 			   INT n1, INT is1, INT os1,
Chris@42: 			   INT vl);
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* misc stuff */
Chris@42: void X(null_awake)(plan *ego, enum wakefulness wakefulness);
Chris@42: double X(iestimate_cost)(const planner *, const plan *, const problem *);
Chris@42: 
Chris@42: #ifdef FFTW_RANDOM_ESTIMATOR
Chris@42: extern unsigned X(random_estimate_seed);
Chris@42: #endif
Chris@42: 
Chris@42: double X(measure_execution_time)(const planner *plnr, 
Chris@42: 				 plan *pln, const problem *p);
Chris@42: IFFTW_EXTERN int X(ialignment_of)(R *p);
Chris@42: unsigned X(hash)(const char *s);
Chris@42: INT X(nbuf)(INT n, INT vl, INT maxnbuf);
Chris@42: int X(nbuf_redundant)(INT n, INT vl, size_t which, 
Chris@42: 		      const INT *maxnbuf, size_t nmaxnbuf);
Chris@42: INT X(bufdist)(INT n, INT vl);
Chris@42: int X(toobig)(INT n);
Chris@42: int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
Chris@42: 
Chris@42: #if HAVE_SIMD
Chris@42: R *X(taint)(R *p, INT s);
Chris@42: R *X(join_taint)(R *p1, R *p2);
Chris@42: #define TAINT(p, s) X(taint)(p, s)
Chris@42: #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
Chris@42: #define TAINTOF(p) (((uintptr_t)(p)) & 3)
Chris@42: #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
Chris@42: #else
Chris@42: #define TAINT(p, s) (p)
Chris@42: #define UNTAINT(p) (p)
Chris@42: #define TAINTOF(p) 0
Chris@42: #define JOIN_TAINT(p1, p2) p1
Chris@42: #endif
Chris@42: 
Chris@42: #ifdef FFTW_DEBUG_ALIGNMENT
Chris@42: #  define ASSERT_ALIGNED_DOUBLE {		\
Chris@42:      double __foo;				\
Chris@42:      CK(!(((uintptr_t) &__foo) & 0x7));		\
Chris@42: }
Chris@42: #else
Chris@42: #  define ASSERT_ALIGNED_DOUBLE 
Chris@42: #endif /* FFTW_DEBUG_ALIGNMENT */
Chris@42: 
Chris@42: 
Chris@42: 
Chris@42: /*-----------------------------------------------------------------------*/
Chris@42: /* macros used in codelets to reduce source code size */
Chris@42: 
Chris@42: typedef R E;  /* internal precision of codelets. */
Chris@42: 
Chris@42: #if defined(FFTW_LDOUBLE)
Chris@42: #  define K(x) ((E) x##L)
Chris@42: #elif defined(FFTW_QUAD)
Chris@42: #  define K(x) ((E) x##Q)
Chris@42: #else
Chris@42: #  define K(x) ((E) x)
Chris@42: #endif
Chris@42: #define DK(name, value) const E name = K(value)
Chris@42: 
Chris@42: /* FMA macros */
Chris@42: 
Chris@42: #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
Chris@42: /* The obvious expression a * b + c does not work.  If both x = a * b
Chris@42:    + c and y = a * b - c appear in the source, gcc computes t = a * b,
Chris@42:    x = t + c, y = t - c, thus destroying the fma.
Chris@42: 
Chris@42:    This peculiar coding seems to do the right thing on all of
Chris@42:    gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3.  It does the right thing
Chris@42:    on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
Chris@42:    `x' for the single-assignment form).
Chris@42: 
Chris@42:    However, gcc-4.0 is a formidable adversary which succeeds in
Chris@42:    pessimizing two fma's into one multiplication and two additions.
Chris@42:    It does it very early in the game---before the optimization passes
Chris@42:    even start.  The only real workaround seems to use fake inline asm
Chris@42:    such as
Chris@42: 
Chris@42:      asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
Chris@42:      return a * b + c;
Chris@42:      
Chris@42:    in each of the FMA, FMS, FNMA, and FNMS functions.  However, this
Chris@42:    does not solve the problem either, because two equal asm statements
Chris@42:    count as a common subexpression!  One must use *different* fake asm
Chris@42:    statements:
Chris@42: 
Chris@42:    in FMA:
Chris@42:      asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
Chris@42: 
Chris@42:    in FMS:
Chris@42:      asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
Chris@42: 
Chris@42:    etc.
Chris@42: 
Chris@42:    After these changes, gcc recalcitrantly generates the fma that was
Chris@42:    in the source to begin with.  However, the extra asm() cruft
Chris@42:    confuses other passes of gcc, notably the instruction scheduler.
Chris@42:    (Of course, one could also generate the fma directly via inline
Chris@42:    asm, but this confuses the scheduler even more.)
Chris@42: 
Chris@42:    Steven and I have submitted more than one bug report to the gcc
Chris@42:    mailing list over the past few years, to no effect.  Thus, I give
Chris@42:    up.  gcc-4.0 can go to hell.  I'll wait at least until gcc-4.3 is
Chris@42:    out before touching this crap again.
Chris@42: */
Chris@42: static __inline__ E FMA(E a, E b, E c)
Chris@42: {
Chris@42:      E x = a * b;
Chris@42:      x = x + c;
Chris@42:      return x;
Chris@42: }
Chris@42: 
Chris@42: static __inline__ E FMS(E a, E b, E c)
Chris@42: {
Chris@42:      E x = a * b;
Chris@42:      x = x - c;
Chris@42:      return x;
Chris@42: }
Chris@42: 
Chris@42: static __inline__ E FNMA(E a, E b, E c)
Chris@42: {
Chris@42:      E x = a * b;
Chris@42:      x = - (x + c);
Chris@42:      return x;
Chris@42: }
Chris@42: 
Chris@42: static __inline__ E FNMS(E a, E b, E c)
Chris@42: {
Chris@42:      E x = a * b;
Chris@42:      x = - (x - c);
Chris@42:      return x;
Chris@42: }
Chris@42: #else
Chris@42: #define FMA(a, b, c) (((a) * (b)) + (c))
Chris@42: #define FMS(a, b, c) (((a) * (b)) - (c))
Chris@42: #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
Chris@42: #define FNMS(a, b, c) ((c) - ((a) * (b)))
Chris@42: #endif
Chris@42: 
Chris@42: #ifdef __cplusplus
Chris@42: }  /* extern "C" */
Chris@42: #endif /* __cplusplus */
Chris@42: 
Chris@42: #endif /* __IFFTW_H__ */