cannam@95: /* cannam@95: * Copyright (c) 2003, 2007-11 Matteo Frigo cannam@95: * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology cannam@95: * cannam@95: * This program is free software; you can redistribute it and/or modify cannam@95: * it under the terms of the GNU General Public License as published by cannam@95: * the Free Software Foundation; either version 2 of the License, or cannam@95: * (at your option) any later version. cannam@95: * cannam@95: * This program is distributed in the hope that it will be useful, cannam@95: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@95: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@95: * GNU General Public License for more details. cannam@95: * cannam@95: * You should have received a copy of the GNU General Public License cannam@95: * along with this program; if not, write to the Free Software cannam@95: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA cannam@95: * cannam@95: */ cannam@95: cannam@95: cannam@95: /* FFTW internal header file */ cannam@95: #ifndef __IFFTW_H__ cannam@95: #define __IFFTW_H__ cannam@95: cannam@95: #include "config.h" cannam@95: cannam@95: #include /* size_t */ cannam@95: #include /* va_list */ cannam@95: #include /* ptrdiff_t */ cannam@95: cannam@95: #if HAVE_SYS_TYPES_H cannam@95: # include cannam@95: #endif cannam@95: cannam@95: #if HAVE_STDINT_H cannam@95: # include /* uintptr_t, maybe */ cannam@95: #endif cannam@95: cannam@95: #if HAVE_INTTYPES_H cannam@95: # include /* uintptr_t, maybe */ cannam@95: #endif cannam@95: cannam@95: #ifdef __cplusplus cannam@95: extern "C" cannam@95: { cannam@95: #endif /* __cplusplus */ cannam@95: cannam@95: /* Windows annoyances -- since tests/hook.c uses some internal cannam@95: FFTW functions, we need to given them the dllexport attribute cannam@95: under Windows when compiling as a DLL (see api/fftw3.h). */ cannam@95: #if defined(FFTW_EXTERN) cannam@95: # define IFFTW_EXTERN FFTW_EXTERN cannam@95: #elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \ cannam@95: && (defined(_WIN32) || defined(__WIN32__)) cannam@95: # define IFFTW_EXTERN extern __declspec(dllexport) cannam@95: #else cannam@95: # define IFFTW_EXTERN extern cannam@95: #endif cannam@95: cannam@95: /* determine precision and name-mangling scheme */ cannam@95: #define CONCAT(prefix, name) prefix ## name cannam@95: #if defined(FFTW_SINGLE) cannam@95: typedef float R; cannam@95: # define X(name) CONCAT(fftwf_, name) cannam@95: #elif defined(FFTW_LDOUBLE) cannam@95: typedef long double R; cannam@95: # define X(name) CONCAT(fftwl_, name) cannam@95: # define TRIGREAL_IS_LONG_DOUBLE cannam@95: #elif defined(FFTW_QUAD) cannam@95: typedef __float128 R; cannam@95: # define X(name) CONCAT(fftwq_, name) cannam@95: # define TRIGREAL_IS_QUAD cannam@95: #else cannam@95: typedef double R; cannam@95: # define X(name) CONCAT(fftw_, name) cannam@95: #endif cannam@95: cannam@95: /* cannam@95: integral type large enough to contain a stride (what ``int'' should cannam@95: have been in the first place. cannam@95: */ cannam@95: typedef ptrdiff_t INT; cannam@95: cannam@95: /* dummy use of unused parameters to silence compiler warnings */ cannam@95: #define UNUSED(x) (void)x cannam@95: cannam@95: #define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0]))) cannam@95: cannam@95: #define FFT_SIGN (-1) /* sign convention for forward transforms */ cannam@95: extern void X(extract_reim)(int sign, R *c, R **r, R **i); cannam@95: cannam@95: #define REGISTER_SOLVER(p, s) X(solver_register)(p, s) cannam@95: cannam@95: #define STRINGIZEx(x) #x cannam@95: #define STRINGIZE(x) STRINGIZEx(x) cannam@95: #define CIMPLIES(ante, post) (!(ante) || (post)) cannam@95: cannam@95: /* define HAVE_SIMD if any simd extensions are supported */ cannam@95: #if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \ cannam@95: defined(HAVE_MIPS_PS) || defined(HAVE_AVX) cannam@95: #define HAVE_SIMD 1 cannam@95: #else cannam@95: #define HAVE_SIMD 0 cannam@95: #endif cannam@95: cannam@95: extern int X(have_simd_sse2)(void); cannam@95: extern int X(have_simd_avx)(void); cannam@95: extern int X(have_simd_altivec)(void); cannam@95: extern int X(have_simd_neon)(void); cannam@95: cannam@95: /* forward declarations */ cannam@95: typedef struct problem_s problem; cannam@95: typedef struct plan_s plan; cannam@95: typedef struct solver_s solver; cannam@95: typedef struct planner_s planner; cannam@95: typedef struct printer_s printer; cannam@95: typedef struct scanner_s scanner; cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* alloca: */ cannam@95: #if HAVE_SIMD cannam@95: # ifdef HAVE_AVX cannam@95: # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for cannam@95: * everything else */ cannam@95: # else cannam@95: /* Note that we cannot use 32-byte alignment for all SIMD. For cannam@95: example, MacOS X malloc is 16-byte aligned, but there was no cannam@95: posix_memalign in MacOS X until version 10.6. */ cannam@95: # define MIN_ALIGNMENT 16 cannam@95: # endif cannam@95: #endif cannam@95: cannam@95: #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA) cannam@95: /* use alloca if available */ cannam@95: cannam@95: #ifndef alloca cannam@95: #ifdef __GNUC__ cannam@95: # define alloca __builtin_alloca cannam@95: #else cannam@95: # ifdef _MSC_VER cannam@95: # include cannam@95: # define alloca _alloca cannam@95: # else cannam@95: # if HAVE_ALLOCA_H cannam@95: # include cannam@95: # else cannam@95: # ifdef _AIX cannam@95: #pragma alloca cannam@95: # else cannam@95: # ifndef alloca /* predefined by HP cc +Olibcalls */ cannam@95: void *alloca(size_t); cannam@95: # endif cannam@95: # endif cannam@95: # endif cannam@95: # endif cannam@95: #endif cannam@95: #endif cannam@95: cannam@95: # ifdef MIN_ALIGNMENT cannam@95: # define STACK_MALLOC(T, p, n) \ cannam@95: { \ cannam@95: p = (T)alloca((n) + MIN_ALIGNMENT); \ cannam@95: p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \ cannam@95: (~(uintptr_t)(MIN_ALIGNMENT - 1))); \ cannam@95: } cannam@95: # define STACK_FREE(n) cannam@95: # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */ cannam@95: # define STACK_MALLOC(T, p, n) p = (T)alloca(n) cannam@95: # define STACK_FREE(n) cannam@95: # endif cannam@95: cannam@95: #else /* ! HAVE_ALLOCA */ cannam@95: /* use malloc instead of alloca */ cannam@95: # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER) cannam@95: # define STACK_FREE(n) X(ifree)(n) cannam@95: #endif /* ! HAVE_ALLOCA */ cannam@95: cannam@95: /* allocation of buffers. If these grow too large use malloc(), else cannam@95: use STACK_MALLOC (hopefully reducing to alloca()). */ cannam@95: cannam@95: /* 64KiB ought to be enough for anybody */ cannam@95: #define MAX_STACK_ALLOC ((size_t)64 * 1024) cannam@95: cannam@95: #define BUF_ALLOC(T, p, n) \ cannam@95: { \ cannam@95: if (n < MAX_STACK_ALLOC) { \ cannam@95: STACK_MALLOC(T, p, n); \ cannam@95: } else { \ cannam@95: p = (T)MALLOC(n, BUFFERS); \ cannam@95: } \ cannam@95: } cannam@95: cannam@95: #define BUF_FREE(p, n) \ cannam@95: { \ cannam@95: if (n < MAX_STACK_ALLOC) { \ cannam@95: STACK_FREE(p); \ cannam@95: } else { \ cannam@95: X(ifree)(p); \ cannam@95: } \ cannam@95: } cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* define uintptr_t if it is not already defined */ cannam@95: cannam@95: #ifndef HAVE_UINTPTR_T cannam@95: # if SIZEOF_VOID_P == 0 cannam@95: # error sizeof void* is unknown! cannam@95: # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P cannam@95: typedef unsigned int uintptr_t; cannam@95: # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P cannam@95: typedef unsigned long uintptr_t; cannam@95: # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P cannam@95: typedef unsigned long long uintptr_t; cannam@95: # else cannam@95: # error no unsigned integer type matches void* sizeof! cannam@95: # endif cannam@95: #endif cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* We can do an optimization for copying pairs of (aligned) floats cannam@95: when in single precision if 2*float = double. */ cannam@95: cannam@95: #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \ cannam@95: && SIZEOF_FLOAT != 0 \ cannam@95: && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT) cannam@95: cannam@95: #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0) cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* assert.c: */ cannam@95: IFFTW_EXTERN void X(assertion_failed)(const char *s, cannam@95: int line, const char *file); cannam@95: cannam@95: /* always check */ cannam@95: #define CK(ex) \ cannam@95: (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) cannam@95: cannam@95: #ifdef FFTW_DEBUG cannam@95: /* check only if debug enabled */ cannam@95: #define A(ex) \ cannam@95: (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) cannam@95: #else cannam@95: #define A(ex) /* nothing */ cannam@95: #endif cannam@95: cannam@95: extern void X(debug)(const char *format, ...); cannam@95: #define D X(debug) cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* kalloc.c: */ cannam@95: extern void *X(kernel_malloc)(size_t n); cannam@95: extern void X(kernel_free)(void *p); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* alloc.c: */ cannam@95: cannam@95: /* objects allocated by malloc, for statistical purposes */ cannam@95: enum malloc_tag { cannam@95: EVERYTHING, cannam@95: PLANS, cannam@95: SOLVERS, cannam@95: PROBLEMS, cannam@95: BUFFERS, cannam@95: HASHT, cannam@95: TENSORS, cannam@95: PLANNERS, cannam@95: SLVDESCS, cannam@95: TWIDDLES, cannam@95: STRIDES, cannam@95: OTHER, cannam@95: MALLOC_WHAT_LAST /* must be last */ cannam@95: }; cannam@95: cannam@95: IFFTW_EXTERN void X(ifree)(void *ptr); cannam@95: extern void X(ifree0)(void *ptr); cannam@95: cannam@95: #ifdef FFTW_DEBUG_MALLOC cannam@95: cannam@95: IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what, cannam@95: const char *file, int line); cannam@95: #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__) cannam@95: IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose); cannam@95: cannam@95: #else /* ! FFTW_DEBUG_MALLOC */ cannam@95: cannam@95: IFFTW_EXTERN void *X(malloc_plain)(size_t sz); cannam@95: #define MALLOC(n, what) X(malloc_plain)(n) cannam@95: cannam@95: #endif cannam@95: cannam@95: #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP)) cannam@95: extern int X(in_thread); cannam@95: # define IN_THREAD X(in_thread) cannam@95: # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1 cannam@95: # define THREAD_OFF X(in_thread) = in_thread_save; } cannam@95: #else cannam@95: # define IN_THREAD 0 cannam@95: # define THREAD_ON cannam@95: # define THREAD_OFF cannam@95: #endif cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* low-resolution clock */ cannam@95: cannam@95: #ifdef FAKE_CRUDE_TIME cannam@95: typedef int crude_time; cannam@95: #else cannam@95: # if TIME_WITH_SYS_TIME cannam@95: # include cannam@95: # include cannam@95: # else cannam@95: # if HAVE_SYS_TIME_H cannam@95: # include cannam@95: # else cannam@95: # include cannam@95: # endif cannam@95: # endif cannam@95: cannam@95: # ifdef HAVE_BSDGETTIMEOFDAY cannam@95: # ifndef HAVE_GETTIMEOFDAY cannam@95: # define gettimeofday BSDgettimeofday cannam@95: # define HAVE_GETTIMEOFDAY 1 cannam@95: # endif cannam@95: # endif cannam@95: cannam@95: # if defined(HAVE_GETTIMEOFDAY) cannam@95: typedef struct timeval crude_time; cannam@95: # else cannam@95: typedef clock_t crude_time; cannam@95: # endif cannam@95: #endif /* else FAKE_CRUDE_TIME */ cannam@95: cannam@95: crude_time X(get_crude_time)(void); cannam@95: double X(elapsed_since)(const planner *plnr, const problem *p, cannam@95: crude_time t0); /* time in seconds since t0 */ cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* ops.c: */ cannam@95: /* cannam@95: * ops counter. The total number of additions is add + fma cannam@95: * and the total number of multiplications is mul + fma. cannam@95: * Total flops = add + mul + 2 * fma cannam@95: */ cannam@95: typedef struct { cannam@95: double add; cannam@95: double mul; cannam@95: double fma; cannam@95: double other; cannam@95: } opcnt; cannam@95: cannam@95: void X(ops_zero)(opcnt *dst); cannam@95: void X(ops_other)(INT o, opcnt *dst); cannam@95: void X(ops_cpy)(const opcnt *src, opcnt *dst); cannam@95: cannam@95: void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst); cannam@95: void X(ops_add2)(const opcnt *a, opcnt *dst); cannam@95: cannam@95: /* dst = m * a + b */ cannam@95: void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst); cannam@95: cannam@95: /* dst += m * a */ cannam@95: void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst); cannam@95: cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* minmax.c: */ cannam@95: INT X(imax)(INT a, INT b); cannam@95: INT X(imin)(INT a, INT b); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* iabs.c: */ cannam@95: INT X(iabs)(INT a); cannam@95: cannam@95: /* inline version */ cannam@95: #define IABS(x) (((x) < 0) ? (0 - (x)) : (x)) cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* md5.c */ cannam@95: cannam@95: #if SIZEOF_UNSIGNED_INT >= 4 cannam@95: typedef unsigned int md5uint; cannam@95: #else cannam@95: typedef unsigned long md5uint; /* at least 32 bits as per C standard */ cannam@95: #endif cannam@95: cannam@95: typedef md5uint md5sig[4]; cannam@95: cannam@95: typedef struct { cannam@95: md5sig s; /* state and signature */ cannam@95: cannam@95: /* fields not meant to be used outside md5.c: */ cannam@95: unsigned char c[64]; /* stuff not yet processed */ cannam@95: unsigned l; /* total length. Should be 64 bits long, but this is cannam@95: good enough for us */ cannam@95: } md5; cannam@95: cannam@95: void X(md5begin)(md5 *p); cannam@95: void X(md5putb)(md5 *p, const void *d_, size_t len); cannam@95: void X(md5puts)(md5 *p, const char *s); cannam@95: void X(md5putc)(md5 *p, unsigned char c); cannam@95: void X(md5int)(md5 *p, int i); cannam@95: void X(md5INT)(md5 *p, INT i); cannam@95: void X(md5unsigned)(md5 *p, unsigned i); cannam@95: void X(md5end)(md5 *p); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* tensor.c: */ cannam@95: #define STRUCT_HACK_KR cannam@95: #undef STRUCT_HACK_C99 cannam@95: cannam@95: typedef struct { cannam@95: INT n; cannam@95: INT is; /* input stride */ cannam@95: INT os; /* output stride */ cannam@95: } iodim; cannam@95: cannam@95: typedef struct { cannam@95: int rnk; cannam@95: #if defined(STRUCT_HACK_KR) cannam@95: iodim dims[1]; cannam@95: #elif defined(STRUCT_HACK_C99) cannam@95: iodim dims[]; cannam@95: #else cannam@95: iodim *dims; cannam@95: #endif cannam@95: } tensor; cannam@95: cannam@95: /* cannam@95: Definition of rank -infinity. cannam@95: This definition has the property that if you want rank 0 or 1, cannam@95: you can simply test for rank <= 1. This is a common case. cannam@95: cannam@95: A tensor of rank -infinity has size 0. cannam@95: */ cannam@95: #define RNK_MINFTY ((int)(((unsigned) -1) >> 1)) cannam@95: #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY) cannam@95: cannam@95: typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind; cannam@95: cannam@95: tensor *X(mktensor)(int rnk); cannam@95: tensor *X(mktensor_0d)(void); cannam@95: tensor *X(mktensor_1d)(INT n, INT is, INT os); cannam@95: tensor *X(mktensor_2d)(INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1); cannam@95: tensor *X(mktensor_3d)(INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT n2, INT is2, INT os2); cannam@95: tensor *X(mktensor_4d)(INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT n2, INT is2, INT os2, cannam@95: INT n3, INT is3, INT os3); cannam@95: tensor *X(mktensor_5d)(INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT n2, INT is2, INT os2, cannam@95: INT n3, INT is3, INT os3, cannam@95: INT n4, INT is4, INT os4); cannam@95: INT X(tensor_sz)(const tensor *sz); cannam@95: void X(tensor_md5)(md5 *p, const tensor *t); cannam@95: INT X(tensor_max_index)(const tensor *sz); cannam@95: INT X(tensor_min_istride)(const tensor *sz); cannam@95: INT X(tensor_min_ostride)(const tensor *sz); cannam@95: INT X(tensor_min_stride)(const tensor *sz); cannam@95: int X(tensor_inplace_strides)(const tensor *sz); cannam@95: int X(tensor_inplace_strides2)(const tensor *a, const tensor *b); cannam@95: int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz, cannam@95: inplace_kind k); cannam@95: tensor *X(tensor_copy)(const tensor *sz); cannam@95: int X(tensor_kosherp)(const tensor *x); cannam@95: cannam@95: tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k); cannam@95: tensor *X(tensor_copy_except)(const tensor *sz, int except_dim); cannam@95: tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk); cannam@95: tensor *X(tensor_compress)(const tensor *sz); cannam@95: tensor *X(tensor_compress_contiguous)(const tensor *sz); cannam@95: tensor *X(tensor_append)(const tensor *a, const tensor *b); cannam@95: void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b); cannam@95: int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os); cannam@95: void X(tensor_destroy)(tensor *sz); cannam@95: void X(tensor_destroy2)(tensor *a, tensor *b); cannam@95: void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d); cannam@95: void X(tensor_print)(const tensor *sz, printer *p); cannam@95: int X(dimcmp)(const iodim *a, const iodim *b); cannam@95: int X(tensor_equal)(const tensor *a, const tensor *b); cannam@95: int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* problem.c: */ cannam@95: enum { cannam@95: /* a problem that cannot be solved */ cannam@95: PROBLEM_UNSOLVABLE, cannam@95: cannam@95: PROBLEM_DFT, cannam@95: PROBLEM_RDFT, cannam@95: PROBLEM_RDFT2, cannam@95: cannam@95: /* for mpi/ subdirectory */ cannam@95: PROBLEM_MPI_DFT, cannam@95: PROBLEM_MPI_RDFT, cannam@95: PROBLEM_MPI_RDFT2, cannam@95: PROBLEM_MPI_TRANSPOSE, cannam@95: cannam@95: PROBLEM_LAST cannam@95: }; cannam@95: cannam@95: typedef struct { cannam@95: int problem_kind; cannam@95: void (*hash) (const problem *ego, md5 *p); cannam@95: void (*zero) (const problem *ego); cannam@95: void (*print) (const problem *ego, printer *p); cannam@95: void (*destroy) (problem *ego); cannam@95: } problem_adt; cannam@95: cannam@95: struct problem_s { cannam@95: const problem_adt *adt; cannam@95: }; cannam@95: cannam@95: problem *X(mkproblem)(size_t sz, const problem_adt *adt); cannam@95: void X(problem_destroy)(problem *ego); cannam@95: problem *X(mkproblem_unsolvable)(void); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* print.c */ cannam@95: struct printer_s { cannam@95: void (*print)(printer *p, const char *format, ...); cannam@95: void (*vprint)(printer *p, const char *format, va_list ap); cannam@95: void (*putchr)(printer *p, char c); cannam@95: void (*cleanup)(printer *p); cannam@95: int indent; cannam@95: int indent_incr; cannam@95: }; cannam@95: cannam@95: printer *X(mkprinter)(size_t size, cannam@95: void (*putchr)(printer *p, char c), cannam@95: void (*cleanup)(printer *p)); cannam@95: IFFTW_EXTERN void X(printer_destroy)(printer *p); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* scan.c */ cannam@95: struct scanner_s { cannam@95: int (*scan)(scanner *sc, const char *format, ...); cannam@95: int (*vscan)(scanner *sc, const char *format, va_list ap); cannam@95: int (*getchr)(scanner *sc); cannam@95: int ungotc; cannam@95: }; cannam@95: cannam@95: scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc)); cannam@95: void X(scanner_destroy)(scanner *sc); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* plan.c: */ cannam@95: cannam@95: enum wakefulness { cannam@95: SLEEPY, cannam@95: AWAKE_ZERO, cannam@95: AWAKE_SQRTN_TABLE, cannam@95: AWAKE_SINCOS cannam@95: }; cannam@95: cannam@95: typedef struct { cannam@95: void (*solve)(const plan *ego, const problem *p); cannam@95: void (*awake)(plan *ego, enum wakefulness wakefulness); cannam@95: void (*print)(const plan *ego, printer *p); cannam@95: void (*destroy)(plan *ego); cannam@95: } plan_adt; cannam@95: cannam@95: struct plan_s { cannam@95: const plan_adt *adt; cannam@95: opcnt ops; cannam@95: double pcost; cannam@95: enum wakefulness wakefulness; /* used for debugging only */ cannam@95: int could_prune_now_p; cannam@95: }; cannam@95: cannam@95: plan *X(mkplan)(size_t size, const plan_adt *adt); cannam@95: void X(plan_destroy_internal)(plan *ego); cannam@95: IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness); cannam@95: void X(plan_null_destroy)(plan *ego); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* solver.c: */ cannam@95: typedef struct { cannam@95: int problem_kind; cannam@95: plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr); cannam@95: void (*destroy)(solver *ego); cannam@95: } solver_adt; cannam@95: cannam@95: struct solver_s { cannam@95: const solver_adt *adt; cannam@95: int refcnt; cannam@95: }; cannam@95: cannam@95: solver *X(mksolver)(size_t size, const solver_adt *adt); cannam@95: void X(solver_use)(solver *ego); cannam@95: void X(solver_destroy)(solver *ego); cannam@95: void X(solver_register)(planner *plnr, solver *s); cannam@95: cannam@95: /* shorthand */ cannam@95: #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt) cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* planner.c */ cannam@95: cannam@95: typedef struct slvdesc_s { cannam@95: solver *slv; cannam@95: const char *reg_nam; cannam@95: unsigned nam_hash; cannam@95: int reg_id; cannam@95: int next_for_same_problem_kind; cannam@95: } slvdesc; cannam@95: cannam@95: typedef struct solution_s solution; /* opaque */ cannam@95: cannam@95: /* interpretation of L and U: cannam@95: cannam@95: - if it returns a plan, the planner guarantees that all applicable cannam@95: plans at least as impatient as U have been tried, and that each cannam@95: plan in the solution is at least as impatient as L. cannam@95: cannam@95: - if it returns 0, the planner guarantees to have tried all solvers cannam@95: at least as impatient as L, and that none of them was applicable. cannam@95: cannam@95: The structure is packed to fit into 64 bits. cannam@95: */ cannam@95: cannam@95: typedef struct { cannam@95: unsigned l:20; cannam@95: unsigned hash_info:3; cannam@95: # define BITS_FOR_TIMELIMIT 9 cannam@95: unsigned timelimit_impatience:BITS_FOR_TIMELIMIT; cannam@95: unsigned u:20; cannam@95: cannam@95: /* abstraction break: we store the solver here to pad the cannam@95: structure to 64 bits. Otherwise, the struct is padded to 64 cannam@95: bits anyway, and another word is allocated for slvndx. */ cannam@95: # define BITS_FOR_SLVNDX 12 cannam@95: unsigned slvndx:BITS_FOR_SLVNDX; cannam@95: } flags_t; cannam@95: cannam@95: /* impatience flags */ cannam@95: enum { cannam@95: BELIEVE_PCOST = 0x0001, cannam@95: ESTIMATE = 0x0002, cannam@95: NO_DFT_R2HC = 0x0004, cannam@95: NO_SLOW = 0x0008, cannam@95: NO_VRECURSE = 0x0010, cannam@95: NO_INDIRECT_OP = 0x0020, cannam@95: NO_LARGE_GENERIC = 0x0040, cannam@95: NO_RANK_SPLITS = 0x0080, cannam@95: NO_VRANK_SPLITS = 0x0100, cannam@95: NO_NONTHREADED = 0x0200, cannam@95: NO_BUFFERING = 0x0400, cannam@95: NO_FIXED_RADIX_LARGE_N = 0x0800, cannam@95: NO_DESTROY_INPUT = 0x1000, cannam@95: NO_SIMD = 0x2000, cannam@95: CONSERVE_MEMORY = 0x4000, cannam@95: NO_DHT_R2HC = 0x8000, cannam@95: NO_UGLY = 0x10000, cannam@95: ALLOW_PRUNING = 0x20000 cannam@95: }; cannam@95: cannam@95: /* hashtable information */ cannam@95: enum { cannam@95: BLESSING = 0x1, /* save this entry */ cannam@95: H_VALID = 0x2, /* valid hastable entry */ cannam@95: H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */ cannam@95: }; cannam@95: cannam@95: #define PLNR_L(plnr) ((plnr)->flags.l) cannam@95: #define PLNR_U(plnr) ((plnr)->flags.u) cannam@95: #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience) cannam@95: cannam@95: #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE) cannam@95: #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST) cannam@95: #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING) cannam@95: cannam@95: #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP) cannam@95: #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC) cannam@95: #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS) cannam@95: #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS) cannam@95: #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE) cannam@95: #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC) cannam@95: #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW) cannam@95: #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY) cannam@95: #define NO_FIXED_RADIX_LARGE_NP(plnr) \ cannam@95: (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N) cannam@95: #define NO_NONTHREADEDP(plnr) \ cannam@95: ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1) cannam@95: cannam@95: #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT) cannam@95: #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD) cannam@95: #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY) cannam@95: #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC) cannam@95: #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING) cannam@95: cannam@95: typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia; cannam@95: cannam@95: typedef enum { cannam@95: /* WISDOM_NORMAL: planner may or may not use wisdom */ cannam@95: WISDOM_NORMAL, cannam@95: cannam@95: /* WISDOM_ONLY: planner must use wisdom and must avoid searching */ cannam@95: WISDOM_ONLY, cannam@95: cannam@95: /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */ cannam@95: WISDOM_IS_BOGUS, cannam@95: cannam@95: /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */ cannam@95: WISDOM_IGNORE_INFEASIBLE, cannam@95: cannam@95: /* WISDOM_IGNORE_ALL: planner ignores all */ cannam@95: WISDOM_IGNORE_ALL cannam@95: } wisdom_state_t; cannam@95: cannam@95: typedef struct { cannam@95: void (*register_solver)(planner *ego, solver *s); cannam@95: plan *(*mkplan)(planner *ego, const problem *p); cannam@95: void (*forget)(planner *ego, amnesia a); cannam@95: void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved cannam@95: word in C++. */ cannam@95: int (*imprt)(planner *ego, scanner *sc); cannam@95: } planner_adt; cannam@95: cannam@95: /* hash table of solutions */ cannam@95: typedef struct { cannam@95: solution *solutions; cannam@95: unsigned hashsiz, nelem; cannam@95: cannam@95: /* statistics */ cannam@95: int lookup, succ_lookup, lookup_iter; cannam@95: int insert, insert_iter, insert_unknown; cannam@95: int nrehash; cannam@95: } hashtab; cannam@95: cannam@95: typedef enum { COST_SUM, COST_MAX } cost_kind; cannam@95: cannam@95: struct planner_s { cannam@95: const planner_adt *adt; cannam@95: void (*hook)(struct planner_s *plnr, plan *pln, cannam@95: const problem *p, int optimalp); cannam@95: double (*cost_hook)(const problem *p, double t, cost_kind k); cannam@95: int (*wisdom_ok_hook)(const problem *p, flags_t flags); cannam@95: void (*nowisdom_hook)(const problem *p); cannam@95: wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p); cannam@95: cannam@95: /* solver descriptors */ cannam@95: slvdesc *slvdescs; cannam@95: unsigned nslvdesc, slvdescsiz; cannam@95: const char *cur_reg_nam; cannam@95: int cur_reg_id; cannam@95: int slvdescs_for_problem_kind[PROBLEM_LAST]; cannam@95: cannam@95: wisdom_state_t wisdom_state; cannam@95: cannam@95: hashtab htab_blessed; cannam@95: hashtab htab_unblessed; cannam@95: cannam@95: int nthr; cannam@95: flags_t flags; cannam@95: cannam@95: crude_time start_time; cannam@95: double timelimit; /* elapsed_since(start_time) at which to bail out */ cannam@95: int timed_out; /* whether most recent search timed out */ cannam@95: int need_timeout_check; cannam@95: cannam@95: /* various statistics */ cannam@95: int nplan; /* number of plans evaluated */ cannam@95: double pcost, epcost; /* total pcost of measured/estimated plans */ cannam@95: int nprob; /* number of problems evaluated */ cannam@95: }; cannam@95: cannam@95: planner *X(mkplanner)(void); cannam@95: void X(planner_destroy)(planner *ego); cannam@95: cannam@95: /* cannam@95: Iterate over all solvers. Read: cannam@95: cannam@95: @article{ baker93iterators, cannam@95: author = "Henry G. Baker, Jr.", cannam@95: title = "Iterators: Signs of Weakness in Object-Oriented Languages", cannam@95: journal = "{ACM} {OOPS} Messenger", cannam@95: volume = "4", cannam@95: number = "3", cannam@95: pages = "18--25" cannam@95: } cannam@95: */ cannam@95: #define FORALL_SOLVERS(ego, s, p, what) \ cannam@95: { \ cannam@95: unsigned _cnt; \ cannam@95: for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \ cannam@95: slvdesc *p = ego->slvdescs + _cnt; \ cannam@95: solver *s = p->slv; \ cannam@95: what; \ cannam@95: } \ cannam@95: } cannam@95: cannam@95: #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \ cannam@95: { \ cannam@95: int _cnt = ego->slvdescs_for_problem_kind[kind]; \ cannam@95: while (_cnt >= 0) { \ cannam@95: slvdesc *p = ego->slvdescs + _cnt; \ cannam@95: solver *s = p->slv; \ cannam@95: what; \ cannam@95: _cnt = p->next_for_same_problem_kind; \ cannam@95: } \ cannam@95: } cannam@95: cannam@95: cannam@95: /* make plan, destroy problem */ cannam@95: plan *X(mkplan_d)(planner *ego, problem *p); cannam@95: plan *X(mkplan_f_d)(planner *ego, problem *p, cannam@95: unsigned l_set, unsigned u_set, unsigned u_reset); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* stride.c: */ cannam@95: cannam@95: /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */ cannam@95: #if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE) cannam@95: #define PRECOMPUTE_ARRAY_INDICES cannam@95: #endif cannam@95: cannam@95: extern const INT X(an_INT_guaranteed_to_be_zero); cannam@95: cannam@95: #ifdef PRECOMPUTE_ARRAY_INDICES cannam@95: typedef INT *stride; cannam@95: #define WS(stride, i) (stride[i]) cannam@95: extern stride X(mkstride)(INT n, INT s); cannam@95: void X(stride_destroy)(stride p); cannam@95: /* hackery to prevent the compiler from copying the strides array cannam@95: onto the stack */ cannam@95: #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero) cannam@95: #else cannam@95: cannam@95: typedef INT stride; cannam@95: #define WS(stride, i) (stride * i) cannam@95: #define fftwf_mkstride(n, stride) stride cannam@95: #define fftw_mkstride(n, stride) stride cannam@95: #define fftwl_mkstride(n, stride) stride cannam@95: #define fftwf_stride_destroy(p) ((void) p) cannam@95: #define fftw_stride_destroy(p) ((void) p) cannam@95: #define fftwl_stride_destroy(p) ((void) p) cannam@95: cannam@95: /* hackery to prevent the compiler from ``optimizing'' induction cannam@95: variables in codelet loops. The problem is that for each K and for cannam@95: each expression of the form P[I + STRIDE * K] in a loop, most cannam@95: compilers will try to lift an induction variable PK := &P[I + STRIDE * K]. cannam@95: For large values of K this behavior overflows the cannam@95: register set, which is likely worse than doing the index computation cannam@95: in the first place. cannam@95: cannam@95: If we guess that there are more than cannam@95: ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse cannam@95: the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to cannam@95: be 0, but the compiler does not know this. cannam@95: cannam@95: 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's cannam@95: seem to imply. cannam@95: */ cannam@95: #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16 cannam@95: #define MAKE_VOLATILE_STRIDE(nptr, x) \ cannam@95: (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \ cannam@95: 0 : \ cannam@95: ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero))) cannam@95: #endif /* PRECOMPUTE_ARRAY_INDICES */ cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* solvtab.c */ cannam@95: cannam@95: struct solvtab_s { void (*reg)(planner *); const char *reg_nam; }; cannam@95: typedef struct solvtab_s solvtab[]; cannam@95: void X(solvtab_exec)(const solvtab tbl, planner *p); cannam@95: #define SOLVTAB(s) { s, STRINGIZE(s) } cannam@95: #define SOLVTAB_END { 0, 0 } cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* pickdim.c */ cannam@95: int X(pickdim)(int which_dim, const int *buddies, int nbuddies, cannam@95: const tensor *sz, int oop, int *dp); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* twiddle.c */ cannam@95: /* little language to express twiddle factors computation */ cannam@95: enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3, cannam@95: TW_FULL = 4, TW_HALF = 5 }; cannam@95: cannam@95: typedef struct { cannam@95: unsigned char op; cannam@95: signed char v; cannam@95: short i; cannam@95: } tw_instr; cannam@95: cannam@95: typedef struct twid_s { cannam@95: R *W; /* array of twiddle factors */ cannam@95: INT n, r, m; /* transform order, radix, # twiddle rows */ cannam@95: int refcnt; cannam@95: const tw_instr *instr; cannam@95: struct twid_s *cdr; cannam@95: enum wakefulness wakefulness; cannam@95: } twid; cannam@95: cannam@95: INT X(twiddle_length)(INT r, const tw_instr *p); cannam@95: void X(twiddle_awake)(enum wakefulness wakefulness, cannam@95: twid **pp, const tw_instr *instr, INT n, INT r, INT m); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* trig.c */ cannam@95: #if defined(TRIGREAL_IS_LONG_DOUBLE) cannam@95: typedef long double trigreal; cannam@95: #elif defined(TRIGREAL_IS_QUAD) cannam@95: typedef __float128 trigreal; cannam@95: #else cannam@95: typedef double trigreal; cannam@95: #endif cannam@95: cannam@95: typedef struct triggen_s triggen; cannam@95: cannam@95: struct triggen_s { cannam@95: void (*cexp)(triggen *t, INT m, R *result); cannam@95: void (*cexpl)(triggen *t, INT m, trigreal *result); cannam@95: void (*rotate)(triggen *p, INT m, R xr, R xi, R *res); cannam@95: cannam@95: INT twshft; cannam@95: INT twradix; cannam@95: INT twmsk; cannam@95: trigreal *W0, *W1; cannam@95: INT n; cannam@95: }; cannam@95: cannam@95: triggen *X(mktriggen)(enum wakefulness wakefulness, INT n); cannam@95: void X(triggen_destroy)(triggen *p); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* primes.c: */ cannam@95: cannam@95: #define MULMOD(x, y, p) \ cannam@95: (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p)) cannam@95: cannam@95: INT X(safe_mulmod)(INT x, INT y, INT p); cannam@95: INT X(power_mod)(INT n, INT m, INT p); cannam@95: INT X(find_generator)(INT p); cannam@95: INT X(first_divisor)(INT n); cannam@95: int X(is_prime)(INT n); cannam@95: INT X(next_prime)(INT n); cannam@95: int X(factors_into)(INT n, const INT *primes); cannam@95: int X(factors_into_small_primes)(INT n); cannam@95: INT X(choose_radix)(INT r, INT n); cannam@95: INT X(isqrt)(INT n); cannam@95: INT X(modulo)(INT a, INT n); cannam@95: cannam@95: #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */ cannam@95: cannam@95: /* thresholds below which certain solvers are considered SLOW. These are guesses cannam@95: believed to be conservative */ cannam@95: #define GENERIC_MAX_SLOW 16 cannam@95: #define RADER_MAX_SLOW 32 cannam@95: #define BLUESTEIN_MAX_SLOW 24 cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* rader.c: */ cannam@95: typedef struct rader_tls rader_tl; cannam@95: cannam@95: void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl); cannam@95: R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t); cannam@95: void X(rader_tl_delete)(R *W, rader_tl **tl); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* copy/transposition routines */ cannam@95: cannam@95: /* lower bound to the cache size, for tiled routines */ cannam@95: #define CACHESIZE 8192 cannam@95: cannam@95: INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache); cannam@95: cannam@95: void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz, cannam@95: void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args), cannam@95: void *args); cannam@95: void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl); cannam@95: void X(cpy2d)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: void X(cpy2d_ci)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: void X(cpy2d_co)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: void X(cpy2d_tiled)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: void X(cpy2d_tiledbuf)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1); cannam@95: void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1); cannam@95: void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1); cannam@95: cannam@95: void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl); cannam@95: void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl); cannam@95: void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl); cannam@95: cannam@95: typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl); cannam@95: typedef void (*cpy2d_func)(R *I, R *O, cannam@95: INT n0, INT is0, INT os0, cannam@95: INT n1, INT is1, INT os1, cannam@95: INT vl); cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* misc stuff */ cannam@95: void X(null_awake)(plan *ego, enum wakefulness wakefulness); cannam@95: double X(iestimate_cost)(const planner *, const plan *, const problem *); cannam@95: cannam@95: #ifdef FFTW_RANDOM_ESTIMATOR cannam@95: extern unsigned X(random_estimate_seed); cannam@95: #endif cannam@95: cannam@95: double X(measure_execution_time)(const planner *plnr, cannam@95: plan *pln, const problem *p); cannam@95: int X(alignment_of)(R *p); cannam@95: unsigned X(hash)(const char *s); cannam@95: INT X(nbuf)(INT n, INT vl, INT maxnbuf); cannam@95: int X(nbuf_redundant)(INT n, INT vl, int which, cannam@95: const INT *maxnbuf, int nmaxnbuf); cannam@95: INT X(bufdist)(INT n, INT vl); cannam@95: int X(toobig)(INT n); cannam@95: int X(ct_uglyp)(INT min_n, INT v, INT n, INT r); cannam@95: cannam@95: #if HAVE_SIMD cannam@95: R *X(taint)(R *p, INT s); cannam@95: R *X(join_taint)(R *p1, R *p2); cannam@95: #define TAINT(p, s) X(taint)(p, s) cannam@95: #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3)) cannam@95: #define TAINTOF(p) (((uintptr_t)(p)) & 3) cannam@95: #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2) cannam@95: #else cannam@95: #define TAINT(p, s) (p) cannam@95: #define UNTAINT(p) (p) cannam@95: #define TAINTOF(p) 0 cannam@95: #define JOIN_TAINT(p1, p2) p1 cannam@95: #endif cannam@95: cannam@95: #ifdef FFTW_DEBUG_ALIGNMENT cannam@95: # define ASSERT_ALIGNED_DOUBLE { \ cannam@95: double __foo; \ cannam@95: CK(!(((uintptr_t) &__foo) & 0x7)); \ cannam@95: } cannam@95: #else cannam@95: # define ASSERT_ALIGNED_DOUBLE cannam@95: #endif /* FFTW_DEBUG_ALIGNMENT */ cannam@95: cannam@95: cannam@95: cannam@95: /*-----------------------------------------------------------------------*/ cannam@95: /* macros used in codelets to reduce source code size */ cannam@95: cannam@95: typedef R E; /* internal precision of codelets. */ cannam@95: cannam@95: #if defined(FFTW_LDOUBLE) cannam@95: # define K(x) ((E) x##L) cannam@95: #elif defined(FFTW_QUAD) cannam@95: # define K(x) ((E) x##Q) cannam@95: #else cannam@95: # define K(x) ((E) x) cannam@95: #endif cannam@95: #define DK(name, value) const E name = K(value) cannam@95: cannam@95: /* FMA macros */ cannam@95: cannam@95: #if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER)) cannam@95: /* The obvious expression a * b + c does not work. If both x = a * b cannam@95: + c and y = a * b - c appear in the source, gcc computes t = a * b, cannam@95: x = t + c, y = t - c, thus destroying the fma. cannam@95: cannam@95: This peculiar coding seems to do the right thing on all of cannam@95: gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing cannam@95: on gcc-3.4 -fno-web (because the ``web'' pass splits the variable cannam@95: `x' for the single-assignment form). cannam@95: cannam@95: However, gcc-4.0 is a formidable adversary which succeeds in cannam@95: pessimizing two fma's into one multiplication and two additions. cannam@95: It does it very early in the game---before the optimization passes cannam@95: even start. The only real workaround seems to use fake inline asm cannam@95: such as cannam@95: cannam@95: asm ("# confuse gcc %0" : "=f"(a) : "0"(a)); cannam@95: return a * b + c; cannam@95: cannam@95: in each of the FMA, FMS, FNMA, and FNMS functions. However, this cannam@95: does not solve the problem either, because two equal asm statements cannam@95: count as a common subexpression! One must use *different* fake asm cannam@95: statements: cannam@95: cannam@95: in FMA: cannam@95: asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a)); cannam@95: cannam@95: in FMS: cannam@95: asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a)); cannam@95: cannam@95: etc. cannam@95: cannam@95: After these changes, gcc recalcitrantly generates the fma that was cannam@95: in the source to begin with. However, the extra asm() cruft cannam@95: confuses other passes of gcc, notably the instruction scheduler. cannam@95: (Of course, one could also generate the fma directly via inline cannam@95: asm, but this confuses the scheduler even more.) cannam@95: cannam@95: Steven and I have submitted more than one bug report to the gcc cannam@95: mailing list over the past few years, to no effect. Thus, I give cannam@95: up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is cannam@95: out before touching this crap again. cannam@95: */ cannam@95: static __inline__ E FMA(E a, E b, E c) cannam@95: { cannam@95: E x = a * b; cannam@95: x = x + c; cannam@95: return x; cannam@95: } cannam@95: cannam@95: static __inline__ E FMS(E a, E b, E c) cannam@95: { cannam@95: E x = a * b; cannam@95: x = x - c; cannam@95: return x; cannam@95: } cannam@95: cannam@95: static __inline__ E FNMA(E a, E b, E c) cannam@95: { cannam@95: E x = a * b; cannam@95: x = - (x + c); cannam@95: return x; cannam@95: } cannam@95: cannam@95: static __inline__ E FNMS(E a, E b, E c) cannam@95: { cannam@95: E x = a * b; cannam@95: x = - (x - c); cannam@95: return x; cannam@95: } cannam@95: #else cannam@95: #define FMA(a, b, c) (((a) * (b)) + (c)) cannam@95: #define FMS(a, b, c) (((a) * (b)) - (c)) cannam@95: #define FNMA(a, b, c) (- (((a) * (b)) + (c))) cannam@95: #define FNMS(a, b, c) ((c) - ((a) * (b))) cannam@95: #endif cannam@95: cannam@95: #ifdef __cplusplus cannam@95: } /* extern "C" */ cannam@95: #endif /* __cplusplus */ cannam@95: cannam@95: #endif /* __IFFTW_H__ */