Mercurial > hg > sv-dependency-builds
diff src/fftw-3.3.3/kernel/ifftw.h @ 95:89f5e221ed7b
Add FFTW3
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Wed, 20 Mar 2013 15:35:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/fftw-3.3.3/kernel/ifftw.h Wed Mar 20 15:35:50 2013 +0000 @@ -0,0 +1,1160 @@ +/* + * Copyright (c) 2003, 2007-11 Matteo Frigo + * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +/* FFTW internal header file */ +#ifndef __IFFTW_H__ +#define __IFFTW_H__ + +#include "config.h" + +#include <stdlib.h> /* size_t */ +#include <stdarg.h> /* va_list */ +#include <stddef.h> /* ptrdiff_t */ + +#if HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif + +#if HAVE_STDINT_H +# include <stdint.h> /* uintptr_t, maybe */ +#endif + +#if HAVE_INTTYPES_H +# include <inttypes.h> /* uintptr_t, maybe */ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + +/* Windows annoyances -- since tests/hook.c uses some internal + FFTW functions, we need to given them the dllexport attribute + under Windows when compiling as a DLL (see api/fftw3.h). */ +#if defined(FFTW_EXTERN) +# define IFFTW_EXTERN FFTW_EXTERN +#elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \ + && (defined(_WIN32) || defined(__WIN32__)) +# define IFFTW_EXTERN extern __declspec(dllexport) +#else +# define IFFTW_EXTERN extern +#endif + +/* determine precision and name-mangling scheme */ +#define CONCAT(prefix, name) prefix ## name +#if defined(FFTW_SINGLE) + typedef float R; +# define X(name) CONCAT(fftwf_, name) +#elif defined(FFTW_LDOUBLE) + typedef long double R; +# define X(name) CONCAT(fftwl_, name) +# define TRIGREAL_IS_LONG_DOUBLE +#elif defined(FFTW_QUAD) + typedef __float128 R; +# define X(name) CONCAT(fftwq_, name) +# define TRIGREAL_IS_QUAD +#else + typedef double R; +# define X(name) CONCAT(fftw_, name) +#endif + +/* + integral type large enough to contain a stride (what ``int'' should + have been in the first place. +*/ +typedef ptrdiff_t INT; + +/* dummy use of unused parameters to silence compiler warnings */ +#define UNUSED(x) (void)x + +#define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0]))) + +#define FFT_SIGN (-1) /* sign convention for forward transforms */ +extern void X(extract_reim)(int sign, R *c, R **r, R **i); + +#define REGISTER_SOLVER(p, s) X(solver_register)(p, s) + +#define STRINGIZEx(x) #x +#define STRINGIZE(x) STRINGIZEx(x) +#define CIMPLIES(ante, post) (!(ante) || (post)) + +/* define HAVE_SIMD if any simd extensions are supported */ +#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \ + defined(HAVE_MIPS_PS) || defined(HAVE_AVX) +#define HAVE_SIMD 1 +#else +#define HAVE_SIMD 0 +#endif + +extern int X(have_simd_sse2)(void); +extern int X(have_simd_avx)(void); +extern int X(have_simd_altivec)(void); +extern int X(have_simd_neon)(void); + +/* forward declarations */ +typedef struct problem_s problem; +typedef struct plan_s plan; +typedef struct solver_s solver; +typedef struct planner_s planner; +typedef struct printer_s printer; +typedef struct scanner_s scanner; + +/*-----------------------------------------------------------------------*/ +/* alloca: */ +#if HAVE_SIMD +# ifdef HAVE_AVX +# define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for + * everything else */ +# else + /* Note that we cannot use 32-byte alignment for all SIMD. For + example, MacOS X malloc is 16-byte aligned, but there was no + posix_memalign in MacOS X until version 10.6. */ +# define MIN_ALIGNMENT 16 +# endif +#endif + +#if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA) + /* use alloca if available */ + +#ifndef alloca +#ifdef __GNUC__ +# define alloca __builtin_alloca +#else +# ifdef _MSC_VER +# include <malloc.h> +# define alloca _alloca +# else +# if HAVE_ALLOCA_H +# include <alloca.h> +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +void *alloca(size_t); +# endif +# endif +# endif +# endif +#endif +#endif + +# ifdef MIN_ALIGNMENT +# define STACK_MALLOC(T, p, n) \ + { \ + p = (T)alloca((n) + MIN_ALIGNMENT); \ + p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \ + (~(uintptr_t)(MIN_ALIGNMENT - 1))); \ + } +# define STACK_FREE(n) +# else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */ +# define STACK_MALLOC(T, p, n) p = (T)alloca(n) +# define STACK_FREE(n) +# endif + +#else /* ! HAVE_ALLOCA */ + /* use malloc instead of alloca */ +# define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER) +# define STACK_FREE(n) X(ifree)(n) +#endif /* ! HAVE_ALLOCA */ + +/* allocation of buffers. If these grow too large use malloc(), else + use STACK_MALLOC (hopefully reducing to alloca()). */ + +/* 64KiB ought to be enough for anybody */ +#define MAX_STACK_ALLOC ((size_t)64 * 1024) + +#define BUF_ALLOC(T, p, n) \ +{ \ + if (n < MAX_STACK_ALLOC) { \ + STACK_MALLOC(T, p, n); \ + } else { \ + p = (T)MALLOC(n, BUFFERS); \ + } \ +} + +#define BUF_FREE(p, n) \ +{ \ + if (n < MAX_STACK_ALLOC) { \ + STACK_FREE(p); \ + } else { \ + X(ifree)(p); \ + } \ +} + +/*-----------------------------------------------------------------------*/ +/* define uintptr_t if it is not already defined */ + +#ifndef HAVE_UINTPTR_T +# if SIZEOF_VOID_P == 0 +# error sizeof void* is unknown! +# elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P + typedef unsigned int uintptr_t; +# elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P + typedef unsigned long uintptr_t; +# elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P + typedef unsigned long long uintptr_t; +# else +# error no unsigned integer type matches void* sizeof! +# endif +#endif + +/*-----------------------------------------------------------------------*/ +/* We can do an optimization for copying pairs of (aligned) floats + when in single precision if 2*float = double. */ + +#define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \ + && SIZEOF_FLOAT != 0 \ + && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT) + +#define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0) + +/*-----------------------------------------------------------------------*/ +/* assert.c: */ +IFFTW_EXTERN void X(assertion_failed)(const char *s, + int line, const char *file); + +/* always check */ +#define CK(ex) \ + (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) + +#ifdef FFTW_DEBUG +/* check only if debug enabled */ +#define A(ex) \ + (void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0)) +#else +#define A(ex) /* nothing */ +#endif + +extern void X(debug)(const char *format, ...); +#define D X(debug) + +/*-----------------------------------------------------------------------*/ +/* kalloc.c: */ +extern void *X(kernel_malloc)(size_t n); +extern void X(kernel_free)(void *p); + +/*-----------------------------------------------------------------------*/ +/* alloc.c: */ + +/* objects allocated by malloc, for statistical purposes */ +enum malloc_tag { + EVERYTHING, + PLANS, + SOLVERS, + PROBLEMS, + BUFFERS, + HASHT, + TENSORS, + PLANNERS, + SLVDESCS, + TWIDDLES, + STRIDES, + OTHER, + MALLOC_WHAT_LAST /* must be last */ +}; + +IFFTW_EXTERN void X(ifree)(void *ptr); +extern void X(ifree0)(void *ptr); + +#ifdef FFTW_DEBUG_MALLOC + +IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what, + const char *file, int line); +#define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__) +IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose); + +#else /* ! FFTW_DEBUG_MALLOC */ + +IFFTW_EXTERN void *X(malloc_plain)(size_t sz); +#define MALLOC(n, what) X(malloc_plain)(n) + +#endif + +#if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) || defined(HAVE_OPENMP)) +extern int X(in_thread); +# define IN_THREAD X(in_thread) +# define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1 +# define THREAD_OFF X(in_thread) = in_thread_save; } +#else +# define IN_THREAD 0 +# define THREAD_ON +# define THREAD_OFF +#endif + +/*-----------------------------------------------------------------------*/ +/* low-resolution clock */ + +#ifdef FAKE_CRUDE_TIME + typedef int crude_time; +#else +# if TIME_WITH_SYS_TIME +# include <sys/time.h> +# include <time.h> +# else +# if HAVE_SYS_TIME_H +# include <sys/time.h> +# else +# include <time.h> +# endif +# endif + +# ifdef HAVE_BSDGETTIMEOFDAY +# ifndef HAVE_GETTIMEOFDAY +# define gettimeofday BSDgettimeofday +# define HAVE_GETTIMEOFDAY 1 +# endif +# endif + +# if defined(HAVE_GETTIMEOFDAY) + typedef struct timeval crude_time; +# else + typedef clock_t crude_time; +# endif +#endif /* else FAKE_CRUDE_TIME */ + +crude_time X(get_crude_time)(void); +double X(elapsed_since)(const planner *plnr, const problem *p, + crude_time t0); /* time in seconds since t0 */ + +/*-----------------------------------------------------------------------*/ +/* ops.c: */ +/* + * ops counter. The total number of additions is add + fma + * and the total number of multiplications is mul + fma. + * Total flops = add + mul + 2 * fma + */ +typedef struct { + double add; + double mul; + double fma; + double other; +} opcnt; + +void X(ops_zero)(opcnt *dst); +void X(ops_other)(INT o, opcnt *dst); +void X(ops_cpy)(const opcnt *src, opcnt *dst); + +void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst); +void X(ops_add2)(const opcnt *a, opcnt *dst); + +/* dst = m * a + b */ +void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst); + +/* dst += m * a */ +void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst); + + +/*-----------------------------------------------------------------------*/ +/* minmax.c: */ +INT X(imax)(INT a, INT b); +INT X(imin)(INT a, INT b); + +/*-----------------------------------------------------------------------*/ +/* iabs.c: */ +INT X(iabs)(INT a); + +/* inline version */ +#define IABS(x) (((x) < 0) ? (0 - (x)) : (x)) + +/*-----------------------------------------------------------------------*/ +/* md5.c */ + +#if SIZEOF_UNSIGNED_INT >= 4 +typedef unsigned int md5uint; +#else +typedef unsigned long md5uint; /* at least 32 bits as per C standard */ +#endif + +typedef md5uint md5sig[4]; + +typedef struct { + md5sig s; /* state and signature */ + + /* fields not meant to be used outside md5.c: */ + unsigned char c[64]; /* stuff not yet processed */ + unsigned l; /* total length. Should be 64 bits long, but this is + good enough for us */ +} md5; + +void X(md5begin)(md5 *p); +void X(md5putb)(md5 *p, const void *d_, size_t len); +void X(md5puts)(md5 *p, const char *s); +void X(md5putc)(md5 *p, unsigned char c); +void X(md5int)(md5 *p, int i); +void X(md5INT)(md5 *p, INT i); +void X(md5unsigned)(md5 *p, unsigned i); +void X(md5end)(md5 *p); + +/*-----------------------------------------------------------------------*/ +/* tensor.c: */ +#define STRUCT_HACK_KR +#undef STRUCT_HACK_C99 + +typedef struct { + INT n; + INT is; /* input stride */ + INT os; /* output stride */ +} iodim; + +typedef struct { + int rnk; +#if defined(STRUCT_HACK_KR) + iodim dims[1]; +#elif defined(STRUCT_HACK_C99) + iodim dims[]; +#else + iodim *dims; +#endif +} tensor; + +/* + Definition of rank -infinity. + This definition has the property that if you want rank 0 or 1, + you can simply test for rank <= 1. This is a common case. + + A tensor of rank -infinity has size 0. +*/ +#define RNK_MINFTY ((int)(((unsigned) -1) >> 1)) +#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY) + +typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind; + +tensor *X(mktensor)(int rnk); +tensor *X(mktensor_0d)(void); +tensor *X(mktensor_1d)(INT n, INT is, INT os); +tensor *X(mktensor_2d)(INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1); +tensor *X(mktensor_3d)(INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT n2, INT is2, INT os2); +tensor *X(mktensor_4d)(INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT n2, INT is2, INT os2, + INT n3, INT is3, INT os3); +tensor *X(mktensor_5d)(INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT n2, INT is2, INT os2, + INT n3, INT is3, INT os3, + INT n4, INT is4, INT os4); +INT X(tensor_sz)(const tensor *sz); +void X(tensor_md5)(md5 *p, const tensor *t); +INT X(tensor_max_index)(const tensor *sz); +INT X(tensor_min_istride)(const tensor *sz); +INT X(tensor_min_ostride)(const tensor *sz); +INT X(tensor_min_stride)(const tensor *sz); +int X(tensor_inplace_strides)(const tensor *sz); +int X(tensor_inplace_strides2)(const tensor *a, const tensor *b); +int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz, + inplace_kind k); +tensor *X(tensor_copy)(const tensor *sz); +int X(tensor_kosherp)(const tensor *x); + +tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k); +tensor *X(tensor_copy_except)(const tensor *sz, int except_dim); +tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk); +tensor *X(tensor_compress)(const tensor *sz); +tensor *X(tensor_compress_contiguous)(const tensor *sz); +tensor *X(tensor_append)(const tensor *a, const tensor *b); +void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b); +int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os); +void X(tensor_destroy)(tensor *sz); +void X(tensor_destroy2)(tensor *a, tensor *b); +void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d); +void X(tensor_print)(const tensor *sz, printer *p); +int X(dimcmp)(const iodim *a, const iodim *b); +int X(tensor_equal)(const tensor *a, const tensor *b); +int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz); + +/*-----------------------------------------------------------------------*/ +/* problem.c: */ +enum { + /* a problem that cannot be solved */ + PROBLEM_UNSOLVABLE, + + PROBLEM_DFT, + PROBLEM_RDFT, + PROBLEM_RDFT2, + + /* for mpi/ subdirectory */ + PROBLEM_MPI_DFT, + PROBLEM_MPI_RDFT, + PROBLEM_MPI_RDFT2, + PROBLEM_MPI_TRANSPOSE, + + PROBLEM_LAST +}; + +typedef struct { + int problem_kind; + void (*hash) (const problem *ego, md5 *p); + void (*zero) (const problem *ego); + void (*print) (const problem *ego, printer *p); + void (*destroy) (problem *ego); +} problem_adt; + +struct problem_s { + const problem_adt *adt; +}; + +problem *X(mkproblem)(size_t sz, const problem_adt *adt); +void X(problem_destroy)(problem *ego); +problem *X(mkproblem_unsolvable)(void); + +/*-----------------------------------------------------------------------*/ +/* print.c */ +struct printer_s { + void (*print)(printer *p, const char *format, ...); + void (*vprint)(printer *p, const char *format, va_list ap); + void (*putchr)(printer *p, char c); + void (*cleanup)(printer *p); + int indent; + int indent_incr; +}; + +printer *X(mkprinter)(size_t size, + void (*putchr)(printer *p, char c), + void (*cleanup)(printer *p)); +IFFTW_EXTERN void X(printer_destroy)(printer *p); + +/*-----------------------------------------------------------------------*/ +/* scan.c */ +struct scanner_s { + int (*scan)(scanner *sc, const char *format, ...); + int (*vscan)(scanner *sc, const char *format, va_list ap); + int (*getchr)(scanner *sc); + int ungotc; +}; + +scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc)); +void X(scanner_destroy)(scanner *sc); + +/*-----------------------------------------------------------------------*/ +/* plan.c: */ + +enum wakefulness { + SLEEPY, + AWAKE_ZERO, + AWAKE_SQRTN_TABLE, + AWAKE_SINCOS +}; + +typedef struct { + void (*solve)(const plan *ego, const problem *p); + void (*awake)(plan *ego, enum wakefulness wakefulness); + void (*print)(const plan *ego, printer *p); + void (*destroy)(plan *ego); +} plan_adt; + +struct plan_s { + const plan_adt *adt; + opcnt ops; + double pcost; + enum wakefulness wakefulness; /* used for debugging only */ + int could_prune_now_p; +}; + +plan *X(mkplan)(size_t size, const plan_adt *adt); +void X(plan_destroy_internal)(plan *ego); +IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness); +void X(plan_null_destroy)(plan *ego); + +/*-----------------------------------------------------------------------*/ +/* solver.c: */ +typedef struct { + int problem_kind; + plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr); + void (*destroy)(solver *ego); +} solver_adt; + +struct solver_s { + const solver_adt *adt; + int refcnt; +}; + +solver *X(mksolver)(size_t size, const solver_adt *adt); +void X(solver_use)(solver *ego); +void X(solver_destroy)(solver *ego); +void X(solver_register)(planner *plnr, solver *s); + +/* shorthand */ +#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt) + +/*-----------------------------------------------------------------------*/ +/* planner.c */ + +typedef struct slvdesc_s { + solver *slv; + const char *reg_nam; + unsigned nam_hash; + int reg_id; + int next_for_same_problem_kind; +} slvdesc; + +typedef struct solution_s solution; /* opaque */ + +/* interpretation of L and U: + + - if it returns a plan, the planner guarantees that all applicable + plans at least as impatient as U have been tried, and that each + plan in the solution is at least as impatient as L. + + - if it returns 0, the planner guarantees to have tried all solvers + at least as impatient as L, and that none of them was applicable. + + The structure is packed to fit into 64 bits. +*/ + +typedef struct { + unsigned l:20; + unsigned hash_info:3; +# define BITS_FOR_TIMELIMIT 9 + unsigned timelimit_impatience:BITS_FOR_TIMELIMIT; + unsigned u:20; + + /* abstraction break: we store the solver here to pad the + structure to 64 bits. Otherwise, the struct is padded to 64 + bits anyway, and another word is allocated for slvndx. */ +# define BITS_FOR_SLVNDX 12 + unsigned slvndx:BITS_FOR_SLVNDX; +} flags_t; + +/* impatience flags */ +enum { + BELIEVE_PCOST = 0x0001, + ESTIMATE = 0x0002, + NO_DFT_R2HC = 0x0004, + NO_SLOW = 0x0008, + NO_VRECURSE = 0x0010, + NO_INDIRECT_OP = 0x0020, + NO_LARGE_GENERIC = 0x0040, + NO_RANK_SPLITS = 0x0080, + NO_VRANK_SPLITS = 0x0100, + NO_NONTHREADED = 0x0200, + NO_BUFFERING = 0x0400, + NO_FIXED_RADIX_LARGE_N = 0x0800, + NO_DESTROY_INPUT = 0x1000, + NO_SIMD = 0x2000, + CONSERVE_MEMORY = 0x4000, + NO_DHT_R2HC = 0x8000, + NO_UGLY = 0x10000, + ALLOW_PRUNING = 0x20000 +}; + +/* hashtable information */ +enum { + BLESSING = 0x1, /* save this entry */ + H_VALID = 0x2, /* valid hastable entry */ + H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */ +}; + +#define PLNR_L(plnr) ((plnr)->flags.l) +#define PLNR_U(plnr) ((plnr)->flags.u) +#define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience) + +#define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE) +#define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST) +#define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING) + +#define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP) +#define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC) +#define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS) +#define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS) +#define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE) +#define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC) +#define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW) +#define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY) +#define NO_FIXED_RADIX_LARGE_NP(plnr) \ + (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N) +#define NO_NONTHREADEDP(plnr) \ + ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1) + +#define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT) +#define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD) +#define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY) +#define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC) +#define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING) + +typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia; + +typedef enum { + /* WISDOM_NORMAL: planner may or may not use wisdom */ + WISDOM_NORMAL, + + /* WISDOM_ONLY: planner must use wisdom and must avoid searching */ + WISDOM_ONLY, + + /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */ + WISDOM_IS_BOGUS, + + /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */ + WISDOM_IGNORE_INFEASIBLE, + + /* WISDOM_IGNORE_ALL: planner ignores all */ + WISDOM_IGNORE_ALL +} wisdom_state_t; + +typedef struct { + void (*register_solver)(planner *ego, solver *s); + plan *(*mkplan)(planner *ego, const problem *p); + void (*forget)(planner *ego, amnesia a); + void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved + word in C++. */ + int (*imprt)(planner *ego, scanner *sc); +} planner_adt; + +/* hash table of solutions */ +typedef struct { + solution *solutions; + unsigned hashsiz, nelem; + + /* statistics */ + int lookup, succ_lookup, lookup_iter; + int insert, insert_iter, insert_unknown; + int nrehash; +} hashtab; + +typedef enum { COST_SUM, COST_MAX } cost_kind; + +struct planner_s { + const planner_adt *adt; + void (*hook)(struct planner_s *plnr, plan *pln, + const problem *p, int optimalp); + double (*cost_hook)(const problem *p, double t, cost_kind k); + int (*wisdom_ok_hook)(const problem *p, flags_t flags); + void (*nowisdom_hook)(const problem *p); + wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p); + + /* solver descriptors */ + slvdesc *slvdescs; + unsigned nslvdesc, slvdescsiz; + const char *cur_reg_nam; + int cur_reg_id; + int slvdescs_for_problem_kind[PROBLEM_LAST]; + + wisdom_state_t wisdom_state; + + hashtab htab_blessed; + hashtab htab_unblessed; + + int nthr; + flags_t flags; + + crude_time start_time; + double timelimit; /* elapsed_since(start_time) at which to bail out */ + int timed_out; /* whether most recent search timed out */ + int need_timeout_check; + + /* various statistics */ + int nplan; /* number of plans evaluated */ + double pcost, epcost; /* total pcost of measured/estimated plans */ + int nprob; /* number of problems evaluated */ +}; + +planner *X(mkplanner)(void); +void X(planner_destroy)(planner *ego); + +/* + Iterate over all solvers. Read: + + @article{ baker93iterators, + author = "Henry G. Baker, Jr.", + title = "Iterators: Signs of Weakness in Object-Oriented Languages", + journal = "{ACM} {OOPS} Messenger", + volume = "4", + number = "3", + pages = "18--25" + } +*/ +#define FORALL_SOLVERS(ego, s, p, what) \ +{ \ + unsigned _cnt; \ + for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \ + slvdesc *p = ego->slvdescs + _cnt; \ + solver *s = p->slv; \ + what; \ + } \ +} + +#define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \ +{ \ + int _cnt = ego->slvdescs_for_problem_kind[kind]; \ + while (_cnt >= 0) { \ + slvdesc *p = ego->slvdescs + _cnt; \ + solver *s = p->slv; \ + what; \ + _cnt = p->next_for_same_problem_kind; \ + } \ +} + + +/* make plan, destroy problem */ +plan *X(mkplan_d)(planner *ego, problem *p); +plan *X(mkplan_f_d)(planner *ego, problem *p, + unsigned l_set, unsigned u_set, unsigned u_reset); + +/*-----------------------------------------------------------------------*/ +/* stride.c: */ + +/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */ +#if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE) +#define PRECOMPUTE_ARRAY_INDICES +#endif + +extern const INT X(an_INT_guaranteed_to_be_zero); + +#ifdef PRECOMPUTE_ARRAY_INDICES +typedef INT *stride; +#define WS(stride, i) (stride[i]) +extern stride X(mkstride)(INT n, INT s); +void X(stride_destroy)(stride p); +/* hackery to prevent the compiler from copying the strides array + onto the stack */ +#define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero) +#else + +typedef INT stride; +#define WS(stride, i) (stride * i) +#define fftwf_mkstride(n, stride) stride +#define fftw_mkstride(n, stride) stride +#define fftwl_mkstride(n, stride) stride +#define fftwf_stride_destroy(p) ((void) p) +#define fftw_stride_destroy(p) ((void) p) +#define fftwl_stride_destroy(p) ((void) p) + +/* hackery to prevent the compiler from ``optimizing'' induction + variables in codelet loops. The problem is that for each K and for + each expression of the form P[I + STRIDE * K] in a loop, most + compilers will try to lift an induction variable PK := &P[I + STRIDE * K]. + For large values of K this behavior overflows the + register set, which is likely worse than doing the index computation + in the first place. + + If we guess that there are more than + ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse + the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to + be 0, but the compiler does not know this. + + 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's + seem to imply. +*/ +#define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16 +#define MAKE_VOLATILE_STRIDE(nptr, x) \ + (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \ + 0 : \ + ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero))) +#endif /* PRECOMPUTE_ARRAY_INDICES */ + +/*-----------------------------------------------------------------------*/ +/* solvtab.c */ + +struct solvtab_s { void (*reg)(planner *); const char *reg_nam; }; +typedef struct solvtab_s solvtab[]; +void X(solvtab_exec)(const solvtab tbl, planner *p); +#define SOLVTAB(s) { s, STRINGIZE(s) } +#define SOLVTAB_END { 0, 0 } + +/*-----------------------------------------------------------------------*/ +/* pickdim.c */ +int X(pickdim)(int which_dim, const int *buddies, int nbuddies, + const tensor *sz, int oop, int *dp); + +/*-----------------------------------------------------------------------*/ +/* twiddle.c */ +/* little language to express twiddle factors computation */ +enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3, + TW_FULL = 4, TW_HALF = 5 }; + +typedef struct { + unsigned char op; + signed char v; + short i; +} tw_instr; + +typedef struct twid_s { + R *W; /* array of twiddle factors */ + INT n, r, m; /* transform order, radix, # twiddle rows */ + int refcnt; + const tw_instr *instr; + struct twid_s *cdr; + enum wakefulness wakefulness; +} twid; + +INT X(twiddle_length)(INT r, const tw_instr *p); +void X(twiddle_awake)(enum wakefulness wakefulness, + twid **pp, const tw_instr *instr, INT n, INT r, INT m); + +/*-----------------------------------------------------------------------*/ +/* trig.c */ +#if defined(TRIGREAL_IS_LONG_DOUBLE) + typedef long double trigreal; +#elif defined(TRIGREAL_IS_QUAD) + typedef __float128 trigreal; +#else + typedef double trigreal; +#endif + +typedef struct triggen_s triggen; + +struct triggen_s { + void (*cexp)(triggen *t, INT m, R *result); + void (*cexpl)(triggen *t, INT m, trigreal *result); + void (*rotate)(triggen *p, INT m, R xr, R xi, R *res); + + INT twshft; + INT twradix; + INT twmsk; + trigreal *W0, *W1; + INT n; +}; + +triggen *X(mktriggen)(enum wakefulness wakefulness, INT n); +void X(triggen_destroy)(triggen *p); + +/*-----------------------------------------------------------------------*/ +/* primes.c: */ + +#define MULMOD(x, y, p) \ + (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p)) + +INT X(safe_mulmod)(INT x, INT y, INT p); +INT X(power_mod)(INT n, INT m, INT p); +INT X(find_generator)(INT p); +INT X(first_divisor)(INT n); +int X(is_prime)(INT n); +INT X(next_prime)(INT n); +int X(factors_into)(INT n, const INT *primes); +int X(factors_into_small_primes)(INT n); +INT X(choose_radix)(INT r, INT n); +INT X(isqrt)(INT n); +INT X(modulo)(INT a, INT n); + +#define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */ + +/* thresholds below which certain solvers are considered SLOW. These are guesses + believed to be conservative */ +#define GENERIC_MAX_SLOW 16 +#define RADER_MAX_SLOW 32 +#define BLUESTEIN_MAX_SLOW 24 + +/*-----------------------------------------------------------------------*/ +/* rader.c: */ +typedef struct rader_tls rader_tl; + +void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl); +R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t); +void X(rader_tl_delete)(R *W, rader_tl **tl); + +/*-----------------------------------------------------------------------*/ +/* copy/transposition routines */ + +/* lower bound to the cache size, for tiled routines */ +#define CACHESIZE 8192 + +INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache); + +void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz, + void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args), + void *args); +void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl); +void X(cpy2d)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); +void X(cpy2d_ci)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); +void X(cpy2d_co)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); +void X(cpy2d_tiled)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); +void X(cpy2d_tiledbuf)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); +void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1); +void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1); +void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1); + +void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl); +void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl); +void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl); + +typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl); +typedef void (*cpy2d_func)(R *I, R *O, + INT n0, INT is0, INT os0, + INT n1, INT is1, INT os1, + INT vl); + +/*-----------------------------------------------------------------------*/ +/* misc stuff */ +void X(null_awake)(plan *ego, enum wakefulness wakefulness); +double X(iestimate_cost)(const planner *, const plan *, const problem *); + +#ifdef FFTW_RANDOM_ESTIMATOR +extern unsigned X(random_estimate_seed); +#endif + +double X(measure_execution_time)(const planner *plnr, + plan *pln, const problem *p); +int X(alignment_of)(R *p); +unsigned X(hash)(const char *s); +INT X(nbuf)(INT n, INT vl, INT maxnbuf); +int X(nbuf_redundant)(INT n, INT vl, int which, + const INT *maxnbuf, int nmaxnbuf); +INT X(bufdist)(INT n, INT vl); +int X(toobig)(INT n); +int X(ct_uglyp)(INT min_n, INT v, INT n, INT r); + +#if HAVE_SIMD +R *X(taint)(R *p, INT s); +R *X(join_taint)(R *p1, R *p2); +#define TAINT(p, s) X(taint)(p, s) +#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3)) +#define TAINTOF(p) (((uintptr_t)(p)) & 3) +#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2) +#else +#define TAINT(p, s) (p) +#define UNTAINT(p) (p) +#define TAINTOF(p) 0 +#define JOIN_TAINT(p1, p2) p1 +#endif + +#ifdef FFTW_DEBUG_ALIGNMENT +# define ASSERT_ALIGNED_DOUBLE { \ + double __foo; \ + CK(!(((uintptr_t) &__foo) & 0x7)); \ +} +#else +# define ASSERT_ALIGNED_DOUBLE +#endif /* FFTW_DEBUG_ALIGNMENT */ + + + +/*-----------------------------------------------------------------------*/ +/* macros used in codelets to reduce source code size */ + +typedef R E; /* internal precision of codelets. */ + +#if defined(FFTW_LDOUBLE) +# define K(x) ((E) x##L) +#elif defined(FFTW_QUAD) +# define K(x) ((E) x##Q) +#else +# define K(x) ((E) x) +#endif +#define DK(name, value) const E name = K(value) + +/* FMA macros */ + +#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER)) +/* The obvious expression a * b + c does not work. If both x = a * b + + c and y = a * b - c appear in the source, gcc computes t = a * b, + x = t + c, y = t - c, thus destroying the fma. + + This peculiar coding seems to do the right thing on all of + gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing + on gcc-3.4 -fno-web (because the ``web'' pass splits the variable + `x' for the single-assignment form). + + However, gcc-4.0 is a formidable adversary which succeeds in + pessimizing two fma's into one multiplication and two additions. + It does it very early in the game---before the optimization passes + even start. The only real workaround seems to use fake inline asm + such as + + asm ("# confuse gcc %0" : "=f"(a) : "0"(a)); + return a * b + c; + + in each of the FMA, FMS, FNMA, and FNMS functions. However, this + does not solve the problem either, because two equal asm statements + count as a common subexpression! One must use *different* fake asm + statements: + + in FMA: + asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a)); + + in FMS: + asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a)); + + etc. + + After these changes, gcc recalcitrantly generates the fma that was + in the source to begin with. However, the extra asm() cruft + confuses other passes of gcc, notably the instruction scheduler. + (Of course, one could also generate the fma directly via inline + asm, but this confuses the scheduler even more.) + + Steven and I have submitted more than one bug report to the gcc + mailing list over the past few years, to no effect. Thus, I give + up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is + out before touching this crap again. +*/ +static __inline__ E FMA(E a, E b, E c) +{ + E x = a * b; + x = x + c; + return x; +} + +static __inline__ E FMS(E a, E b, E c) +{ + E x = a * b; + x = x - c; + return x; +} + +static __inline__ E FNMA(E a, E b, E c) +{ + E x = a * b; + x = - (x + c); + return x; +} + +static __inline__ E FNMS(E a, E b, E c) +{ + E x = a * b; + x = - (x - c); + return x; +} +#else +#define FMA(a, b, c) (((a) * (b)) + (c)) +#define FMS(a, b, c) (((a) * (b)) - (c)) +#define FNMA(a, b, c) (- (((a) * (b)) + (c))) +#define FNMS(a, b, c) ((c) - ((a) * (b))) +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* __IFFTW_H__ */