sv-dependency-builds: src/fftw-3.3.3/kernel/ifftw.h annotate

annotate src/fftw-3.3.3/kernel/ifftw.h @ 169:223a55898ab9 tip default

Add null config files

author	Chris Cannam <cannam@all-day-breakfast.com>
date	Mon, 02 Mar 2020 14:03:47 +0000
parents	89f5e221ed7b
children

rev	line source
cannam@95	1 /*
cannam@95	2 * Copyright (c) 2003, 2007-11 Matteo Frigo
cannam@95	3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
cannam@95	4 *
cannam@95	5 * This program is free software; you can redistribute it and/or modify
cannam@95	6 * it under the terms of the GNU General Public License as published by
cannam@95	7 * the Free Software Foundation; either version 2 of the License, or
cannam@95	8 * (at your option) any later version.
cannam@95	9 *
cannam@95	10 * This program is distributed in the hope that it will be useful,
cannam@95	11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@95	12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@95	13 * GNU General Public License for more details.
cannam@95	14 *
cannam@95	15 * You should have received a copy of the GNU General Public License
cannam@95	16 * along with this program; if not, write to the Free Software
cannam@95	17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@95	18 *
cannam@95	19 */
cannam@95	20
cannam@95	21
cannam@95	22 /* FFTW internal header file */
cannam@95	23 #ifndef __IFFTW_H__
cannam@95	24 #define __IFFTW_H__
cannam@95	25
cannam@95	26 #include "config.h"
cannam@95	27
cannam@95	28 #include <stdlib.h> /* size_t */
cannam@95	29 #include <stdarg.h> /* va_list */
cannam@95	30 #include <stddef.h> /* ptrdiff_t */
cannam@95	31
cannam@95	32 #if HAVE_SYS_TYPES_H
cannam@95	33 # include <sys/types.h>
cannam@95	34 #endif
cannam@95	35
cannam@95	36 #if HAVE_STDINT_H
cannam@95	37 # include <stdint.h> /* uintptr_t, maybe */
cannam@95	38 #endif
cannam@95	39
cannam@95	40 #if HAVE_INTTYPES_H
cannam@95	41 # include <inttypes.h> /* uintptr_t, maybe */
cannam@95	42 #endif
cannam@95	43
cannam@95	44 #ifdef __cplusplus
cannam@95	45 extern "C"
cannam@95	46 {
cannam@95	47 #endif /* __cplusplus */
cannam@95	48
cannam@95	49 /* Windows annoyances -- since tests/hook.c uses some internal
cannam@95	50 FFTW functions, we need to given them the dllexport attribute
cannam@95	51 under Windows when compiling as a DLL (see api/fftw3.h). */
cannam@95	52 #if defined(FFTW_EXTERN)
cannam@95	53 # define IFFTW_EXTERN FFTW_EXTERN
cannam@95	54 #elif (defined(FFTW_DLL) \|\| defined(DLL_EXPORT)) \
cannam@95	55 && (defined(_WIN32) \|\| defined(__WIN32__))
cannam@95	56 # define IFFTW_EXTERN extern __declspec(dllexport)
cannam@95	57 #else
cannam@95	58 # define IFFTW_EXTERN extern
cannam@95	59 #endif
cannam@95	60
cannam@95	61 /* determine precision and name-mangling scheme */
cannam@95	62 #define CONCAT(prefix, name) prefix ## name
cannam@95	63 #if defined(FFTW_SINGLE)
cannam@95	64 typedef float R;
cannam@95	65 # define X(name) CONCAT(fftwf_, name)
cannam@95	66 #elif defined(FFTW_LDOUBLE)
cannam@95	67 typedef long double R;
cannam@95	68 # define X(name) CONCAT(fftwl_, name)
cannam@95	69 # define TRIGREAL_IS_LONG_DOUBLE
cannam@95	70 #elif defined(FFTW_QUAD)
cannam@95	71 typedef __float128 R;
cannam@95	72 # define X(name) CONCAT(fftwq_, name)
cannam@95	73 # define TRIGREAL_IS_QUAD
cannam@95	74 #else
cannam@95	75 typedef double R;
cannam@95	76 # define X(name) CONCAT(fftw_, name)
cannam@95	77 #endif
cannam@95	78
cannam@95	79 /*
cannam@95	80 integral type large enough to contain a stride (what ``int'' should
cannam@95	81 have been in the first place.
cannam@95	82 */
cannam@95	83 typedef ptrdiff_t INT;
cannam@95	84
cannam@95	85 /* dummy use of unused parameters to silence compiler warnings */
cannam@95	86 #define UNUSED(x) (void)x
cannam@95	87
cannam@95	88 #define NELEM(array) ((int) (sizeof(array) / sizeof((array)[0])))
cannam@95	89
cannam@95	90 #define FFT_SIGN (-1) /* sign convention for forward transforms */
cannam@95	91 extern void X(extract_reim)(int sign, R c, R r, R *i);
cannam@95	92
cannam@95	93 #define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
cannam@95	94
cannam@95	95 #define STRINGIZEx(x) #x
cannam@95	96 #define STRINGIZE(x) STRINGIZEx(x)
cannam@95	97 #define CIMPLIES(ante, post) (!(ante) \|\| (post))
cannam@95	98
cannam@95	99 /* define HAVE_SIMD if any simd extensions are supported */
cannam@95	100 #if defined(HAVE_SSE) \|\| defined(HAVE_SSE2) \|\| defined(HAVE_ALTIVEC) \|\| \
cannam@95	101 defined(HAVE_MIPS_PS) \|\| defined(HAVE_AVX)
cannam@95	102 #define HAVE_SIMD 1
cannam@95	103 #else
cannam@95	104 #define HAVE_SIMD 0
cannam@95	105 #endif
cannam@95	106
cannam@95	107 extern int X(have_simd_sse2)(void);
cannam@95	108 extern int X(have_simd_avx)(void);
cannam@95	109 extern int X(have_simd_altivec)(void);
cannam@95	110 extern int X(have_simd_neon)(void);
cannam@95	111
cannam@95	112 /* forward declarations */
cannam@95	113 typedef struct problem_s problem;
cannam@95	114 typedef struct plan_s plan;
cannam@95	115 typedef struct solver_s solver;
cannam@95	116 typedef struct planner_s planner;
cannam@95	117 typedef struct printer_s printer;
cannam@95	118 typedef struct scanner_s scanner;
cannam@95	119
cannam@95	120 /-----------------------------------------------------------------------/
cannam@95	121 /* alloca: */
cannam@95	122 #if HAVE_SIMD
cannam@95	123 # ifdef HAVE_AVX
cannam@95	124 # define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
cannam@95	125 * everything else */
cannam@95	126 # else
cannam@95	127 /* Note that we cannot use 32-byte alignment for all SIMD. For
cannam@95	128 example, MacOS X malloc is 16-byte aligned, but there was no
cannam@95	129 posix_memalign in MacOS X until version 10.6. */
cannam@95	130 # define MIN_ALIGNMENT 16
cannam@95	131 # endif
cannam@95	132 #endif
cannam@95	133
cannam@95	134 #if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
cannam@95	135 /* use alloca if available */
cannam@95	136
cannam@95	137 #ifndef alloca
cannam@95	138 #ifdef __GNUC__
cannam@95	139 # define alloca __builtin_alloca
cannam@95	140 #else
cannam@95	141 # ifdef _MSC_VER
cannam@95	142 # include <malloc.h>
cannam@95	143 # define alloca _alloca
cannam@95	144 # else
cannam@95	145 # if HAVE_ALLOCA_H
cannam@95	146 # include <alloca.h>
cannam@95	147 # else
cannam@95	148 # ifdef _AIX
cannam@95	149 #pragma alloca
cannam@95	150 # else
cannam@95	151 # ifndef alloca /* predefined by HP cc +Olibcalls */
cannam@95	152 void *alloca(size_t);
cannam@95	153 # endif
cannam@95	154 # endif
cannam@95	155 # endif
cannam@95	156 # endif
cannam@95	157 #endif
cannam@95	158 #endif
cannam@95	159
cannam@95	160 # ifdef MIN_ALIGNMENT
cannam@95	161 # define STACK_MALLOC(T, p, n) \
cannam@95	162 { \
cannam@95	163 p = (T)alloca((n) + MIN_ALIGNMENT); \
cannam@95	164 p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
cannam@95	165 (~(uintptr_t)(MIN_ALIGNMENT - 1))); \
cannam@95	166 }
cannam@95	167 # define STACK_FREE(n)
cannam@95	168 # else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
cannam@95	169 # define STACK_MALLOC(T, p, n) p = (T)alloca(n)
cannam@95	170 # define STACK_FREE(n)
cannam@95	171 # endif
cannam@95	172
cannam@95	173 #else /* ! HAVE_ALLOCA */
cannam@95	174 /* use malloc instead of alloca */
cannam@95	175 # define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
cannam@95	176 # define STACK_FREE(n) X(ifree)(n)
cannam@95	177 #endif /* ! HAVE_ALLOCA */
cannam@95	178
cannam@95	179 /* allocation of buffers. If these grow too large use malloc(), else
cannam@95	180 use STACK_MALLOC (hopefully reducing to alloca()). */
cannam@95	181
cannam@95	182 /* 64KiB ought to be enough for anybody */
cannam@95	183 #define MAX_STACK_ALLOC ((size_t)64 * 1024)
cannam@95	184
cannam@95	185 #define BUF_ALLOC(T, p, n) \
cannam@95	186 { \
cannam@95	187 if (n < MAX_STACK_ALLOC) { \
cannam@95	188 STACK_MALLOC(T, p, n); \
cannam@95	189 } else { \
cannam@95	190 p = (T)MALLOC(n, BUFFERS); \
cannam@95	191 } \
cannam@95	192 }
cannam@95	193
cannam@95	194 #define BUF_FREE(p, n) \
cannam@95	195 { \
cannam@95	196 if (n < MAX_STACK_ALLOC) { \
cannam@95	197 STACK_FREE(p); \
cannam@95	198 } else { \
cannam@95	199 X(ifree)(p); \
cannam@95	200 } \
cannam@95	201 }
cannam@95	202
cannam@95	203 /-----------------------------------------------------------------------/
cannam@95	204 /* define uintptr_t if it is not already defined */
cannam@95	205
cannam@95	206 #ifndef HAVE_UINTPTR_T
cannam@95	207 # if SIZEOF_VOID_P == 0
cannam@95	208 # error sizeof void* is unknown!
cannam@95	209 # elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
cannam@95	210 typedef unsigned int uintptr_t;
cannam@95	211 # elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
cannam@95	212 typedef unsigned long uintptr_t;
cannam@95	213 # elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
cannam@95	214 typedef unsigned long long uintptr_t;
cannam@95	215 # else
cannam@95	216 # error no unsigned integer type matches void* sizeof!
cannam@95	217 # endif
cannam@95	218 #endif
cannam@95	219
cannam@95	220 /-----------------------------------------------------------------------/
cannam@95	221 /* We can do an optimization for copying pairs of (aligned) floats
cannam@95	222 when in single precision if 2float = double. /
cannam@95	223
cannam@95	224 #define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
cannam@95	225 && SIZEOF_FLOAT != 0 \
cannam@95	226 && SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
cannam@95	227
cannam@95	228 #define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
cannam@95	229
cannam@95	230 /-----------------------------------------------------------------------/
cannam@95	231 /* assert.c: */
cannam@95	232 IFFTW_EXTERN void X(assertion_failed)(const char *s,
cannam@95	233 int line, const char *file);
cannam@95	234
cannam@95	235 /* always check */
cannam@95	236 #define CK(ex) \
cannam@95	237 (void)((ex) \|\| (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@95	238
cannam@95	239 #ifdef FFTW_DEBUG
cannam@95	240 /* check only if debug enabled */
cannam@95	241 #define A(ex) \
cannam@95	242 (void)((ex) \|\| (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
cannam@95	243 #else
cannam@95	244 #define A(ex) /* nothing */
cannam@95	245 #endif
cannam@95	246
cannam@95	247 extern void X(debug)(const char *format, ...);
cannam@95	248 #define D X(debug)
cannam@95	249
cannam@95	250 /-----------------------------------------------------------------------/
cannam@95	251 /* kalloc.c: */
cannam@95	252 extern void *X(kernel_malloc)(size_t n);
cannam@95	253 extern void X(kernel_free)(void *p);
cannam@95	254
cannam@95	255 /-----------------------------------------------------------------------/
cannam@95	256 /* alloc.c: */
cannam@95	257
cannam@95	258 /* objects allocated by malloc, for statistical purposes */
cannam@95	259 enum malloc_tag {
cannam@95	260 EVERYTHING,
cannam@95	261 PLANS,
cannam@95	262 SOLVERS,
cannam@95	263 PROBLEMS,
cannam@95	264 BUFFERS,
cannam@95	265 HASHT,
cannam@95	266 TENSORS,
cannam@95	267 PLANNERS,
cannam@95	268 SLVDESCS,
cannam@95	269 TWIDDLES,
cannam@95	270 STRIDES,
cannam@95	271 OTHER,
cannam@95	272 MALLOC_WHAT_LAST /* must be last */
cannam@95	273 };
cannam@95	274
cannam@95	275 IFFTW_EXTERN void X(ifree)(void *ptr);
cannam@95	276 extern void X(ifree0)(void *ptr);
cannam@95	277
cannam@95	278 #ifdef FFTW_DEBUG_MALLOC
cannam@95	279
cannam@95	280 IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
cannam@95	281 const char *file, int line);
cannam@95	282 #define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
cannam@95	283 IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
cannam@95	284
cannam@95	285 #else /* ! FFTW_DEBUG_MALLOC */
cannam@95	286
cannam@95	287 IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
cannam@95	288 #define MALLOC(n, what) X(malloc_plain)(n)
cannam@95	289
cannam@95	290 #endif
cannam@95	291
cannam@95	292 #if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && (defined(HAVE_THREADS) \|\| defined(HAVE_OPENMP))
cannam@95	293 extern int X(in_thread);
cannam@95	294 # define IN_THREAD X(in_thread)
cannam@95	295 # define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
cannam@95	296 # define THREAD_OFF X(in_thread) = in_thread_save; }
cannam@95	297 #else
cannam@95	298 # define IN_THREAD 0
cannam@95	299 # define THREAD_ON
cannam@95	300 # define THREAD_OFF
cannam@95	301 #endif
cannam@95	302
cannam@95	303 /-----------------------------------------------------------------------/
cannam@95	304 /* low-resolution clock */
cannam@95	305
cannam@95	306 #ifdef FAKE_CRUDE_TIME
cannam@95	307 typedef int crude_time;
cannam@95	308 #else
cannam@95	309 # if TIME_WITH_SYS_TIME
cannam@95	310 # include <sys/time.h>
cannam@95	311 # include <time.h>
cannam@95	312 # else
cannam@95	313 # if HAVE_SYS_TIME_H
cannam@95	314 # include <sys/time.h>
cannam@95	315 # else
cannam@95	316 # include <time.h>
cannam@95	317 # endif
cannam@95	318 # endif
cannam@95	319
cannam@95	320 # ifdef HAVE_BSDGETTIMEOFDAY
cannam@95	321 # ifndef HAVE_GETTIMEOFDAY
cannam@95	322 # define gettimeofday BSDgettimeofday
cannam@95	323 # define HAVE_GETTIMEOFDAY 1
cannam@95	324 # endif
cannam@95	325 # endif
cannam@95	326
cannam@95	327 # if defined(HAVE_GETTIMEOFDAY)
cannam@95	328 typedef struct timeval crude_time;
cannam@95	329 # else
cannam@95	330 typedef clock_t crude_time;
cannam@95	331 # endif
cannam@95	332 #endif /* else FAKE_CRUDE_TIME */
cannam@95	333
cannam@95	334 crude_time X(get_crude_time)(void);
cannam@95	335 double X(elapsed_since)(const planner plnr, const problem p,
cannam@95	336 crude_time t0); /* time in seconds since t0 */
cannam@95	337
cannam@95	338 /-----------------------------------------------------------------------/
cannam@95	339 /* ops.c: */
cannam@95	340 /*
cannam@95	341 * ops counter. The total number of additions is add + fma
cannam@95	342 * and the total number of multiplications is mul + fma.
cannam@95	343 * Total flops = add + mul + 2 * fma
cannam@95	344 */
cannam@95	345 typedef struct {
cannam@95	346 double add;
cannam@95	347 double mul;
cannam@95	348 double fma;
cannam@95	349 double other;
cannam@95	350 } opcnt;
cannam@95	351
cannam@95	352 void X(ops_zero)(opcnt *dst);
cannam@95	353 void X(ops_other)(INT o, opcnt *dst);
cannam@95	354 void X(ops_cpy)(const opcnt src, opcnt dst);
cannam@95	355
cannam@95	356 void X(ops_add)(const opcnt a, const opcnt b, opcnt *dst);
cannam@95	357 void X(ops_add2)(const opcnt a, opcnt dst);
cannam@95	358
cannam@95	359 /* dst = m * a + b */
cannam@95	360 void X(ops_madd)(INT m, const opcnt a, const opcnt b, opcnt *dst);
cannam@95	361
cannam@95	362 /* dst += m * a */
cannam@95	363 void X(ops_madd2)(INT m, const opcnt a, opcnt dst);
cannam@95	364
cannam@95	365
cannam@95	366 /-----------------------------------------------------------------------/
cannam@95	367 /* minmax.c: */
cannam@95	368 INT X(imax)(INT a, INT b);
cannam@95	369 INT X(imin)(INT a, INT b);
cannam@95	370
cannam@95	371 /-----------------------------------------------------------------------/
cannam@95	372 /* iabs.c: */
cannam@95	373 INT X(iabs)(INT a);
cannam@95	374
cannam@95	375 /* inline version */
cannam@95	376 #define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
cannam@95	377
cannam@95	378 /-----------------------------------------------------------------------/
cannam@95	379 /* md5.c */
cannam@95	380
cannam@95	381 #if SIZEOF_UNSIGNED_INT >= 4
cannam@95	382 typedef unsigned int md5uint;
cannam@95	383 #else
cannam@95	384 typedef unsigned long md5uint; /* at least 32 bits as per C standard */
cannam@95	385 #endif
cannam@95	386
cannam@95	387 typedef md5uint md5sig[4];
cannam@95	388
cannam@95	389 typedef struct {
cannam@95	390 md5sig s; /* state and signature */
cannam@95	391
cannam@95	392 /* fields not meant to be used outside md5.c: */
cannam@95	393 unsigned char c[64]; /* stuff not yet processed */
cannam@95	394 unsigned l; /* total length. Should be 64 bits long, but this is
cannam@95	395 good enough for us */
cannam@95	396 } md5;
cannam@95	397
cannam@95	398 void X(md5begin)(md5 *p);
cannam@95	399 void X(md5putb)(md5 p, const void d_, size_t len);
cannam@95	400 void X(md5puts)(md5 p, const char s);
cannam@95	401 void X(md5putc)(md5 *p, unsigned char c);
cannam@95	402 void X(md5int)(md5 *p, int i);
cannam@95	403 void X(md5INT)(md5 *p, INT i);
cannam@95	404 void X(md5unsigned)(md5 *p, unsigned i);
cannam@95	405 void X(md5end)(md5 *p);
cannam@95	406
cannam@95	407 /-----------------------------------------------------------------------/
cannam@95	408 /* tensor.c: */
cannam@95	409 #define STRUCT_HACK_KR
cannam@95	410 #undef STRUCT_HACK_C99
cannam@95	411
cannam@95	412 typedef struct {
cannam@95	413 INT n;
cannam@95	414 INT is; /* input stride */
cannam@95	415 INT os; /* output stride */
cannam@95	416 } iodim;
cannam@95	417
cannam@95	418 typedef struct {
cannam@95	419 int rnk;
cannam@95	420 #if defined(STRUCT_HACK_KR)
cannam@95	421 iodim dims[1];
cannam@95	422 #elif defined(STRUCT_HACK_C99)
cannam@95	423 iodim dims[];
cannam@95	424 #else
cannam@95	425 iodim *dims;
cannam@95	426 #endif
cannam@95	427 } tensor;
cannam@95	428
cannam@95	429 /*
cannam@95	430 Definition of rank -infinity.
cannam@95	431 This definition has the property that if you want rank 0 or 1,
cannam@95	432 you can simply test for rank <= 1. This is a common case.
cannam@95	433
cannam@95	434 A tensor of rank -infinity has size 0.
cannam@95	435 */
cannam@95	436 #define RNK_MINFTY ((int)(((unsigned) -1) >> 1))
cannam@95	437 #define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
cannam@95	438
cannam@95	439 typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
cannam@95	440
cannam@95	441 tensor *X(mktensor)(int rnk);
cannam@95	442 tensor *X(mktensor_0d)(void);
cannam@95	443 tensor *X(mktensor_1d)(INT n, INT is, INT os);
cannam@95	444 tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
cannam@95	445 INT n1, INT is1, INT os1);
cannam@95	446 tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
cannam@95	447 INT n1, INT is1, INT os1,
cannam@95	448 INT n2, INT is2, INT os2);
cannam@95	449 tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
cannam@95	450 INT n1, INT is1, INT os1,
cannam@95	451 INT n2, INT is2, INT os2,
cannam@95	452 INT n3, INT is3, INT os3);
cannam@95	453 tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
cannam@95	454 INT n1, INT is1, INT os1,
cannam@95	455 INT n2, INT is2, INT os2,
cannam@95	456 INT n3, INT is3, INT os3,
cannam@95	457 INT n4, INT is4, INT os4);
cannam@95	458 INT X(tensor_sz)(const tensor *sz);
cannam@95	459 void X(tensor_md5)(md5 p, const tensor t);
cannam@95	460 INT X(tensor_max_index)(const tensor *sz);
cannam@95	461 INT X(tensor_min_istride)(const tensor *sz);
cannam@95	462 INT X(tensor_min_ostride)(const tensor *sz);
cannam@95	463 INT X(tensor_min_stride)(const tensor *sz);
cannam@95	464 int X(tensor_inplace_strides)(const tensor *sz);
cannam@95	465 int X(tensor_inplace_strides2)(const tensor a, const tensor b);
cannam@95	466 int X(tensor_strides_decrease)(const tensor sz, const tensor vecsz,
cannam@95	467 inplace_kind k);
cannam@95	468 tensor X(tensor_copy)(const tensor sz);
cannam@95	469 int X(tensor_kosherp)(const tensor *x);
cannam@95	470
cannam@95	471 tensor X(tensor_copy_inplace)(const tensor sz, inplace_kind k);
cannam@95	472 tensor X(tensor_copy_except)(const tensor sz, int except_dim);
cannam@95	473 tensor X(tensor_copy_sub)(const tensor sz, int start_dim, int rnk);
cannam@95	474 tensor X(tensor_compress)(const tensor sz);
cannam@95	475 tensor X(tensor_compress_contiguous)(const tensor sz);
cannam@95	476 tensor X(tensor_append)(const tensor a, const tensor *b);
cannam@95	477 void X(tensor_split)(const tensor sz, tensor a, int a_rnk, tensor *b);
cannam@95	478 int X(tensor_tornk1)(const tensor t, INT n, INT is, INT os);
cannam@95	479 void X(tensor_destroy)(tensor *sz);
cannam@95	480 void X(tensor_destroy2)(tensor a, tensor b);
cannam@95	481 void X(tensor_destroy4)(tensor a, tensor b, tensor c, tensor d);
cannam@95	482 void X(tensor_print)(const tensor sz, printer p);
cannam@95	483 int X(dimcmp)(const iodim a, const iodim b);
cannam@95	484 int X(tensor_equal)(const tensor a, const tensor b);
cannam@95	485 int X(tensor_inplace_locations)(const tensor sz, const tensor vecsz);
cannam@95	486
cannam@95	487 /-----------------------------------------------------------------------/
cannam@95	488 /* problem.c: */
cannam@95	489 enum {
cannam@95	490 /* a problem that cannot be solved */
cannam@95	491 PROBLEM_UNSOLVABLE,
cannam@95	492
cannam@95	493 PROBLEM_DFT,
cannam@95	494 PROBLEM_RDFT,
cannam@95	495 PROBLEM_RDFT2,
cannam@95	496
cannam@95	497 /* for mpi/ subdirectory */
cannam@95	498 PROBLEM_MPI_DFT,
cannam@95	499 PROBLEM_MPI_RDFT,
cannam@95	500 PROBLEM_MPI_RDFT2,
cannam@95	501 PROBLEM_MPI_TRANSPOSE,
cannam@95	502
cannam@95	503 PROBLEM_LAST
cannam@95	504 };
cannam@95	505
cannam@95	506 typedef struct {
cannam@95	507 int problem_kind;
cannam@95	508 void (hash) (const problem ego, md5 *p);
cannam@95	509 void (zero) (const problem ego);
cannam@95	510 void (print) (const problem ego, printer *p);
cannam@95	511 void (destroy) (problem ego);
cannam@95	512 } problem_adt;
cannam@95	513
cannam@95	514 struct problem_s {
cannam@95	515 const problem_adt *adt;
cannam@95	516 };
cannam@95	517
cannam@95	518 problem X(mkproblem)(size_t sz, const problem_adt adt);
cannam@95	519 void X(problem_destroy)(problem *ego);
cannam@95	520 problem *X(mkproblem_unsolvable)(void);
cannam@95	521
cannam@95	522 /-----------------------------------------------------------------------/
cannam@95	523 /* print.c */
cannam@95	524 struct printer_s {
cannam@95	525 void (print)(printer p, const char *format, ...);
cannam@95	526 void (vprint)(printer p, const char *format, va_list ap);
cannam@95	527 void (putchr)(printer p, char c);
cannam@95	528 void (cleanup)(printer p);
cannam@95	529 int indent;
cannam@95	530 int indent_incr;
cannam@95	531 };
cannam@95	532
cannam@95	533 printer *X(mkprinter)(size_t size,
cannam@95	534 void (putchr)(printer p, char c),
cannam@95	535 void (cleanup)(printer p));
cannam@95	536 IFFTW_EXTERN void X(printer_destroy)(printer *p);
cannam@95	537
cannam@95	538 /-----------------------------------------------------------------------/
cannam@95	539 /* scan.c */
cannam@95	540 struct scanner_s {
cannam@95	541 int (scan)(scanner sc, const char *format, ...);
cannam@95	542 int (vscan)(scanner sc, const char *format, va_list ap);
cannam@95	543 int (getchr)(scanner sc);
cannam@95	544 int ungotc;
cannam@95	545 };
cannam@95	546
cannam@95	547 scanner X(mkscanner)(size_t size, int (getchr)(scanner *sc));
cannam@95	548 void X(scanner_destroy)(scanner *sc);
cannam@95	549
cannam@95	550 /-----------------------------------------------------------------------/
cannam@95	551 /* plan.c: */
cannam@95	552
cannam@95	553 enum wakefulness {
cannam@95	554 SLEEPY,
cannam@95	555 AWAKE_ZERO,
cannam@95	556 AWAKE_SQRTN_TABLE,
cannam@95	557 AWAKE_SINCOS
cannam@95	558 };
cannam@95	559
cannam@95	560 typedef struct {
cannam@95	561 void (solve)(const plan ego, const problem *p);
cannam@95	562 void (awake)(plan ego, enum wakefulness wakefulness);
cannam@95	563 void (print)(const plan ego, printer *p);
cannam@95	564 void (destroy)(plan ego);
cannam@95	565 } plan_adt;
cannam@95	566
cannam@95	567 struct plan_s {
cannam@95	568 const plan_adt *adt;
cannam@95	569 opcnt ops;
cannam@95	570 double pcost;
cannam@95	571 enum wakefulness wakefulness; /* used for debugging only */
cannam@95	572 int could_prune_now_p;
cannam@95	573 };
cannam@95	574
cannam@95	575 plan X(mkplan)(size_t size, const plan_adt adt);
cannam@95	576 void X(plan_destroy_internal)(plan *ego);
cannam@95	577 IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
cannam@95	578 void X(plan_null_destroy)(plan *ego);
cannam@95	579
cannam@95	580 /-----------------------------------------------------------------------/
cannam@95	581 /* solver.c: */
cannam@95	582 typedef struct {
cannam@95	583 int problem_kind;
cannam@95	584 plan (mkplan)(const solver ego, const problem p, planner *plnr);
cannam@95	585 void (destroy)(solver ego);
cannam@95	586 } solver_adt;
cannam@95	587
cannam@95	588 struct solver_s {
cannam@95	589 const solver_adt *adt;
cannam@95	590 int refcnt;
cannam@95	591 };
cannam@95	592
cannam@95	593 solver X(mksolver)(size_t size, const solver_adt adt);
cannam@95	594 void X(solver_use)(solver *ego);
cannam@95	595 void X(solver_destroy)(solver *ego);
cannam@95	596 void X(solver_register)(planner plnr, solver s);
cannam@95	597
cannam@95	598 /* shorthand */
cannam@95	599 #define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
cannam@95	600
cannam@95	601 /-----------------------------------------------------------------------/
cannam@95	602 /* planner.c */
cannam@95	603
cannam@95	604 typedef struct slvdesc_s {
cannam@95	605 solver *slv;
cannam@95	606 const char *reg_nam;
cannam@95	607 unsigned nam_hash;
cannam@95	608 int reg_id;
cannam@95	609 int next_for_same_problem_kind;
cannam@95	610 } slvdesc;
cannam@95	611
cannam@95	612 typedef struct solution_s solution; /* opaque */
cannam@95	613
cannam@95	614 /* interpretation of L and U:
cannam@95	615
cannam@95	616 - if it returns a plan, the planner guarantees that all applicable
cannam@95	617 plans at least as impatient as U have been tried, and that each
cannam@95	618 plan in the solution is at least as impatient as L.
cannam@95	619
cannam@95	620 - if it returns 0, the planner guarantees to have tried all solvers
cannam@95	621 at least as impatient as L, and that none of them was applicable.
cannam@95	622
cannam@95	623 The structure is packed to fit into 64 bits.
cannam@95	624 */
cannam@95	625
cannam@95	626 typedef struct {
cannam@95	627 unsigned l:20;
cannam@95	628 unsigned hash_info:3;
cannam@95	629 # define BITS_FOR_TIMELIMIT 9
cannam@95	630 unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
cannam@95	631 unsigned u:20;
cannam@95	632
cannam@95	633 /* abstraction break: we store the solver here to pad the
cannam@95	634 structure to 64 bits. Otherwise, the struct is padded to 64
cannam@95	635 bits anyway, and another word is allocated for slvndx. */
cannam@95	636 # define BITS_FOR_SLVNDX 12
cannam@95	637 unsigned slvndx:BITS_FOR_SLVNDX;
cannam@95	638 } flags_t;
cannam@95	639
cannam@95	640 /* impatience flags */
cannam@95	641 enum {
cannam@95	642 BELIEVE_PCOST = 0x0001,
cannam@95	643 ESTIMATE = 0x0002,
cannam@95	644 NO_DFT_R2HC = 0x0004,
cannam@95	645 NO_SLOW = 0x0008,
cannam@95	646 NO_VRECURSE = 0x0010,
cannam@95	647 NO_INDIRECT_OP = 0x0020,
cannam@95	648 NO_LARGE_GENERIC = 0x0040,
cannam@95	649 NO_RANK_SPLITS = 0x0080,
cannam@95	650 NO_VRANK_SPLITS = 0x0100,
cannam@95	651 NO_NONTHREADED = 0x0200,
cannam@95	652 NO_BUFFERING = 0x0400,
cannam@95	653 NO_FIXED_RADIX_LARGE_N = 0x0800,
cannam@95	654 NO_DESTROY_INPUT = 0x1000,
cannam@95	655 NO_SIMD = 0x2000,
cannam@95	656 CONSERVE_MEMORY = 0x4000,
cannam@95	657 NO_DHT_R2HC = 0x8000,
cannam@95	658 NO_UGLY = 0x10000,
cannam@95	659 ALLOW_PRUNING = 0x20000
cannam@95	660 };
cannam@95	661
cannam@95	662 /* hashtable information */
cannam@95	663 enum {
cannam@95	664 BLESSING = 0x1, /* save this entry */
cannam@95	665 H_VALID = 0x2, /* valid hastable entry */
cannam@95	666 H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */
cannam@95	667 };
cannam@95	668
cannam@95	669 #define PLNR_L(plnr) ((plnr)->flags.l)
cannam@95	670 #define PLNR_U(plnr) ((plnr)->flags.u)
cannam@95	671 #define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
cannam@95	672
cannam@95	673 #define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
cannam@95	674 #define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
cannam@95	675 #define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
cannam@95	676
cannam@95	677 #define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
cannam@95	678 #define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
cannam@95	679 #define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
cannam@95	680 #define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
cannam@95	681 #define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
cannam@95	682 #define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
cannam@95	683 #define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
cannam@95	684 #define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
cannam@95	685 #define NO_FIXED_RADIX_LARGE_NP(plnr) \
cannam@95	686 (PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
cannam@95	687 #define NO_NONTHREADEDP(plnr) \
cannam@95	688 ((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
cannam@95	689
cannam@95	690 #define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
cannam@95	691 #define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
cannam@95	692 #define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
cannam@95	693 #define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
cannam@95	694 #define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
cannam@95	695
cannam@95	696 typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
cannam@95	697
cannam@95	698 typedef enum {
cannam@95	699 /* WISDOM_NORMAL: planner may or may not use wisdom */
cannam@95	700 WISDOM_NORMAL,
cannam@95	701
cannam@95	702 /* WISDOM_ONLY: planner must use wisdom and must avoid searching */
cannam@95	703 WISDOM_ONLY,
cannam@95	704
cannam@95	705 /* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
cannam@95	706 WISDOM_IS_BOGUS,
cannam@95	707
cannam@95	708 /* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
cannam@95	709 WISDOM_IGNORE_INFEASIBLE,
cannam@95	710
cannam@95	711 /* WISDOM_IGNORE_ALL: planner ignores all */
cannam@95	712 WISDOM_IGNORE_ALL
cannam@95	713 } wisdom_state_t;
cannam@95	714
cannam@95	715 typedef struct {
cannam@95	716 void (register_solver)(planner ego, solver *s);
cannam@95	717 plan (mkplan)(planner ego, const problem p);
cannam@95	718 void (forget)(planner ego, amnesia a);
cannam@95	719 void (exprt)(planner ego, printer p); / ``export'' is a reserved
cannam@95	720 word in C++. */
cannam@95	721 int (imprt)(planner ego, scanner *sc);
cannam@95	722 } planner_adt;
cannam@95	723
cannam@95	724 /* hash table of solutions */
cannam@95	725 typedef struct {
cannam@95	726 solution *solutions;
cannam@95	727 unsigned hashsiz, nelem;
cannam@95	728
cannam@95	729 /* statistics */
cannam@95	730 int lookup, succ_lookup, lookup_iter;
cannam@95	731 int insert, insert_iter, insert_unknown;
cannam@95	732 int nrehash;
cannam@95	733 } hashtab;
cannam@95	734
cannam@95	735 typedef enum { COST_SUM, COST_MAX } cost_kind;
cannam@95	736
cannam@95	737 struct planner_s {
cannam@95	738 const planner_adt *adt;
cannam@95	739 void (hook)(struct planner_s plnr, plan *pln,
cannam@95	740 const problem *p, int optimalp);
cannam@95	741 double (cost_hook)(const problem p, double t, cost_kind k);
cannam@95	742 int (wisdom_ok_hook)(const problem p, flags_t flags);
cannam@95	743 void (nowisdom_hook)(const problem p);
cannam@95	744 wisdom_state_t (bogosity_hook)(wisdom_state_t state, const problem p);
cannam@95	745
cannam@95	746 /* solver descriptors */
cannam@95	747 slvdesc *slvdescs;
cannam@95	748 unsigned nslvdesc, slvdescsiz;
cannam@95	749 const char *cur_reg_nam;
cannam@95	750 int cur_reg_id;
cannam@95	751 int slvdescs_for_problem_kind[PROBLEM_LAST];
cannam@95	752
cannam@95	753 wisdom_state_t wisdom_state;
cannam@95	754
cannam@95	755 hashtab htab_blessed;
cannam@95	756 hashtab htab_unblessed;
cannam@95	757
cannam@95	758 int nthr;
cannam@95	759 flags_t flags;
cannam@95	760
cannam@95	761 crude_time start_time;
cannam@95	762 double timelimit; /* elapsed_since(start_time) at which to bail out */
cannam@95	763 int timed_out; /* whether most recent search timed out */
cannam@95	764 int need_timeout_check;
cannam@95	765
cannam@95	766 /* various statistics */
cannam@95	767 int nplan; /* number of plans evaluated */
cannam@95	768 double pcost, epcost; /* total pcost of measured/estimated plans */
cannam@95	769 int nprob; /* number of problems evaluated */
cannam@95	770 };
cannam@95	771
cannam@95	772 planner *X(mkplanner)(void);
cannam@95	773 void X(planner_destroy)(planner *ego);
cannam@95	774
cannam@95	775 /*
cannam@95	776 Iterate over all solvers. Read:
cannam@95	777
cannam@95	778 @article{ baker93iterators,
cannam@95	779 author = "Henry G. Baker, Jr.",
cannam@95	780 title = "Iterators: Signs of Weakness in Object-Oriented Languages",
cannam@95	781 journal = "{ACM} {OOPS} Messenger",
cannam@95	782 volume = "4",
cannam@95	783 number = "3",
cannam@95	784 pages = "18--25"
cannam@95	785 }
cannam@95	786 */
cannam@95	787 #define FORALL_SOLVERS(ego, s, p, what) \
cannam@95	788 { \
cannam@95	789 unsigned _cnt; \
cannam@95	790 for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
cannam@95	791 slvdesc *p = ego->slvdescs + _cnt; \
cannam@95	792 solver *s = p->slv; \
cannam@95	793 what; \
cannam@95	794 } \
cannam@95	795 }
cannam@95	796
cannam@95	797 #define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
cannam@95	798 { \
cannam@95	799 int _cnt = ego->slvdescs_for_problem_kind[kind]; \
cannam@95	800 while (_cnt >= 0) { \
cannam@95	801 slvdesc *p = ego->slvdescs + _cnt; \
cannam@95	802 solver *s = p->slv; \
cannam@95	803 what; \
cannam@95	804 _cnt = p->next_for_same_problem_kind; \
cannam@95	805 } \
cannam@95	806 }
cannam@95	807
cannam@95	808
cannam@95	809 /* make plan, destroy problem */
cannam@95	810 plan X(mkplan_d)(planner ego, problem *p);
cannam@95	811 plan X(mkplan_f_d)(planner ego, problem *p,
cannam@95	812 unsigned l_set, unsigned u_set, unsigned u_reset);
cannam@95	813
cannam@95	814 /-----------------------------------------------------------------------/
cannam@95	815 /* stride.c: */
cannam@95	816
cannam@95	817 /* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
cannam@95	818 #if (defined(__i386__) \|\| defined(__x86_64__) \|\| _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
cannam@95	819 #define PRECOMPUTE_ARRAY_INDICES
cannam@95	820 #endif
cannam@95	821
cannam@95	822 extern const INT X(an_INT_guaranteed_to_be_zero);
cannam@95	823
cannam@95	824 #ifdef PRECOMPUTE_ARRAY_INDICES
cannam@95	825 typedef INT *stride;
cannam@95	826 #define WS(stride, i) (stride[i])
cannam@95	827 extern stride X(mkstride)(INT n, INT s);
cannam@95	828 void X(stride_destroy)(stride p);
cannam@95	829 /* hackery to prevent the compiler from copying the strides array
cannam@95	830 onto the stack */
cannam@95	831 #define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
cannam@95	832 #else
cannam@95	833
cannam@95	834 typedef INT stride;
cannam@95	835 #define WS(stride, i) (stride * i)
cannam@95	836 #define fftwf_mkstride(n, stride) stride
cannam@95	837 #define fftw_mkstride(n, stride) stride
cannam@95	838 #define fftwl_mkstride(n, stride) stride
cannam@95	839 #define fftwf_stride_destroy(p) ((void) p)
cannam@95	840 #define fftw_stride_destroy(p) ((void) p)
cannam@95	841 #define fftwl_stride_destroy(p) ((void) p)
cannam@95	842
cannam@95	843 /* hackery to prevent the compiler from ``optimizing'' induction
cannam@95	844 variables in codelet loops. The problem is that for each K and for
cannam@95	845 each expression of the form P[I + STRIDE * K] in a loop, most
cannam@95	846 compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
cannam@95	847 For large values of K this behavior overflows the
cannam@95	848 register set, which is likely worse than doing the index computation
cannam@95	849 in the first place.
cannam@95	850
cannam@95	851 If we guess that there are more than
cannam@95	852 ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
cannam@95	853 the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
cannam@95	854 be 0, but the compiler does not know this.
cannam@95	855
cannam@95	856 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
cannam@95	857 seem to imply.
cannam@95	858 */
cannam@95	859 #define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
cannam@95	860 #define MAKE_VOLATILE_STRIDE(nptr, x) \
cannam@95	861 (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
cannam@95	862 0 : \
cannam@95	863 ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
cannam@95	864 #endif /* PRECOMPUTE_ARRAY_INDICES */
cannam@95	865
cannam@95	866 /-----------------------------------------------------------------------/
cannam@95	867 /* solvtab.c */
cannam@95	868
cannam@95	869 struct solvtab_s { void (reg)(planner ); const char *reg_nam; };
cannam@95	870 typedef struct solvtab_s solvtab[];
cannam@95	871 void X(solvtab_exec)(const solvtab tbl, planner *p);
cannam@95	872 #define SOLVTAB(s) { s, STRINGIZE(s) }
cannam@95	873 #define SOLVTAB_END { 0, 0 }
cannam@95	874
cannam@95	875 /-----------------------------------------------------------------------/
cannam@95	876 /* pickdim.c */
cannam@95	877 int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
cannam@95	878 const tensor sz, int oop, int dp);
cannam@95	879
cannam@95	880 /-----------------------------------------------------------------------/
cannam@95	881 /* twiddle.c */
cannam@95	882 /* little language to express twiddle factors computation */
cannam@95	883 enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
cannam@95	884 TW_FULL = 4, TW_HALF = 5 };
cannam@95	885
cannam@95	886 typedef struct {
cannam@95	887 unsigned char op;
cannam@95	888 signed char v;
cannam@95	889 short i;
cannam@95	890 } tw_instr;
cannam@95	891
cannam@95	892 typedef struct twid_s {
cannam@95	893 R W; / array of twiddle factors */
cannam@95	894 INT n, r, m; /* transform order, radix, # twiddle rows */
cannam@95	895 int refcnt;
cannam@95	896 const tw_instr *instr;
cannam@95	897 struct twid_s *cdr;
cannam@95	898 enum wakefulness wakefulness;
cannam@95	899 } twid;
cannam@95	900
cannam@95	901 INT X(twiddle_length)(INT r, const tw_instr *p);
cannam@95	902 void X(twiddle_awake)(enum wakefulness wakefulness,
cannam@95	903 twid *pp, const tw_instr instr, INT n, INT r, INT m);
cannam@95	904
cannam@95	905 /-----------------------------------------------------------------------/
cannam@95	906 /* trig.c */
cannam@95	907 #if defined(TRIGREAL_IS_LONG_DOUBLE)
cannam@95	908 typedef long double trigreal;
cannam@95	909 #elif defined(TRIGREAL_IS_QUAD)
cannam@95	910 typedef __float128 trigreal;
cannam@95	911 #else
cannam@95	912 typedef double trigreal;
cannam@95	913 #endif
cannam@95	914
cannam@95	915 typedef struct triggen_s triggen;
cannam@95	916
cannam@95	917 struct triggen_s {
cannam@95	918 void (cexp)(triggen t, INT m, R *result);
cannam@95	919 void (cexpl)(triggen t, INT m, trigreal *result);
cannam@95	920 void (rotate)(triggen p, INT m, R xr, R xi, R *res);
cannam@95	921
cannam@95	922 INT twshft;
cannam@95	923 INT twradix;
cannam@95	924 INT twmsk;
cannam@95	925 trigreal W0, W1;
cannam@95	926 INT n;
cannam@95	927 };
cannam@95	928
cannam@95	929 triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
cannam@95	930 void X(triggen_destroy)(triggen *p);
cannam@95	931
cannam@95	932 /-----------------------------------------------------------------------/
cannam@95	933 /* primes.c: */
cannam@95	934
cannam@95	935 #define MULMOD(x, y, p) \
cannam@95	936 (((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
cannam@95	937
cannam@95	938 INT X(safe_mulmod)(INT x, INT y, INT p);
cannam@95	939 INT X(power_mod)(INT n, INT m, INT p);
cannam@95	940 INT X(find_generator)(INT p);
cannam@95	941 INT X(first_divisor)(INT n);
cannam@95	942 int X(is_prime)(INT n);
cannam@95	943 INT X(next_prime)(INT n);
cannam@95	944 int X(factors_into)(INT n, const INT *primes);
cannam@95	945 int X(factors_into_small_primes)(INT n);
cannam@95	946 INT X(choose_radix)(INT r, INT n);
cannam@95	947 INT X(isqrt)(INT n);
cannam@95	948 INT X(modulo)(INT a, INT n);
cannam@95	949
cannam@95	950 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
cannam@95	951
cannam@95	952 /* thresholds below which certain solvers are considered SLOW. These are guesses
cannam@95	953 believed to be conservative */
cannam@95	954 #define GENERIC_MAX_SLOW 16
cannam@95	955 #define RADER_MAX_SLOW 32
cannam@95	956 #define BLUESTEIN_MAX_SLOW 24
cannam@95	957
cannam@95	958 /-----------------------------------------------------------------------/
cannam@95	959 /* rader.c: */
cannam@95	960 typedef struct rader_tls rader_tl;
cannam@95	961
cannam@95	962 void X(rader_tl_insert)(INT k1, INT k2, INT k3, R W, rader_tl *tl);
cannam@95	963 R X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl t);
cannam@95	964 void X(rader_tl_delete)(R W, rader_tl *tl);
cannam@95	965
cannam@95	966 /-----------------------------------------------------------------------/
cannam@95	967 /* copy/transposition routines */
cannam@95	968
cannam@95	969 /* lower bound to the cache size, for tiled routines */
cannam@95	970 #define CACHESIZE 8192
cannam@95	971
cannam@95	972 INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
cannam@95	973
cannam@95	974 void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
cannam@95	975 void (f)(INT n0l, INT n0u, INT n1l, INT n1u, void args),
cannam@95	976 void *args);
cannam@95	977 void X(cpy1d)(R I, R O, INT n0, INT is0, INT os0, INT vl);
cannam@95	978 void X(cpy2d)(R I, R O,
cannam@95	979 INT n0, INT is0, INT os0,
cannam@95	980 INT n1, INT is1, INT os1,
cannam@95	981 INT vl);
cannam@95	982 void X(cpy2d_ci)(R I, R O,
cannam@95	983 INT n0, INT is0, INT os0,
cannam@95	984 INT n1, INT is1, INT os1,
cannam@95	985 INT vl);
cannam@95	986 void X(cpy2d_co)(R I, R O,
cannam@95	987 INT n0, INT is0, INT os0,
cannam@95	988 INT n1, INT is1, INT os1,
cannam@95	989 INT vl);
cannam@95	990 void X(cpy2d_tiled)(R I, R O,
cannam@95	991 INT n0, INT is0, INT os0,
cannam@95	992 INT n1, INT is1, INT os1,
cannam@95	993 INT vl);
cannam@95	994 void X(cpy2d_tiledbuf)(R I, R O,
cannam@95	995 INT n0, INT is0, INT os0,
cannam@95	996 INT n1, INT is1, INT os1,
cannam@95	997 INT vl);
cannam@95	998 void X(cpy2d_pair)(R I0, R I1, R O0, R O1,
cannam@95	999 INT n0, INT is0, INT os0,
cannam@95	1000 INT n1, INT is1, INT os1);
cannam@95	1001 void X(cpy2d_pair_ci)(R I0, R I1, R O0, R O1,
cannam@95	1002 INT n0, INT is0, INT os0,
cannam@95	1003 INT n1, INT is1, INT os1);
cannam@95	1004 void X(cpy2d_pair_co)(R I0, R I1, R O0, R O1,
cannam@95	1005 INT n0, INT is0, INT os0,
cannam@95	1006 INT n1, INT is1, INT os1);
cannam@95	1007
cannam@95	1008 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@95	1009 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@95	1010 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
cannam@95	1011
cannam@95	1012 typedef void (transpose_func)(R I, INT n, INT s0, INT s1, INT vl);
cannam@95	1013 typedef void (cpy2d_func)(R I, R *O,
cannam@95	1014 INT n0, INT is0, INT os0,
cannam@95	1015 INT n1, INT is1, INT os1,
cannam@95	1016 INT vl);
cannam@95	1017
cannam@95	1018 /-----------------------------------------------------------------------/
cannam@95	1019 /* misc stuff */
cannam@95	1020 void X(null_awake)(plan *ego, enum wakefulness wakefulness);
cannam@95	1021 double X(iestimate_cost)(const planner , const plan , const problem *);
cannam@95	1022
cannam@95	1023 #ifdef FFTW_RANDOM_ESTIMATOR
cannam@95	1024 extern unsigned X(random_estimate_seed);
cannam@95	1025 #endif
cannam@95	1026
cannam@95	1027 double X(measure_execution_time)(const planner *plnr,
cannam@95	1028 plan pln, const problem p);
cannam@95	1029 int X(alignment_of)(R *p);
cannam@95	1030 unsigned X(hash)(const char *s);
cannam@95	1031 INT X(nbuf)(INT n, INT vl, INT maxnbuf);
cannam@95	1032 int X(nbuf_redundant)(INT n, INT vl, int which,
cannam@95	1033 const INT *maxnbuf, int nmaxnbuf);
cannam@95	1034 INT X(bufdist)(INT n, INT vl);
cannam@95	1035 int X(toobig)(INT n);
cannam@95	1036 int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
cannam@95	1037
cannam@95	1038 #if HAVE_SIMD
cannam@95	1039 R X(taint)(R p, INT s);
cannam@95	1040 R X(join_taint)(R p1, R *p2);
cannam@95	1041 #define TAINT(p, s) X(taint)(p, s)
cannam@95	1042 #define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
cannam@95	1043 #define TAINTOF(p) (((uintptr_t)(p)) & 3)
cannam@95	1044 #define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
cannam@95	1045 #else
cannam@95	1046 #define TAINT(p, s) (p)
cannam@95	1047 #define UNTAINT(p) (p)
cannam@95	1048 #define TAINTOF(p) 0
cannam@95	1049 #define JOIN_TAINT(p1, p2) p1
cannam@95	1050 #endif
cannam@95	1051
cannam@95	1052 #ifdef FFTW_DEBUG_ALIGNMENT
cannam@95	1053 # define ASSERT_ALIGNED_DOUBLE { \
cannam@95	1054 double __foo; \
cannam@95	1055 CK(!(((uintptr_t) &__foo) & 0x7)); \
cannam@95	1056 }
cannam@95	1057 #else
cannam@95	1058 # define ASSERT_ALIGNED_DOUBLE
cannam@95	1059 #endif /* FFTW_DEBUG_ALIGNMENT */
cannam@95	1060
cannam@95	1061
cannam@95	1062
cannam@95	1063 /-----------------------------------------------------------------------/
cannam@95	1064 /* macros used in codelets to reduce source code size */
cannam@95	1065
cannam@95	1066 typedef R E; /* internal precision of codelets. */
cannam@95	1067
cannam@95	1068 #if defined(FFTW_LDOUBLE)
cannam@95	1069 # define K(x) ((E) x##L)
cannam@95	1070 #elif defined(FFTW_QUAD)
cannam@95	1071 # define K(x) ((E) x##Q)
cannam@95	1072 #else
cannam@95	1073 # define K(x) ((E) x)
cannam@95	1074 #endif
cannam@95	1075 #define DK(name, value) const E name = K(value)
cannam@95	1076
cannam@95	1077 /* FMA macros */
cannam@95	1078
cannam@95	1079 #if defined(__GNUC__) && (defined(__powerpc__) \|\| defined(__ppc__) \|\| defined(_POWER))
cannam@95	1080 /* The obvious expression a * b + c does not work. If both x = a * b
cannam@95	1081 + c and y = a * b - c appear in the source, gcc computes t = a * b,
cannam@95	1082 x = t + c, y = t - c, thus destroying the fma.
cannam@95	1083
cannam@95	1084 This peculiar coding seems to do the right thing on all of
cannam@95	1085 gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
cannam@95	1086 on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
cannam@95	1087 `x' for the single-assignment form).
cannam@95	1088
cannam@95	1089 However, gcc-4.0 is a formidable adversary which succeeds in
cannam@95	1090 pessimizing two fma's into one multiplication and two additions.
cannam@95	1091 It does it very early in the game---before the optimization passes
cannam@95	1092 even start. The only real workaround seems to use fake inline asm
cannam@95	1093 such as
cannam@95	1094
cannam@95	1095 asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
cannam@95	1096 return a * b + c;
cannam@95	1097
cannam@95	1098 in each of the FMA, FMS, FNMA, and FNMS functions. However, this
cannam@95	1099 does not solve the problem either, because two equal asm statements
cannam@95	1100 count as a common subexpression! One must use different fake asm
cannam@95	1101 statements:
cannam@95	1102
cannam@95	1103 in FMA:
cannam@95	1104 asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
cannam@95	1105
cannam@95	1106 in FMS:
cannam@95	1107 asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
cannam@95	1108
cannam@95	1109 etc.
cannam@95	1110
cannam@95	1111 After these changes, gcc recalcitrantly generates the fma that was
cannam@95	1112 in the source to begin with. However, the extra asm() cruft
cannam@95	1113 confuses other passes of gcc, notably the instruction scheduler.
cannam@95	1114 (Of course, one could also generate the fma directly via inline
cannam@95	1115 asm, but this confuses the scheduler even more.)
cannam@95	1116
cannam@95	1117 Steven and I have submitted more than one bug report to the gcc
cannam@95	1118 mailing list over the past few years, to no effect. Thus, I give
cannam@95	1119 up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
cannam@95	1120 out before touching this crap again.
cannam@95	1121 */
cannam@95	1122 static __inline__ E FMA(E a, E b, E c)
cannam@95	1123 {
cannam@95	1124 E x = a * b;
cannam@95	1125 x = x + c;
cannam@95	1126 return x;
cannam@95	1127 }
cannam@95	1128
cannam@95	1129 static __inline__ E FMS(E a, E b, E c)
cannam@95	1130 {
cannam@95	1131 E x = a * b;
cannam@95	1132 x = x - c;
cannam@95	1133 return x;
cannam@95	1134 }
cannam@95	1135
cannam@95	1136 static __inline__ E FNMA(E a, E b, E c)
cannam@95	1137 {
cannam@95	1138 E x = a * b;
cannam@95	1139 x = - (x + c);
cannam@95	1140 return x;
cannam@95	1141 }
cannam@95	1142
cannam@95	1143 static __inline__ E FNMS(E a, E b, E c)
cannam@95	1144 {
cannam@95	1145 E x = a * b;
cannam@95	1146 x = - (x - c);
cannam@95	1147 return x;
cannam@95	1148 }
cannam@95	1149 #else
cannam@95	1150 #define FMA(a, b, c) (((a) * (b)) + (c))
cannam@95	1151 #define FMS(a, b, c) (((a) * (b)) - (c))
cannam@95	1152 #define FNMA(a, b, c) (- (((a) * (b)) + (c)))
cannam@95	1153 #define FNMS(a, b, c) ((c) - ((a) * (b)))
cannam@95	1154 #endif
cannam@95	1155
cannam@95	1156 #ifdef __cplusplus
cannam@95	1157 } /* extern "C" */
cannam@95	1158 #endif /* __cplusplus */
cannam@95	1159
cannam@95	1160 #endif /* __IFFTW_H__ */

Mercurial > hg > sv-dependency-builds

annotate src/fftw-3.3.3/kernel/ifftw.h @ 169:223a55898ab9 tip default